diff options
Diffstat (limited to 'src/video_core')
64 files changed, 2537 insertions, 490 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index ccfed4f2e..4b0c6346f 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -29,12 +29,15 @@ add_library(video_core STATIC gpu_synch.h gpu_thread.cpp gpu_thread.h + guest_driver.cpp + guest_driver.h macro_interpreter.cpp macro_interpreter.h memory_manager.cpp memory_manager.h morton.cpp morton.h + query_cache.h rasterizer_accelerated.cpp rasterizer_accelerated.h rasterizer_cache.cpp @@ -72,6 +75,8 @@ add_library(video_core STATIC renderer_opengl/gl_stream_buffer.h renderer_opengl/gl_texture_cache.cpp renderer_opengl/gl_texture_cache.h + renderer_opengl/gl_query_cache.cpp + renderer_opengl/gl_query_cache.h renderer_opengl/maxwell_to_gl.h renderer_opengl/renderer_opengl.cpp renderer_opengl/renderer_opengl.h @@ -154,6 +159,7 @@ if (ENABLE_VULKAN) renderer_vulkan/maxwell_to_vk.cpp renderer_vulkan/maxwell_to_vk.h renderer_vulkan/renderer_vulkan.h + renderer_vulkan/renderer_vulkan.cpp renderer_vulkan/vk_blit_screen.cpp renderer_vulkan/vk_blit_screen.h renderer_vulkan/vk_buffer_cache.cpp @@ -174,6 +180,8 @@ if (ENABLE_VULKAN) renderer_vulkan/vk_memory_manager.h renderer_vulkan/vk_pipeline_cache.cpp renderer_vulkan/vk_pipeline_cache.h + renderer_vulkan/vk_query_cache.cpp + renderer_vulkan/vk_query_cache.h renderer_vulkan/vk_rasterizer.cpp renderer_vulkan/vk_rasterizer.h renderer_vulkan/vk_renderpass_cache.cpp diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 0510ed777..186aca61d 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -101,7 +101,10 @@ public: void TickFrame() { ++epoch; while (!pending_destruction.empty()) { - if (pending_destruction.front()->GetEpoch() + 1 > epoch) { + // Delay at least 4 frames before destruction. + // This is due to triple buffering happening on some drivers. + static constexpr u64 epochs_to_destroy = 5; + if (pending_destruction.front()->GetEpoch() + epochs_to_destroy > epoch) { break; } pending_destruction.pop_front(); diff --git a/src/video_core/engines/const_buffer_engine_interface.h b/src/video_core/engines/const_buffer_engine_interface.h index 44b8b8d22..d56a47710 100644 --- a/src/video_core/engines/const_buffer_engine_interface.h +++ b/src/video_core/engines/const_buffer_engine_interface.h @@ -9,6 +9,7 @@ #include "common/common_types.h" #include "video_core/engines/shader_bytecode.h" #include "video_core/engines/shader_type.h" +#include "video_core/guest_driver.h" #include "video_core/textures/texture.h" namespace Tegra::Engines { @@ -106,6 +107,9 @@ public: virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer, u64 offset) const = 0; virtual u32 GetBoundBuffer() const = 0; + + virtual VideoCore::GuestDriverProfile& AccessGuestDriverProfile() = 0; + virtual const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const = 0; }; } // namespace Tegra::Engines diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index 110406f2f..4b824aa4e 100644 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -94,6 +94,14 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con return result; } +VideoCore::GuestDriverProfile& KeplerCompute::AccessGuestDriverProfile() { + return rasterizer.AccessGuestDriverProfile(); +} + +const VideoCore::GuestDriverProfile& KeplerCompute::AccessGuestDriverProfile() const { + return rasterizer.AccessGuestDriverProfile(); +} + void KeplerCompute::ProcessLaunch() { const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address(); memory_manager.ReadBlockUnsafe(launch_desc_loc, &launch_description, diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h index 4ef3e0613..eeb79c56f 100644 --- a/src/video_core/engines/kepler_compute.h +++ b/src/video_core/engines/kepler_compute.h @@ -218,6 +218,10 @@ public: return regs.tex_cb_index; } + VideoCore::GuestDriverProfile& AccessGuestDriverProfile() override; + + const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override; + private: Core::System& system; VideoCore::RasterizerInterface& rasterizer; diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 58dfa8033..b28de1092 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -4,17 +4,21 @@ #include <cinttypes> #include <cstring> +#include <optional> #include "common/assert.h" #include "core/core.h" #include "core/core_timing.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/shader_type.h" +#include "video_core/gpu.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" #include "video_core/textures/texture.h" namespace Tegra::Engines { +using VideoCore::QueryType; + /// First register id that is actually a Macro call. constexpr u32 MacroRegistersStart = 0xE00; @@ -399,6 +403,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { ProcessQueryCondition(); break; } + case MAXWELL3D_REG_INDEX(counter_reset): { + ProcessCounterReset(); + break; + } case MAXWELL3D_REG_INDEX(sync_info): { ProcessSyncPoint(); break; @@ -481,7 +489,7 @@ void Maxwell3D::FlushMMEInlineDraw() { const bool is_indexed = mme_draw.current_mode == MMEDrawMode::Indexed; if (ShouldExecute()) { - rasterizer.DrawMultiBatch(is_indexed); + rasterizer.Draw(is_indexed, true); } // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if @@ -519,61 +527,51 @@ void Maxwell3D::ProcessFirmwareCall4() { regs.reg_array[0xd00] = 1; } -void Maxwell3D::ProcessQueryGet() { +void Maxwell3D::StampQueryResult(u64 payload, bool long_query) { + struct LongQueryResult { + u64_le value; + u64_le timestamp; + }; + static_assert(sizeof(LongQueryResult) == 16, "LongQueryResult has wrong size"); const GPUVAddr sequence_address{regs.query.QueryAddress()}; - // Since the sequence address is given as a GPU VAddr, we have to convert it to an application - // VAddr before writing. + if (long_query) { + // Write the 128-bit result structure in long mode. Note: We emulate an infinitely fast + // GPU, this command may actually take a while to complete in real hardware due to GPU + // wait queues. + LongQueryResult query_result{payload, system.GPU().GetTicks()}; + memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result)); + } else { + memory_manager.Write<u32>(sequence_address, static_cast<u32>(payload)); + } +} +void Maxwell3D::ProcessQueryGet() { // TODO(Subv): Support the other query units. ASSERT_MSG(regs.query.query_get.unit == Regs::QueryUnit::Crop, "Units other than CROP are unimplemented"); - u64 result = 0; - - // TODO(Subv): Support the other query variables - switch (regs.query.query_get.select) { - case Regs::QuerySelect::Zero: - // This seems to actually write the query sequence to the query address. - result = regs.query.query_sequence; + switch (regs.query.query_get.operation) { + case Regs::QueryOperation::Release: + StampQueryResult(regs.query.query_sequence, regs.query.query_get.short_query == 0); break; - default: - result = 1; - UNIMPLEMENTED_MSG("Unimplemented query select type {}", - static_cast<u32>(regs.query.query_get.select.Value())); - } - - // TODO(Subv): Research and implement how query sync conditions work. - - struct LongQueryResult { - u64_le value; - u64_le timestamp; - }; - static_assert(sizeof(LongQueryResult) == 16, "LongQueryResult has wrong size"); - - switch (regs.query.query_get.mode) { - case Regs::QueryMode::Write: - case Regs::QueryMode::Write2: { - u32 sequence = regs.query.query_sequence; - if (regs.query.query_get.short_query) { - // Write the current query sequence to the sequence address. - // TODO(Subv): Find out what happens if you use a long query type but mark it as a short - // query. - memory_manager.Write<u32>(sequence_address, sequence); - } else { - // Write the 128-bit result structure in long mode. Note: We emulate an infinitely fast - // GPU, this command may actually take a while to complete in real hardware due to GPU - // wait queues. - LongQueryResult query_result{}; - query_result.value = result; - // TODO(Subv): Generate a real GPU timestamp and write it here instead of CoreTiming - query_result.timestamp = system.CoreTiming().GetTicks(); - memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result)); + case Regs::QueryOperation::Acquire: + // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that + // matches the current payload. + UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE"); + break; + case Regs::QueryOperation::Counter: + if (const std::optional<u64> result = GetQueryResult()) { + // If the query returns an empty optional it means it's cached and deferred. + // In this case we have a non-empty result, so we stamp it immediately. + StampQueryResult(*result, regs.query.query_get.short_query == 0); } break; - } + case Regs::QueryOperation::Trap: + UNIMPLEMENTED_MSG("Unimplemented query operation TRAP"); + break; default: - UNIMPLEMENTED_MSG("Query mode {} not implemented", - static_cast<u32>(regs.query.query_get.mode.Value())); + UNIMPLEMENTED_MSG("Unknown query operation"); + break; } } @@ -590,20 +588,20 @@ void Maxwell3D::ProcessQueryCondition() { } case Regs::ConditionMode::ResNonZero: { Regs::QueryCompare cmp; - memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp)); + memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp)); execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U; break; } case Regs::ConditionMode::Equal: { Regs::QueryCompare cmp; - memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp)); + memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp)); execute_on = cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode; break; } case Regs::ConditionMode::NotEqual: { Regs::QueryCompare cmp; - memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp)); + memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp)); execute_on = cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode; break; @@ -616,6 +614,18 @@ void Maxwell3D::ProcessQueryCondition() { } } +void Maxwell3D::ProcessCounterReset() { + switch (regs.counter_reset) { + case Regs::CounterReset::SampleCnt: + rasterizer.ResetCounter(QueryType::SamplesPassed); + break; + default: + LOG_WARNING(Render_OpenGL, "Unimplemented counter reset={}", + static_cast<int>(regs.counter_reset)); + break; + } +} + void Maxwell3D::ProcessSyncPoint() { const u32 sync_point = regs.sync_info.sync_point.Value(); const u32 increment = regs.sync_info.increment.Value(); @@ -644,7 +654,7 @@ void Maxwell3D::DrawArrays() { const bool is_indexed{regs.index_array.count && !regs.vertex_buffer.count}; if (ShouldExecute()) { - rasterizer.DrawBatch(is_indexed); + rasterizer.Draw(is_indexed, false); } // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if @@ -658,6 +668,22 @@ void Maxwell3D::DrawArrays() { } } +std::optional<u64> Maxwell3D::GetQueryResult() { + switch (regs.query.query_get.select) { + case Regs::QuerySelect::Zero: + return 0; + case Regs::QuerySelect::SamplesPassed: + // Deferred. + rasterizer.Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed, + system.GPU().GetTicks()); + return {}; + default: + UNIMPLEMENTED_MSG("Unimplemented query select type {}", + static_cast<u32>(regs.query.query_get.select.Value())); + return 1; + } +} + void Maxwell3D::ProcessCBBind(std::size_t stage_index) { // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage. auto& shader = state.shader_stages[stage_index]; @@ -784,4 +810,12 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b return result; } +VideoCore::GuestDriverProfile& Maxwell3D::AccessGuestDriverProfile() { + return rasterizer.AccessGuestDriverProfile(); +} + +const VideoCore::GuestDriverProfile& Maxwell3D::AccessGuestDriverProfile() const { + return rasterizer.AccessGuestDriverProfile(); +} + } // namespace Tegra::Engines diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index ee79260fc..26939be3f 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -6,6 +6,7 @@ #include <array> #include <bitset> +#include <optional> #include <type_traits> #include <unordered_map> #include <vector> @@ -71,12 +72,11 @@ public: static constexpr std::size_t MaxConstBuffers = 18; static constexpr std::size_t MaxConstBufferSize = 0x10000; - enum class QueryMode : u32 { - Write = 0, - Sync = 1, - // TODO(Subv): It is currently unknown what the difference between method 2 and method 0 - // is. - Write2 = 2, + enum class QueryOperation : u32 { + Release = 0, + Acquire = 1, + Counter = 2, + Trap = 3, }; enum class QueryUnit : u32 { @@ -410,6 +410,27 @@ public: Linear = 1, }; + enum class CounterReset : u32 { + SampleCnt = 0x01, + Unk02 = 0x02, + Unk03 = 0x03, + Unk04 = 0x04, + EmittedPrimitives = 0x10, // Not tested + Unk11 = 0x11, + Unk12 = 0x12, + Unk13 = 0x13, + Unk15 = 0x15, + Unk16 = 0x16, + Unk17 = 0x17, + Unk18 = 0x18, + Unk1A = 0x1A, + Unk1B = 0x1B, + Unk1C = 0x1C, + Unk1D = 0x1D, + Unk1E = 0x1E, + GeneratedPrimitives = 0x1F, + }; + struct Cull { enum class FrontFace : u32 { ClockWise = 0x0900, @@ -704,8 +725,8 @@ public: INSERT_UNION_PADDING_WORDS(0x15); s32 stencil_back_func_ref; - u32 stencil_back_func_mask; u32 stencil_back_mask; + u32 stencil_back_func_mask; INSERT_UNION_PADDING_WORDS(0xC); @@ -858,11 +879,19 @@ public: BitField<7, 1, u32> c7; } clip_distance_enabled; - INSERT_UNION_PADDING_WORDS(0x1); + u32 samplecnt_enable; float point_size; - INSERT_UNION_PADDING_WORDS(0x7); + INSERT_UNION_PADDING_WORDS(0x1); + + u32 point_sprite_enable; + + INSERT_UNION_PADDING_WORDS(0x3); + + CounterReset counter_reset; + + INSERT_UNION_PADDING_WORDS(0x1); u32 zeta_enable; @@ -1077,7 +1106,7 @@ public: u32 query_sequence; union { u32 raw; - BitField<0, 2, QueryMode> mode; + BitField<0, 2, QueryOperation> operation; BitField<4, 1, u32> fence; BitField<12, 4, QueryUnit> unit; BitField<16, 1, QuerySyncCondition> sync_cond; @@ -1306,6 +1335,10 @@ public: return regs.tex_cb_index; } + VideoCore::GuestDriverProfile& AccessGuestDriverProfile() override; + + const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override; + /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than /// we've seen used. using MacroMemory = std::array<u32, 0x40000>; @@ -1405,9 +1438,15 @@ private: /// Handles a write to the QUERY_GET register. void ProcessQueryGet(); - // Handles Conditional Rendering + /// Writes the query result accordingly. + void StampQueryResult(u64 payload, bool long_query); + + /// Handles conditional rendering. void ProcessQueryCondition(); + /// Handles counter resets. + void ProcessCounterReset(); + /// Handles writes to syncing register. void ProcessSyncPoint(); @@ -1424,6 +1463,9 @@ private: // Handles a instance drawcall from MME void StepInstance(MMEDrawMode expected_mode, u32 count); + + /// Returns a query's value or an empty object if the value will be deferred through a cache. + std::optional<u64> GetQueryResult(); }; #define ASSERT_REG_POSITION(field_name, position) \ @@ -1454,8 +1496,8 @@ ASSERT_REG_POSITION(polygon_offset_fill_enable, 0x372); ASSERT_REG_POSITION(patch_vertices, 0x373); ASSERT_REG_POSITION(scissor_test, 0x380); ASSERT_REG_POSITION(stencil_back_func_ref, 0x3D5); -ASSERT_REG_POSITION(stencil_back_func_mask, 0x3D6); -ASSERT_REG_POSITION(stencil_back_mask, 0x3D7); +ASSERT_REG_POSITION(stencil_back_mask, 0x3D6); +ASSERT_REG_POSITION(stencil_back_func_mask, 0x3D7); ASSERT_REG_POSITION(color_mask_common, 0x3E4); ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB); ASSERT_REG_POSITION(depth_bounds, 0x3E7); @@ -1489,7 +1531,10 @@ ASSERT_REG_POSITION(screen_y_control, 0x4EB); ASSERT_REG_POSITION(vb_element_base, 0x50D); ASSERT_REG_POSITION(vb_base_instance, 0x50E); ASSERT_REG_POSITION(clip_distance_enabled, 0x544); +ASSERT_REG_POSITION(samplecnt_enable, 0x545); ASSERT_REG_POSITION(point_size, 0x546); +ASSERT_REG_POSITION(point_sprite_enable, 0x548); +ASSERT_REG_POSITION(counter_reset, 0x54C); ASSERT_REG_POSITION(zeta_enable, 0x54E); ASSERT_REG_POSITION(multisample_control, 0x54F); ASSERT_REG_POSITION(condition, 0x554); diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index 6f98bd827..c9bc83cd7 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -227,6 +227,28 @@ enum class AtomicOp : u64 { Exch = 8, }; +enum class GlobalAtomicOp : u64 { + Add = 0, + Min = 1, + Max = 2, + Inc = 3, + Dec = 4, + And = 5, + Or = 6, + Xor = 7, + Exch = 8, + SafeAdd = 10, +}; + +enum class GlobalAtomicType : u64 { + U32 = 0, + S32 = 1, + U64 = 2, + F32_FTZ_RN = 3, + F16x2_FTZ_RN = 4, + S64 = 5, +}; + enum class UniformType : u64 { UnsignedByte = 0, SignedByte = 1, @@ -602,6 +624,19 @@ enum class ShuffleOperation : u64 { Bfly = 3, // shuffleXorNV }; +enum class ShfType : u64 { + Bits32 = 0, + U64 = 2, + S64 = 3, +}; + +enum class ShfXmode : u64 { + None = 0, + HI = 1, + X = 2, + XHI = 3, +}; + union Instruction { constexpr Instruction& operator=(const Instruction& instr) { value = instr.value; @@ -754,6 +789,13 @@ union Instruction { } shr; union { + BitField<37, 2, ShfType> type; + BitField<48, 2, ShfXmode> xmode; + BitField<50, 1, u64> wrap; + BitField<20, 6, u64> immediate; + } shf; + + union { BitField<39, 5, u64> shift_amount; BitField<48, 1, u64> negate_b; BitField<49, 1, u64> negate_a; @@ -958,6 +1000,12 @@ union Instruction { } stg; union { + BitField<52, 4, GlobalAtomicOp> operation; + BitField<49, 3, GlobalAtomicType> type; + BitField<28, 20, s64> offset; + } atom; + + union { BitField<52, 4, AtomicOp> operation; BitField<28, 2, AtomicType> type; BitField<30, 22, s64> offset; @@ -1096,6 +1144,11 @@ union Instruction { } fset; union { + BitField<47, 1, u64> ftz; + BitField<48, 4, PredCondition> cond; + } fcmp; + + union { BitField<49, 1, u64> bf; BitField<35, 3, PredCondition> cond; BitField<50, 1, u64> ftz; @@ -1624,11 +1677,11 @@ union Instruction { } xmad; union { - BitField<20, 14, u64> offset; + BitField<20, 14, u64> shifted_offset; BitField<34, 5, u64> index; u64 GetOffset() const { - return offset * 4; + return shifted_offset * 4; } } cbuf34; @@ -1675,6 +1728,7 @@ public: BFE_C, BFE_R, BFE_IMM, + BFI_RC, BFI_IMM_R, BRA, BRX, @@ -1690,6 +1744,7 @@ public: ST_S, ST, // Store in generic memory STG, // Store in global memory + ATOM, // Atomic operation on global memory ATOMS, // Atomic operation on shared memory AL2P, // Transforms attribute memory into physical memory TEX, @@ -1771,6 +1826,7 @@ public: ICMP_R, ICMP_CR, ICMP_IMM, + FCMP_R, MUFU, // Multi-Function Operator RRO_C, // Range Reduction Operator RRO_R, @@ -1994,6 +2050,7 @@ private: INST("1110111101010---", Id::ST_L, Type::Memory, "ST_L"), INST("101-------------", Id::ST, Type::Memory, "ST"), INST("1110111011011---", Id::STG, Type::Memory, "STG"), + INST("11101101--------", Id::ATOM, Type::Memory, "ATOM"), INST("11101100--------", Id::ATOMS, Type::Memory, "ATOMS"), INST("1110111110100---", Id::AL2P, Type::Memory, "AL2P"), INST("110000----111---", Id::TEX, Type::Texture, "TEX"), @@ -2074,6 +2131,7 @@ private: INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"), INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"), INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"), + INST("010110111010----", Id::FCMP_R, Type::Arithmetic, "FCMP_R"), INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"), INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"), INST("0101110010010---", Id::RRO_R, Type::Arithmetic, "RRO_R"), @@ -2098,6 +2156,7 @@ private: INST("0100110000000---", Id::BFE_C, Type::Bfe, "BFE_C"), INST("0101110000000---", Id::BFE_R, Type::Bfe, "BFE_R"), INST("0011100-00000---", Id::BFE_IMM, Type::Bfe, "BFE_IMM"), + INST("0101001111110---", Id::BFI_RC, Type::Bfi, "BFI_RC"), INST("0011011-11110---", Id::BFI_IMM_R, Type::Bfi, "BFI_IMM_R"), INST("0100110001000---", Id::LOP_C, Type::ArithmeticInteger, "LOP_C"), INST("0101110001000---", Id::LOP_R, Type::ArithmeticInteger, "LOP_R"), diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index b9c5c41a2..7d7137109 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -6,6 +6,7 @@ #include "common/microprofile.h" #include "core/core.h" #include "core/core_timing.h" +#include "core/core_timing_util.h" #include "core/memory.h" #include "video_core/engines/fermi_2d.h" #include "video_core/engines/kepler_compute.h" @@ -122,6 +123,19 @@ bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) { return true; } +u64 GPU::GetTicks() const { + // This values were reversed engineered by fincs from NVN + // The gpu clock is reported in units of 385/625 nanoseconds + constexpr u64 gpu_ticks_num = 384; + constexpr u64 gpu_ticks_den = 625; + + const u64 cpu_ticks = system.CoreTiming().GetTicks(); + const u64 nanoseconds = Core::Timing::CyclesToNs(cpu_ticks).count(); + const u64 nanoseconds_num = nanoseconds / gpu_ticks_den; + const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den; + return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den; +} + void GPU::FlushCommands() { renderer.Rasterizer().FlushCommands(); } @@ -340,7 +354,7 @@ void GPU::ProcessSemaphoreTriggerMethod() { block.sequence = regs.semaphore_sequence; // TODO(Kmather73): Generate a real GPU timestamp and write it here instead of // CoreTiming - block.timestamp = system.CoreTiming().GetTicks(); + block.timestamp = GetTicks(); memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block, sizeof(block)); } else { diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index b648317bb..07727210c 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -192,6 +192,8 @@ public: bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value); + u64 GetTicks() const; + std::unique_lock<std::mutex> LockSync() { return std::unique_lock{sync_mutex}; } diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index 08dc96bb3..882e2d9c7 100644 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -86,7 +86,7 @@ struct CommandDataContainer { struct SynchState final { std::atomic_bool is_running{true}; - using CommandQueue = Common::SPSCQueue<CommandDataContainer>; + using CommandQueue = Common::MPSCQueue<CommandDataContainer>; CommandQueue queue; u64 last_fence{}; std::atomic<u64> signaled_fence{}; diff --git a/src/video_core/guest_driver.cpp b/src/video_core/guest_driver.cpp new file mode 100644 index 000000000..6adef459e --- /dev/null +++ b/src/video_core/guest_driver.cpp @@ -0,0 +1,36 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <limits> + +#include "video_core/guest_driver.h" + +namespace VideoCore { + +void GuestDriverProfile::DeduceTextureHandlerSize(std::vector<u32>&& bound_offsets) { + if (texture_handler_size_deduced) { + return; + } + const std::size_t size = bound_offsets.size(); + if (size < 2) { + return; + } + std::sort(bound_offsets.begin(), bound_offsets.end(), std::less{}); + u32 min_val = std::numeric_limits<u32>::max(); + for (std::size_t i = 1; i < size; ++i) { + if (bound_offsets[i] == bound_offsets[i - 1]) { + continue; + } + const u32 new_min = bound_offsets[i] - bound_offsets[i - 1]; + min_val = std::min(min_val, new_min); + } + if (min_val > 2) { + return; + } + texture_handler_size_deduced = true; + texture_handler_size = min_texture_handler_size * min_val; +} + +} // namespace VideoCore diff --git a/src/video_core/guest_driver.h b/src/video_core/guest_driver.h new file mode 100644 index 000000000..fc1917347 --- /dev/null +++ b/src/video_core/guest_driver.h @@ -0,0 +1,41 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <vector> + +#include "common/common_types.h" + +namespace VideoCore { + +/** + * The GuestDriverProfile class is used to learn about the GPU drivers behavior and collect + * information necessary for impossible to avoid HLE methods like shader tracks as they are + * Entscheidungsproblems. + */ +class GuestDriverProfile { +public: + void DeduceTextureHandlerSize(std::vector<u32>&& bound_offsets); + + u32 GetTextureHandlerSize() const { + return texture_handler_size; + } + + bool TextureHandlerSizeKnown() const { + return texture_handler_size_deduced; + } + +private: + // Minimum size of texture handler any driver can use. + static constexpr u32 min_texture_handler_size = 4; + // This goes with Vulkan and OpenGL standards but Nvidia GPUs can easily + // use 4 bytes instead. Thus, certain drivers may squish the size. + static constexpr u32 default_texture_handler_size = 8; + + u32 texture_handler_size = default_texture_handler_size; + bool texture_handler_size_deduced = false; +}; + +} // namespace VideoCore diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 11848fbce..f5d33f27a 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -9,6 +9,7 @@ #include "core/hle/kernel/process.h" #include "core/hle/kernel/vm_manager.h" #include "core/memory.h" +#include "video_core/gpu.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" @@ -84,7 +85,9 @@ GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) { const auto cpu_addr = GpuToCpuAddress(gpu_addr); ASSERT(cpu_addr); - rasterizer.FlushAndInvalidateRegion(cache_addr, aligned_size); + // Flush and invalidate through the GPU interface, to be asynchronous if possible. + system.GPU().FlushAndInvalidateRegion(cache_addr, aligned_size); + UnmapRange(gpu_addr, aligned_size); ASSERT(system.CurrentProcess() ->VMManager() @@ -242,6 +245,8 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::s switch (page_table.attributes[page_index]) { case Common::PageType::Memory: { const u8* src_ptr{page_table.pointers[page_index] + page_offset}; + // Flush must happen on the rasterizer interface, such that memory is always synchronous + // when it is read (even when in asynchronous GPU mode). Fixes Dead Cells title menu. rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount); std::memcpy(dest_buffer, src_ptr, copy_amount); break; @@ -292,6 +297,8 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const switch (page_table.attributes[page_index]) { case Common::PageType::Memory: { u8* dest_ptr{page_table.pointers[page_index] + page_offset}; + // Invalidate must happen on the rasterizer interface, such that memory is always + // synchronous when it is written (even when in asynchronous GPU mode). rasterizer.InvalidateRegion(ToCacheAddr(dest_ptr), copy_amount); std::memcpy(dest_ptr, src_buffer, copy_amount); break; @@ -339,6 +346,8 @@ void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std:: switch (page_table.attributes[page_index]) { case Common::PageType::Memory: { + // Flush must happen on the rasterizer interface, such that memory is always synchronous + // when it is copied (even when in asynchronous GPU mode). const u8* src_ptr{page_table.pointers[page_index] + page_offset}; rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount); WriteBlock(dest_addr, src_ptr, copy_amount); diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h new file mode 100644 index 000000000..e66054ed0 --- /dev/null +++ b/src/video_core/query_cache.h @@ -0,0 +1,359 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <algorithm> +#include <array> +#include <cstring> +#include <iterator> +#include <memory> +#include <mutex> +#include <optional> +#include <unordered_map> +#include <vector> + +#include "common/assert.h" +#include "core/core.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" +#include "video_core/rasterizer_interface.h" + +namespace VideoCommon { + +template <class QueryCache, class HostCounter> +class CounterStreamBase { +public: + explicit CounterStreamBase(QueryCache& cache, VideoCore::QueryType type) + : cache{cache}, type{type} {} + + /// Updates the state of the stream, enabling or disabling as needed. + void Update(bool enabled) { + if (enabled) { + Enable(); + } else { + Disable(); + } + } + + /// Resets the stream to zero. It doesn't disable the query after resetting. + void Reset() { + if (current) { + current->EndQuery(); + + // Immediately start a new query to avoid disabling its state. + current = cache.Counter(nullptr, type); + } + last = nullptr; + } + + /// Returns the current counter slicing as needed. + std::shared_ptr<HostCounter> Current() { + if (!current) { + return nullptr; + } + current->EndQuery(); + last = std::move(current); + current = cache.Counter(last, type); + return last; + } + + /// Returns true when the counter stream is enabled. + bool IsEnabled() const { + return current != nullptr; + } + +private: + /// Enables the stream. + void Enable() { + if (current) { + return; + } + current = cache.Counter(last, type); + } + + // Disables the stream. + void Disable() { + if (current) { + current->EndQuery(); + } + last = std::exchange(current, nullptr); + } + + QueryCache& cache; + const VideoCore::QueryType type; + + std::shared_ptr<HostCounter> current; + std::shared_ptr<HostCounter> last; +}; + +template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter, + class QueryPool> +class QueryCacheBase { +public: + explicit QueryCacheBase(Core::System& system, VideoCore::RasterizerInterface& rasterizer) + : system{system}, rasterizer{rasterizer}, streams{{CounterStream{ + static_cast<QueryCache&>(*this), + VideoCore::QueryType::SamplesPassed}}} {} + + void InvalidateRegion(CacheAddr addr, std::size_t size) { + std::unique_lock lock{mutex}; + FlushAndRemoveRegion(addr, size); + } + + void FlushRegion(CacheAddr addr, std::size_t size) { + std::unique_lock lock{mutex}; + FlushAndRemoveRegion(addr, size); + } + + /** + * Records a query in GPU mapped memory, potentially marked with a timestamp. + * @param gpu_addr GPU address to flush to when the mapped memory is read. + * @param type Query type, e.g. SamplesPassed. + * @param timestamp Timestamp, when empty the flushed query is assumed to be short. + */ + void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) { + std::unique_lock lock{mutex}; + auto& memory_manager = system.GPU().MemoryManager(); + const auto host_ptr = memory_manager.GetPointer(gpu_addr); + + CachedQuery* query = TryGet(ToCacheAddr(host_ptr)); + if (!query) { + const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr); + ASSERT_OR_EXECUTE(cpu_addr, return;); + + query = Register(type, *cpu_addr, host_ptr, timestamp.has_value()); + } + + query->BindCounter(Stream(type).Current(), timestamp); + } + + /// Updates counters from GPU state. Expected to be called once per draw, clear or dispatch. + void UpdateCounters() { + std::unique_lock lock{mutex}; + const auto& regs = system.GPU().Maxwell3D().regs; + Stream(VideoCore::QueryType::SamplesPassed).Update(regs.samplecnt_enable); + } + + /// Resets a counter to zero. It doesn't disable the query after resetting. + void ResetCounter(VideoCore::QueryType type) { + std::unique_lock lock{mutex}; + Stream(type).Reset(); + } + + /// Disable all active streams. Expected to be called at the end of a command buffer. + void DisableStreams() { + std::unique_lock lock{mutex}; + for (auto& stream : streams) { + stream.Update(false); + } + } + + /// Returns a new host counter. + std::shared_ptr<HostCounter> Counter(std::shared_ptr<HostCounter> dependency, + VideoCore::QueryType type) { + return std::make_shared<HostCounter>(static_cast<QueryCache&>(*this), std::move(dependency), + type); + } + + /// Returns the counter stream of the specified type. + CounterStream& Stream(VideoCore::QueryType type) { + return streams[static_cast<std::size_t>(type)]; + } + + /// Returns the counter stream of the specified type. + const CounterStream& Stream(VideoCore::QueryType type) const { + return streams[static_cast<std::size_t>(type)]; + } + +protected: + std::array<QueryPool, VideoCore::NumQueryTypes> query_pools; + +private: + /// Flushes a memory range to guest memory and removes it from the cache. + void FlushAndRemoveRegion(CacheAddr addr, std::size_t size) { + const u64 addr_begin = static_cast<u64>(addr); + const u64 addr_end = addr_begin + static_cast<u64>(size); + const auto in_range = [addr_begin, addr_end](CachedQuery& query) { + const u64 cache_begin = query.GetCacheAddr(); + const u64 cache_end = cache_begin + query.SizeInBytes(); + return cache_begin < addr_end && addr_begin < cache_end; + }; + + const u64 page_end = addr_end >> PAGE_SHIFT; + for (u64 page = addr_begin >> PAGE_SHIFT; page <= page_end; ++page) { + const auto& it = cached_queries.find(page); + if (it == std::end(cached_queries)) { + continue; + } + auto& contents = it->second; + for (auto& query : contents) { + if (!in_range(query)) { + continue; + } + rasterizer.UpdatePagesCachedCount(query.CpuAddr(), query.SizeInBytes(), -1); + query.Flush(); + } + contents.erase(std::remove_if(std::begin(contents), std::end(contents), in_range), + std::end(contents)); + } + } + + /// Registers the passed parameters as cached and returns a pointer to the stored cached query. + CachedQuery* Register(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr, bool timestamp) { + rasterizer.UpdatePagesCachedCount(cpu_addr, CachedQuery::SizeInBytes(timestamp), 1); + const u64 page = static_cast<u64>(ToCacheAddr(host_ptr)) >> PAGE_SHIFT; + return &cached_queries[page].emplace_back(static_cast<QueryCache&>(*this), type, cpu_addr, + host_ptr); + } + + /// Tries to a get a cached query. Returns nullptr on failure. + CachedQuery* TryGet(CacheAddr addr) { + const u64 page = static_cast<u64>(addr) >> PAGE_SHIFT; + const auto it = cached_queries.find(page); + if (it == std::end(cached_queries)) { + return nullptr; + } + auto& contents = it->second; + const auto found = + std::find_if(std::begin(contents), std::end(contents), + [addr](auto& query) { return query.GetCacheAddr() == addr; }); + return found != std::end(contents) ? &*found : nullptr; + } + + static constexpr std::uintptr_t PAGE_SIZE = 4096; + static constexpr unsigned PAGE_SHIFT = 12; + + Core::System& system; + VideoCore::RasterizerInterface& rasterizer; + + std::recursive_mutex mutex; + + std::unordered_map<u64, std::vector<CachedQuery>> cached_queries; + + std::array<CounterStream, VideoCore::NumQueryTypes> streams; +}; + +template <class QueryCache, class HostCounter> +class HostCounterBase { +public: + explicit HostCounterBase(std::shared_ptr<HostCounter> dependency_) + : dependency{std::move(dependency_)}, depth{dependency ? (dependency->Depth() + 1) : 0} { + // Avoid nesting too many dependencies to avoid a stack overflow when these are deleted. + constexpr u64 depth_threshold = 96; + if (depth > depth_threshold) { + depth = 0; + base_result = dependency->Query(); + dependency = nullptr; + } + } + virtual ~HostCounterBase() = default; + + /// Returns the current value of the query. + u64 Query() { + if (result) { + return *result; + } + + u64 value = BlockingQuery() + base_result; + if (dependency) { + value += dependency->Query(); + dependency = nullptr; + } + + result = value; + return *result; + } + + /// Returns true when flushing this query will potentially wait. + bool WaitPending() const noexcept { + return result.has_value(); + } + + u64 Depth() const noexcept { + return depth; + } + +protected: + /// Returns the value of query from the backend API blocking as needed. + virtual u64 BlockingQuery() const = 0; + +private: + std::shared_ptr<HostCounter> dependency; ///< Counter to add to this value. + std::optional<u64> result; ///< Filled with the already returned value. + u64 depth; ///< Number of nested dependencies. + u64 base_result = 0; ///< Equivalent to nested dependencies value. +}; + +template <class HostCounter> +class CachedQueryBase { +public: + explicit CachedQueryBase(VAddr cpu_addr, u8* host_ptr) + : cpu_addr{cpu_addr}, host_ptr{host_ptr} {} + virtual ~CachedQueryBase() = default; + + CachedQueryBase(CachedQueryBase&&) noexcept = default; + CachedQueryBase(const CachedQueryBase&) = delete; + + CachedQueryBase& operator=(CachedQueryBase&&) noexcept = default; + CachedQueryBase& operator=(const CachedQueryBase&) = delete; + + /// Flushes the query to guest memory. + virtual void Flush() { + // When counter is nullptr it means that it's just been reseted. We are supposed to write a + // zero in these cases. + const u64 value = counter ? counter->Query() : 0; + std::memcpy(host_ptr, &value, sizeof(u64)); + + if (timestamp) { + std::memcpy(host_ptr + TIMESTAMP_OFFSET, &*timestamp, sizeof(u64)); + } + } + + /// Binds a counter to this query. + void BindCounter(std::shared_ptr<HostCounter> counter_, std::optional<u64> timestamp_) { + if (counter) { + // If there's an old counter set it means the query is being rewritten by the game. + // To avoid losing the data forever, flush here. + Flush(); + } + counter = std::move(counter_); + timestamp = timestamp_; + } + + VAddr CpuAddr() const noexcept { + return cpu_addr; + } + + CacheAddr GetCacheAddr() const noexcept { + return ToCacheAddr(host_ptr); + } + + u64 SizeInBytes() const noexcept { + return SizeInBytes(timestamp.has_value()); + } + + static constexpr u64 SizeInBytes(bool with_timestamp) noexcept { + return with_timestamp ? LARGE_QUERY_SIZE : SMALL_QUERY_SIZE; + } + +protected: + /// Returns true when querying the counter may potentially block. + bool WaitPending() const noexcept { + return counter && counter->WaitPending(); + } + +private: + static constexpr std::size_t SMALL_QUERY_SIZE = 8; // Query size without timestamp. + static constexpr std::size_t LARGE_QUERY_SIZE = 16; // Query size with timestamp. + static constexpr std::intptr_t TIMESTAMP_OFFSET = 8; // Timestamp offset in a large query. + + VAddr cpu_addr; ///< Guest CPU address. + u8* host_ptr; ///< Writable host pointer. + std::shared_ptr<HostCounter> counter; ///< Host counter to query, owns the dependency tree. + std::optional<u64> timestamp; ///< Timestamp to flush to guest memory. +}; + +} // namespace VideoCommon diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 5b0eca9e2..f18eaf4bc 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -6,9 +6,11 @@ #include <atomic> #include <functional> +#include <optional> #include "common/common_types.h" #include "video_core/engines/fermi_2d.h" #include "video_core/gpu.h" +#include "video_core/guest_driver.h" namespace Tegra { class MemoryManager; @@ -16,6 +18,11 @@ class MemoryManager; namespace VideoCore { +enum class QueryType { + SamplesPassed, +}; +constexpr std::size_t NumQueryTypes = 1; + enum class LoadCallbackStage { Prepare, Decompile, @@ -28,11 +35,8 @@ class RasterizerInterface { public: virtual ~RasterizerInterface() {} - /// Draw the current batch of vertex arrays - virtual bool DrawBatch(bool is_indexed) = 0; - - /// Draw the current batch of multiple instances of vertex arrays - virtual bool DrawMultiBatch(bool is_indexed) = 0; + /// Dispatches a draw invocation + virtual void Draw(bool is_indexed, bool is_instanced) = 0; /// Clear the current framebuffer virtual void Clear() = 0; @@ -40,6 +44,12 @@ public: /// Dispatches a compute shader invocation virtual void DispatchCompute(GPUVAddr code_addr) = 0; + /// Resets the counter of a query + virtual void ResetCounter(QueryType type) = 0; + + /// Records a GPU query and caches it + virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0; + /// Notify rasterizer that all caches should be flushed to Switch memory virtual void FlushAll() = 0; @@ -78,5 +88,18 @@ public: /// Initialize disk cached resources for the game being emulated virtual void LoadDiskResources(const std::atomic_bool& stop_loading = false, const DiskResourceLoadCallback& callback = {}) {} + + /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver. + GuestDriverProfile& AccessGuestDriverProfile() { + return guest_driver_profile; + } + + /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver. + const GuestDriverProfile& AccessGuestDriverProfile() const { + return guest_driver_profile; + } + +private: + GuestDriverProfile guest_driver_profile{}; }; } // namespace VideoCore diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp new file mode 100644 index 000000000..f12e9f55f --- /dev/null +++ b/src/video_core/renderer_opengl/gl_query_cache.cpp @@ -0,0 +1,120 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <cstring> +#include <memory> +#include <unordered_map> +#include <utility> +#include <vector> + +#include <glad/glad.h> + +#include "common/assert.h" +#include "core/core.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/memory_manager.h" +#include "video_core/renderer_opengl/gl_query_cache.h" +#include "video_core/renderer_opengl/gl_rasterizer.h" + +namespace OpenGL { + +namespace { + +constexpr std::array<GLenum, VideoCore::NumQueryTypes> QueryTargets = {GL_SAMPLES_PASSED}; + +constexpr GLenum GetTarget(VideoCore::QueryType type) { + return QueryTargets[static_cast<std::size_t>(type)]; +} + +} // Anonymous namespace + +QueryCache::QueryCache(Core::System& system, RasterizerOpenGL& gl_rasterizer) + : VideoCommon::QueryCacheBase< + QueryCache, CachedQuery, CounterStream, HostCounter, + std::vector<OGLQuery>>{system, + static_cast<VideoCore::RasterizerInterface&>(gl_rasterizer)}, + gl_rasterizer{gl_rasterizer} {} + +QueryCache::~QueryCache() = default; + +OGLQuery QueryCache::AllocateQuery(VideoCore::QueryType type) { + auto& reserve = query_pools[static_cast<std::size_t>(type)]; + OGLQuery query; + if (reserve.empty()) { + query.Create(GetTarget(type)); + return query; + } + + query = std::move(reserve.back()); + reserve.pop_back(); + return query; +} + +void QueryCache::Reserve(VideoCore::QueryType type, OGLQuery&& query) { + query_pools[static_cast<std::size_t>(type)].push_back(std::move(query)); +} + +bool QueryCache::AnyCommandQueued() const noexcept { + return gl_rasterizer.AnyCommandQueued(); +} + +HostCounter::HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency, + VideoCore::QueryType type) + : VideoCommon::HostCounterBase<QueryCache, HostCounter>{std::move(dependency)}, cache{cache}, + type{type}, query{cache.AllocateQuery(type)} { + glBeginQuery(GetTarget(type), query.handle); +} + +HostCounter::~HostCounter() { + cache.Reserve(type, std::move(query)); +} + +void HostCounter::EndQuery() { + if (!cache.AnyCommandQueued()) { + // There are chances a query waited on without commands (glDraw, glClear, glDispatch). Not + // having any of these causes a lock. glFlush is considered a command, so we can safely wait + // for this. Insert to the OpenGL command stream a flush. + glFlush(); + } + glEndQuery(GetTarget(type)); +} + +u64 HostCounter::BlockingQuery() const { + GLint64 value; + glGetQueryObjecti64v(query.handle, GL_QUERY_RESULT, &value); + return static_cast<u64>(value); +} + +CachedQuery::CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr) + : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr}, cache{&cache}, type{type} {} + +CachedQuery::CachedQuery(CachedQuery&& rhs) noexcept + : VideoCommon::CachedQueryBase<HostCounter>(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {} + +CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept { + VideoCommon::CachedQueryBase<HostCounter>::operator=(std::move(rhs)); + cache = rhs.cache; + type = rhs.type; + return *this; +} + +void CachedQuery::Flush() { + // Waiting for a query while another query of the same target is enabled locks Nvidia's driver. + // To avoid this disable and re-enable keeping the dependency stream. + // But we only have to do this if we have pending waits to be done. + auto& stream = cache->Stream(type); + const bool slice_counter = WaitPending() && stream.IsEnabled(); + if (slice_counter) { + stream.Update(false); + } + + VideoCommon::CachedQueryBase<HostCounter>::Flush(); + + if (slice_counter) { + stream.Update(true); + } +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h new file mode 100644 index 000000000..d8e7052a1 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_query_cache.h @@ -0,0 +1,78 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <memory> +#include <vector> + +#include "common/common_types.h" +#include "video_core/query_cache.h" +#include "video_core/rasterizer_interface.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" + +namespace Core { +class System; +} + +namespace OpenGL { + +class CachedQuery; +class HostCounter; +class QueryCache; +class RasterizerOpenGL; + +using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; + +class QueryCache final : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, + HostCounter, std::vector<OGLQuery>> { +public: + explicit QueryCache(Core::System& system, RasterizerOpenGL& rasterizer); + ~QueryCache(); + + OGLQuery AllocateQuery(VideoCore::QueryType type); + + void Reserve(VideoCore::QueryType type, OGLQuery&& query); + + bool AnyCommandQueued() const noexcept; + +private: + RasterizerOpenGL& gl_rasterizer; +}; + +class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> { +public: + explicit HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency, + VideoCore::QueryType type); + ~HostCounter(); + + void EndQuery(); + +private: + u64 BlockingQuery() const override; + + QueryCache& cache; + const VideoCore::QueryType type; + OGLQuery query; +}; + +class CachedQuery final : public VideoCommon::CachedQueryBase<HostCounter> { +public: + explicit CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, + u8* host_ptr); + CachedQuery(CachedQuery&& rhs) noexcept; + CachedQuery(const CachedQuery&) = delete; + + CachedQuery& operator=(CachedQuery&& rhs) noexcept; + CachedQuery& operator=(const CachedQuery&) = delete; + + void Flush() override; + +private: + QueryCache* cache; + VideoCore::QueryType type; +}; + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index c428f06e4..e1965fb21 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -25,6 +25,7 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/shader_type.h" #include "video_core/memory_manager.h" +#include "video_core/renderer_opengl/gl_query_cache.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/gl_shader_gen.h" @@ -55,16 +56,20 @@ namespace { template <typename Engine, typename Entry> Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, - Tegra::Engines::ShaderType shader_type) { + Tegra::Engines::ShaderType shader_type, + std::size_t index = 0) { if (entry.IsBindless()) { const Tegra::Texture::TextureHandle tex_handle = engine.AccessConstBuffer32(shader_type, entry.GetBuffer(), entry.GetOffset()); return engine.GetTextureInfo(tex_handle); } + const auto& gpu_profile = engine.AccessGuestDriverProfile(); + const u32 offset = + entry.GetOffset() + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize()); if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) { - return engine.GetStageTexture(shader_type, entry.GetOffset()); + return engine.GetStageTexture(shader_type, offset); } else { - return engine.GetTexture(entry.GetOffset()); + return engine.GetTexture(offset); } } @@ -88,8 +93,8 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer, RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, ScreenInfo& info) : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device}, - shader_cache{*this, system, emu_window, device}, system{system}, screen_info{info}, - buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} { + shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, system{system}, + screen_info{info}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} { shader_program_manager = std::make_unique<GLShader::ProgramManager>(); state.draw.shader_program = 0; state.Apply(); @@ -244,9 +249,6 @@ void RasterizerOpenGL::SetupVertexInstances(GLuint vao) { } GLintptr RasterizerOpenGL::SetupIndexBuffer() { - if (accelerate_draw != AccelDraw::Indexed) { - return 0; - } MICROPROFILE_SCOPE(OpenGL_Index); const auto& regs = system.GPU().Maxwell3D().regs; const std::size_t size = CalculateIndexBufferSize(); @@ -540,10 +542,16 @@ void RasterizerOpenGL::Clear() { } else if (use_stencil) { glClearBufferiv(GL_STENCIL, 0, ®s.clear_stencil); } + + ++num_queued_commands; } -void RasterizerOpenGL::DrawPrelude() { +void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { + MICROPROFILE_SCOPE(OpenGL_Drawing); auto& gpu = system.GPU().Maxwell3D(); + const auto& regs = gpu.regs; + + query_cache.UpdateCounters(); SyncRasterizeEnable(state); SyncColorMask(); @@ -563,9 +571,6 @@ void RasterizerOpenGL::DrawPrelude() { buffer_cache.Acquire(); - // Draw the vertex batch - const bool is_indexed = accelerate_draw == AccelDraw::Indexed; - std::size_t buffer_size = CalculateVertexArraysSize(); // Add space for index buffer @@ -592,7 +597,11 @@ void RasterizerOpenGL::DrawPrelude() { // Upload vertex and index data. SetupVertexBuffer(vao); SetupVertexInstances(vao); - index_buffer_offset = SetupIndexBuffer(); + + GLintptr index_buffer_offset; + if (is_indexed) { + index_buffer_offset = SetupIndexBuffer(); + } // Prepare packed bindings. bind_ubo_pushbuffer.Setup(); @@ -608,7 +617,7 @@ void RasterizerOpenGL::DrawPrelude() { // Setup shaders and their used resources. texture_cache.GuardSamplers(true); - const auto primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology); + const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology); SetupShaders(primitive_mode); texture_cache.GuardSamplers(false); @@ -626,6 +635,7 @@ void RasterizerOpenGL::DrawPrelude() { // As all cached buffers are invalidated, we need to recheck their state. gpu.dirty.ResetVertexArrays(); } + gpu.dirty.memory_general = false; shader_program_manager->ApplyTo(state); state.Apply(); @@ -633,107 +643,46 @@ void RasterizerOpenGL::DrawPrelude() { if (texture_cache.TextureBarrier()) { glTextureBarrier(); } -} -struct DrawParams { - bool is_indexed{}; - bool is_instanced{}; - GLenum primitive_mode{}; - GLint count{}; - GLint base_vertex{}; - - // Indexed settings - GLenum index_format{}; - GLintptr index_buffer_offset{}; - - // Instanced setting - GLint num_instances{}; - GLint base_instance{}; - - void DispatchDraw() { - if (is_indexed) { - const auto index_buffer_ptr = reinterpret_cast<const void*>(index_buffer_offset); - if (is_instanced) { - glDrawElementsInstancedBaseVertexBaseInstance(primitive_mode, count, index_format, - index_buffer_ptr, num_instances, - base_vertex, base_instance); - } else { - glDrawElementsBaseVertex(primitive_mode, count, index_format, index_buffer_ptr, - base_vertex); - } + ++num_queued_commands; + + const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance); + const GLsizei num_instances = + static_cast<GLsizei>(is_instanced ? gpu.mme_draw.instance_count : 1); + if (is_indexed) { + const GLint base_vertex = static_cast<GLint>(gpu.regs.vb_element_base); + const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.index_array.count); + const GLvoid* offset = reinterpret_cast<const GLvoid*>(index_buffer_offset); + const GLenum format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format); + if (num_instances == 1 && base_instance == 0 && base_vertex == 0) { + glDrawElements(primitive_mode, num_vertices, format, offset); + } else if (num_instances == 1 && base_instance == 0) { + glDrawElementsBaseVertex(primitive_mode, num_vertices, format, offset, base_vertex); + } else if (base_vertex == 0 && base_instance == 0) { + glDrawElementsInstanced(primitive_mode, num_vertices, format, offset, num_instances); + } else if (base_vertex == 0) { + glDrawElementsInstancedBaseInstance(primitive_mode, num_vertices, format, offset, + num_instances, base_instance); + } else if (base_instance == 0) { + glDrawElementsInstancedBaseVertex(primitive_mode, num_vertices, format, offset, + num_instances, base_vertex); } else { - if (is_instanced) { - glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, count, num_instances, - base_instance); - } else { - glDrawArrays(primitive_mode, base_vertex, count); - } + glDrawElementsInstancedBaseVertexBaseInstance(primitive_mode, num_vertices, format, + offset, num_instances, base_vertex, + base_instance); } - } -}; - -bool RasterizerOpenGL::DrawBatch(bool is_indexed) { - accelerate_draw = is_indexed ? AccelDraw::Indexed : AccelDraw::Arrays; - - MICROPROFILE_SCOPE(OpenGL_Drawing); - - DrawPrelude(); - - auto& maxwell3d = system.GPU().Maxwell3D(); - const auto& regs = maxwell3d.regs; - const auto current_instance = maxwell3d.state.current_instance; - DrawParams draw_call{}; - draw_call.is_indexed = is_indexed; - draw_call.num_instances = static_cast<GLint>(1); - draw_call.base_instance = static_cast<GLint>(current_instance); - draw_call.is_instanced = current_instance > 0; - draw_call.primitive_mode = MaxwellToGL::PrimitiveTopology(regs.draw.topology); - if (draw_call.is_indexed) { - draw_call.count = static_cast<GLint>(regs.index_array.count); - draw_call.base_vertex = static_cast<GLint>(regs.vb_element_base); - draw_call.index_format = MaxwellToGL::IndexFormat(regs.index_array.format); - draw_call.index_buffer_offset = index_buffer_offset; } else { - draw_call.count = static_cast<GLint>(regs.vertex_buffer.count); - draw_call.base_vertex = static_cast<GLint>(regs.vertex_buffer.first); - } - draw_call.DispatchDraw(); - - maxwell3d.dirty.memory_general = false; - accelerate_draw = AccelDraw::Disabled; - return true; -} - -bool RasterizerOpenGL::DrawMultiBatch(bool is_indexed) { - accelerate_draw = is_indexed ? AccelDraw::Indexed : AccelDraw::Arrays; - - MICROPROFILE_SCOPE(OpenGL_Drawing); - - DrawPrelude(); - - auto& maxwell3d = system.GPU().Maxwell3D(); - const auto& regs = maxwell3d.regs; - const auto& draw_setup = maxwell3d.mme_draw; - DrawParams draw_call{}; - draw_call.is_indexed = is_indexed; - draw_call.num_instances = static_cast<GLint>(draw_setup.instance_count); - draw_call.base_instance = static_cast<GLint>(regs.vb_base_instance); - draw_call.is_instanced = draw_setup.instance_count > 1; - draw_call.primitive_mode = MaxwellToGL::PrimitiveTopology(regs.draw.topology); - if (draw_call.is_indexed) { - draw_call.count = static_cast<GLint>(regs.index_array.count); - draw_call.base_vertex = static_cast<GLint>(regs.vb_element_base); - draw_call.index_format = MaxwellToGL::IndexFormat(regs.index_array.format); - draw_call.index_buffer_offset = index_buffer_offset; - } else { - draw_call.count = static_cast<GLint>(regs.vertex_buffer.count); - draw_call.base_vertex = static_cast<GLint>(regs.vertex_buffer.first); + const GLint base_vertex = static_cast<GLint>(gpu.regs.vertex_buffer.first); + const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.vertex_buffer.count); + if (num_instances == 1 && base_instance == 0) { + glDrawArrays(primitive_mode, base_vertex, num_vertices); + } else if (base_instance == 0) { + glDrawArraysInstanced(primitive_mode, base_vertex, num_vertices, num_instances); + } else { + glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, num_vertices, + num_instances, base_instance); + } } - draw_call.DispatchDraw(); - - maxwell3d.dirty.memory_general = false; - accelerate_draw = AccelDraw::Disabled; - return true; } void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { @@ -776,6 +725,16 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { state.ApplyProgramPipeline(); glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); + ++num_queued_commands; +} + +void RasterizerOpenGL::ResetCounter(VideoCore::QueryType type) { + query_cache.ResetCounter(type); +} + +void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, + std::optional<u64> timestamp) { + query_cache.Query(gpu_addr, type, timestamp); } void RasterizerOpenGL::FlushAll() {} @@ -787,6 +746,7 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) { } texture_cache.FlushRegion(addr, size); buffer_cache.FlushRegion(addr, size); + query_cache.FlushRegion(addr, size); } void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) { @@ -797,6 +757,7 @@ void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) { texture_cache.InvalidateRegion(addr, size); shader_cache.InvalidateRegion(addr, size); buffer_cache.InvalidateRegion(addr, size); + query_cache.InvalidateRegion(addr, size); } void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) { @@ -807,10 +768,18 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) { } void RasterizerOpenGL::FlushCommands() { + // Only flush when we have commands queued to OpenGL. + if (num_queued_commands == 0) { + return; + } + num_queued_commands = 0; glFlush(); } void RasterizerOpenGL::TickFrame() { + // Ticking a frame means that buffers will be swapped, calling glFlush implicitly. + num_queued_commands = 0; + buffer_cache.TickFrame(); } @@ -942,8 +911,15 @@ void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& u32 binding = device.GetBaseBindings(stage_index).sampler; for (const auto& entry : shader->GetShaderEntries().samplers) { const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage_index); - const auto texture = GetTextureInfo(maxwell3d, entry, shader_type); - SetupTexture(binding++, texture, entry); + if (!entry.IsIndexed()) { + const auto texture = GetTextureInfo(maxwell3d, entry, shader_type); + SetupTexture(binding++, texture, entry); + } else { + for (std::size_t i = 0; i < entry.Size(); ++i) { + const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i); + SetupTexture(binding++, texture, entry); + } + } } } @@ -952,8 +928,17 @@ void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) { const auto& compute = system.GPU().KeplerCompute(); u32 binding = 0; for (const auto& entry : kernel->GetShaderEntries().samplers) { - const auto texture = GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute); - SetupTexture(binding++, texture, entry); + if (!entry.IsIndexed()) { + const auto texture = + GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute); + SetupTexture(binding++, texture, entry); + } else { + for (std::size_t i = 0; i < entry.Size(); ++i) { + const auto texture = + GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute, i); + SetupTexture(binding++, texture, entry); + } + } } } @@ -1273,6 +1258,7 @@ void RasterizerOpenGL::SyncPointState() { // Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid // in OpenGL). state.point.program_control = regs.vp_point_size.enable != 0; + state.point.sprite = regs.point_sprite_enable != 0; state.point.size = std::max(1.0f, regs.point_size); } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 6a27cf497..68abe9a21 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -24,6 +24,7 @@ #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_framebuffer_cache.h" +#include "video_core/renderer_opengl/gl_query_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_sampler_cache.h" #include "video_core/renderer_opengl/gl_shader_cache.h" @@ -57,10 +58,11 @@ public: ScreenInfo& info); ~RasterizerOpenGL() override; - bool DrawBatch(bool is_indexed) override; - bool DrawMultiBatch(bool is_indexed) override; + void Draw(bool is_indexed, bool is_instanced) override; void Clear() override; void DispatchCompute(GPUVAddr code_addr) override; + void ResetCounter(VideoCore::QueryType type) override; + void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; void FlushAll() override; void FlushRegion(CacheAddr addr, u64 size) override; void InvalidateRegion(CacheAddr addr, u64 size) override; @@ -75,6 +77,11 @@ public: void LoadDiskResources(const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) override; + /// Returns true when there are commands queued to the OpenGL server. + bool AnyCommandQueued() const { + return num_queued_commands > 0; + } + private: /// Configures the color and depth framebuffer states. void ConfigureFramebuffers(); @@ -102,9 +109,6 @@ private: void SetupGlobalMemory(u32 binding, const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr, std::size_t size); - /// Syncs all the state, shaders, render targets and textures setting before a draw call. - void DrawPrelude(); - /// Configures the current textures to use for the draw command. void SetupDrawTextures(std::size_t stage_index, const Shader& shader); @@ -180,10 +184,23 @@ private: /// Syncs the alpha test state to match the guest state void SyncAlphaTest(); - /// Check for extension that are not strictly required - /// but are needed for correct emulation + /// Check for extension that are not strictly required but are needed for correct emulation void CheckExtensions(); + std::size_t CalculateVertexArraysSize() const; + + std::size_t CalculateIndexBufferSize() const; + + /// Updates and returns a vertex array object representing current vertex format + GLuint SetupVertexFormat(); + + void SetupVertexBuffer(GLuint vao); + void SetupVertexInstances(GLuint vao); + + GLintptr SetupIndexBuffer(); + + void SetupShaders(GLenum primitive_mode); + const Device device; OpenGLState state; @@ -191,6 +208,7 @@ private: ShaderCacheOpenGL shader_cache; SamplerCacheOpenGL sampler_cache; FramebufferCacheOpenGL framebuffer_cache; + QueryCache query_cache; Core::System& system; ScreenInfo& screen_info; @@ -208,24 +226,8 @@ private: BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER}; BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER}; - std::size_t CalculateVertexArraysSize() const; - - std::size_t CalculateIndexBufferSize() const; - - /// Updates and returns a vertex array object representing current vertex format - GLuint SetupVertexFormat(); - - void SetupVertexBuffer(GLuint vao); - void SetupVertexInstances(GLuint vao); - - GLintptr SetupIndexBuffer(); - - GLintptr index_buffer_offset; - - void SetupShaders(GLenum primitive_mode); - - enum class AccelDraw { Disabled, Arrays, Indexed }; - AccelDraw accelerate_draw = AccelDraw::Disabled; + /// Number of commands queued to the OpenGL driver. Reseted on flush. + std::size_t num_queued_commands = 0; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp index 5c96c1d46..f0ddfb276 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp @@ -207,4 +207,21 @@ void OGLFramebuffer::Release() { handle = 0; } +void OGLQuery::Create(GLenum target) { + if (handle != 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceCreation); + glCreateQueries(target, 1, &handle); +} + +void OGLQuery::Release() { + if (handle == 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); + glDeleteQueries(1, &handle); + handle = 0; +} + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index 3a85a1d4c..514d1d165 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -266,4 +266,29 @@ public: GLuint handle = 0; }; +class OGLQuery : private NonCopyable { +public: + OGLQuery() = default; + + OGLQuery(OGLQuery&& o) noexcept : handle(std::exchange(o.handle, 0)) {} + + ~OGLQuery() { + Release(); + } + + OGLQuery& operator=(OGLQuery&& o) noexcept { + Release(); + handle = std::exchange(o.handle, 0); + return *this; + } + + /// Creates a new internal OpenGL resource and stores the handle + void Create(GLenum target); + + /// Deletes the internal OpenGL resource + void Release(); + + GLuint handle = 0; +}; + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 3c5bdd377..489eb143c 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -214,6 +214,7 @@ std::unique_ptr<ConstBufferLocker> MakeLocker(Core::System& system, ShaderType s } void FillLocker(ConstBufferLocker& locker, const ShaderDiskCacheUsage& usage) { + locker.SetBoundBuffer(usage.bound_buffer); for (const auto& key : usage.keys) { const auto [buffer, offset] = key.first; locker.InsertKey(buffer, offset, key.second); @@ -418,7 +419,8 @@ bool CachedShader::EnsureValidLockerVariant() { ShaderDiskCacheUsage CachedShader::GetUsage(const ProgramVariant& variant, const ConstBufferLocker& locker) const { - return ShaderDiskCacheUsage{unique_identifier, variant, locker.GetKeys(), + return ShaderDiskCacheUsage{unique_identifier, variant, + locker.GetBoundBuffer(), locker.GetKeys(), locker.GetBoundSamplers(), locker.GetBindlessSamplers()}; } diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 2996aaf08..4735000b5 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -391,6 +391,7 @@ public: DeclareVertex(); DeclareGeometry(); DeclareRegisters(); + DeclareCustomVariables(); DeclarePredicates(); DeclareLocalMemory(); DeclareInternalFlags(); @@ -503,6 +504,16 @@ private: } } + void DeclareCustomVariables() { + const u32 num_custom_variables = ir.GetNumCustomVariables(); + for (u32 i = 0; i < num_custom_variables; ++i) { + code.AddLine("float {} = 0.0f;", GetCustomVariable(i)); + } + if (num_custom_variables > 0) { + code.AddNewLine(); + } + } + void DeclarePredicates() { const auto& predicates = ir.GetPredicates(); for (const auto pred : predicates) { @@ -655,7 +666,8 @@ private: u32 binding = device.GetBaseBindings(stage).sampler; for (const auto& sampler : ir.GetSamplers()) { const std::string name = GetSampler(sampler); - const std::string description = fmt::format("layout (binding = {}) uniform", binding++); + const std::string description = fmt::format("layout (binding = {}) uniform", binding); + binding += sampler.IsIndexed() ? sampler.Size() : 1; std::string sampler_type = [&]() { if (sampler.IsBuffer()) { @@ -682,7 +694,11 @@ private: sampler_type += "Shadow"; } - code.AddLine("{} {} {};", description, sampler_type, name); + if (!sampler.IsIndexed()) { + code.AddLine("{} {} {};", description, sampler_type, name); + } else { + code.AddLine("{} {} {}[{}];", description, sampler_type, name, sampler.Size()); + } } if (!ir.GetSamplers().empty()) { code.AddNewLine(); @@ -775,6 +791,11 @@ private: return {GetRegister(index), Type::Float}; } + if (const auto cv = std::get_if<CustomVarNode>(&*node)) { + const u32 index = cv->GetIndex(); + return {GetCustomVariable(index), Type::Float}; + } + if (const auto immediate = std::get_if<ImmediateNode>(&*node)) { const u32 value = immediate->GetValue(); if (value < 10) { @@ -1019,7 +1040,6 @@ private: } return {{"gl_ViewportIndex", Type::Int}}; case 3: - UNIMPLEMENTED_MSG("Requires some state changes for gl_PointSize to work in shader"); return {{"gl_PointSize", Type::Float}}; } return {}; @@ -1099,7 +1119,11 @@ private: } else if (!meta->ptp.empty()) { expr += "Offsets"; } - expr += '(' + GetSampler(meta->sampler) + ", "; + if (!meta->sampler.IsIndexed()) { + expr += '(' + GetSampler(meta->sampler) + ", "; + } else { + expr += '(' + GetSampler(meta->sampler) + '[' + Visit(meta->index).AsUint() + "], "; + } expr += coord_constructors.at(count + (has_array ? 1 : 0) + (has_shadow && !separate_dc ? 1 : 0) - 1); expr += '('; @@ -1311,6 +1335,8 @@ private: const std::string final_offset = fmt::format("({} - {}) >> 2", real, base); target = {fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset), Type::Uint}; + } else if (const auto cv = std::get_if<CustomVarNode>(&*dest)) { + target = {GetCustomVariable(cv->GetIndex()), Type::Float}; } else { UNREACHABLE_MSG("Assign called without a proper target"); } @@ -1858,10 +1884,7 @@ private: template <const std::string_view& opname, Type type> Expression Atomic(Operation operation) { - ASSERT(stage == ShaderType::Compute); - auto& smem = std::get<SmemNode>(*operation[0]); - - return {fmt::format("atomic{}(smem[{} >> 2], {})", opname, Visit(smem.GetAddress()).AsInt(), + return {fmt::format("atomic{}({}, {})", opname, Visit(operation[0]).GetCode(), Visit(operation[1]).As(type)), type}; } @@ -2241,6 +2264,10 @@ private: return GetDeclarationWithSuffix(index, "gpr"); } + std::string GetCustomVariable(u32 index) const { + return GetDeclarationWithSuffix(index, "custom_var"); + } + std::string GetPredicate(Tegra::Shader::Pred pred) const { return GetDeclarationWithSuffix(static_cast<u32>(pred), "pred"); } diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index cf874a09a..1fc204f6f 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp @@ -53,7 +53,7 @@ struct BindlessSamplerKey { Tegra::Engines::SamplerDescriptor sampler{}; }; -constexpr u32 NativeVersion = 11; +constexpr u32 NativeVersion = 12; // Making sure sizes doesn't change by accident static_assert(sizeof(ProgramVariant) == 20); @@ -186,7 +186,8 @@ ShaderDiskCacheOpenGL::LoadTransferable() { u32 num_bound_samplers{}; u32 num_bindless_samplers{}; if (file.ReadArray(&usage.unique_identifier, 1) != 1 || - file.ReadArray(&usage.variant, 1) != 1 || file.ReadArray(&num_keys, 1) != 1 || + file.ReadArray(&usage.variant, 1) != 1 || + file.ReadArray(&usage.bound_buffer, 1) != 1 || file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 || file.ReadArray(&num_bindless_samplers, 1) != 1) { LOG_ERROR(Render_OpenGL, error_loading); @@ -281,7 +282,9 @@ ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) { u32 num_bindless_samplers{}; ShaderDiskCacheUsage usage; if (!LoadObjectFromPrecompiled(usage.unique_identifier) || - !LoadObjectFromPrecompiled(usage.variant) || !LoadObjectFromPrecompiled(num_keys) || + !LoadObjectFromPrecompiled(usage.variant) || + !LoadObjectFromPrecompiled(usage.bound_buffer) || + !LoadObjectFromPrecompiled(num_keys) || !LoadObjectFromPrecompiled(num_bound_samplers) || !LoadObjectFromPrecompiled(num_bindless_samplers)) { return {}; @@ -393,6 +396,7 @@ void ShaderDiskCacheOpenGL::SaveUsage(const ShaderDiskCacheUsage& usage) { if (file.WriteObject(TransferableEntryKind::Usage) != 1 || file.WriteObject(usage.unique_identifier) != 1 || file.WriteObject(usage.variant) != 1 || + file.WriteObject(usage.bound_buffer) != 1 || file.WriteObject(static_cast<u32>(usage.keys.size())) != 1 || file.WriteObject(static_cast<u32>(usage.bound_samplers.size())) != 1 || file.WriteObject(static_cast<u32>(usage.bindless_samplers.size())) != 1) { @@ -447,7 +451,7 @@ void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint p }; if (!SaveObjectToPrecompiled(usage.unique_identifier) || - !SaveObjectToPrecompiled(usage.variant) || + !SaveObjectToPrecompiled(usage.variant) || !SaveObjectToPrecompiled(usage.bound_buffer) || !SaveObjectToPrecompiled(static_cast<u32>(usage.keys.size())) || !SaveObjectToPrecompiled(static_cast<u32>(usage.bound_samplers.size())) || !SaveObjectToPrecompiled(static_cast<u32>(usage.bindless_samplers.size()))) { diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h index 69a2fbdda..ef2371f6d 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h @@ -79,6 +79,7 @@ static_assert(std::is_trivially_copyable_v<ProgramVariant>); struct ShaderDiskCacheUsage { u64 unique_identifier{}; ProgramVariant variant; + u32 bound_buffer{}; VideoCommon::Shader::KeyMap keys; VideoCommon::Shader::BoundSamplerMap bound_samplers; VideoCommon::Shader::BindlessSamplerMap bindless_samplers; diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp index cc185e9e1..ab1f7983c 100644 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ b/src/video_core/renderer_opengl/gl_state.cpp @@ -128,6 +128,7 @@ void OpenGLState::ApplyClipDistances() { void OpenGLState::ApplyPointSize() { Enable(GL_PROGRAM_POINT_SIZE, cur_state.point.program_control, point.program_control); + Enable(GL_POINT_SPRITE, cur_state.point.sprite, point.sprite); if (UpdateValue(cur_state.point.size, point.size)) { glPointSize(point.size); } diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h index 678e5cd89..4953eeda2 100644 --- a/src/video_core/renderer_opengl/gl_state.h +++ b/src/video_core/renderer_opengl/gl_state.h @@ -132,6 +132,7 @@ public: struct { bool program_control = false; // GL_PROGRAM_POINT_SIZE + bool sprite = false; // GL_POINT_SPRITE GLfloat size = 1.0f; // GL_POINT_SIZE } point; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index e95eb069e..d4b81cd87 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -176,6 +176,19 @@ GLint GetSwizzleSource(SwizzleSource source) { return GL_NONE; } +GLenum GetComponent(PixelFormat format, bool is_first) { + switch (format) { + case PixelFormat::Z24S8: + case PixelFormat::Z32FS8: + return is_first ? GL_DEPTH_COMPONENT : GL_STENCIL_INDEX; + case PixelFormat::S8Z24: + return is_first ? GL_STENCIL_INDEX : GL_DEPTH_COMPONENT; + default: + UNREACHABLE(); + return GL_DEPTH_COMPONENT; + } +} + void ApplyTextureDefaults(const SurfaceParams& params, GLuint texture) { if (params.IsBuffer()) { return; @@ -184,7 +197,7 @@ void ApplyTextureDefaults(const SurfaceParams& params, GLuint texture) { glTextureParameteri(texture, GL_TEXTURE_MAG_FILTER, GL_LINEAR); glTextureParameteri(texture, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTextureParameteri(texture, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTextureParameteri(texture, GL_TEXTURE_MAX_LEVEL, params.num_levels - 1); + glTextureParameteri(texture, GL_TEXTURE_MAX_LEVEL, static_cast<GLint>(params.num_levels - 1)); if (params.num_levels == 1) { glTextureParameterf(texture, GL_TEXTURE_LOD_BIAS, 1000.0f); } @@ -416,11 +429,21 @@ void CachedSurfaceView::ApplySwizzle(SwizzleSource x_source, SwizzleSource y_sou if (new_swizzle == swizzle) return; swizzle = new_swizzle; - const std::array<GLint, 4> gl_swizzle = {GetSwizzleSource(x_source), GetSwizzleSource(y_source), - GetSwizzleSource(z_source), - GetSwizzleSource(w_source)}; + const std::array gl_swizzle = {GetSwizzleSource(x_source), GetSwizzleSource(y_source), + GetSwizzleSource(z_source), GetSwizzleSource(w_source)}; const GLuint handle = GetTexture(); - glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data()); + const PixelFormat format = surface.GetSurfaceParams().pixel_format; + switch (format) { + case PixelFormat::Z24S8: + case PixelFormat::Z32FS8: + case PixelFormat::S8Z24: + glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE, + GetComponent(format, x_source == SwizzleSource::R)); + break; + default: + glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data()); + break; + } } OGLTextureView CachedSurfaceView::CreateTextureView() const { @@ -529,8 +552,11 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view, const Common::Rectangle<u32>& dst_rect = copy_config.dst_rect; const bool is_linear = copy_config.filter == Tegra::Engines::Fermi2D::Filter::Linear; - glBlitFramebuffer(src_rect.left, src_rect.top, src_rect.right, src_rect.bottom, dst_rect.left, - dst_rect.top, dst_rect.right, dst_rect.bottom, buffers, + glBlitFramebuffer(static_cast<GLint>(src_rect.left), static_cast<GLint>(src_rect.top), + static_cast<GLint>(src_rect.right), static_cast<GLint>(src_rect.bottom), + static_cast<GLint>(dst_rect.left), static_cast<GLint>(dst_rect.top), + static_cast<GLint>(dst_rect.right), static_cast<GLint>(dst_rect.bottom), + buffers, is_linear && (buffers == GL_COLOR_BUFFER_BIT) ? GL_LINEAR : GL_NEAREST); } diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index ea4f35663..7ed505628 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -47,8 +47,7 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return GL_UNSIGNED_INT_2_10_10_10_REV; default: - LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - UNREACHABLE(); + LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); return {}; } case Maxwell::VertexAttribute::Type::SignedInt: @@ -72,8 +71,7 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return GL_INT_2_10_10_10_REV; default: - LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - UNREACHABLE(); + LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); return {}; } case Maxwell::VertexAttribute::Type::Float: @@ -89,13 +87,19 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { case Maxwell::VertexAttribute::Size::Size_32_32_32_32: return GL_FLOAT; default: - LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - UNREACHABLE(); + LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); + return {}; + } + case Maxwell::VertexAttribute::Type::UnsignedScaled: + switch (attrib.size) { + case Maxwell::VertexAttribute::Size::Size_8_8: + return GL_UNSIGNED_BYTE; + default: + LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); return {}; } default: - LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString()); - UNREACHABLE(); + LOG_ERROR(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString()); return {}; } } diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 331808113..5403c3ab7 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -164,7 +164,7 @@ struct FormatTuple { {vk::Format::eUndefined, {}}, // ASTC_2D_5X4 {vk::Format::eUndefined, {}}, // BGRA8_SRGB {vk::Format::eBc1RgbaSrgbBlock, {}}, // DXT1_SRGB - {vk::Format::eUndefined, {}}, // DXT23_SRGB + {vk::Format::eBc2SrgbBlock, {}}, // DXT23_SRGB {vk::Format::eBc3SrgbBlock, {}}, // DXT45_SRGB {vk::Format::eBc7SrgbBlock, {}}, // BC7U_SRGB {vk::Format::eR4G4B4A4UnormPack16, Attachable}, // R4G4B4A4U @@ -363,6 +363,8 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr return vk::Format::eR8G8B8A8Uint; case Maxwell::VertexAttribute::Size::Size_32: return vk::Format::eR32Uint; + case Maxwell::VertexAttribute::Size::Size_32_32_32_32: + return vk::Format::eR32G32B32A32Uint; default: break; } diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp new file mode 100644 index 000000000..d5032b432 --- /dev/null +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -0,0 +1,265 @@ +// Copyright 2018 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <memory> +#include <optional> +#include <vector> + +#include <fmt/format.h> + +#include "common/assert.h" +#include "common/logging/log.h" +#include "common/telemetry.h" +#include "core/core.h" +#include "core/core_timing.h" +#include "core/frontend/emu_window.h" +#include "core/memory.h" +#include "core/perf_stats.h" +#include "core/settings.h" +#include "core/telemetry_session.h" +#include "video_core/gpu.h" +#include "video_core/renderer_vulkan/declarations.h" +#include "video_core/renderer_vulkan/renderer_vulkan.h" +#include "video_core/renderer_vulkan/vk_blit_screen.h" +#include "video_core/renderer_vulkan/vk_device.h" +#include "video_core/renderer_vulkan/vk_memory_manager.h" +#include "video_core/renderer_vulkan/vk_rasterizer.h" +#include "video_core/renderer_vulkan/vk_resource_manager.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_swapchain.h" + +namespace Vulkan { + +namespace { + +VkBool32 DebugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT severity_, + VkDebugUtilsMessageTypeFlagsEXT type, + const VkDebugUtilsMessengerCallbackDataEXT* data, + [[maybe_unused]] void* user_data) { + const vk::DebugUtilsMessageSeverityFlagBitsEXT severity{severity_}; + const char* message{data->pMessage}; + + if (severity & vk::DebugUtilsMessageSeverityFlagBitsEXT::eError) { + LOG_CRITICAL(Render_Vulkan, "{}", message); + } else if (severity & vk::DebugUtilsMessageSeverityFlagBitsEXT::eWarning) { + LOG_WARNING(Render_Vulkan, "{}", message); + } else if (severity & vk::DebugUtilsMessageSeverityFlagBitsEXT::eInfo) { + LOG_INFO(Render_Vulkan, "{}", message); + } else if (severity & vk::DebugUtilsMessageSeverityFlagBitsEXT::eVerbose) { + LOG_DEBUG(Render_Vulkan, "{}", message); + } + return VK_FALSE; +} + +std::string GetReadableVersion(u32 version) { + return fmt::format("{}.{}.{}", VK_VERSION_MAJOR(version), VK_VERSION_MINOR(version), + VK_VERSION_PATCH(version)); +} + +std::string GetDriverVersion(const VKDevice& device) { + // Extracted from + // https://github.com/SaschaWillems/vulkan.gpuinfo.org/blob/5dddea46ea1120b0df14eef8f15ff8e318e35462/functions.php#L308-L314 + const u32 version = device.GetDriverVersion(); + + if (device.GetDriverID() == vk::DriverIdKHR::eNvidiaProprietary) { + const u32 major = (version >> 22) & 0x3ff; + const u32 minor = (version >> 14) & 0x0ff; + const u32 secondary = (version >> 6) & 0x0ff; + const u32 tertiary = version & 0x003f; + return fmt::format("{}.{}.{}.{}", major, minor, secondary, tertiary); + } + if (device.GetDriverID() == vk::DriverIdKHR::eIntelProprietaryWindows) { + const u32 major = version >> 14; + const u32 minor = version & 0x3fff; + return fmt::format("{}.{}", major, minor); + } + + return GetReadableVersion(version); +} + +std::string BuildCommaSeparatedExtensions(std::vector<std::string> available_extensions) { + std::sort(std::begin(available_extensions), std::end(available_extensions)); + + static constexpr std::size_t AverageExtensionSize = 64; + std::string separated_extensions; + separated_extensions.reserve(available_extensions.size() * AverageExtensionSize); + + const auto end = std::end(available_extensions); + for (auto extension = std::begin(available_extensions); extension != end; ++extension) { + if (const bool is_last = extension + 1 == end; is_last) { + separated_extensions += *extension; + } else { + separated_extensions += fmt::format("{},", *extension); + } + } + return separated_extensions; +} + +} // Anonymous namespace + +RendererVulkan::RendererVulkan(Core::Frontend::EmuWindow& window, Core::System& system) + : RendererBase(window), system{system} {} + +RendererVulkan::~RendererVulkan() { + ShutDown(); +} + +void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { + const auto& layout = render_window.GetFramebufferLayout(); + if (framebuffer && layout.width > 0 && layout.height > 0 && render_window.IsShown()) { + const VAddr framebuffer_addr = framebuffer->address + framebuffer->offset; + const bool use_accelerated = + rasterizer->AccelerateDisplay(*framebuffer, framebuffer_addr, framebuffer->stride); + const bool is_srgb = use_accelerated && screen_info.is_srgb; + if (swapchain->HasFramebufferChanged(layout) || swapchain->GetSrgbState() != is_srgb) { + swapchain->Create(layout.width, layout.height, is_srgb); + blit_screen->Recreate(); + } + + scheduler->WaitWorker(); + + swapchain->AcquireNextImage(); + const auto [fence, render_semaphore] = blit_screen->Draw(*framebuffer, use_accelerated); + + scheduler->Flush(false, render_semaphore); + + if (swapchain->Present(render_semaphore, fence)) { + blit_screen->Recreate(); + } + + render_window.SwapBuffers(); + rasterizer->TickFrame(); + } + + render_window.PollEvents(); +} + +bool RendererVulkan::Init() { + PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr{}; + render_window.RetrieveVulkanHandlers(&vkGetInstanceProcAddr, &instance, &surface); + const vk::DispatchLoaderDynamic dldi(instance, vkGetInstanceProcAddr); + + std::optional<vk::DebugUtilsMessengerEXT> callback; + if (Settings::values.renderer_debug && dldi.vkCreateDebugUtilsMessengerEXT) { + callback = CreateDebugCallback(dldi); + if (!callback) { + return false; + } + } + + if (!PickDevices(dldi)) { + if (callback) { + instance.destroy(*callback, nullptr, dldi); + } + return false; + } + debug_callback = UniqueDebugUtilsMessengerEXT( + *callback, vk::ObjectDestroy<vk::Instance, vk::DispatchLoaderDynamic>( + instance, nullptr, device->GetDispatchLoader())); + + Report(); + + memory_manager = std::make_unique<VKMemoryManager>(*device); + + resource_manager = std::make_unique<VKResourceManager>(*device); + + const auto& framebuffer = render_window.GetFramebufferLayout(); + swapchain = std::make_unique<VKSwapchain>(surface, *device); + swapchain->Create(framebuffer.width, framebuffer.height, false); + + scheduler = std::make_unique<VKScheduler>(*device, *resource_manager); + + rasterizer = std::make_unique<RasterizerVulkan>(system, render_window, screen_info, *device, + *resource_manager, *memory_manager, *scheduler); + + blit_screen = std::make_unique<VKBlitScreen>(system, render_window, *rasterizer, *device, + *resource_manager, *memory_manager, *swapchain, + *scheduler, screen_info); + + return true; +} + +void RendererVulkan::ShutDown() { + if (!device) { + return; + } + const auto dev = device->GetLogical(); + const auto& dld = device->GetDispatchLoader(); + if (dev && dld.vkDeviceWaitIdle) { + dev.waitIdle(dld); + } + + rasterizer.reset(); + blit_screen.reset(); + scheduler.reset(); + swapchain.reset(); + memory_manager.reset(); + resource_manager.reset(); + device.reset(); +} + +std::optional<vk::DebugUtilsMessengerEXT> RendererVulkan::CreateDebugCallback( + const vk::DispatchLoaderDynamic& dldi) { + const vk::DebugUtilsMessengerCreateInfoEXT callback_ci( + {}, + vk::DebugUtilsMessageSeverityFlagBitsEXT::eError | + vk::DebugUtilsMessageSeverityFlagBitsEXT::eWarning | + vk::DebugUtilsMessageSeverityFlagBitsEXT::eInfo | + vk::DebugUtilsMessageSeverityFlagBitsEXT::eVerbose, + vk::DebugUtilsMessageTypeFlagBitsEXT::eGeneral | + vk::DebugUtilsMessageTypeFlagBitsEXT::eValidation | + vk::DebugUtilsMessageTypeFlagBitsEXT::ePerformance, + &DebugCallback, nullptr); + vk::DebugUtilsMessengerEXT callback; + if (instance.createDebugUtilsMessengerEXT(&callback_ci, nullptr, &callback, dldi) != + vk::Result::eSuccess) { + LOG_ERROR(Render_Vulkan, "Failed to create debug callback"); + return {}; + } + return callback; +} + +bool RendererVulkan::PickDevices(const vk::DispatchLoaderDynamic& dldi) { + const auto devices = instance.enumeratePhysicalDevices(dldi); + + // TODO(Rodrigo): Choose device from config file + const s32 device_index = Settings::values.vulkan_device; + if (device_index < 0 || device_index >= static_cast<s32>(devices.size())) { + LOG_ERROR(Render_Vulkan, "Invalid device index {}!", device_index); + return false; + } + const vk::PhysicalDevice physical_device = devices[device_index]; + + if (!VKDevice::IsSuitable(dldi, physical_device, surface)) { + return false; + } + + device = std::make_unique<VKDevice>(dldi, physical_device, surface); + return device->Create(dldi, instance); +} + +void RendererVulkan::Report() const { + const std::string vendor_name{device->GetVendorName()}; + const std::string model_name{device->GetModelName()}; + const std::string driver_version = GetDriverVersion(*device); + const std::string driver_name = fmt::format("{} {}", vendor_name, driver_version); + + const std::string api_version = GetReadableVersion(device->GetApiVersion()); + + const std::string extensions = BuildCommaSeparatedExtensions(device->GetAvailableExtensions()); + + LOG_INFO(Render_Vulkan, "Driver: {}", driver_name); + LOG_INFO(Render_Vulkan, "Device: {}", model_name); + LOG_INFO(Render_Vulkan, "Vulkan: {}", api_version); + + auto& telemetry_session = system.TelemetrySession(); + constexpr auto field = Telemetry::FieldType::UserSystem; + telemetry_session.AddField(field, "GPU_Vendor", vendor_name); + telemetry_session.AddField(field, "GPU_Model", model_name); + telemetry_session.AddField(field, "GPU_Vulkan_Driver", driver_name); + telemetry_session.AddField(field, "GPU_Vulkan_Version", api_version); + telemetry_session.AddField(field, "GPU_Vulkan_Extensions", extensions); +} + +} // namespace Vulkan
\ No newline at end of file diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp index 939eebe83..d1da4f9d3 100644 --- a/src/video_core/renderer_vulkan/vk_device.cpp +++ b/src/video_core/renderer_vulkan/vk_device.cpp @@ -104,8 +104,11 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan features.depthBiasClamp = true; features.geometryShader = true; features.tessellationShader = true; + features.occlusionQueryPrecise = true; features.fragmentStoresAndAtomics = true; features.shaderImageGatherExtended = true; + features.shaderStorageImageReadWithoutFormat = + is_shader_storage_img_read_without_format_supported; features.shaderStorageImageWriteWithoutFormat = true; features.textureCompressionASTC_LDR = is_optimal_astc_supported; @@ -117,6 +120,10 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan bit8_storage.uniformAndStorageBuffer8BitAccess = true; SetNext(next, bit8_storage); + vk::PhysicalDeviceHostQueryResetFeaturesEXT host_query_reset; + host_query_reset.hostQueryReset = true; + SetNext(next, host_query_reset); + vk::PhysicalDeviceFloat16Int8FeaturesKHR float16_int8; if (is_float16_supported) { float16_int8.shaderFloat16 = true; @@ -273,6 +280,7 @@ bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDev VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME, VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME, VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME, + VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME, }; std::bitset<required_extensions.size()> available_extensions{}; @@ -340,6 +348,7 @@ bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDev std::make_pair(features.depthBiasClamp, "depthBiasClamp"), std::make_pair(features.geometryShader, "geometryShader"), std::make_pair(features.tessellationShader, "tessellationShader"), + std::make_pair(features.occlusionQueryPrecise, "occlusionQueryPrecise"), std::make_pair(features.fragmentStoresAndAtomics, "fragmentStoresAndAtomics"), std::make_pair(features.shaderImageGatherExtended, "shaderImageGatherExtended"), std::make_pair(features.shaderStorageImageWriteWithoutFormat, @@ -376,7 +385,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami } }; - extensions.reserve(13); + extensions.reserve(14); extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); extensions.push_back(VK_KHR_16BIT_STORAGE_EXTENSION_NAME); extensions.push_back(VK_KHR_8BIT_STORAGE_EXTENSION_NAME); @@ -384,6 +393,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami extensions.push_back(VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME); extensions.push_back(VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME); extensions.push_back(VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME); + extensions.push_back(VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME); [[maybe_unused]] const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); @@ -400,8 +410,10 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME, true); Test(extension, ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME, false); - Test(extension, nv_device_diagnostic_checkpoints, - VK_NV_DEVICE_DIAGNOSTIC_CHECKPOINTS_EXTENSION_NAME, true); + if (Settings::values.renderer_debug) { + Test(extension, nv_device_diagnostic_checkpoints, + VK_NV_DEVICE_DIAGNOSTIC_CHECKPOINTS_EXTENSION_NAME, true); + } } if (khr_shader_float16_int8) { @@ -455,6 +467,8 @@ void VKDevice::SetupFamilies(const vk::DispatchLoaderDynamic& dldi, vk::SurfaceK void VKDevice::SetupFeatures(const vk::DispatchLoaderDynamic& dldi) { const auto supported_features{physical.getFeatures(dldi)}; + is_shader_storage_img_read_without_format_supported = + supported_features.shaderStorageImageReadWithoutFormat; is_optimal_astc_supported = IsOptimalAstcSupported(supported_features, dldi); } @@ -528,6 +542,7 @@ std::unordered_map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperti vk::Format::eBc6HUfloatBlock, vk::Format::eBc6HSfloatBlock, vk::Format::eBc1RgbaSrgbBlock, + vk::Format::eBc2SrgbBlock, vk::Format::eBc3SrgbBlock, vk::Format::eBc7SrgbBlock, vk::Format::eAstc4x4SrgbBlock, diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/renderer_vulkan/vk_device.h index 72603f9f6..2c27ad730 100644 --- a/src/video_core/renderer_vulkan/vk_device.h +++ b/src/video_core/renderer_vulkan/vk_device.h @@ -122,6 +122,11 @@ public: return properties.limits.maxPushConstantsSize; } + /// Returns true if Shader storage Image Read Without Format supported. + bool IsShaderStorageImageReadWithoutFormatSupported() const { + return is_shader_storage_img_read_without_format_supported; + } + /// Returns true if ASTC is natively supported. bool IsOptimalAstcSupported() const { return is_optimal_astc_supported; @@ -227,6 +232,8 @@ private: bool ext_depth_range_unrestricted{}; ///< Support for VK_EXT_depth_range_unrestricted. bool ext_shader_viewport_index_layer{}; ///< Support for VK_EXT_shader_viewport_index_layer. bool nv_device_diagnostic_checkpoints{}; ///< Support for VK_NV_device_diagnostic_checkpoints. + bool is_shader_storage_img_read_without_format_supported{}; ///< Support for shader storage + ///< image read without format // Telemetry parameters std::string vendor_name; ///< Device's driver name. diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 48e23d4cd..7ddf7d3ee 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -325,9 +325,6 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) { specialization.tessellation.primitive = fixed_state.tessellation.primitive; specialization.tessellation.spacing = fixed_state.tessellation.spacing; specialization.tessellation.clockwise = fixed_state.tessellation.clockwise; - for (const auto& rt : key.renderpass_params.color_attachments) { - specialization.enabled_rendertargets.set(rt.index); - } SPIRVProgram program; std::vector<vk::DescriptorSetLayoutBinding> bindings; diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp new file mode 100644 index 000000000..ffbf60dda --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -0,0 +1,122 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <cstddef> +#include <cstdint> +#include <utility> +#include <vector> + +#include "video_core/renderer_vulkan/declarations.h" +#include "video_core/renderer_vulkan/vk_device.h" +#include "video_core/renderer_vulkan/vk_query_cache.h" +#include "video_core/renderer_vulkan/vk_resource_manager.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" + +namespace Vulkan { + +namespace { + +constexpr std::array QUERY_TARGETS = {vk::QueryType::eOcclusion}; + +constexpr vk::QueryType GetTarget(VideoCore::QueryType type) { + return QUERY_TARGETS[static_cast<std::size_t>(type)]; +} + +} // Anonymous namespace + +QueryPool::QueryPool() : VKFencedPool{GROW_STEP} {} + +QueryPool::~QueryPool() = default; + +void QueryPool::Initialize(const VKDevice& device_, VideoCore::QueryType type_) { + device = &device_; + type = type_; +} + +std::pair<vk::QueryPool, std::uint32_t> QueryPool::Commit(VKFence& fence) { + std::size_t index; + do { + index = CommitResource(fence); + } while (usage[index]); + usage[index] = true; + + return {*pools[index / GROW_STEP], static_cast<std::uint32_t>(index % GROW_STEP)}; +} + +void QueryPool::Allocate(std::size_t begin, std::size_t end) { + usage.resize(end); + + const auto dev = device->GetLogical(); + const u32 size = static_cast<u32>(end - begin); + const vk::QueryPoolCreateInfo query_pool_ci({}, GetTarget(type), size, {}); + pools.push_back(dev.createQueryPoolUnique(query_pool_ci, nullptr, device->GetDispatchLoader())); +} + +void QueryPool::Reserve(std::pair<vk::QueryPool, std::uint32_t> query) { + const auto it = + std::find_if(std::begin(pools), std::end(pools), + [query_pool = query.first](auto& pool) { return query_pool == *pool; }); + ASSERT(it != std::end(pools)); + + const std::ptrdiff_t pool_index = std::distance(std::begin(pools), it); + usage[pool_index * GROW_STEP + static_cast<std::ptrdiff_t>(query.second)] = false; +} + +VKQueryCache::VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, + const VKDevice& device, VKScheduler& scheduler) + : VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter, + QueryPool>{system, rasterizer}, + device{device}, scheduler{scheduler} { + for (std::size_t i = 0; i < static_cast<std::size_t>(VideoCore::NumQueryTypes); ++i) { + query_pools[i].Initialize(device, static_cast<VideoCore::QueryType>(i)); + } +} + +VKQueryCache::~VKQueryCache() = default; + +std::pair<vk::QueryPool, std::uint32_t> VKQueryCache::AllocateQuery(VideoCore::QueryType type) { + return query_pools[static_cast<std::size_t>(type)].Commit(scheduler.GetFence()); +} + +void VKQueryCache::Reserve(VideoCore::QueryType type, + std::pair<vk::QueryPool, std::uint32_t> query) { + query_pools[static_cast<std::size_t>(type)].Reserve(query); +} + +HostCounter::HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency, + VideoCore::QueryType type) + : VideoCommon::HostCounterBase<VKQueryCache, HostCounter>{std::move(dependency)}, cache{cache}, + type{type}, query{cache.AllocateQuery(type)}, ticks{cache.Scheduler().Ticks()} { + const auto dev = cache.Device().GetLogical(); + cache.Scheduler().Record([dev, query = query](vk::CommandBuffer cmdbuf, auto& dld) { + dev.resetQueryPoolEXT(query.first, query.second, 1, dld); + cmdbuf.beginQuery(query.first, query.second, vk::QueryControlFlagBits::ePrecise, dld); + }); +} + +HostCounter::~HostCounter() { + cache.Reserve(type, query); +} + +void HostCounter::EndQuery() { + cache.Scheduler().Record([query = query](auto cmdbuf, auto& dld) { + cmdbuf.endQuery(query.first, query.second, dld); + }); +} + +u64 HostCounter::BlockingQuery() const { + if (ticks >= cache.Scheduler().Ticks()) { + cache.Scheduler().Flush(); + } + + const auto dev = cache.Device().GetLogical(); + const auto& dld = cache.Device().GetDispatchLoader(); + u64 value; + dev.getQueryPoolResults(query.first, query.second, 1, sizeof(value), &value, sizeof(value), + vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait, dld); + return value; +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h new file mode 100644 index 000000000..c3092ee96 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_query_cache.h @@ -0,0 +1,104 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <cstddef> +#include <cstdint> +#include <memory> +#include <utility> +#include <vector> + +#include "common/common_types.h" +#include "video_core/query_cache.h" +#include "video_core/renderer_vulkan/declarations.h" +#include "video_core/renderer_vulkan/vk_resource_manager.h" + +namespace VideoCore { +class RasterizerInterface; +} + +namespace Vulkan { + +class CachedQuery; +class HostCounter; +class VKDevice; +class VKQueryCache; +class VKScheduler; + +using CounterStream = VideoCommon::CounterStreamBase<VKQueryCache, HostCounter>; + +class QueryPool final : public VKFencedPool { +public: + explicit QueryPool(); + ~QueryPool() override; + + void Initialize(const VKDevice& device, VideoCore::QueryType type); + + std::pair<vk::QueryPool, std::uint32_t> Commit(VKFence& fence); + + void Reserve(std::pair<vk::QueryPool, std::uint32_t> query); + +protected: + void Allocate(std::size_t begin, std::size_t end) override; + +private: + static constexpr std::size_t GROW_STEP = 512; + + const VKDevice* device = nullptr; + VideoCore::QueryType type = {}; + + std::vector<UniqueQueryPool> pools; + std::vector<bool> usage; +}; + +class VKQueryCache final + : public VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter, + QueryPool> { +public: + explicit VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, + const VKDevice& device, VKScheduler& scheduler); + ~VKQueryCache(); + + std::pair<vk::QueryPool, std::uint32_t> AllocateQuery(VideoCore::QueryType type); + + void Reserve(VideoCore::QueryType type, std::pair<vk::QueryPool, std::uint32_t> query); + + const VKDevice& Device() const noexcept { + return device; + } + + VKScheduler& Scheduler() const noexcept { + return scheduler; + } + +private: + const VKDevice& device; + VKScheduler& scheduler; +}; + +class HostCounter final : public VideoCommon::HostCounterBase<VKQueryCache, HostCounter> { +public: + explicit HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency, + VideoCore::QueryType type); + ~HostCounter(); + + void EndQuery(); + +private: + u64 BlockingQuery() const override; + + VKQueryCache& cache; + const VideoCore::QueryType type; + const std::pair<vk::QueryPool, std::uint32_t> query; + const u64 ticks; +}; + +class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> { +public: + explicit CachedQuery(VKQueryCache&, VideoCore::QueryType, VAddr cpu_addr, u8* host_ptr) + : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr} {} +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index d2c6b1189..31c078f6a 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -289,25 +289,19 @@ RasterizerVulkan::RasterizerVulkan(Core::System& system, Core::Frontend::EmuWind staging_pool), pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue), buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool), - sampler_cache(device) {} - -RasterizerVulkan::~RasterizerVulkan() = default; - -bool RasterizerVulkan::DrawBatch(bool is_indexed) { - Draw(is_indexed, false); - return true; + sampler_cache(device), query_cache(system, *this, device, scheduler) { + scheduler.SetQueryCache(query_cache); } -bool RasterizerVulkan::DrawMultiBatch(bool is_indexed) { - Draw(is_indexed, true); - return true; -} +RasterizerVulkan::~RasterizerVulkan() = default; void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { MICROPROFILE_SCOPE(Vulkan_Drawing); FlushWork(); + query_cache.UpdateCounters(); + const auto& gpu = system.GPU().Maxwell3D(); GraphicsPipelineCacheKey key{GetFixedPipelineState(gpu.regs)}; @@ -362,6 +356,8 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { void RasterizerVulkan::Clear() { MICROPROFILE_SCOPE(Vulkan_Clearing); + query_cache.UpdateCounters(); + const auto& gpu = system.GPU().Maxwell3D(); if (!system.GPU().Maxwell3D().ShouldExecute()) { return; @@ -429,6 +425,8 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { sampled_views.clear(); image_views.clear(); + query_cache.UpdateCounters(); + const auto& launch_desc = system.GPU().KeplerCompute().launch_description; const ComputePipelineCacheKey key{ code_addr, @@ -471,17 +469,28 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { }); } +void RasterizerVulkan::ResetCounter(VideoCore::QueryType type) { + query_cache.ResetCounter(type); +} + +void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, + std::optional<u64> timestamp) { + query_cache.Query(gpu_addr, type, timestamp); +} + void RasterizerVulkan::FlushAll() {} void RasterizerVulkan::FlushRegion(CacheAddr addr, u64 size) { texture_cache.FlushRegion(addr, size); buffer_cache.FlushRegion(addr, size); + query_cache.FlushRegion(addr, size); } void RasterizerVulkan::InvalidateRegion(CacheAddr addr, u64 size) { texture_cache.InvalidateRegion(addr, size); pipeline_cache.InvalidateRegion(addr, size); buffer_cache.InvalidateRegion(addr, size); + query_cache.InvalidateRegion(addr, size); } void RasterizerVulkan::FlushAndInvalidateRegion(CacheAddr addr, u64 size) { @@ -571,7 +580,7 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() { color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, true); } if (color_attachments[rt] && WalkAttachmentOverlaps(*color_attachments[rt])) { - texceptions.set(rt); + texceptions[rt] = true; } } @@ -579,7 +588,7 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() { zeta_attachment = texture_cache.GetDepthBufferSurface(true); } if (zeta_attachment && WalkAttachmentOverlaps(*zeta_attachment)) { - texceptions.set(ZETA_TEXCEPTION_INDEX); + texceptions[ZETA_TEXCEPTION_INDEX] = true; } texture_cache.GuardRenderTargets(false); @@ -1122,11 +1131,12 @@ RenderPassParams RasterizerVulkan::GetRenderPassParams(Texceptions texceptions) for (std::size_t rt = 0; rt < static_cast<std::size_t>(regs.rt_control.count); ++rt) { const auto& rendertarget = regs.rt[rt]; - if (rendertarget.Address() == 0 || rendertarget.format == Tegra::RenderTargetFormat::NONE) + if (rendertarget.Address() == 0 || rendertarget.format == Tegra::RenderTargetFormat::NONE) { continue; + } renderpass_params.color_attachments.push_back(RenderPassParams::ColorAttachment{ static_cast<u32>(rt), PixelFormatFromRenderTargetFormat(rendertarget.format), - texceptions.test(rt)}); + texceptions[rt]}); } renderpass_params.has_zeta = regs.zeta_enable; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 7be71e734..138903d60 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -24,6 +24,7 @@ #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" +#include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_renderpass_cache.h" #include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_sampler_cache.h" @@ -96,7 +97,7 @@ struct ImageView { vk::ImageLayout* layout = nullptr; }; -class RasterizerVulkan : public VideoCore::RasterizerAccelerated { +class RasterizerVulkan final : public VideoCore::RasterizerAccelerated { public: explicit RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& render_window, VKScreenInfo& screen_info, const VKDevice& device, @@ -104,10 +105,11 @@ public: VKScheduler& scheduler); ~RasterizerVulkan() override; - bool DrawBatch(bool is_indexed) override; - bool DrawMultiBatch(bool is_indexed) override; + void Draw(bool is_indexed, bool is_instanced) override; void Clear() override; void DispatchCompute(GPUVAddr code_addr) override; + void ResetCounter(VideoCore::QueryType type) override; + void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; void FlushAll() override; void FlushRegion(CacheAddr addr, u64 size) override; void InvalidateRegion(CacheAddr addr, u64 size) override; @@ -140,8 +142,6 @@ private: static constexpr std::size_t ZETA_TEXCEPTION_INDEX = 8; - void Draw(bool is_indexed, bool is_instanced); - void FlushWork(); Texceptions UpdateAttachments(); @@ -247,6 +247,7 @@ private: VKPipelineCache pipeline_cache; VKBufferCache buffer_cache; VKSamplerCache sampler_cache; + VKQueryCache query_cache; std::array<View, Maxwell::NumRenderTargets> color_attachments; View zeta_attachment; diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp index 0a8ec8398..204b7c39c 100644 --- a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp @@ -23,7 +23,14 @@ static std::optional<vk::BorderColor> TryConvertBorderColor(std::array<float, 4> } else if (color == std::array<float, 4>{1, 1, 1, 1}) { return vk::BorderColor::eFloatOpaqueWhite; } else { - return {}; + if (color[0] + color[1] + color[2] > 1.35f) { + // If color elements are brighter than roughly 0.5 average, use white border + return vk::BorderColor::eFloatOpaqueWhite; + } + if (color[3] > 0.5f) { + return vk::BorderColor::eFloatOpaqueBlack; + } + return vk::BorderColor::eFloatTransparentBlack; } } @@ -37,8 +44,6 @@ UniqueSampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc) const auto border_color{tsc.GetBorderColor()}; const auto vk_border_color{TryConvertBorderColor(border_color)}; - UNIMPLEMENTED_IF_MSG(!vk_border_color, "Unimplemented border color {} {} {} {}", - border_color[0], border_color[1], border_color[2], border_color[3]); constexpr bool unnormalized_coords{false}; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index d66133ad1..92bd6c344 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -6,6 +6,7 @@ #include "common/microprofile.h" #include "video_core/renderer_vulkan/declarations.h" #include "video_core/renderer_vulkan/vk_device.h" +#include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" @@ -139,6 +140,8 @@ void VKScheduler::SubmitExecution(vk::Semaphore semaphore) { } void VKScheduler::AllocateNewContext() { + ++ticks; + std::unique_lock lock{mutex}; current_fence = next_fence; next_fence = &resource_manager.CommitFence(); @@ -146,6 +149,10 @@ void VKScheduler::AllocateNewContext() { current_cmdbuf = resource_manager.CommitCommandBuffer(*current_fence); current_cmdbuf.begin({vk::CommandBufferUsageFlagBits::eOneTimeSubmit}, device.GetDispatchLoader()); + // Enable counters once again. These are disabled when a command buffer is finished. + if (query_cache) { + query_cache->UpdateCounters(); + } } void VKScheduler::InvalidateState() { @@ -159,6 +166,7 @@ void VKScheduler::InvalidateState() { } void VKScheduler::EndPendingOperations() { + query_cache->DisableStreams(); EndRenderPass(); } diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index bcdffbba0..62fd7858b 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -4,6 +4,7 @@ #pragma once +#include <atomic> #include <condition_variable> #include <memory> #include <optional> @@ -18,6 +19,7 @@ namespace Vulkan { class VKDevice; class VKFence; +class VKQueryCache; class VKResourceManager; class VKFenceView { @@ -67,6 +69,11 @@ public: /// Binds a pipeline to the current execution context. void BindGraphicsPipeline(vk::Pipeline pipeline); + /// Assigns the query cache. + void SetQueryCache(VKQueryCache& query_cache_) { + query_cache = &query_cache_; + } + /// Returns true when viewports have been set in the current command buffer. bool TouchViewports() { return std::exchange(state.viewports, true); @@ -112,6 +119,11 @@ public: return current_fence; } + /// Returns the current command buffer tick. + u64 Ticks() const { + return ticks; + } + private: class Command { public: @@ -205,6 +217,8 @@ private: const VKDevice& device; VKResourceManager& resource_manager; + VKQueryCache* query_cache = nullptr; + vk::CommandBuffer current_cmdbuf; VKFence* current_fence = nullptr; VKFence* next_fence = nullptr; @@ -227,6 +241,7 @@ private: Common::SPSCQueue<std::unique_ptr<CommandChunk>> chunk_reserve; std::mutex mutex; std::condition_variable cv; + std::atomic<u64> ticks = 0; bool quit = false; }; diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index dd6d2ef03..6d0bf6aa1 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -275,12 +275,14 @@ public: AddCapability(spv::Capability::ImageGatherExtended); AddCapability(spv::Capability::SampledBuffer); AddCapability(spv::Capability::StorageImageWriteWithoutFormat); + AddCapability(spv::Capability::DrawParameters); AddCapability(spv::Capability::SubgroupBallotKHR); AddCapability(spv::Capability::SubgroupVoteKHR); AddExtension("SPV_KHR_shader_ballot"); AddExtension("SPV_KHR_subgroup_vote"); AddExtension("SPV_KHR_storage_buffer_storage_class"); AddExtension("SPV_KHR_variable_pointers"); + AddExtension("SPV_KHR_shader_draw_parameters"); if (ir.UsesViewportIndex()) { AddCapability(spv::Capability::MultiViewport); @@ -290,6 +292,10 @@ public: } } + if (device.IsShaderStorageImageReadWithoutFormatSupported()) { + AddCapability(spv::Capability::StorageImageReadWithoutFormat); + } + if (device.IsFloat16Supported()) { AddCapability(spv::Capability::Float16); } @@ -353,6 +359,7 @@ private: DeclareFragment(); DeclareCompute(); DeclareRegisters(); + DeclareCustomVariables(); DeclarePredicates(); DeclareLocalMemory(); DeclareSharedMemory(); @@ -491,9 +498,11 @@ private: interfaces.push_back(AddGlobalVariable(Name(out_vertex, "out_vertex"))); // Declare input attributes - vertex_index = DeclareInputBuiltIn(spv::BuiltIn::VertexIndex, t_in_uint, "vertex_index"); + vertex_index = DeclareInputBuiltIn(spv::BuiltIn::VertexIndex, t_in_int, "vertex_index"); instance_index = - DeclareInputBuiltIn(spv::BuiltIn::InstanceIndex, t_in_uint, "instance_index"); + DeclareInputBuiltIn(spv::BuiltIn::InstanceIndex, t_in_int, "instance_index"); + base_vertex = DeclareInputBuiltIn(spv::BuiltIn::BaseVertex, t_in_int, "base_vertex"); + base_instance = DeclareInputBuiltIn(spv::BuiltIn::BaseInstance, t_in_int, "base_instance"); } void DeclareTessControl() { @@ -542,11 +551,10 @@ private: return; } - for (u32 rt = 0; rt < static_cast<u32>(frag_colors.size()); ++rt) { - if (!specialization.enabled_rendertargets[rt]) { + for (u32 rt = 0; rt < static_cast<u32>(std::size(frag_colors)); ++rt) { + if (!IsRenderTargetEnabled(rt)) { continue; } - const Id id = AddGlobalVariable(OpVariable(t_out_float4, spv::StorageClass::Output)); Name(id, fmt::format("frag_color{}", rt)); Decorate(id, spv::Decoration::Location, rt); @@ -587,6 +595,15 @@ private: } } + void DeclareCustomVariables() { + const u32 num_custom_variables = ir.GetNumCustomVariables(); + for (u32 i = 0; i < num_custom_variables; ++i) { + const Id id = OpVariable(t_prv_float, spv::StorageClass::Private, v_float_zero); + Name(id, fmt::format("custom_var_{}", i)); + custom_variables.emplace(i, AddGlobalVariable(id)); + } + } + void DeclarePredicates() { for (const auto pred : ir.GetPredicates()) { const Id id = OpVariable(t_prv_bool, spv::StorageClass::Private, v_false); @@ -852,6 +869,15 @@ private: return binding; } + bool IsRenderTargetEnabled(u32 rt) const { + for (u32 component = 0; component < 4; ++component) { + if (header.ps.IsColorComponentOutputEnabled(rt, component)) { + return true; + } + } + return false; + } + bool IsInputAttributeArray() const { return stage == ShaderType::TesselationControl || stage == ShaderType::TesselationEval || stage == ShaderType::Geometry; @@ -974,6 +1000,11 @@ private: return {OpLoad(t_float, registers.at(index)), Type::Float}; } + if (const auto cv = std::get_if<CustomVarNode>(&*node)) { + const u32 index = cv->GetIndex(); + return {OpLoad(t_float, custom_variables.at(index)), Type::Float}; + } + if (const auto immediate = std::get_if<ImmediateNode>(&*node)) { return {Constant(t_uint, immediate->GetValue()), Type::Uint}; } @@ -1045,9 +1076,12 @@ private: return {OpLoad(t_float, AccessElement(t_in_float, tess_coord, element)), Type::Float}; case 2: - return {OpLoad(t_uint, instance_index), Type::Uint}; + return { + OpISub(t_int, OpLoad(t_int, instance_index), OpLoad(t_int, base_instance)), + Type::Int}; case 3: - return {OpLoad(t_uint, vertex_index), Type::Uint}; + return {OpISub(t_int, OpLoad(t_int, vertex_index), OpLoad(t_int, base_vertex)), + Type::Int}; } UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element); return {Constant(t_uint, 0U), Type::Uint}; @@ -1115,15 +1149,7 @@ private: } if (const auto gmem = std::get_if<GmemNode>(&*node)) { - const Id gmem_buffer = global_buffers.at(gmem->GetDescriptor()); - const Id real = AsUint(Visit(gmem->GetRealAddress())); - const Id base = AsUint(Visit(gmem->GetBaseAddress())); - - Id offset = OpISub(t_uint, real, base); - offset = OpUDiv(t_uint, offset, Constant(t_uint, 4U)); - return {OpLoad(t_float, - OpAccessChain(t_gmem_float, gmem_buffer, Constant(t_uint, 0U), offset)), - Type::Float}; + return {OpLoad(t_uint, GetGlobalMemoryPointer(*gmem)), Type::Uint}; } if (const auto lmem = std::get_if<LmemNode>(&*node)) { @@ -1134,10 +1160,7 @@ private: } if (const auto smem = std::get_if<SmemNode>(&*node)) { - Id address = AsUint(Visit(smem->GetAddress())); - address = OpShiftRightLogical(t_uint, address, Constant(t_uint, 2U)); - const Id pointer = OpAccessChain(t_smem_uint, shared_memory, address); - return {OpLoad(t_uint, pointer), Type::Uint}; + return {OpLoad(t_uint, GetSharedMemoryPointer(*smem)), Type::Uint}; } if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) { @@ -1331,20 +1354,13 @@ private: target = {OpAccessChain(t_prv_float, local_memory, address), Type::Float}; } else if (const auto smem = std::get_if<SmemNode>(&*dest)) { - ASSERT(stage == ShaderType::Compute); - Id address = AsUint(Visit(smem->GetAddress())); - address = OpShiftRightLogical(t_uint, address, Constant(t_uint, 2U)); - target = {OpAccessChain(t_smem_uint, shared_memory, address), Type::Uint}; + target = {GetSharedMemoryPointer(*smem), Type::Uint}; } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) { - const Id real = AsUint(Visit(gmem->GetRealAddress())); - const Id base = AsUint(Visit(gmem->GetBaseAddress())); - const Id diff = OpISub(t_uint, real, base); - const Id offset = OpShiftRightLogical(t_uint, diff, Constant(t_uint, 2)); + target = {GetGlobalMemoryPointer(*gmem), Type::Uint}; - const Id gmem_buffer = global_buffers.at(gmem->GetDescriptor()); - target = {OpAccessChain(t_gmem_float, gmem_buffer, Constant(t_uint, 0), offset), - Type::Float}; + } else if (const auto cv = std::get_if<CustomVarNode>(&*dest)) { + target = {custom_variables.at(cv->GetIndex()), Type::Float}; } else { UNIMPLEMENTED(); @@ -1743,8 +1759,16 @@ private: } Expression ImageLoad(Operation operation) { - UNIMPLEMENTED(); - return {}; + if (!device.IsShaderStorageImageReadWithoutFormatSupported()) { + return {v_float_zero, Type::Float}; + } + + const auto& meta{std::get<MetaImage>(operation.GetMeta())}; + + const Id coords = GetCoordinates(operation, Type::Int); + const Id texel = OpImageRead(t_uint4, GetImage(operation), coords); + + return {OpCompositeExtract(t_uint, texel, meta.element), Type::Uint}; } Expression ImageStore(Operation operation) { @@ -1796,11 +1820,16 @@ private: return {}; } - Expression UAtomicAdd(Operation operation) { - const auto& smem = std::get<SmemNode>(*operation[0]); - Id address = AsUint(Visit(smem.GetAddress())); - address = OpShiftRightLogical(t_uint, address, Constant(t_uint, 2U)); - const Id pointer = OpAccessChain(t_smem_uint, shared_memory, address); + Expression AtomicAdd(Operation operation) { + Id pointer; + if (const auto smem = std::get_if<SmemNode>(&*operation[0])) { + pointer = GetSharedMemoryPointer(*smem); + } else if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) { + pointer = GetGlobalMemoryPointer(*gmem); + } else { + UNREACHABLE(); + return {Constant(t_uint, 0), Type::Uint}; + } const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device)); const Id semantics = Constant(t_uint, 0U); @@ -1889,19 +1918,14 @@ private: // rendertargets/components are skipped in the register assignment. u32 current_reg = 0; for (u32 rt = 0; rt < Maxwell::NumRenderTargets; ++rt) { - if (!specialization.enabled_rendertargets[rt]) { - // Skip rendertargets that are not enabled - continue; - } // TODO(Subv): Figure out how dual-source blending is configured in the Switch. for (u32 component = 0; component < 4; ++component) { - const Id pointer = AccessElement(t_out_float, frag_colors.at(rt), component); - if (header.ps.IsColorComponentOutputEnabled(rt, component)) { - OpStore(pointer, SafeGetRegister(current_reg)); - ++current_reg; - } else { - OpStore(pointer, component == 3 ? v_float_one : v_float_zero); + if (!header.ps.IsColorComponentOutputEnabled(rt, component)) { + continue; } + const Id pointer = AccessElement(t_out_float, frag_colors[rt], component); + OpStore(pointer, SafeGetRegister(current_reg)); + ++current_reg; } } if (header.ps.omap.depth) { @@ -2240,6 +2264,22 @@ private: return {}; } + Id GetGlobalMemoryPointer(const GmemNode& gmem) { + const Id real = AsUint(Visit(gmem.GetRealAddress())); + const Id base = AsUint(Visit(gmem.GetBaseAddress())); + const Id diff = OpISub(t_uint, real, base); + const Id offset = OpShiftRightLogical(t_uint, diff, Constant(t_uint, 2)); + const Id buffer = global_buffers.at(gmem.GetDescriptor()); + return OpAccessChain(t_gmem_uint, buffer, Constant(t_uint, 0), offset); + } + + Id GetSharedMemoryPointer(const SmemNode& smem) { + ASSERT(stage == ShaderType::Compute); + Id address = AsUint(Visit(smem.GetAddress())); + address = OpShiftRightLogical(t_uint, address, Constant(t_uint, 2U)); + return OpAccessChain(t_smem_uint, shared_memory, address); + } + static constexpr std::array operation_decompilers = { &SPIRVDecompiler::Assign, @@ -2386,7 +2426,7 @@ private: &SPIRVDecompiler::AtomicImageXor, &SPIRVDecompiler::AtomicImageExchange, - &SPIRVDecompiler::UAtomicAdd, + &SPIRVDecompiler::AtomicAdd, &SPIRVDecompiler::Branch, &SPIRVDecompiler::BranchIndirect, @@ -2482,9 +2522,9 @@ private: Id t_smem_uint{}; - const Id t_gmem_float = TypePointer(spv::StorageClass::StorageBuffer, t_float); + const Id t_gmem_uint = TypePointer(spv::StorageClass::StorageBuffer, t_uint); const Id t_gmem_array = - Name(Decorate(TypeRuntimeArray(t_float), spv::Decoration::ArrayStride, 4U), "GmemArray"); + Name(Decorate(TypeRuntimeArray(t_uint), spv::Decoration::ArrayStride, 4U), "GmemArray"); const Id t_gmem_struct = MemberDecorate( Decorate(TypeStruct(t_gmem_array), spv::Decoration::Block), 0, spv::Decoration::Offset, 0); const Id t_gmem_ssbo = TypePointer(spv::StorageClass::StorageBuffer, t_gmem_struct); @@ -2505,6 +2545,7 @@ private: Id out_vertex{}; Id in_vertex{}; std::map<u32, Id> registers; + std::map<u32, Id> custom_variables; std::map<Tegra::Shader::Pred, Id> predicates; std::map<u32, Id> flow_variables; Id local_memory{}; @@ -2520,6 +2561,8 @@ private: Id instance_index{}; Id vertex_index{}; + Id base_instance{}; + Id base_vertex{}; std::array<Id, Maxwell::NumRenderTargets> frag_colors{}; Id frag_depth{}; Id frag_coord{}; diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h index 10794be1c..f5dc14d9e 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h @@ -102,9 +102,6 @@ struct Specialization final { Maxwell::TessellationSpacing spacing{}; bool clockwise{}; } tessellation; - - // Fragment specific - std::bitset<8> enabled_rendertargets; }; // Old gcc versions don't consider this trivially copyable. // static_assert(std::is_trivially_copyable_v<Specialization>); diff --git a/src/video_core/shader/ast.h b/src/video_core/shader/ast.h index a2f0044ba..cca13bcde 100644 --- a/src/video_core/shader/ast.h +++ b/src/video_core/shader/ast.h @@ -65,8 +65,8 @@ public: void DetachSegment(ASTNode start, ASTNode end); void Remove(ASTNode node); - ASTNode first{}; - ASTNode last{}; + ASTNode first; + ASTNode last; }; class ASTProgram { @@ -299,9 +299,9 @@ private: friend class ASTZipper; ASTData data; - ASTNode parent{}; - ASTNode next{}; - ASTNode previous{}; + ASTNode parent; + ASTNode next; + ASTNode previous; ASTZipper* manager{}; }; diff --git a/src/video_core/shader/const_buffer_locker.cpp b/src/video_core/shader/const_buffer_locker.cpp index a4a0319eb..0638be8cb 100644 --- a/src/video_core/shader/const_buffer_locker.cpp +++ b/src/video_core/shader/const_buffer_locker.cpp @@ -66,6 +66,18 @@ std::optional<Tegra::Engines::SamplerDescriptor> ConstBufferLocker::ObtainBindle return value; } +std::optional<u32> ConstBufferLocker::ObtainBoundBuffer() { + if (bound_buffer_saved) { + return bound_buffer; + } + if (!engine) { + return std::nullopt; + } + bound_buffer_saved = true; + bound_buffer = engine->GetBoundBuffer(); + return bound_buffer; +} + void ConstBufferLocker::InsertKey(u32 buffer, u32 offset, u32 value) { keys.insert_or_assign({buffer, offset}, value); } @@ -78,6 +90,11 @@ void ConstBufferLocker::InsertBindlessSampler(u32 buffer, u32 offset, SamplerDes bindless_samplers.insert_or_assign({buffer, offset}, sampler); } +void ConstBufferLocker::SetBoundBuffer(u32 buffer) { + bound_buffer_saved = true; + bound_buffer = buffer; +} + bool ConstBufferLocker::IsConsistent() const { if (!engine) { return false; diff --git a/src/video_core/shader/const_buffer_locker.h b/src/video_core/shader/const_buffer_locker.h index d32e2d657..d3ea11087 100644 --- a/src/video_core/shader/const_buffer_locker.h +++ b/src/video_core/shader/const_buffer_locker.h @@ -10,6 +10,7 @@ #include "common/hash.h" #include "video_core/engines/const_buffer_engine_interface.h" #include "video_core/engines/shader_type.h" +#include "video_core/guest_driver.h" namespace VideoCommon::Shader { @@ -40,6 +41,8 @@ public: std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset); + std::optional<u32> ObtainBoundBuffer(); + /// Inserts a key. void InsertKey(u32 buffer, u32 offset, u32 value); @@ -49,6 +52,9 @@ public: /// Inserts a bindless sampler key. void InsertBindlessSampler(u32 buffer, u32 offset, Tegra::Engines::SamplerDescriptor sampler); + /// Set the bound buffer for this locker. + void SetBoundBuffer(u32 buffer); + /// Checks keys and samplers against engine's current const buffers. Returns true if they are /// the same value, false otherwise; bool IsConsistent() const; @@ -71,12 +77,27 @@ public: return bindless_samplers; } + /// Gets bound buffer used on this shader + u32 GetBoundBuffer() const { + return bound_buffer; + } + + /// Obtains access to the guest driver's profile. + VideoCore::GuestDriverProfile* AccessGuestDriverProfile() const { + if (engine) { + return &engine->AccessGuestDriverProfile(); + } + return nullptr; + } + private: const Tegra::Engines::ShaderType stage; Tegra::Engines::ConstBufferEngineInterface* engine = nullptr; KeyMap keys; BoundSamplerMap bound_samplers; BindlessSamplerMap bindless_samplers; + bool bound_buffer_saved{}; + u32 bound_buffer{}; }; } // namespace VideoCommon::Shader diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp index 22c3e5120..6b697ed5d 100644 --- a/src/video_core/shader/decode.cpp +++ b/src/video_core/shader/decode.cpp @@ -3,6 +3,7 @@ // Refer to the license.txt file included. #include <cstring> +#include <limits> #include <set> #include <fmt/format.h> @@ -33,6 +34,52 @@ constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) { return (absolute_offset % SchedPeriod) == 0; } +void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile* gpu_driver, + const std::list<Sampler>& used_samplers) { + if (gpu_driver == nullptr) { + LOG_CRITICAL(HW_GPU, "GPU driver profile has not been created yet"); + return; + } + if (gpu_driver->TextureHandlerSizeKnown() || used_samplers.size() <= 1) { + return; + } + u32 count{}; + std::vector<u32> bound_offsets; + for (const auto& sampler : used_samplers) { + if (sampler.IsBindless()) { + continue; + } + ++count; + bound_offsets.emplace_back(sampler.GetOffset()); + } + if (count > 1) { + gpu_driver->DeduceTextureHandlerSize(std::move(bound_offsets)); + } +} + +std::optional<u32> TryDeduceSamplerSize(const Sampler& sampler_to_deduce, + VideoCore::GuestDriverProfile* gpu_driver, + const std::list<Sampler>& used_samplers) { + if (gpu_driver == nullptr) { + LOG_CRITICAL(HW_GPU, "GPU Driver profile has not been created yet"); + return std::nullopt; + } + const u32 base_offset = sampler_to_deduce.GetOffset(); + u32 max_offset{std::numeric_limits<u32>::max()}; + for (const auto& sampler : used_samplers) { + if (sampler.IsBindless()) { + continue; + } + if (sampler.GetOffset() > base_offset) { + max_offset = std::min(sampler.GetOffset(), max_offset); + } + } + if (max_offset == std::numeric_limits<u32>::max()) { + return std::nullopt; + } + return ((max_offset - base_offset) * 4) / gpu_driver->GetTextureHandlerSize(); +} + } // Anonymous namespace class ASTDecoder { @@ -315,4 +362,25 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) { return pc + 1; } +void ShaderIR::PostDecode() { + // Deduce texture handler size if needed + auto gpu_driver = locker.AccessGuestDriverProfile(); + DeduceTextureHandlerSize(gpu_driver, used_samplers); + // Deduce Indexed Samplers + if (!uses_indexed_samplers) { + return; + } + for (auto& sampler : used_samplers) { + if (!sampler.IsIndexed()) { + continue; + } + if (const auto size = TryDeduceSamplerSize(sampler, gpu_driver, used_samplers)) { + sampler.SetSize(*size); + } else { + LOG_CRITICAL(HW_GPU, "Failed to deduce size of indexed sampler"); + sampler.SetSize(1); + } + } +} + } // namespace VideoCommon::Shader diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp index fcedd2af6..90240c765 100644 --- a/src/video_core/shader/decode/arithmetic.cpp +++ b/src/video_core/shader/decode/arithmetic.cpp @@ -21,7 +21,7 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) { Node op_a = GetRegister(instr.gpr8); - Node op_b = [&]() -> Node { + Node op_b = [&] { if (instr.is_b_imm) { return GetImmediate19(instr); } else if (instr.is_b_gpr) { @@ -141,6 +141,15 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) { SetRegister(bb, instr.gpr0, value); break; } + case OpCode::Id::FCMP_R: { + UNIMPLEMENTED_IF(instr.fcmp.ftz == 0); + Node op_c = GetRegister(instr.gpr39); + Node comp = GetPredicateComparisonFloat(instr.fcmp.cond, std::move(op_c), Immediate(0.0f)); + SetRegister( + bb, instr.gpr0, + Operation(OperationCode::Select, std::move(comp), std::move(op_a), std::move(op_b))); + break; + } case OpCode::Id::RRO_C: case OpCode::Id::RRO_R: case OpCode::Id::RRO_IMM: { diff --git a/src/video_core/shader/decode/arithmetic_integer.cpp b/src/video_core/shader/decode/arithmetic_integer.cpp index 371fae127..21366869d 100644 --- a/src/video_core/shader/decode/arithmetic_integer.cpp +++ b/src/video_core/shader/decode/arithmetic_integer.cpp @@ -166,13 +166,13 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) { const auto [op_rhs, test] = [&]() -> std::pair<Node, Node> { switch (opcode->get().GetId()) { case OpCode::Id::ICMP_CR: - return {GetConstBuffer(instr.cbuf34.index, instr.cbuf34.offset), + return {GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()), GetRegister(instr.gpr39)}; case OpCode::Id::ICMP_R: return {GetRegister(instr.gpr20), GetRegister(instr.gpr39)}; case OpCode::Id::ICMP_RC: return {GetRegister(instr.gpr39), - GetConstBuffer(instr.cbuf34.index, instr.cbuf34.offset)}; + GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset())}; case OpCode::Id::ICMP_IMM: return {Immediate(instr.alu.GetSignedImm20_20()), GetRegister(instr.gpr39)}; default: @@ -297,7 +297,7 @@ void ShaderIR::WriteLop3Instruction(NodeBlock& bb, Register dest, Node op_a, Nod const Node one = Immediate(1); const Node two = Immediate(2); - Node value{}; + Node value; for (u32 i = 0; i < lop_iterations; ++i) { const Node shift_amount = Immediate(i); diff --git a/src/video_core/shader/decode/bfi.cpp b/src/video_core/shader/decode/bfi.cpp index 8be1119df..70d1c055b 100644 --- a/src/video_core/shader/decode/bfi.cpp +++ b/src/video_core/shader/decode/bfi.cpp @@ -17,10 +17,13 @@ u32 ShaderIR::DecodeBfi(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; const auto opcode = OpCode::Decode(instr); - const auto [base, packed_shift] = [&]() -> std::tuple<Node, Node> { + const auto [packed_shift, base] = [&]() -> std::pair<Node, Node> { switch (opcode->get().GetId()) { + case OpCode::Id::BFI_RC: + return {GetRegister(instr.gpr39), + GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset())}; case OpCode::Id::BFI_IMM_R: - return {GetRegister(instr.gpr39), Immediate(instr.alu.GetSignedImm20_20())}; + return {Immediate(instr.alu.GetSignedImm20_20()), GetRegister(instr.gpr39)}; default: UNREACHABLE(); return {Immediate(0), Immediate(0)}; diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp index 0eeb75559..6ead42070 100644 --- a/src/video_core/shader/decode/conversion.cpp +++ b/src/video_core/shader/decode/conversion.cpp @@ -83,14 +83,14 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { const bool input_signed = instr.conversion.is_input_signed; - if (instr.conversion.src_size == Register::Size::Byte) { - const u32 offset = static_cast<u32>(instr.conversion.int_src.selector) * 8; - if (offset > 0) { - value = SignedOperation(OperationCode::ILogicalShiftRight, input_signed, - std::move(value), Immediate(offset)); + if (const u32 offset = static_cast<u32>(instr.conversion.int_src.selector); offset > 0) { + ASSERT(instr.conversion.src_size == Register::Size::Byte || + instr.conversion.src_size == Register::Size::Short); + if (instr.conversion.src_size == Register::Size::Short) { + ASSERT(offset == 0 || offset == 2); } - } else { - UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0); + value = SignedOperation(OperationCode::ILogicalShiftRight, input_signed, + std::move(value), Immediate(offset * 8)); } value = ConvertIntegerSize(value, instr.conversion.src_size, input_signed); diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp index 7591a715f..b5fbc4d58 100644 --- a/src/video_core/shader/decode/memory.cpp +++ b/src/video_core/shader/decode/memory.cpp @@ -19,9 +19,12 @@ namespace VideoCommon::Shader { using Tegra::Shader::AtomicOp; using Tegra::Shader::AtomicType; using Tegra::Shader::Attribute; +using Tegra::Shader::GlobalAtomicOp; +using Tegra::Shader::GlobalAtomicType; using Tegra::Shader::Instruction; using Tegra::Shader::OpCode; using Tegra::Shader::Register; +using Tegra::Shader::StoreType; namespace { @@ -61,6 +64,27 @@ u32 GetMemorySize(Tegra::Shader::UniformType uniform_type) { } } +Node ExtractUnaligned(Node value, Node address, u32 mask, u32 size) { + Node offset = Operation(OperationCode::UBitwiseAnd, address, Immediate(mask)); + offset = Operation(OperationCode::ULogicalShiftLeft, std::move(offset), Immediate(3)); + return Operation(OperationCode::UBitfieldExtract, std::move(value), std::move(offset), + Immediate(size)); +} + +Node InsertUnaligned(Node dest, Node value, Node address, u32 mask, u32 size) { + Node offset = Operation(OperationCode::UBitwiseAnd, std::move(address), Immediate(mask)); + offset = Operation(OperationCode::ULogicalShiftLeft, std::move(offset), Immediate(3)); + return Operation(OperationCode::UBitfieldInsert, std::move(dest), std::move(value), + std::move(offset), Immediate(size)); +} + +Node Sign16Extend(Node value) { + Node sign = Operation(OperationCode::UBitwiseAnd, value, Immediate(1U << 15)); + Node is_sign = Operation(OperationCode::LogicalUEqual, std::move(sign), Immediate(1U << 15)); + Node extend = Operation(OperationCode::Select, is_sign, Immediate(0xFFFF0000), Immediate(0)); + return Operation(OperationCode::UBitwiseOr, std::move(value), std::move(extend)); +} + } // Anonymous namespace u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { @@ -136,26 +160,31 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}", static_cast<u64>(instr.ld_l.unknown)); [[fallthrough]]; case OpCode::Id::LD_S: { - const auto GetMemory = [&](s32 offset) { + const auto GetAddress = [&](s32 offset) { ASSERT(offset % 4 == 0); const Node immediate_offset = Immediate(static_cast<s32>(instr.smem_imm) + offset); - const Node address = Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8), - immediate_offset); - return opcode->get().GetId() == OpCode::Id::LD_S ? GetSharedMemory(address) - : GetLocalMemory(address); + return Operation(OperationCode::IAdd, GetRegister(instr.gpr8), immediate_offset); + }; + const auto GetMemory = [&](s32 offset) { + return opcode->get().GetId() == OpCode::Id::LD_S ? GetSharedMemory(GetAddress(offset)) + : GetLocalMemory(GetAddress(offset)); }; switch (instr.ldst_sl.type.Value()) { - case Tegra::Shader::StoreType::Bits32: - case Tegra::Shader::StoreType::Bits64: - case Tegra::Shader::StoreType::Bits128: { - const u32 count = [&]() { + case StoreType::Signed16: + SetRegister(bb, instr.gpr0, + Sign16Extend(ExtractUnaligned(GetMemory(0), GetAddress(0), 0b10, 16))); + break; + case StoreType::Bits32: + case StoreType::Bits64: + case StoreType::Bits128: { + const u32 count = [&] { switch (instr.ldst_sl.type.Value()) { - case Tegra::Shader::StoreType::Bits32: + case StoreType::Bits32: return 1; - case Tegra::Shader::StoreType::Bits64: + case StoreType::Bits64: return 2; - case Tegra::Shader::StoreType::Bits128: + case StoreType::Bits128: return 4; default: UNREACHABLE(); @@ -212,12 +241,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { // To handle unaligned loads get the bytes used to dereference global memory and extract // those bytes from the loaded u32. if (IsUnaligned(type)) { - Node mask = Immediate(GetUnalignedMask(type)); - Node offset = Operation(OperationCode::UBitwiseAnd, real_address, std::move(mask)); - offset = Operation(OperationCode::ULogicalShiftLeft, offset, Immediate(3)); - - gmem = Operation(OperationCode::UBitfieldExtract, std::move(gmem), - std::move(offset), Immediate(size)); + gmem = ExtractUnaligned(gmem, real_address, GetUnalignedMask(type), size); } SetTemporary(bb, i, gmem); @@ -269,21 +293,28 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { return Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8), immediate); }; - const auto set_memory = opcode->get().GetId() == OpCode::Id::ST_L - ? &ShaderIR::SetLocalMemory - : &ShaderIR::SetSharedMemory; + const bool is_local = opcode->get().GetId() == OpCode::Id::ST_L; + const auto set_memory = is_local ? &ShaderIR::SetLocalMemory : &ShaderIR::SetSharedMemory; + const auto get_memory = is_local ? &ShaderIR::GetLocalMemory : &ShaderIR::GetSharedMemory; switch (instr.ldst_sl.type.Value()) { - case Tegra::Shader::StoreType::Bits128: + case StoreType::Bits128: (this->*set_memory)(bb, GetAddress(12), GetRegister(instr.gpr0.Value() + 3)); (this->*set_memory)(bb, GetAddress(8), GetRegister(instr.gpr0.Value() + 2)); [[fallthrough]]; - case Tegra::Shader::StoreType::Bits64: + case StoreType::Bits64: (this->*set_memory)(bb, GetAddress(4), GetRegister(instr.gpr0.Value() + 1)); [[fallthrough]]; - case Tegra::Shader::StoreType::Bits32: + case StoreType::Bits32: (this->*set_memory)(bb, GetAddress(0), GetRegister(instr.gpr0)); break; + case StoreType::Signed16: { + Node address = GetAddress(0); + Node memory = (this->*get_memory)(address); + (this->*set_memory)( + bb, address, InsertUnaligned(memory, GetRegister(instr.gpr0), address, 0b10, 16)); + break; + } default: UNIMPLEMENTED_MSG("{} unhandled type: {}", opcode->get().GetName(), static_cast<u32>(instr.ldst_sl.type.Value())); @@ -323,18 +354,32 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { Node value = GetRegister(instr.gpr0.Value() + i); if (IsUnaligned(type)) { - Node mask = Immediate(GetUnalignedMask(type)); - Node offset = Operation(OperationCode::UBitwiseAnd, real_address, std::move(mask)); - offset = Operation(OperationCode::ULogicalShiftLeft, offset, Immediate(3)); - - value = Operation(OperationCode::UBitfieldInsert, gmem, std::move(value), offset, - Immediate(size)); + const u32 mask = GetUnalignedMask(type); + value = InsertUnaligned(gmem, std::move(value), real_address, mask, size); } bb.push_back(Operation(OperationCode::Assign, gmem, value)); } break; } + case OpCode::Id::ATOM: { + UNIMPLEMENTED_IF_MSG(instr.atom.operation != GlobalAtomicOp::Add, "operation={}", + static_cast<int>(instr.atom.operation.Value())); + UNIMPLEMENTED_IF_MSG(instr.atom.type != GlobalAtomicType::S32, "type={}", + static_cast<int>(instr.atom.type.Value())); + + const auto [real_address, base_address, descriptor] = + TrackGlobalMemory(bb, instr, true, true); + if (!real_address || !base_address) { + // Tracking failed, skip atomic. + break; + } + + Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor); + Node value = Operation(OperationCode::AtomicAdd, std::move(gmem), GetRegister(instr.gpr20)); + SetRegister(bb, instr.gpr0, std::move(value)); + break; + } case OpCode::Id::ATOMS: { UNIMPLEMENTED_IF_MSG(instr.atoms.operation != AtomicOp::Add, "operation={}", static_cast<int>(instr.atoms.operation.Value())); @@ -348,7 +393,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { Node memory = GetSharedMemory(std::move(address)); Node data = GetRegister(instr.gpr20); - Node value = Operation(OperationCode::UAtomicAdd, std::move(memory), std::move(data)); + Node value = Operation(OperationCode::AtomicAdd, std::move(memory), std::move(data)); SetRegister(bb, instr.gpr0, std::move(value)); break; } diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp index 7321698b2..4944e9d69 100644 --- a/src/video_core/shader/decode/other.cpp +++ b/src/video_core/shader/decode/other.cpp @@ -69,13 +69,16 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { case OpCode::Id::MOV_SYS: { const Node value = [this, instr] { switch (instr.sys20) { + case SystemVariable::LaneId: + LOG_WARNING(HW_GPU, "MOV_SYS instruction with LaneId is incomplete"); + return Immediate(0U); case SystemVariable::InvocationId: return Operation(OperationCode::InvocationId); case SystemVariable::Ydirection: return Operation(OperationCode::YNegate); case SystemVariable::InvocationInfo: LOG_WARNING(HW_GPU, "MOV_SYS instruction with InvocationInfo is incomplete"); - return Immediate(0u); + return Immediate(0U); case SystemVariable::Tid: { Node value = Immediate(0); value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdX), 0, 9); @@ -188,7 +191,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "SYNC condition code used: {}", static_cast<u32>(cc)); - if (disable_flow_stack) { + if (decompiled) { break; } @@ -200,7 +203,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { const Tegra::Shader::ConditionCode cc = instr.flow_condition_code; UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "BRK condition code used: {}", static_cast<u32>(cc)); - if (disable_flow_stack) { + if (decompiled) { break; } diff --git a/src/video_core/shader/decode/shift.cpp b/src/video_core/shader/decode/shift.cpp index d419e9c45..3b391d3e6 100644 --- a/src/video_core/shader/decode/shift.cpp +++ b/src/video_core/shader/decode/shift.cpp @@ -10,8 +10,80 @@ namespace VideoCommon::Shader { +using std::move; using Tegra::Shader::Instruction; using Tegra::Shader::OpCode; +using Tegra::Shader::ShfType; +using Tegra::Shader::ShfXmode; + +namespace { + +Node IsFull(Node shift) { + return Operation(OperationCode::LogicalIEqual, move(shift), Immediate(32)); +} + +Node Shift(OperationCode opcode, Node value, Node shift) { + Node is_full = Operation(OperationCode::LogicalIEqual, shift, Immediate(32)); + Node shifted = Operation(opcode, move(value), shift); + return Operation(OperationCode::Select, IsFull(move(shift)), Immediate(0), move(shifted)); +} + +Node ClampShift(Node shift, s32 size = 32) { + shift = Operation(OperationCode::IMax, move(shift), Immediate(0)); + return Operation(OperationCode::IMin, move(shift), Immediate(size)); +} + +Node WrapShift(Node shift, s32 size = 32) { + return Operation(OperationCode::UBitwiseAnd, move(shift), Immediate(size - 1)); +} + +Node ShiftRight(Node low, Node high, Node shift, Node low_shift, ShfType type) { + // These values are used when the shift value is less than 32 + Node less_low = Shift(OperationCode::ILogicalShiftRight, low, shift); + Node less_high = Shift(OperationCode::ILogicalShiftLeft, high, low_shift); + Node less = Operation(OperationCode::IBitwiseOr, move(less_high), move(less_low)); + + if (type == ShfType::Bits32) { + // On 32 bit shifts we are either full (shifting 32) or shifting less than 32 bits + return Operation(OperationCode::Select, IsFull(move(shift)), move(high), move(less)); + } + + // And these when it's larger than or 32 + const bool is_signed = type == ShfType::S64; + const auto opcode = SignedToUnsignedCode(OperationCode::IArithmeticShiftRight, is_signed); + Node reduced = Operation(OperationCode::IAdd, shift, Immediate(-32)); + Node greater = Shift(opcode, high, move(reduced)); + + Node is_less = Operation(OperationCode::LogicalILessThan, shift, Immediate(32)); + Node is_zero = Operation(OperationCode::LogicalIEqual, move(shift), Immediate(0)); + + Node value = Operation(OperationCode::Select, move(is_less), move(less), move(greater)); + return Operation(OperationCode::Select, move(is_zero), move(high), move(value)); +} + +Node ShiftLeft(Node low, Node high, Node shift, Node low_shift, ShfType type) { + // These values are used when the shift value is less than 32 + Node less_low = Operation(OperationCode::ILogicalShiftRight, low, low_shift); + Node less_high = Operation(OperationCode::ILogicalShiftLeft, high, shift); + Node less = Operation(OperationCode::IBitwiseOr, move(less_low), move(less_high)); + + if (type == ShfType::Bits32) { + // On 32 bit shifts we are either full (shifting 32) or shifting less than 32 bits + return Operation(OperationCode::Select, IsFull(move(shift)), move(low), move(less)); + } + + // And these when it's larger than or 32 + Node reduced = Operation(OperationCode::IAdd, shift, Immediate(-32)); + Node greater = Shift(OperationCode::ILogicalShiftLeft, move(low), move(reduced)); + + Node is_less = Operation(OperationCode::LogicalILessThan, shift, Immediate(32)); + Node is_zero = Operation(OperationCode::LogicalIEqual, move(shift), Immediate(0)); + + Node value = Operation(OperationCode::Select, move(is_less), move(less), move(greater)); + return Operation(OperationCode::Select, move(is_zero), move(high), move(value)); +} + +} // Anonymous namespace u32 ShaderIR::DecodeShift(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; @@ -28,29 +100,48 @@ u32 ShaderIR::DecodeShift(NodeBlock& bb, u32 pc) { } }(); - switch (opcode->get().GetId()) { + switch (const auto opid = opcode->get().GetId(); opid) { case OpCode::Id::SHR_C: case OpCode::Id::SHR_R: case OpCode::Id::SHR_IMM: { - if (instr.shr.wrap) { - op_b = Operation(OperationCode::UBitwiseAnd, std::move(op_b), Immediate(0x1f)); - } else { - op_b = Operation(OperationCode::IMax, std::move(op_b), Immediate(0)); - op_b = Operation(OperationCode::IMin, std::move(op_b), Immediate(31)); - } + op_b = instr.shr.wrap ? WrapShift(move(op_b)) : ClampShift(move(op_b)); Node value = SignedOperation(OperationCode::IArithmeticShiftRight, instr.shift.is_signed, - std::move(op_a), std::move(op_b)); + move(op_a), move(op_b)); SetInternalFlagsFromInteger(bb, value, instr.generates_cc); - SetRegister(bb, instr.gpr0, std::move(value)); + SetRegister(bb, instr.gpr0, move(value)); break; } case OpCode::Id::SHL_C: case OpCode::Id::SHL_R: case OpCode::Id::SHL_IMM: { - const Node value = Operation(OperationCode::ILogicalShiftLeft, op_a, op_b); + Node value = Operation(OperationCode::ILogicalShiftLeft, op_a, op_b); SetInternalFlagsFromInteger(bb, value, instr.generates_cc); - SetRegister(bb, instr.gpr0, value); + SetRegister(bb, instr.gpr0, move(value)); + break; + } + case OpCode::Id::SHF_RIGHT_R: + case OpCode::Id::SHF_RIGHT_IMM: + case OpCode::Id::SHF_LEFT_R: + case OpCode::Id::SHF_LEFT_IMM: { + UNIMPLEMENTED_IF(instr.generates_cc); + UNIMPLEMENTED_IF_MSG(instr.shf.xmode != ShfXmode::None, "xmode={}", + static_cast<int>(instr.shf.xmode.Value())); + + if (instr.is_b_imm) { + op_b = Immediate(static_cast<u32>(instr.shf.immediate)); + } + const s32 size = instr.shf.type == ShfType::Bits32 ? 32 : 64; + Node shift = instr.shf.wrap ? WrapShift(move(op_b), size) : ClampShift(move(op_b), size); + + Node negated_shift = Operation(OperationCode::INegate, shift); + Node low_shift = Operation(OperationCode::IAdd, move(negated_shift), Immediate(32)); + + const bool is_right = opid == OpCode::Id::SHF_RIGHT_R || opid == OpCode::Id::SHF_RIGHT_IMM; + Node value = (is_right ? ShiftRight : ShiftLeft)( + move(op_a), GetRegister(instr.gpr39), move(shift), move(low_shift), instr.shf.type); + + SetRegister(bb, instr.gpr0, move(value)); break; } default: diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index cd984f763..bee7d8cad 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -144,7 +144,8 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { Node4 values; for (u32 element = 0; element < values.size(); ++element) { auto coords_copy = coords; - MetaTexture meta{sampler, {}, depth_compare, aoffi, {}, {}, {}, {}, component, element}; + MetaTexture meta{sampler, {}, depth_compare, aoffi, {}, {}, + {}, {}, component, element, {}}; values[element] = Operation(OperationCode::TextureGather, meta, std::move(coords_copy)); } @@ -161,16 +162,16 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { case OpCode::Id::TXD: { UNIMPLEMENTED_IF_MSG(instr.txd.UsesMiscMode(TextureMiscMode::AOFFI), "AOFFI is not implemented"); - UNIMPLEMENTED_IF_MSG(instr.txd.is_array != 0, "TXD Array is not implemented"); + const bool is_array = instr.txd.is_array != 0; u64 base_reg = instr.gpr8.Value(); const auto derivate_reg = instr.gpr20.Value(); const auto texture_type = instr.txd.texture_type.Value(); const auto coord_count = GetCoordCount(texture_type); - - const Sampler* sampler = is_bindless - ? GetBindlessSampler(base_reg, {{texture_type, false, false}}) - : GetSampler(instr.sampler, {{texture_type, false, false}}); + Node index_var{}; + const Sampler* sampler = + is_bindless ? GetBindlessSampler(base_reg, index_var, {{texture_type, is_array, false}}) + : GetSampler(instr.sampler, {{texture_type, is_array, false}}); Node4 values; if (sampler == nullptr) { for (u32 element = 0; element < values.size(); ++element) { @@ -179,6 +180,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { WriteTexInstructionFloat(bb, instr, values); break; } + if (is_bindless) { base_reg++; } @@ -192,8 +194,15 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { derivates.push_back(GetRegister(derivate_reg + derivate + 1)); } + Node array_node = {}; + if (is_array) { + const Node info_reg = GetRegister(base_reg + coord_count); + array_node = BitfieldExtract(info_reg, 0, 16); + } + for (u32 element = 0; element < values.size(); ++element) { - MetaTexture meta{*sampler, {}, {}, {}, {}, derivates, {}, {}, {}, element}; + MetaTexture meta{*sampler, array_node, {}, {}, {}, derivates, + {}, {}, {}, element, index_var}; values[element] = Operation(OperationCode::TextureGradient, std::move(meta), coords); } @@ -208,8 +217,9 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { // TODO: The new commits on the texture refactor, change the way samplers work. // Sadly, not all texture instructions specify the type of texture their sampler // uses. This must be fixed at a later instance. + Node index_var{}; const Sampler* sampler = - is_bindless ? GetBindlessSampler(instr.gpr8) : GetSampler(instr.sampler); + is_bindless ? GetBindlessSampler(instr.gpr8, index_var) : GetSampler(instr.sampler); if (sampler == nullptr) { u32 indexer = 0; @@ -233,7 +243,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { if (!instr.txq.IsComponentEnabled(element)) { continue; } - MetaTexture meta{*sampler, {}, {}, {}, {}, {}, {}, {}, {}, element}; + MetaTexture meta{*sampler, {}, {}, {}, {}, {}, {}, {}, {}, element, index_var}; const Node value = Operation(OperationCode::TextureQueryDimensions, meta, GetRegister(instr.gpr8.Value() + (is_bindless ? 1 : 0))); @@ -259,8 +269,9 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { auto texture_type = instr.tmml.texture_type.Value(); const bool is_array = instr.tmml.array != 0; + Node index_var{}; const Sampler* sampler = - is_bindless ? GetBindlessSampler(instr.gpr20) : GetSampler(instr.sampler); + is_bindless ? GetBindlessSampler(instr.gpr20, index_var) : GetSampler(instr.sampler); if (sampler == nullptr) { u32 indexer = 0; @@ -302,7 +313,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { continue; } auto params = coords; - MetaTexture meta{*sampler, {}, {}, {}, {}, {}, {}, {}, {}, element}; + MetaTexture meta{*sampler, {}, {}, {}, {}, {}, {}, {}, {}, element, index_var}; const Node value = Operation(OperationCode::TextureQueryLod, meta, std::move(params)); SetTemporary(bb, indexer++, value); } @@ -376,37 +387,65 @@ const Sampler* ShaderIR::GetSampler(const Tegra::Shader::Sampler& sampler, // Otherwise create a new mapping for this sampler const auto next_index = static_cast<u32>(used_samplers.size()); return &used_samplers.emplace_back(next_index, offset, info.type, info.is_array, info.is_shadow, - info.is_buffer); + info.is_buffer, false); } -const Sampler* ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, +const Sampler* ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, Node& index_var, std::optional<SamplerInfo> sampler_info) { const Node sampler_register = GetRegister(reg); - const auto [base_sampler, buffer, offset] = - TrackCbuf(sampler_register, global_code, static_cast<s64>(global_code.size())); - ASSERT(base_sampler != nullptr); - if (base_sampler == nullptr) { + const auto [base_node, tracked_sampler_info] = + TrackBindlessSampler(sampler_register, global_code, static_cast<s64>(global_code.size())); + ASSERT(base_node != nullptr); + if (base_node == nullptr) { return nullptr; } - const auto info = GetSamplerInfo(sampler_info, offset, buffer); + if (const auto bindless_sampler_info = + std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) { + const u32 buffer = bindless_sampler_info->GetIndex(); + const u32 offset = bindless_sampler_info->GetOffset(); + const auto info = GetSamplerInfo(sampler_info, offset, buffer); + + // If this sampler has already been used, return the existing mapping. + const auto it = + std::find_if(used_samplers.begin(), used_samplers.end(), + [buffer = buffer, offset = offset](const Sampler& entry) { + return entry.GetBuffer() == buffer && entry.GetOffset() == offset; + }); + if (it != used_samplers.end()) { + ASSERT(it->IsBindless() && it->GetType() == info.type && + it->IsArray() == info.is_array && it->IsShadow() == info.is_shadow); + return &*it; + } - // If this sampler has already been used, return the existing mapping. - const auto it = - std::find_if(used_samplers.begin(), used_samplers.end(), - [buffer = buffer, offset = offset](const Sampler& entry) { - return entry.GetBuffer() == buffer && entry.GetOffset() == offset; - }); - if (it != used_samplers.end()) { - ASSERT(it->IsBindless() && it->GetType() == info.type && it->IsArray() == info.is_array && - it->IsShadow() == info.is_shadow); - return &*it; - } + // Otherwise create a new mapping for this sampler + const auto next_index = static_cast<u32>(used_samplers.size()); + return &used_samplers.emplace_back(next_index, offset, buffer, info.type, info.is_array, + info.is_shadow, info.is_buffer, false); + } else if (const auto array_sampler_info = + std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) { + const u32 base_offset = array_sampler_info->GetBaseOffset() / 4; + index_var = GetCustomVariable(array_sampler_info->GetIndexVar()); + const auto info = GetSamplerInfo(sampler_info, base_offset); + + // If this sampler has already been used, return the existing mapping. + const auto it = std::find_if( + used_samplers.begin(), used_samplers.end(), + [base_offset](const Sampler& entry) { return entry.GetOffset() == base_offset; }); + if (it != used_samplers.end()) { + ASSERT(!it->IsBindless() && it->GetType() == info.type && + it->IsArray() == info.is_array && it->IsShadow() == info.is_shadow && + it->IsBuffer() == info.is_buffer && it->IsIndexed()); + return &*it; + } - // Otherwise create a new mapping for this sampler - const auto next_index = static_cast<u32>(used_samplers.size()); - return &used_samplers.emplace_back(next_index, offset, buffer, info.type, info.is_array, - info.is_shadow, info.is_buffer); + uses_indexed_samplers = true; + // Otherwise create a new mapping for this sampler + const auto next_index = static_cast<u32>(used_samplers.size()); + return &used_samplers.emplace_back(next_index, base_offset, info.type, info.is_array, + info.is_shadow, info.is_buffer, true); + } + return nullptr; } void ShaderIR::WriteTexInstructionFloat(NodeBlock& bb, Instruction instr, const Node4& components) { @@ -483,66 +522,53 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type, Node array, Node depth_compare, u32 bias_offset, std::vector<Node> aoffi, std::optional<Tegra::Shader::Register> bindless_reg) { - const auto is_array = static_cast<bool>(array); - const auto is_shadow = static_cast<bool>(depth_compare); + const bool is_array = array != nullptr; + const bool is_shadow = depth_compare != nullptr; const bool is_bindless = bindless_reg.has_value(); - UNIMPLEMENTED_IF_MSG((texture_type == TextureType::Texture3D && (is_array || is_shadow)) || - (texture_type == TextureType::TextureCube && is_array && is_shadow), - "This method is not supported."); + UNIMPLEMENTED_IF(texture_type == TextureType::TextureCube && is_array && is_shadow); + ASSERT_MSG(texture_type != TextureType::Texture3D || !is_array || !is_shadow, + "Illegal texture type"); const SamplerInfo info{texture_type, is_array, is_shadow, false}; - const Sampler* sampler = - is_bindless ? GetBindlessSampler(*bindless_reg, info) : GetSampler(instr.sampler, info); - Node4 values; - if (sampler == nullptr) { - for (u32 element = 0; element < values.size(); ++element) { - values[element] = Immediate(0); - } - return values; + Node index_var; + const Sampler* sampler = is_bindless ? GetBindlessSampler(*bindless_reg, index_var, info) + : GetSampler(instr.sampler, info); + if (!sampler) { + return {Immediate(0), Immediate(0), Immediate(0), Immediate(0)}; } const bool lod_needed = process_mode == TextureProcessMode::LZ || process_mode == TextureProcessMode::LL || process_mode == TextureProcessMode::LLA; - - // LOD selection (either via bias or explicit textureLod) not supported in GL for - // sampler2DArrayShadow and samplerCubeArrayShadow. - const bool gl_lod_supported = - !((texture_type == Tegra::Shader::TextureType::Texture2D && is_array && is_shadow) || - (texture_type == Tegra::Shader::TextureType::TextureCube && is_array && is_shadow)); - - const OperationCode read_method = - (lod_needed && gl_lod_supported) ? OperationCode::TextureLod : OperationCode::Texture; - - UNIMPLEMENTED_IF(process_mode != TextureProcessMode::None && !gl_lod_supported); + const OperationCode opcode = lod_needed ? OperationCode::TextureLod : OperationCode::Texture; Node bias; Node lod; - if (process_mode != TextureProcessMode::None && gl_lod_supported) { - switch (process_mode) { - case TextureProcessMode::LZ: - lod = Immediate(0.0f); - break; - case TextureProcessMode::LB: - // If present, lod or bias are always stored in the register - // indexed by the gpr20 field with an offset depending on the - // usage of the other registers - bias = GetRegister(instr.gpr20.Value() + bias_offset); - break; - case TextureProcessMode::LL: - lod = GetRegister(instr.gpr20.Value() + bias_offset); - break; - default: - UNIMPLEMENTED_MSG("Unimplemented process mode={}", static_cast<u32>(process_mode)); - break; - } + switch (process_mode) { + case TextureProcessMode::None: + break; + case TextureProcessMode::LZ: + lod = Immediate(0.0f); + break; + case TextureProcessMode::LB: + // If present, lod or bias are always stored in the register indexed by the gpr20 field with + // an offset depending on the usage of the other registers. + bias = GetRegister(instr.gpr20.Value() + bias_offset); + break; + case TextureProcessMode::LL: + lod = GetRegister(instr.gpr20.Value() + bias_offset); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented process mode={}", static_cast<u32>(process_mode)); + break; } + Node4 values; for (u32 element = 0; element < values.size(); ++element) { - auto copy_coords = coords; - MetaTexture meta{*sampler, array, depth_compare, aoffi, {}, {}, bias, lod, {}, element}; - values[element] = Operation(read_method, meta, std::move(copy_coords)); + MetaTexture meta{*sampler, array, depth_compare, aoffi, {}, {}, bias, + lod, {}, element, index_var}; + values[element] = Operation(opcode, meta, coords); } return values; @@ -589,7 +615,7 @@ Node4 ShaderIR::GetTexCode(Instruction instr, TextureType texture_type, aoffi = GetAoffiCoordinates(GetRegister(parameter_register++), coord_count, false); } - Node dc{}; + Node dc; if (depth_compare) { // Depth is always stored in the register signaled by gpr20 or in the next register if lod // or bias are used @@ -625,7 +651,7 @@ Node4 ShaderIR::GetTexsCode(Instruction instr, TextureType texture_type, const Node array = is_array ? GetRegister(array_register) : nullptr; - Node dc{}; + Node dc; if (depth_compare) { // Depth is always stored in the register signaled by gpr20 or in the next register if lod // or bias are used @@ -656,7 +682,8 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de u64 parameter_register = instr.gpr20.Value(); const SamplerInfo info{texture_type, is_array, depth_compare, false}; - const Sampler* sampler = is_bindless ? GetBindlessSampler(parameter_register++, info) + Node index_var{}; + const Sampler* sampler = is_bindless ? GetBindlessSampler(parameter_register++, index_var, info) : GetSampler(instr.sampler, info); Node4 values; if (sampler == nullptr) { @@ -685,7 +712,8 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de for (u32 element = 0; element < values.size(); ++element) { auto coords_copy = coords; MetaTexture meta{ - *sampler, GetRegister(array_register), dc, aoffi, ptp, {}, {}, {}, component, element}; + *sampler, GetRegister(array_register), dc, aoffi, ptp, {}, {}, {}, component, element, + index_var}; values[element] = Operation(OperationCode::TextureGather, meta, std::move(coords_copy)); } @@ -718,7 +746,7 @@ Node4 ShaderIR::GetTldCode(Tegra::Shader::Instruction instr) { Node4 values; for (u32 element = 0; element < values.size(); ++element) { auto coords_copy = coords; - MetaTexture meta{sampler, array_register, {}, {}, {}, {}, {}, lod, {}, element}; + MetaTexture meta{sampler, array_register, {}, {}, {}, {}, {}, lod, {}, element, {}}; values[element] = Operation(OperationCode::TexelFetch, meta, std::move(coords_copy)); } @@ -768,7 +796,7 @@ Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is Node4 values; for (u32 element = 0; element < values.size(); ++element) { auto coords_copy = coords; - MetaTexture meta{sampler, array, {}, {}, {}, {}, {}, lod, {}, element}; + MetaTexture meta{sampler, array, {}, {}, {}, {}, {}, lod, {}, element, {}}; values[element] = Operation(OperationCode::TexelFetch, meta, std::move(coords_copy)); } return values; diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index 075c7d07c..a0a7b9111 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -162,7 +162,7 @@ enum class OperationCode { AtomicImageXor, /// (MetaImage, int[N] coords) -> void AtomicImageExchange, /// (MetaImage, int[N] coords) -> void - UAtomicAdd, /// (smem, uint) -> uint + AtomicAdd, /// (memory, {u}int) -> {u}int Branch, /// (uint branch_target) -> void BranchIndirect, /// (uint branch_target) -> void @@ -212,6 +212,7 @@ enum class MetaStackClass { class OperationNode; class ConditionalNode; class GprNode; +class CustomVarNode; class ImmediateNode; class InternalFlagNode; class PredicateNode; @@ -223,26 +224,32 @@ class SmemNode; class GmemNode; class CommentNode; -using NodeData = std::variant<OperationNode, ConditionalNode, GprNode, ImmediateNode, +using NodeData = std::variant<OperationNode, ConditionalNode, GprNode, CustomVarNode, ImmediateNode, InternalFlagNode, PredicateNode, AbufNode, PatchNode, CbufNode, LmemNode, SmemNode, GmemNode, CommentNode>; using Node = std::shared_ptr<NodeData>; using Node4 = std::array<Node, 4>; using NodeBlock = std::vector<Node>; +class BindlessSamplerNode; +class ArraySamplerNode; + +using TrackSamplerData = std::variant<BindlessSamplerNode, ArraySamplerNode>; +using TrackSampler = std::shared_ptr<TrackSamplerData>; + class Sampler { public: /// This constructor is for bound samplers constexpr explicit Sampler(u32 index, u32 offset, Tegra::Shader::TextureType type, - bool is_array, bool is_shadow, bool is_buffer) + bool is_array, bool is_shadow, bool is_buffer, bool is_indexed) : index{index}, offset{offset}, type{type}, is_array{is_array}, is_shadow{is_shadow}, - is_buffer{is_buffer} {} + is_buffer{is_buffer}, is_indexed{is_indexed} {} /// This constructor is for bindless samplers constexpr explicit Sampler(u32 index, u32 offset, u32 buffer, Tegra::Shader::TextureType type, - bool is_array, bool is_shadow, bool is_buffer) + bool is_array, bool is_shadow, bool is_buffer, bool is_indexed) : index{index}, offset{offset}, buffer{buffer}, type{type}, is_array{is_array}, - is_shadow{is_shadow}, is_buffer{is_buffer}, is_bindless{true} {} + is_shadow{is_shadow}, is_buffer{is_buffer}, is_bindless{true}, is_indexed{is_indexed} {} constexpr u32 GetIndex() const { return index; @@ -276,16 +283,72 @@ public: return is_bindless; } + constexpr bool IsIndexed() const { + return is_indexed; + } + + constexpr u32 Size() const { + return size; + } + + constexpr void SetSize(u32 new_size) { + size = new_size; + } + private: u32 index{}; ///< Emulated index given for the this sampler. u32 offset{}; ///< Offset in the const buffer from where the sampler is being read. u32 buffer{}; ///< Buffer where the bindless sampler is being read (unused on bound samplers). + u32 size{}; ///< Size of the sampler if indexed. Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc) bool is_array{}; ///< Whether the texture is being sampled as an array texture or not. bool is_shadow{}; ///< Whether the texture is being sampled as a depth texture or not. bool is_buffer{}; ///< Whether the texture is a texture buffer without sampler. bool is_bindless{}; ///< Whether this sampler belongs to a bindless texture or not. + bool is_indexed{}; ///< Whether this sampler is an indexed array of textures. +}; + +/// Represents a tracked bindless sampler into a direct const buffer +class ArraySamplerNode final { +public: + explicit ArraySamplerNode(u32 index, u32 base_offset, u32 bindless_var) + : index{index}, base_offset{base_offset}, bindless_var{bindless_var} {} + + constexpr u32 GetIndex() const { + return index; + } + + constexpr u32 GetBaseOffset() const { + return base_offset; + } + + constexpr u32 GetIndexVar() const { + return bindless_var; + } + +private: + u32 index; + u32 base_offset; + u32 bindless_var; +}; + +/// Represents a tracked bindless sampler into a direct const buffer +class BindlessSamplerNode final { +public: + explicit BindlessSamplerNode(u32 index, u32 offset) : index{index}, offset{offset} {} + + constexpr u32 GetIndex() const { + return index; + } + + constexpr u32 GetOffset() const { + return offset; + } + +private: + u32 index; + u32 offset; }; class Image final { @@ -380,8 +443,9 @@ struct MetaTexture { std::vector<Node> derivates; Node bias; Node lod; - Node component{}; + Node component; u32 element{}; + Node index; }; struct MetaImage { @@ -488,6 +552,19 @@ private: Tegra::Shader::Register index{}; }; +/// A custom variable +class CustomVarNode final { +public: + explicit constexpr CustomVarNode(u32 index) : index{index} {} + + constexpr u32 GetIndex() const { + return index; + } + +private: + u32 index{}; +}; + /// A 32-bits value that represents an immediate value class ImmediateNode final { public: diff --git a/src/video_core/shader/node_helper.h b/src/video_core/shader/node_helper.h index 0c2aa749b..11231bbea 100644 --- a/src/video_core/shader/node_helper.h +++ b/src/video_core/shader/node_helper.h @@ -45,6 +45,12 @@ Node MakeNode(Args&&... args) { return std::make_shared<NodeData>(T(std::forward<Args>(args)...)); } +template <typename T, typename... Args> +TrackSampler MakeTrackSampler(Args&&... args) { + static_assert(std::is_convertible_v<T, TrackSamplerData>); + return std::make_shared<TrackSamplerData>(T(std::forward<Args>(args)...)); +} + template <typename... Args> Node Operation(OperationCode code, Args&&... args) { if constexpr (sizeof...(args) == 0) { diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp index 31eecb3f4..3a5d280a9 100644 --- a/src/video_core/shader/shader_ir.cpp +++ b/src/video_core/shader/shader_ir.cpp @@ -27,6 +27,7 @@ ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset, CompilerSet ConstBufferLocker& locker) : program_code{program_code}, main_offset{main_offset}, settings{settings}, locker{locker} { Decode(); + PostDecode(); } ShaderIR::~ShaderIR() = default; @@ -38,6 +39,10 @@ Node ShaderIR::GetRegister(Register reg) { return MakeNode<GprNode>(reg); } +Node ShaderIR::GetCustomVariable(u32 id) { + return MakeNode<CustomVarNode>(id); +} + Node ShaderIR::GetImmediate19(Instruction instr) { return Immediate(instr.alu.GetImm20_19()); } @@ -452,4 +457,8 @@ std::size_t ShaderIR::DeclareAmend(Node new_amend) { return id; } +u32 ShaderIR::NewCustomVariable() { + return num_custom_variables++; +} + } // namespace VideoCommon::Shader diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index ba1db4c11..b0851c3be 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -180,6 +180,10 @@ public: return amend_code[index]; } + u32 GetNumCustomVariables() const { + return num_custom_variables; + } + private: friend class ASTDecoder; @@ -191,6 +195,7 @@ private: }; void Decode(); + void PostDecode(); NodeBlock DecodeRange(u32 begin, u32 end); void DecodeRangeInner(NodeBlock& bb, u32 begin, u32 end); @@ -235,6 +240,8 @@ private: /// Generates a node for a passed register. Node GetRegister(Tegra::Shader::Register reg); + /// Generates a node for a custom variable + Node GetCustomVariable(u32 id); /// Generates a node representing a 19-bit immediate value Node GetImmediate19(Tegra::Shader::Instruction instr); /// Generates a node representing a 32-bit immediate value @@ -321,7 +328,7 @@ private: std::optional<SamplerInfo> sampler_info = std::nullopt); /// Accesses a texture sampler for a bindless texture. - const Sampler* GetBindlessSampler(Tegra::Shader::Register reg, + const Sampler* GetBindlessSampler(Tegra::Shader::Register reg, Node& index_var, std::optional<SamplerInfo> sampler_info = std::nullopt); /// Accesses an image. @@ -387,6 +394,9 @@ private: std::tuple<Node, u32, u32> TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const; + std::tuple<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code, + s64 cursor); + std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const; std::pair<Node, s64> TrackRegister(const GprNode* tracked, const NodeBlock& code, @@ -399,6 +409,8 @@ private: /// Register new amending code and obtain the reference id. std::size_t DeclareAmend(Node new_amend); + u32 NewCustomVariable(); + const ProgramCode& program_code; const u32 main_offset; const CompilerSettings settings; @@ -414,6 +426,7 @@ private: NodeBlock global_code; ASTManager program_manager{true, true}; std::vector<Node> amend_code; + u32 num_custom_variables{}; std::set<u32> used_registers; std::set<Tegra::Shader::Pred> used_predicates; @@ -431,6 +444,7 @@ private: bool uses_instance_id{}; bool uses_vertex_id{}; bool uses_warps{}; + bool uses_indexed_samplers{}; Tegra::Shader::Header header; }; diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp index 165c79330..face8c943 100644 --- a/src/video_core/shader/track.cpp +++ b/src/video_core/shader/track.cpp @@ -8,6 +8,7 @@ #include "common/common_types.h" #include "video_core/shader/node.h" +#include "video_core/shader/node_helper.h" #include "video_core/shader/shader_ir.h" namespace VideoCommon::Shader { @@ -35,8 +36,113 @@ std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor, } return {}; } + +std::optional<std::pair<Node, Node>> DecoupleIndirectRead(const OperationNode& operation) { + if (operation.GetCode() != OperationCode::UAdd) { + return std::nullopt; + } + Node gpr; + Node offset; + ASSERT(operation.GetOperandsCount() == 2); + for (std::size_t i = 0; i < operation.GetOperandsCount(); i++) { + Node operand = operation[i]; + if (std::holds_alternative<ImmediateNode>(*operand)) { + offset = operation[i]; + } else if (std::holds_alternative<GprNode>(*operand)) { + gpr = operation[i]; + } + } + if (offset && gpr) { + return std::make_pair(gpr, offset); + } + return std::nullopt; +} + +bool AmendNodeCv(std::size_t amend_index, Node node) { + if (const auto operation = std::get_if<OperationNode>(&*node)) { + operation->SetAmendIndex(amend_index); + return true; + } else if (const auto conditional = std::get_if<ConditionalNode>(&*node)) { + conditional->SetAmendIndex(amend_index); + return true; + } + return false; +} + } // Anonymous namespace +std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code, + s64 cursor) { + if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) { + // Constant buffer found, test if it's an immediate + const auto offset = cbuf->GetOffset(); + if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { + auto track = + MakeTrackSampler<BindlessSamplerNode>(cbuf->GetIndex(), immediate->GetValue()); + return {tracked, track}; + } else if (const auto operation = std::get_if<OperationNode>(&*offset)) { + auto bound_buffer = locker.ObtainBoundBuffer(); + if (!bound_buffer) { + return {}; + } + if (*bound_buffer != cbuf->GetIndex()) { + return {}; + } + auto pair = DecoupleIndirectRead(*operation); + if (!pair) { + return {}; + } + auto [gpr, base_offset] = *pair; + const auto offset_inm = std::get_if<ImmediateNode>(&*base_offset); + auto gpu_driver = locker.AccessGuestDriverProfile(); + if (gpu_driver == nullptr) { + return {}; + } + const u32 bindless_cv = NewCustomVariable(); + const Node op = Operation(OperationCode::UDiv, NO_PRECISE, gpr, + Immediate(gpu_driver->GetTextureHandlerSize())); + + const Node cv_node = GetCustomVariable(bindless_cv); + Node amend_op = Operation(OperationCode::Assign, cv_node, std::move(op)); + const std::size_t amend_index = DeclareAmend(amend_op); + AmendNodeCv(amend_index, code[cursor]); + // TODO Implement Bindless Index custom variable + auto track = MakeTrackSampler<ArraySamplerNode>(cbuf->GetIndex(), + offset_inm->GetValue(), bindless_cv); + return {tracked, track}; + } + return {}; + } + if (const auto gpr = std::get_if<GprNode>(&*tracked)) { + if (gpr->GetIndex() == Tegra::Shader::Register::ZeroIndex) { + return {}; + } + // Reduce the cursor in one to avoid infinite loops when the instruction sets the same + // register that it uses as operand + const auto [source, new_cursor] = TrackRegister(gpr, code, cursor - 1); + if (!source) { + return {}; + } + return TrackBindlessSampler(source, code, new_cursor); + } + if (const auto operation = std::get_if<OperationNode>(&*tracked)) { + for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) { + if (auto found = TrackBindlessSampler((*operation)[i - 1], code, cursor); + std::get<0>(found)) { + // Cbuf found in operand. + return found; + } + } + return {}; + } + if (const auto conditional = std::get_if<ConditionalNode>(&*tracked)) { + const auto& conditional_code = conditional->GetCode(); + return TrackBindlessSampler(tracked, conditional_code, + static_cast<s64>(conditional_code.size())); + } + return {}; +} + std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const { if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) { diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp index 829268b4c..84469b7ba 100644 --- a/src/video_core/texture_cache/surface_base.cpp +++ b/src/video_core/texture_cache/surface_base.cpp @@ -135,7 +135,7 @@ std::vector<CopyParams> SurfaceBaseImpl::BreakDownLayered(const SurfaceParams& i for (u32 level = 0; level < mipmaps; level++) { const u32 width = SurfaceParams::IntersectWidth(params, in_params, level, level); const u32 height = SurfaceParams::IntersectHeight(params, in_params, level, level); - result.emplace_back(width, height, layer, level); + result.emplace_back(0, 0, layer, 0, 0, layer, level, level, width, height, 1); } } return result; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index f4c015635..0d105d386 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -721,7 +721,6 @@ private: std::pair<TSurface, TView> GetSurface(const GPUVAddr gpu_addr, const CacheAddr cache_addr, const SurfaceParams& params, bool preserve_contents, bool is_render) { - // Step 1 // Check Level 1 Cache for a fast structural match. If candidate surface // matches at certain level we are pretty much done. @@ -733,14 +732,18 @@ private: return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, topological_result); } + const auto struct_result = current_surface->MatchesStructure(params); - if (struct_result != MatchStructureResult::None && - (params.target != SurfaceTarget::Texture3D || - current_surface->MatchTarget(params.target))) { - if (struct_result == MatchStructureResult::FullMatch) { - return ManageStructuralMatch(current_surface, params, is_render); - } else { - return RebuildSurface(current_surface, params, is_render); + if (struct_result != MatchStructureResult::None) { + const auto& old_params = current_surface->GetSurfaceParams(); + const bool not_3d = params.target != SurfaceTarget::Texture3D && + old_params.target != SurfaceTarget::Texture3D; + if (not_3d || current_surface->MatchTarget(params.target)) { + if (struct_result == MatchStructureResult::FullMatch) { + return ManageStructuralMatch(current_surface, params, is_render); + } else { + return RebuildSurface(current_surface, params, is_render); + } } } } diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp index 8e947394c..a5f81a8a0 100644 --- a/src/video_core/video_core.cpp +++ b/src/video_core/video_core.cpp @@ -3,19 +3,32 @@ // Refer to the license.txt file included. #include <memory> +#include "common/logging/log.h" #include "core/core.h" #include "core/settings.h" #include "video_core/gpu_asynch.h" #include "video_core/gpu_synch.h" #include "video_core/renderer_base.h" #include "video_core/renderer_opengl/renderer_opengl.h" +#ifdef HAS_VULKAN +#include "video_core/renderer_vulkan/renderer_vulkan.h" +#endif #include "video_core/video_core.h" namespace VideoCore { std::unique_ptr<RendererBase> CreateRenderer(Core::Frontend::EmuWindow& emu_window, Core::System& system) { - return std::make_unique<OpenGL::RendererOpenGL>(emu_window, system); + switch (Settings::values.renderer_backend) { + case Settings::RendererBackend::OpenGL: + return std::make_unique<OpenGL::RendererOpenGL>(emu_window, system); +#ifdef HAS_VULKAN + case Settings::RendererBackend::Vulkan: + return std::make_unique<Vulkan::RendererVulkan>(emu_window, system); +#endif + default: + return nullptr; + } } std::unique_ptr<Tegra::GPU> CreateGPU(Core::System& system) { |
