diff options
Diffstat (limited to 'src/video_core')
82 files changed, 4462 insertions, 1440 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 2bf8d68ce..21c46a567 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -3,6 +3,8 @@ add_library(video_core STATIC buffer_cache/buffer_cache.h buffer_cache/map_interval.cpp buffer_cache/map_interval.h + compatible_formats.cpp + compatible_formats.h dirty_flags.cpp dirty_flags.h dma_pusher.cpp @@ -27,6 +29,8 @@ add_library(video_core STATIC engines/shader_type.h macro/macro.cpp macro/macro.h + macro/macro_hle.cpp + macro/macro_hle.h macro/macro_interpreter.cpp macro/macro_interpreter.h macro/macro_jit_x64.cpp @@ -49,11 +53,11 @@ add_library(video_core STATIC query_cache.h rasterizer_accelerated.cpp rasterizer_accelerated.h - rasterizer_cache.cpp - rasterizer_cache.h rasterizer_interface.h renderer_base.cpp renderer_base.h + renderer_opengl/gl_arb_decompiler.cpp + renderer_opengl/gl_arb_decompiler.h renderer_opengl/gl_buffer_cache.cpp renderer_opengl/gl_buffer_cache.h renderer_opengl/gl_device.cpp @@ -93,6 +97,7 @@ add_library(video_core STATIC renderer_opengl/utils.h sampler_cache.cpp sampler_cache.h + shader_cache.h shader/decode/arithmetic.cpp shader/decode/arithmetic_immediate.cpp shader/decode/bfe.cpp diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h index e35ee0b67..e64170e66 100644 --- a/src/video_core/buffer_cache/buffer_block.h +++ b/src/video_core/buffer_cache/buffer_block.h @@ -15,48 +15,47 @@ namespace VideoCommon { class BufferBlock { public: - bool Overlaps(const VAddr start, const VAddr end) const { + bool Overlaps(VAddr start, VAddr end) const { return (cpu_addr < end) && (cpu_addr_end > start); } - bool IsInside(const VAddr other_start, const VAddr other_end) const { + bool IsInside(VAddr other_start, VAddr other_end) const { return cpu_addr <= other_start && other_end <= cpu_addr_end; } - std::size_t GetOffset(const VAddr in_addr) { + std::size_t Offset(VAddr in_addr) const { return static_cast<std::size_t>(in_addr - cpu_addr); } - VAddr GetCpuAddr() const { + VAddr CpuAddr() const { return cpu_addr; } - VAddr GetCpuAddrEnd() const { + VAddr CpuAddrEnd() const { return cpu_addr_end; } - void SetCpuAddr(const VAddr new_addr) { + void SetCpuAddr(VAddr new_addr) { cpu_addr = new_addr; cpu_addr_end = new_addr + size; } - std::size_t GetSize() const { + std::size_t Size() const { return size; } - void SetEpoch(u64 new_epoch) { - epoch = new_epoch; + u64 Epoch() const { + return epoch; } - u64 GetEpoch() { - return epoch; + void SetEpoch(u64 new_epoch) { + epoch = new_epoch; } protected: - explicit BufferBlock(VAddr cpu_addr, const std::size_t size) : size{size} { - SetCpuAddr(cpu_addr); + explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} { + SetCpuAddr(cpu_addr_); } - ~BufferBlock() = default; private: VAddr cpu_addr{}; diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index b88fce2cd..cf8bdd021 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -30,23 +30,31 @@ namespace VideoCommon { -template <typename OwnerBuffer, typename BufferType, typename StreamBuffer> +template <typename Buffer, typename BufferType, typename StreamBuffer> class BufferCache { using IntervalSet = boost::icl::interval_set<VAddr>; using IntervalType = typename IntervalSet::interval_type; using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>; + static constexpr u64 WRITE_PAGE_BIT = 11; + static constexpr u64 BLOCK_PAGE_BITS = 21; + static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS; + public: - using BufferInfo = std::pair<BufferType, u64>; + struct BufferInfo { + BufferType handle; + u64 offset; + u64 address; + }; BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, bool is_written = false, bool use_fast_cbuf = false) { std::lock_guard lock{mutex}; - const auto& memory_manager = system.GPU().MemoryManager(); + auto& memory_manager = system.GPU().MemoryManager(); const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr); if (!cpu_addr_opt) { - return {GetEmptyBuffer(size), 0}; + return GetEmptyBuffer(size); } const VAddr cpu_addr = *cpu_addr_opt; @@ -55,7 +63,6 @@ public: constexpr std::size_t max_stream_size = 0x800; if (use_fast_cbuf || size < max_stream_size) { if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) { - auto& memory_manager = system.GPU().MemoryManager(); const bool is_granular = memory_manager.IsGranularRange(gpu_addr, size); if (use_fast_cbuf) { u8* dest; @@ -82,10 +89,10 @@ public: } } - OwnerBuffer block = GetBlock(cpu_addr, size); + Buffer* const block = GetBlock(cpu_addr, size); MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size); if (!map) { - return {GetEmptyBuffer(size), 0}; + return GetEmptyBuffer(size); } if (is_written) { map->MarkAsModified(true, GetModifiedTicks()); @@ -98,7 +105,7 @@ public: } } - return {ToHandle(block), static_cast<u64>(block->GetOffset(cpu_addr))}; + return BufferInfo{block->Handle(), block->Offset(cpu_addr), block->Address()}; } /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset. @@ -110,31 +117,37 @@ public: }); } - void Map(std::size_t max_size) { + /// Prepares the buffer cache for data uploading + /// @param max_size Maximum number of bytes that will be uploaded + /// @return True when a stream buffer invalidation was required, false otherwise + bool Map(std::size_t max_size) { std::lock_guard lock{mutex}; + bool invalidated; std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4); buffer_offset = buffer_offset_base; + + return invalidated; } - /// Finishes the upload stream, returns true on bindings invalidation. - bool Unmap() { + /// Finishes the upload stream + void Unmap() { std::lock_guard lock{mutex}; - stream_buffer->Unmap(buffer_offset - buffer_offset_base); - return std::exchange(invalidated, false); } + /// Function called at the end of each frame, inteded for deferred operations void TickFrame() { ++epoch; + while (!pending_destruction.empty()) { // Delay at least 4 frames before destruction. // This is due to triple buffering happening on some drivers. static constexpr u64 epochs_to_destroy = 5; - if (pending_destruction.front()->GetEpoch() + epochs_to_destroy > epoch) { + if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) { break; } - pending_destruction.pop_front(); + pending_destruction.pop(); } } @@ -245,28 +258,16 @@ public: committed_flushes.pop_front(); } - virtual BufferType GetEmptyBuffer(std::size_t size) = 0; + virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0; protected: explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, std::unique_ptr<StreamBuffer> stream_buffer) - : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)}, - stream_buffer_handle{this->stream_buffer->GetHandle()} {} + : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)} {} ~BufferCache() = default; - virtual BufferType ToHandle(const OwnerBuffer& storage) = 0; - - virtual OwnerBuffer CreateBlock(VAddr cpu_addr, std::size_t size) = 0; - - virtual void UploadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size, - const u8* data) = 0; - - virtual void DownloadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size, - u8* data) = 0; - - virtual void CopyBlock(const OwnerBuffer& src, const OwnerBuffer& dst, std::size_t src_offset, - std::size_t dst_offset, std::size_t size) = 0; + virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0; virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) { return {}; @@ -321,7 +322,7 @@ protected: } private: - MapInterval* MapAddress(const OwnerBuffer& block, GPUVAddr gpu_addr, VAddr cpu_addr, + MapInterval* MapAddress(const Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) { const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size); if (overlaps.empty()) { @@ -329,11 +330,11 @@ private: const VAddr cpu_addr_end = cpu_addr + size; if (memory_manager.IsGranularRange(gpu_addr, size)) { u8* host_ptr = memory_manager.GetPointer(gpu_addr); - UploadBlockData(block, block->GetOffset(cpu_addr), size, host_ptr); + block->Upload(block->Offset(cpu_addr), size, host_ptr); } else { staging_buffer.resize(size); memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); - UploadBlockData(block, block->GetOffset(cpu_addr), size, staging_buffer.data()); + block->Upload(block->Offset(cpu_addr), size, staging_buffer.data()); } return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr)); } @@ -376,7 +377,7 @@ private: return map; } - void UpdateBlock(const OwnerBuffer& block, VAddr start, VAddr end, + void UpdateBlock(const Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) { const IntervalType base_interval{start, end}; IntervalSet interval_set{}; @@ -386,13 +387,13 @@ private: interval_set.subtract(subtract); } for (auto& interval : interval_set) { - std::size_t size = interval.upper() - interval.lower(); - if (size > 0) { - staging_buffer.resize(size); - system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size); - UploadBlockData(block, block->GetOffset(interval.lower()), size, - staging_buffer.data()); + const std::size_t size = interval.upper() - interval.lower(); + if (size == 0) { + continue; } + staging_buffer.resize(size); + system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size); + block->Upload(block->Offset(interval.lower()), size, staging_buffer.data()); } } @@ -422,10 +423,14 @@ private: } void FlushMap(MapInterval* map) { + const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS); + ASSERT_OR_EXECUTE(it != blocks.end(), return;); + + std::shared_ptr<Buffer> block = it->second; + const std::size_t size = map->end - map->start; - OwnerBuffer block = blocks[map->start >> block_page_bits]; staging_buffer.resize(size); - DownloadBlockData(block, block->GetOffset(map->start), size, staging_buffer.data()); + block->Download(block->Offset(map->start), size, staging_buffer.data()); system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size); map->MarkAsModified(false, 0); } @@ -438,7 +443,7 @@ private: buffer_ptr += size; buffer_offset += size; - return {stream_buffer_handle, uploaded_offset}; + return BufferInfo{stream_buffer->Handle(), uploaded_offset, stream_buffer->Address()}; } void AlignBuffer(std::size_t alignment) { @@ -448,97 +453,89 @@ private: buffer_offset = offset_aligned; } - OwnerBuffer EnlargeBlock(OwnerBuffer buffer) { - const std::size_t old_size = buffer->GetSize(); - const std::size_t new_size = old_size + block_page_size; - const VAddr cpu_addr = buffer->GetCpuAddr(); - OwnerBuffer new_buffer = CreateBlock(cpu_addr, new_size); - CopyBlock(buffer, new_buffer, 0, 0, old_size); - buffer->SetEpoch(epoch); - pending_destruction.push_back(buffer); + std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) { + const std::size_t old_size = buffer->Size(); + const std::size_t new_size = old_size + BLOCK_PAGE_SIZE; + const VAddr cpu_addr = buffer->CpuAddr(); + std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size); + new_buffer->CopyFrom(*buffer, 0, 0, old_size); + QueueDestruction(std::move(buffer)); + const VAddr cpu_addr_end = cpu_addr + new_size - 1; - u64 page_start = cpu_addr >> block_page_bits; - const u64 page_end = cpu_addr_end >> block_page_bits; - while (page_start <= page_end) { - blocks[page_start] = new_buffer; - ++page_start; + const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; + for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { + blocks.insert_or_assign(page_start, new_buffer); } + return new_buffer; } - OwnerBuffer MergeBlocks(OwnerBuffer first, OwnerBuffer second) { - const std::size_t size_1 = first->GetSize(); - const std::size_t size_2 = second->GetSize(); - const VAddr first_addr = first->GetCpuAddr(); - const VAddr second_addr = second->GetCpuAddr(); + std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first, + std::shared_ptr<Buffer> second) { + const std::size_t size_1 = first->Size(); + const std::size_t size_2 = second->Size(); + const VAddr first_addr = first->CpuAddr(); + const VAddr second_addr = second->CpuAddr(); const VAddr new_addr = std::min(first_addr, second_addr); const std::size_t new_size = size_1 + size_2; - OwnerBuffer new_buffer = CreateBlock(new_addr, new_size); - CopyBlock(first, new_buffer, 0, new_buffer->GetOffset(first_addr), size_1); - CopyBlock(second, new_buffer, 0, new_buffer->GetOffset(second_addr), size_2); - first->SetEpoch(epoch); - second->SetEpoch(epoch); - pending_destruction.push_back(first); - pending_destruction.push_back(second); + + std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size); + new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1); + new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2); + QueueDestruction(std::move(first)); + QueueDestruction(std::move(second)); + const VAddr cpu_addr_end = new_addr + new_size - 1; - u64 page_start = new_addr >> block_page_bits; - const u64 page_end = cpu_addr_end >> block_page_bits; - while (page_start <= page_end) { - blocks[page_start] = new_buffer; - ++page_start; + const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; + for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { + blocks.insert_or_assign(page_start, new_buffer); } return new_buffer; } - OwnerBuffer GetBlock(const VAddr cpu_addr, const std::size_t size) { - OwnerBuffer found; + Buffer* GetBlock(VAddr cpu_addr, std::size_t size) { + std::shared_ptr<Buffer> found; + const VAddr cpu_addr_end = cpu_addr + size - 1; - u64 page_start = cpu_addr >> block_page_bits; - const u64 page_end = cpu_addr_end >> block_page_bits; - while (page_start <= page_end) { + const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; + for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { auto it = blocks.find(page_start); if (it == blocks.end()) { if (found) { found = EnlargeBlock(found); - } else { - const VAddr start_addr = (page_start << block_page_bits); - found = CreateBlock(start_addr, block_page_size); - blocks[page_start] = found; - } - } else { - if (found) { - if (found == it->second) { - ++page_start; - continue; - } - found = MergeBlocks(found, it->second); - } else { - found = it->second; + continue; } + const VAddr start_addr = page_start << BLOCK_PAGE_BITS; + found = CreateBlock(start_addr, BLOCK_PAGE_SIZE); + blocks.insert_or_assign(page_start, found); + continue; + } + if (!found) { + found = it->second; + continue; + } + if (found != it->second) { + found = MergeBlocks(std::move(found), it->second); } - ++page_start; } - return found; + return found.get(); } - void MarkRegionAsWritten(const VAddr start, const VAddr end) { - u64 page_start = start >> write_page_bit; - const u64 page_end = end >> write_page_bit; - while (page_start <= page_end) { + void MarkRegionAsWritten(VAddr start, VAddr end) { + const u64 page_end = end >> WRITE_PAGE_BIT; + for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { auto it = written_pages.find(page_start); if (it != written_pages.end()) { it->second = it->second + 1; } else { - written_pages[page_start] = 1; + written_pages.insert_or_assign(page_start, 1); } - ++page_start; } } - void UnmarkRegionAsWritten(const VAddr start, const VAddr end) { - u64 page_start = start >> write_page_bit; - const u64 page_end = end >> write_page_bit; - while (page_start <= page_end) { + void UnmarkRegionAsWritten(VAddr start, VAddr end) { + const u64 page_end = end >> WRITE_PAGE_BIT; + for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { auto it = written_pages.find(page_start); if (it != written_pages.end()) { if (it->second > 1) { @@ -547,22 +544,24 @@ private: written_pages.erase(it); } } - ++page_start; } } - bool IsRegionWritten(const VAddr start, const VAddr end) const { - u64 page_start = start >> write_page_bit; - const u64 page_end = end >> write_page_bit; - while (page_start <= page_end) { + bool IsRegionWritten(VAddr start, VAddr end) const { + const u64 page_end = end >> WRITE_PAGE_BIT; + for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { if (written_pages.count(page_start) > 0) { return true; } - ++page_start; } return false; } + void QueueDestruction(std::shared_ptr<Buffer> buffer) { + buffer->SetEpoch(epoch); + pending_destruction.push(std::move(buffer)); + } + void MarkForAsyncFlush(MapInterval* map) { if (!uncommitted_flushes) { uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>(); @@ -574,9 +573,7 @@ private: Core::System& system; std::unique_ptr<StreamBuffer> stream_buffer; - BufferType stream_buffer_handle{}; - - bool invalidated = false; + BufferType stream_buffer_handle; u8* buffer_ptr = nullptr; u64 buffer_offset = 0; @@ -586,18 +583,15 @@ private: boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>> mapped_addresses; - static constexpr u64 write_page_bit = 11; std::unordered_map<u64, u32> written_pages; + std::unordered_map<u64, std::shared_ptr<Buffer>> blocks; - static constexpr u64 block_page_bits = 21; - static constexpr u64 block_page_size = 1ULL << block_page_bits; - std::unordered_map<u64, OwnerBuffer> blocks; - - std::list<OwnerBuffer> pending_destruction; + std::queue<std::shared_ptr<Buffer>> pending_destruction; u64 epoch = 0; u64 modified_ticks = 0; std::vector<u8> staging_buffer; + std::list<MapInterval*> marked_for_unregister; std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes; diff --git a/src/video_core/compatible_formats.cpp b/src/video_core/compatible_formats.cpp new file mode 100644 index 000000000..6c426b035 --- /dev/null +++ b/src/video_core/compatible_formats.cpp @@ -0,0 +1,162 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <array> +#include <bitset> +#include <cstddef> + +#include "video_core/compatible_formats.h" +#include "video_core/surface.h" + +namespace VideoCore::Surface { + +namespace { + +// Compatibility table taken from Table 3.X.2 in: +// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_view.txt + +constexpr std::array VIEW_CLASS_128_BITS = { + PixelFormat::RGBA32F, + PixelFormat::RGBA32UI, +}; +// Missing formats: +// PixelFormat::RGBA32I + +constexpr std::array VIEW_CLASS_96_BITS = { + PixelFormat::RGB32F, +}; +// Missing formats: +// PixelFormat::RGB32UI, +// PixelFormat::RGB32I, + +constexpr std::array VIEW_CLASS_64_BITS = { + PixelFormat::RGBA16F, PixelFormat::RG32F, PixelFormat::RGBA16UI, PixelFormat::RG32UI, + PixelFormat::RGBA16U, PixelFormat::RGBA16F, PixelFormat::RGBA16S, +}; +// Missing formats: +// PixelFormat::RGBA16I +// PixelFormat::RG32I + +// TODO: How should we handle 48 bits? + +constexpr std::array VIEW_CLASS_32_BITS = { + PixelFormat::RG16F, PixelFormat::R11FG11FB10F, PixelFormat::R32F, + PixelFormat::A2B10G10R10U, PixelFormat::RG16UI, PixelFormat::R32UI, + PixelFormat::RG16I, PixelFormat::R32I, PixelFormat::ABGR8U, + PixelFormat::RG16, PixelFormat::ABGR8S, PixelFormat::RG16S, + PixelFormat::RGBA8_SRGB, PixelFormat::E5B9G9R9F, PixelFormat::BGRA8, + PixelFormat::BGRA8_SRGB, +}; +// Missing formats: +// PixelFormat::RGBA8UI +// PixelFormat::RGBA8I +// PixelFormat::RGB10_A2_UI + +// TODO: How should we handle 24 bits? + +constexpr std::array VIEW_CLASS_16_BITS = { + PixelFormat::R16F, PixelFormat::RG8UI, PixelFormat::R16UI, PixelFormat::R16I, + PixelFormat::RG8U, PixelFormat::R16U, PixelFormat::RG8S, PixelFormat::R16S, +}; +// Missing formats: +// PixelFormat::RG8I + +constexpr std::array VIEW_CLASS_8_BITS = { + PixelFormat::R8UI, + PixelFormat::R8U, +}; +// Missing formats: +// PixelFormat::R8I +// PixelFormat::R8S + +constexpr std::array VIEW_CLASS_RGTC1_RED = { + PixelFormat::DXN1, +}; +// Missing formats: +// COMPRESSED_SIGNED_RED_RGTC1 + +constexpr std::array VIEW_CLASS_RGTC2_RG = { + PixelFormat::DXN2UNORM, + PixelFormat::DXN2SNORM, +}; + +constexpr std::array VIEW_CLASS_BPTC_UNORM = { + PixelFormat::BC7U, + PixelFormat::BC7U_SRGB, +}; + +constexpr std::array VIEW_CLASS_BPTC_FLOAT = { + PixelFormat::BC6H_SF16, + PixelFormat::BC6H_UF16, +}; + +// Compatibility table taken from Table 4.X.1 in: +// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_copy_image.txt + +constexpr std::array COPY_CLASS_128_BITS = { + PixelFormat::RGBA32UI, PixelFormat::RGBA32F, PixelFormat::DXT23, + PixelFormat::DXT23_SRGB, PixelFormat::DXT45, PixelFormat::DXT45_SRGB, + PixelFormat::DXN2SNORM, PixelFormat::BC7U, PixelFormat::BC7U_SRGB, + PixelFormat::BC6H_SF16, PixelFormat::BC6H_UF16, +}; +// Missing formats: +// PixelFormat::RGBA32I +// COMPRESSED_RG_RGTC2 + +constexpr std::array COPY_CLASS_64_BITS = { + PixelFormat::RGBA16F, PixelFormat::RG32F, PixelFormat::RGBA16UI, PixelFormat::RG32UI, + PixelFormat::RGBA16U, PixelFormat::RGBA16S, PixelFormat::DXT1_SRGB, PixelFormat::DXT1, + +}; +// Missing formats: +// PixelFormat::RGBA16I +// PixelFormat::RG32I, +// COMPRESSED_RGB_S3TC_DXT1_EXT +// COMPRESSED_SRGB_S3TC_DXT1_EXT +// COMPRESSED_RGBA_S3TC_DXT1_EXT +// COMPRESSED_SIGNED_RED_RGTC1 + +void Enable(FormatCompatibility::Table& compatiblity, size_t format_a, size_t format_b) { + compatiblity[format_a][format_b] = true; + compatiblity[format_b][format_a] = true; +} + +void Enable(FormatCompatibility::Table& compatibility, PixelFormat format_a, PixelFormat format_b) { + Enable(compatibility, static_cast<size_t>(format_a), static_cast<size_t>(format_b)); +} + +template <typename Range> +void EnableRange(FormatCompatibility::Table& compatibility, const Range& range) { + for (auto it_a = range.begin(); it_a != range.end(); ++it_a) { + for (auto it_b = it_a; it_b != range.end(); ++it_b) { + Enable(compatibility, *it_a, *it_b); + } + } +} + +} // Anonymous namespace + +FormatCompatibility::FormatCompatibility() { + for (size_t i = 0; i < MaxPixelFormat; ++i) { + // Identity is allowed + Enable(view, i, i); + } + + EnableRange(view, VIEW_CLASS_128_BITS); + EnableRange(view, VIEW_CLASS_96_BITS); + EnableRange(view, VIEW_CLASS_64_BITS); + EnableRange(view, VIEW_CLASS_32_BITS); + EnableRange(view, VIEW_CLASS_16_BITS); + EnableRange(view, VIEW_CLASS_8_BITS); + EnableRange(view, VIEW_CLASS_RGTC1_RED); + EnableRange(view, VIEW_CLASS_RGTC2_RG); + EnableRange(view, VIEW_CLASS_BPTC_UNORM); + EnableRange(view, VIEW_CLASS_BPTC_FLOAT); + + copy = view; + EnableRange(copy, COPY_CLASS_128_BITS); + EnableRange(copy, COPY_CLASS_64_BITS); +} + +} // namespace VideoCore::Surface diff --git a/src/video_core/compatible_formats.h b/src/video_core/compatible_formats.h new file mode 100644 index 000000000..d1082566d --- /dev/null +++ b/src/video_core/compatible_formats.h @@ -0,0 +1,32 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <array> +#include <bitset> +#include <cstddef> + +#include "video_core/surface.h" + +namespace VideoCore::Surface { + +class FormatCompatibility { +public: + using Table = std::array<std::bitset<MaxPixelFormat>, MaxPixelFormat>; + + explicit FormatCompatibility(); + + bool TestView(PixelFormat format_a, PixelFormat format_b) const noexcept { + return view[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)]; + } + + bool TestCopy(PixelFormat format_a, PixelFormat format_b) const noexcept { + return copy[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)]; + } + +private: + Table view; + Table copy; +}; + +} // namespace VideoCore::Surface diff --git a/src/video_core/engines/const_buffer_engine_interface.h b/src/video_core/engines/const_buffer_engine_interface.h index ebe139504..f46e81bb7 100644 --- a/src/video_core/engines/const_buffer_engine_interface.h +++ b/src/video_core/engines/const_buffer_engine_interface.h @@ -93,6 +93,7 @@ public: virtual SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const = 0; virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer, u64 offset) const = 0; + virtual SamplerDescriptor AccessSampler(u32 handle) const = 0; virtual u32 GetBoundBuffer() const = 0; virtual VideoCore::GuestDriverProfile& AccessGuestDriverProfile() = 0; diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index f6237fc6a..a82b06a38 100644 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -92,8 +92,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con ASSERT(stage == ShaderType::Compute); const auto& tex_info_buffer = launch_description.const_buffer_config[const_buffer]; const GPUVAddr tex_info_address = tex_info_buffer.Address() + offset; + return AccessSampler(memory_manager.Read<u32>(tex_info_address)); +} - const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)}; +SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const { + const Texture::TextureHandle tex_handle{handle}; const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h index 18ceedfaf..b7f668d88 100644 --- a/src/video_core/engines/kepler_compute.h +++ b/src/video_core/engines/kepler_compute.h @@ -219,6 +219,8 @@ public: SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer, u64 offset) const override; + SamplerDescriptor AccessSampler(u32 handle) const override; + u32 GetBoundBuffer() const override { return regs.tex_cb_index; } diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index e46b153f9..c01436295 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -128,7 +128,7 @@ void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size()); // Execute the current macro. - macro_engine->Execute(macro_positions[entry], parameters); + macro_engine->Execute(*this, macro_positions[entry], parameters); if (mme_draw.current_mode != MMEDrawMode::Undefined) { FlushMMEInlineDraw(); } @@ -740,8 +740,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b const auto& shader = state.shader_stages[static_cast<std::size_t>(stage)]; const auto& tex_info_buffer = shader.const_buffers[const_buffer]; const GPUVAddr tex_info_address = tex_info_buffer.address + offset; + return AccessSampler(memory_manager.Read<u32>(tex_info_address)); +} - const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)}; +SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const { + const Texture::TextureHandle tex_handle{handle}; const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index b827b112f..ef1618990 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -598,6 +598,7 @@ public: BitField<4, 3, u32> block_height; BitField<8, 3, u32> block_depth; BitField<12, 1, InvMemoryLayout> type; + BitField<16, 1, u32> is_3d; } memory_layout; union { BitField<0, 16, u32> layers; @@ -1403,6 +1404,8 @@ public: SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer, u64 offset) const override; + SamplerDescriptor AccessSampler(u32 handle) const override; + u32 GetBoundBuffer() const override { return regs.tex_cb_index; } @@ -1415,6 +1418,14 @@ public: return execute_on; } + VideoCore::RasterizerInterface& GetRasterizer() { + return rasterizer; + } + + const VideoCore::RasterizerInterface& GetRasterizer() const { + return rasterizer; + } + /// Notify a memory write has happened. void OnMemoryWrite() { dirty.flags |= dirty.on_write_stores; diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index e7cb87589..d374b73cf 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -661,6 +661,10 @@ union Instruction { constexpr Instruction(u64 value) : value{value} {} constexpr Instruction(const Instruction& instr) : value(instr.value) {} + constexpr bool Bit(u64 offset) const { + return ((value >> offset) & 1) != 0; + } + BitField<0, 8, Register> gpr0; BitField<8, 8, Register> gpr8; union { @@ -1874,7 +1878,9 @@ public: HSETP2_C, HSETP2_R, HSETP2_IMM, + HSET2_C, HSET2_R, + HSET2_IMM, POPC_C, POPC_R, POPC_IMM, @@ -2194,7 +2200,9 @@ private: INST("0111111-1-------", Id::HSETP2_C, Type::HalfSetPredicate, "HSETP2_C"), INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"), INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"), + INST("0111110-1-------", Id::HSET2_C, Type::HalfSet, "HSET2_C"), INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"), + INST("0111110-0-------", Id::HSET2_IMM, Type::HalfSet, "HSET2_IMM"), INST("010110111010----", Id::FCMP_RR, Type::Arithmetic, "FCMP_RR"), INST("010010111010----", Id::FCMP_RC, Type::Arithmetic, "FCMP_RC"), INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"), diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 8eb017f65..482e49711 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -2,6 +2,8 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <chrono> + #include "common/assert.h" #include "common/microprofile.h" #include "core/core.h" @@ -154,8 +156,7 @@ u64 GPU::GetTicks() const { constexpr u64 gpu_ticks_num = 384; constexpr u64 gpu_ticks_den = 625; - const u64 cpu_ticks = system.CoreTiming().GetTicks(); - u64 nanoseconds = Core::Timing::CyclesToNs(cpu_ticks).count(); + u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count(); if (Settings::values.use_fast_gpu_time) { nanoseconds /= 256; } diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index a1b4c305c..2c42483bd 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -284,6 +284,12 @@ public: /// core timing events. virtual void Start() = 0; + /// Obtain the CPU Context + virtual void ObtainContext() = 0; + + /// Release the CPU Context + virtual void ReleaseContext() = 0; + /// Push GPU command entries to be processed virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0; diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp index 53305ab43..7b855f63e 100644 --- a/src/video_core/gpu_asynch.cpp +++ b/src/video_core/gpu_asynch.cpp @@ -19,10 +19,17 @@ GPUAsynch::GPUAsynch(Core::System& system, std::unique_ptr<VideoCore::RendererBa GPUAsynch::~GPUAsynch() = default; void GPUAsynch::Start() { - cpu_context->MakeCurrent(); gpu_thread.StartThread(*renderer, *gpu_context, *dma_pusher); } +void GPUAsynch::ObtainContext() { + cpu_context->MakeCurrent(); +} + +void GPUAsynch::ReleaseContext() { + cpu_context->DoneCurrent(); +} + void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) { gpu_thread.SubmitList(std::move(entries)); } diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h index 517658612..15e9f1d38 100644 --- a/src/video_core/gpu_asynch.h +++ b/src/video_core/gpu_asynch.h @@ -25,6 +25,8 @@ public: ~GPUAsynch() override; void Start() override; + void ObtainContext() override; + void ReleaseContext() override; void PushGPUEntries(Tegra::CommandList&& entries) override; void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; void FlushRegion(VAddr addr, u64 size) override; diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp index 6f38a672a..aaeb9811d 100644 --- a/src/video_core/gpu_synch.cpp +++ b/src/video_core/gpu_synch.cpp @@ -13,10 +13,16 @@ GPUSynch::GPUSynch(Core::System& system, std::unique_ptr<VideoCore::RendererBase GPUSynch::~GPUSynch() = default; -void GPUSynch::Start() { +void GPUSynch::Start() {} + +void GPUSynch::ObtainContext() { context->MakeCurrent(); } +void GPUSynch::ReleaseContext() { + context->DoneCurrent(); +} + void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) { dma_pusher->Push(std::move(entries)); dma_pusher->DispatchCalls(); diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h index 4a6e9a01d..762c20aa5 100644 --- a/src/video_core/gpu_synch.h +++ b/src/video_core/gpu_synch.h @@ -24,6 +24,8 @@ public: ~GPUSynch() override; void Start() override; + void ObtainContext() override; + void ReleaseContext() override; void PushGPUEntries(Tegra::CommandList&& entries) override; void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; void FlushRegion(VAddr addr, u64 size) override; diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index c3bb4fe06..738c6f0c1 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -4,6 +4,7 @@ #include "common/assert.h" #include "common/microprofile.h" +#include "common/thread.h" #include "core/core.h" #include "core/frontend/emu_window.h" #include "core/settings.h" @@ -18,7 +19,11 @@ namespace VideoCommon::GPUThread { static void RunThread(Core::System& system, VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher, SynchState& state) { - MicroProfileOnThreadCreate("GpuThread"); + std::string name = "yuzu:GPU"; + MicroProfileOnThreadCreate(name.c_str()); + Common::SetCurrentThreadName(name.c_str()); + Common::SetCurrentThreadPriority(Common::ThreadPriority::High); + system.RegisterHostThread(); // Wait for first GPU command before acquiring the window context while (state.queue.Empty()) diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp index 89077a2d8..a50e7b4e0 100644 --- a/src/video_core/macro/macro.cpp +++ b/src/video_core/macro/macro.cpp @@ -2,32 +2,78 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <optional> +#include <boost/container_hash/hash.hpp> #include "common/assert.h" #include "common/logging/log.h" #include "core/settings.h" +#include "video_core/engines/maxwell_3d.h" #include "video_core/macro/macro.h" +#include "video_core/macro/macro_hle.h" #include "video_core/macro/macro_interpreter.h" #include "video_core/macro/macro_jit_x64.h" namespace Tegra { +MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d) + : hle_macros{std::make_unique<Tegra::HLEMacro>(maxwell3d)} {} + +MacroEngine::~MacroEngine() = default; + void MacroEngine::AddCode(u32 method, u32 data) { uploaded_macro_code[method].push_back(data); } -void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) { +void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method, + const std::vector<u32>& parameters) { auto compiled_macro = macro_cache.find(method); if (compiled_macro != macro_cache.end()) { - compiled_macro->second->Execute(parameters, method); + const auto& cache_info = compiled_macro->second; + if (cache_info.has_hle_program) { + cache_info.hle_program->Execute(parameters, method); + } else { + cache_info.lle_program->Execute(parameters, method); + } } else { // Macro not compiled, check if it's uploaded and if so, compile it - auto macro_code = uploaded_macro_code.find(method); + std::optional<u32> mid_method = std::nullopt; + const auto macro_code = uploaded_macro_code.find(method); if (macro_code == uploaded_macro_code.end()) { - UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method); - return; + for (const auto& [method_base, code] : uploaded_macro_code) { + if (method >= method_base && (method - method_base) < code.size()) { + mid_method = method_base; + break; + } + } + if (!mid_method.has_value()) { + UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method); + return; + } + } + auto& cache_info = macro_cache[method]; + + if (!mid_method.has_value()) { + cache_info.lle_program = Compile(macro_code->second); + cache_info.hash = boost::hash_value(macro_code->second); + } else { + const auto& macro_cached = uploaded_macro_code[mid_method.value()]; + const auto rebased_method = method - mid_method.value(); + auto& code = uploaded_macro_code[method]; + code.resize(macro_cached.size() - rebased_method); + std::memcpy(code.data(), macro_cached.data() + rebased_method, + code.size() * sizeof(u32)); + cache_info.hash = boost::hash_value(code); + cache_info.lle_program = Compile(code); + } + + auto hle_program = hle_macros->GetHLEProgram(cache_info.hash); + if (hle_program.has_value()) { + cache_info.has_hle_program = true; + cache_info.hle_program = std::move(hle_program.value()); + cache_info.hle_program->Execute(parameters, method); + } else { + cache_info.lle_program->Execute(parameters, method); } - macro_cache[method] = Compile(macro_code->second); - macro_cache[method]->Execute(parameters, method); } } diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h index b76ed891f..4d00b84b0 100644 --- a/src/video_core/macro/macro.h +++ b/src/video_core/macro/macro.h @@ -11,9 +11,11 @@ #include "common/common_types.h" namespace Tegra { + namespace Engines { class Maxwell3D; } + namespace Macro { constexpr std::size_t NUM_MACRO_REGISTERS = 8; enum class Operation : u32 { @@ -94,6 +96,8 @@ union MethodAddress { } // namespace Macro +class HLEMacro; + class CachedMacro { public: virtual ~CachedMacro() = default; @@ -107,20 +111,29 @@ public: class MacroEngine { public: - virtual ~MacroEngine() = default; + explicit MacroEngine(Engines::Maxwell3D& maxwell3d); + virtual ~MacroEngine(); // Store the uploaded macro code to compile them when they're called. void AddCode(u32 method, u32 data); // Compiles the macro if its not in the cache, and executes the compiled macro - void Execute(u32 method, const std::vector<u32>& parameters); + void Execute(Engines::Maxwell3D& maxwell3d, u32 method, const std::vector<u32>& parameters); protected: virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0; private: - std::unordered_map<u32, std::unique_ptr<CachedMacro>> macro_cache; + struct CacheInfo { + std::unique_ptr<CachedMacro> lle_program{}; + std::unique_ptr<CachedMacro> hle_program{}; + u64 hash{}; + bool has_hle_program{}; + }; + + std::unordered_map<u32, CacheInfo> macro_cache; std::unordered_map<u32, std::vector<u32>> uploaded_macro_code; + std::unique_ptr<HLEMacro> hle_macros; }; std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d); diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp new file mode 100644 index 000000000..410f99018 --- /dev/null +++ b/src/video_core/macro/macro_hle.cpp @@ -0,0 +1,113 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <array> +#include <vector> +#include "video_core/engines/maxwell_3d.h" +#include "video_core/macro/macro_hle.h" +#include "video_core/rasterizer_interface.h" + +namespace Tegra { + +namespace { +// HLE'd functions +static void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d, + const std::vector<u32>& parameters) { + const u32 instance_count = parameters[2] & maxwell3d.GetRegisterValue(0xD1B); + + maxwell3d.regs.draw.topology.Assign( + static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0] & + ~(0x3ffffff << 26))); + maxwell3d.regs.vb_base_instance = parameters[5]; + maxwell3d.mme_draw.instance_count = instance_count; + maxwell3d.regs.vb_element_base = parameters[3]; + maxwell3d.regs.index_array.count = parameters[1]; + maxwell3d.regs.index_array.first = parameters[4]; + + if (maxwell3d.ShouldExecute()) { + maxwell3d.GetRasterizer().Draw(true, true); + } + maxwell3d.regs.index_array.count = 0; + maxwell3d.mme_draw.instance_count = 0; + maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined; +} + +static void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d, + const std::vector<u32>& parameters) { + const u32 count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]); + + maxwell3d.regs.vertex_buffer.first = parameters[3]; + maxwell3d.regs.vertex_buffer.count = parameters[1]; + maxwell3d.regs.vb_base_instance = parameters[4]; + maxwell3d.regs.draw.topology.Assign( + static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0])); + maxwell3d.mme_draw.instance_count = count; + + if (maxwell3d.ShouldExecute()) { + maxwell3d.GetRasterizer().Draw(false, true); + } + maxwell3d.regs.vertex_buffer.count = 0; + maxwell3d.mme_draw.instance_count = 0; + maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined; +} + +static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, + const std::vector<u32>& parameters) { + const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]); + const u32 element_base = parameters[4]; + const u32 base_instance = parameters[5]; + maxwell3d.regs.index_array.first = parameters[3]; + maxwell3d.regs.reg_array[0x446] = element_base; // vertex id base? + maxwell3d.regs.index_array.count = parameters[1]; + maxwell3d.regs.vb_element_base = element_base; + maxwell3d.regs.vb_base_instance = base_instance; + maxwell3d.mme_draw.instance_count = instance_count; + maxwell3d.CallMethodFromMME(0x8e3, 0x640); + maxwell3d.CallMethodFromMME(0x8e4, element_base); + maxwell3d.CallMethodFromMME(0x8e5, base_instance); + maxwell3d.regs.draw.topology.Assign( + static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0])); + if (maxwell3d.ShouldExecute()) { + maxwell3d.GetRasterizer().Draw(true, true); + } + maxwell3d.regs.reg_array[0x446] = 0x0; // vertex id base? + maxwell3d.regs.index_array.count = 0; + maxwell3d.regs.vb_element_base = 0x0; + maxwell3d.regs.vb_base_instance = 0x0; + maxwell3d.mme_draw.instance_count = 0; + maxwell3d.CallMethodFromMME(0x8e3, 0x640); + maxwell3d.CallMethodFromMME(0x8e4, 0x0); + maxwell3d.CallMethodFromMME(0x8e5, 0x0); + maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined; +} +} // namespace + +constexpr std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{{ + std::make_pair<u64, HLEFunction>(0x771BB18C62444DA0, &HLE_771BB18C62444DA0), + std::make_pair<u64, HLEFunction>(0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD), + std::make_pair<u64, HLEFunction>(0x0217920100488FF7, &HLE_0217920100488FF7), +}}; + +HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} +HLEMacro::~HLEMacro() = default; + +std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const { + const auto it = std::find_if(hle_funcs.cbegin(), hle_funcs.cend(), + [hash](const auto& pair) { return pair.first == hash; }); + if (it == hle_funcs.end()) { + return std::nullopt; + } + return std::make_unique<HLEMacroImpl>(maxwell3d, it->second); +} + +HLEMacroImpl::~HLEMacroImpl() = default; + +HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func) + : maxwell3d(maxwell3d), func(func) {} + +void HLEMacroImpl::Execute(const std::vector<u32>& parameters, u32 method) { + func(maxwell3d, parameters); +} + +} // namespace Tegra diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h new file mode 100644 index 000000000..37af875a0 --- /dev/null +++ b/src/video_core/macro/macro_hle.h @@ -0,0 +1,44 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <optional> +#include <vector> +#include "common/common_types.h" +#include "video_core/macro/macro.h" + +namespace Tegra { + +namespace Engines { +class Maxwell3D; +} + +using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters); + +class HLEMacro { +public: + explicit HLEMacro(Engines::Maxwell3D& maxwell3d); + ~HLEMacro(); + + std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const; + +private: + Engines::Maxwell3D& maxwell3d; +}; + +class HLEMacroImpl : public CachedMacro { +public: + explicit HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func); + ~HLEMacroImpl(); + + void Execute(const std::vector<u32>& parameters, u32 method) override; + +private: + Engines::Maxwell3D& maxwell3d; + HLEFunction func; +}; + +} // namespace Tegra diff --git a/src/video_core/macro/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp index 5edff27aa..aa5256419 100644 --- a/src/video_core/macro/macro_interpreter.cpp +++ b/src/video_core/macro/macro_interpreter.cpp @@ -11,7 +11,8 @@ MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192)); namespace Tegra { -MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} +MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) + : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {} std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) { return std::make_unique<MacroInterpreterImpl>(maxwell3d, code); diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index 11c1cc3be..07292702f 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -14,27 +14,22 @@ MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255 MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0)); namespace Tegra { -static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r9; -static const Xbyak::Reg64 REGISTERS = Xbyak::util::r10; -static const Xbyak::Reg64 STATE = Xbyak::util::r11; -static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r12; -static const Xbyak::Reg32 RESULT = Xbyak::util::r13d; -static const Xbyak::Reg64 RESULT_64 = Xbyak::util::r13; +static const Xbyak::Reg64 STATE = Xbyak::util::rbx; +static const Xbyak::Reg32 RESULT = Xbyak::util::ebp; +static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r12; static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d; -static const Xbyak::Reg64 METHOD_ADDRESS_64 = Xbyak::util::r14; static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15; static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({ - PARAMETERS, - REGISTERS, STATE, - NEXT_PARAMETER, RESULT, + PARAMETERS, METHOD_ADDRESS, BRANCH_HOLDER, }); -MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} +MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) + : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {} std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) { return std::make_unique<MacroJITx64Impl>(maxwell3d, code); @@ -53,32 +48,32 @@ void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) { JITState state{}; state.maxwell3d = &maxwell3d; state.registers = {}; - state.parameters = parameters.data(); - program(&state); + program(&state, parameters.data()); } void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) { const bool is_a_zero = opcode.src_a == 0; const bool is_b_zero = opcode.src_b == 0; const bool valid_operation = !is_a_zero && !is_b_zero; - const bool is_move_operation = !is_a_zero && is_b_zero; + [[maybe_unused]] const bool is_move_operation = !is_a_zero && is_b_zero; const bool has_zero_register = is_a_zero || is_b_zero; + const bool no_zero_reg_skip = opcode.alu_operation == Macro::ALUOperation::AddWithCarry || + opcode.alu_operation == Macro::ALUOperation::SubtractWithBorrow; - Xbyak::Reg64 src_a; + Xbyak::Reg32 src_a; Xbyak::Reg32 src_b; - if (!optimizer.zero_reg_skip) { - src_a = Compile_GetRegister(opcode.src_a, RESULT_64); - src_b = Compile_GetRegister(opcode.src_b, ebx); + if (!optimizer.zero_reg_skip || no_zero_reg_skip) { + src_a = Compile_GetRegister(opcode.src_a, RESULT); + src_b = Compile_GetRegister(opcode.src_b, eax); } else { if (!is_a_zero) { - src_a = Compile_GetRegister(opcode.src_a, RESULT_64); + src_a = Compile_GetRegister(opcode.src_a, RESULT); } if (!is_b_zero) { - src_b = Compile_GetRegister(opcode.src_b, ebx); + src_b = Compile_GetRegister(opcode.src_b, eax); } } - Xbyak::Label skip_carry{}; bool has_emitted = false; @@ -190,7 +185,8 @@ void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) { opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) { if (next_opcode.has_value()) { const auto next = *next_opcode; - if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod) { + if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod && + opcode.dst == next.dst) { return; } } @@ -244,10 +240,10 @@ void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) { } void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) { - auto dst = Compile_GetRegister(opcode.src_a, eax); - auto src = Compile_GetRegister(opcode.src_b, RESULT); + const auto dst = Compile_GetRegister(opcode.src_a, ecx); + const auto src = Compile_GetRegister(opcode.src_b, RESULT); - shr(src, al); + shr(src, dst.cvt8()); if (opcode.bf_size != 0 && opcode.bf_size != 31) { and_(src, opcode.GetBitfieldMask()); } else if (opcode.bf_size == 0) { @@ -263,8 +259,8 @@ void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) { } void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) { - auto dst = Compile_GetRegister(opcode.src_a, eax); - auto src = Compile_GetRegister(opcode.src_b, RESULT); + const auto dst = Compile_GetRegister(opcode.src_a, ecx); + const auto src = Compile_GetRegister(opcode.src_b, RESULT); if (opcode.bf_src_bit != 0) { shr(src, opcode.bf_src_bit); @@ -273,16 +269,9 @@ void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) { if (opcode.bf_size != 31) { and_(src, opcode.GetBitfieldMask()); } - shl(src, al); - Compile_ProcessResult(opcode.result_operation, opcode.dst); -} + shl(src, dst.cvt8()); -static u32 Read(Engines::Maxwell3D* maxwell3d, u32 method) { - return maxwell3d->GetRegisterValue(method); -} - -static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) { - maxwell3d->CallMethodFromMME(method_address.address, value); + Compile_ProcessResult(opcode.result_operation, opcode.dst); } void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) { @@ -302,22 +291,34 @@ void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) { sub(result, opcode.immediate * -1); } } - Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); - mov(Common::X64::ABI_PARAM1, qword[STATE]); - mov(Common::X64::ABI_PARAM2, RESULT); - Common::X64::CallFarFunction(*this, &Read); - Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); - mov(RESULT, Common::X64::ABI_RETURN.cvt32()); + + // Equivalent to Engines::Maxwell3D::GetRegisterValue: + if (optimizer.enable_asserts) { + Xbyak::Label pass_range_check; + cmp(RESULT, static_cast<u32>(Engines::Maxwell3D::Regs::NUM_REGS)); + jb(pass_range_check); + int3(); + L(pass_range_check); + } + mov(rax, qword[STATE]); + mov(RESULT, + dword[rax + offsetof(Engines::Maxwell3D, regs) + + offsetof(Engines::Maxwell3D::Regs, reg_array) + RESULT.cvt64() * sizeof(u32)]); + Compile_ProcessResult(opcode.result_operation, opcode.dst); } +static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) { + maxwell3d->CallMethodFromMME(method_address.address, value); +} + void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) { - Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); + Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); mov(Common::X64::ABI_PARAM1, qword[STATE]); mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS); mov(Common::X64::ABI_PARAM3, value); Common::X64::CallFarFunction(*this, &Send); - Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); + Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); Xbyak::Label dont_process{}; // Get increment @@ -329,7 +330,7 @@ void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) { and_(METHOD_ADDRESS, 0xfff); shr(ecx, 12); and_(ecx, 0x3f); - lea(eax, ptr[rcx + METHOD_ADDRESS_64]); + lea(eax, ptr[rcx + METHOD_ADDRESS.cvt64()]); sal(ecx, 12); or_(eax, ecx); @@ -421,19 +422,15 @@ void MacroJITx64Impl::Compile() { bool keep_executing = true; labels.fill(Xbyak::Label()); - Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); + Common::X64::ABI_PushRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); // JIT state mov(STATE, Common::X64::ABI_PARAM1); - mov(PARAMETERS, qword[Common::X64::ABI_PARAM1 + - static_cast<Xbyak::uint32>(offsetof(JITState, parameters))]); - mov(REGISTERS, Common::X64::ABI_PARAM1); - add(REGISTERS, static_cast<Xbyak::uint32>(offsetof(JITState, registers))); + mov(PARAMETERS, Common::X64::ABI_PARAM2); xor_(RESULT, RESULT); xor_(METHOD_ADDRESS, METHOD_ADDRESS); - xor_(NEXT_PARAMETER, NEXT_PARAMETER); xor_(BRANCH_HOLDER, BRANCH_HOLDER); - mov(dword[REGISTERS + 4], Compile_FetchParameter()); + mov(dword[STATE + offsetof(JITState, registers) + 4], Compile_FetchParameter()); // Track get register for zero registers and mark it as no-op optimizer.zero_reg_skip = true; @@ -446,6 +443,9 @@ void MacroJITx64Impl::Compile() { // one if our register isn't "dirty" optimizer.optimize_for_method_move = true; + // Enable run-time assertions in JITted code + optimizer.enable_asserts = false; + // Check to see if we can skip emitting certain instructions Optimizer_ScanFlags(); @@ -463,7 +463,7 @@ void MacroJITx64Impl::Compile() { L(end_of_code); - Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); + Common::X64::ABI_PopRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); ret(); ready(); program = getCode<ProgramType>(); @@ -537,8 +537,8 @@ bool MacroJITx64Impl::Compile_NextInstruction() { } Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() { - mov(eax, dword[PARAMETERS + NEXT_PARAMETER * sizeof(u32)]); - inc(NEXT_PARAMETER); + mov(eax, dword[PARAMETERS]); + add(PARAMETERS, sizeof(u32)); return eax; } @@ -547,41 +547,22 @@ Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) { // Register 0 is always zero xor_(dst, dst); } else { - mov(dst, dword[REGISTERS + index * sizeof(u32)]); - } - - return dst; -} - -Xbyak::Reg64 Tegra::MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg64 dst) { - if (index == 0) { - // Register 0 is always zero - xor_(dst, dst); - } else { - mov(dst, dword[REGISTERS + index * sizeof(u32)]); + mov(dst, dword[STATE + offsetof(JITState, registers) + index * sizeof(u32)]); } return dst; } -void Tegra::MacroJITx64Impl::Compile_WriteCarry(Xbyak::Reg64 dst) { - Xbyak::Label zero{}, end{}; - xor_(ecx, ecx); - shr(dst, 32); - setne(cl); - mov(dword[STATE + offsetof(JITState, carry_flag)], ecx); -} - void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) { - auto SetRegister = [=](u32 reg, Xbyak::Reg32 result) { + const auto SetRegister = [this](u32 reg, const Xbyak::Reg32& result) { // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero // register. if (reg == 0) { return; } - mov(dword[REGISTERS + reg * sizeof(u32)], result); + mov(dword[STATE + offsetof(JITState, registers) + reg * sizeof(u32)], result); }; - auto SetMethodAddress = [=](Xbyak::Reg32 reg) { mov(METHOD_ADDRESS, reg); }; + const auto SetMethodAddress = [this](const Xbyak::Reg32& reg) { mov(METHOD_ADDRESS, reg); }; switch (operation) { case Macro::ResultOperation::IgnoreAndFetch: diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h index 21ee157cf..a180e7428 100644 --- a/src/video_core/macro/macro_jit_x64.h +++ b/src/video_core/macro/macro_jit_x64.h @@ -55,8 +55,6 @@ private: Xbyak::Reg32 Compile_FetchParameter(); Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst); - Xbyak::Reg64 Compile_GetRegister(u32 index, Xbyak::Reg64 dst); - void Compile_WriteCarry(Xbyak::Reg64 dst); void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg); void Compile_Send(Xbyak::Reg32 value); @@ -67,11 +65,10 @@ private: struct JITState { Engines::Maxwell3D* maxwell3d{}; std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{}; - const u32* parameters{}; u32 carry_flag{}; }; static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0"); - using ProgramType = void (*)(JITState*); + using ProgramType = void (*)(JITState*, const u32*); struct OptimizerState { bool can_skip_carry{}; @@ -79,14 +76,15 @@ private: bool zero_reg_skip{}; bool skip_dummy_addimmediate{}; bool optimize_for_method_move{}; + bool enable_asserts{}; }; OptimizerState optimizer{}; std::optional<Macro::Opcode> next_opcode{}; ProgramType program{nullptr}; - std::array<Xbyak::Label, MAX_CODE_SIZE> labels{}; - std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip{}; + std::array<Xbyak::Label, MAX_CODE_SIZE> labels; + std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip; Xbyak::Label end_of_code{}; bool is_delay_slot{}; diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index dbee9f634..ff5505d12 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -210,10 +210,11 @@ bool MemoryManager::IsBlockContinuous(const GPUVAddr start, const std::size_t si return range == inner_size; } -void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const { +void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, + const std::size_t size) const { std::size_t remaining_size{size}; - std::size_t page_index{src_addr >> page_bits}; - std::size_t page_offset{src_addr & page_mask}; + std::size_t page_index{gpu_src_addr >> page_bits}; + std::size_t page_offset{gpu_src_addr & page_mask}; auto& memory = system.Memory(); @@ -234,11 +235,11 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::s } } -void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, +void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, const std::size_t size) const { std::size_t remaining_size{size}; - std::size_t page_index{src_addr >> page_bits}; - std::size_t page_offset{src_addr & page_mask}; + std::size_t page_index{gpu_src_addr >> page_bits}; + std::size_t page_offset{gpu_src_addr & page_mask}; auto& memory = system.Memory(); @@ -259,10 +260,11 @@ void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, } } -void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size) { +void MemoryManager::WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, + const std::size_t size) { std::size_t remaining_size{size}; - std::size_t page_index{dest_addr >> page_bits}; - std::size_t page_offset{dest_addr & page_mask}; + std::size_t page_index{gpu_dest_addr >> page_bits}; + std::size_t page_offset{gpu_dest_addr & page_mask}; auto& memory = system.Memory(); @@ -283,11 +285,11 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const } } -void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, +void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, const std::size_t size) { std::size_t remaining_size{size}; - std::size_t page_index{dest_addr >> page_bits}; - std::size_t page_offset{dest_addr & page_mask}; + std::size_t page_index{gpu_dest_addr >> page_bits}; + std::size_t page_offset{gpu_dest_addr & page_mask}; auto& memory = system.Memory(); @@ -306,16 +308,18 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, } } -void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) { +void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, + const std::size_t size) { std::vector<u8> tmp_buffer(size); - ReadBlock(src_addr, tmp_buffer.data(), size); - WriteBlock(dest_addr, tmp_buffer.data(), size); + ReadBlock(gpu_src_addr, tmp_buffer.data(), size); + WriteBlock(gpu_dest_addr, tmp_buffer.data(), size); } -void MemoryManager::CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) { +void MemoryManager::CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, + const std::size_t size) { std::vector<u8> tmp_buffer(size); - ReadBlockUnsafe(src_addr, tmp_buffer.data(), size); - WriteBlockUnsafe(dest_addr, tmp_buffer.data(), size); + ReadBlockUnsafe(gpu_src_addr, tmp_buffer.data(), size); + WriteBlockUnsafe(gpu_dest_addr, tmp_buffer.data(), size); } bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) { diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 0ddd52d5a..87658e87a 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -79,9 +79,9 @@ public: * in the Host Memory counterpart. Note: This functions cause Host GPU Memory * Flushes and Invalidations, respectively to each operation. */ - void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const; - void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size); - void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size); + void ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const; + void WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size); + void CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size); /** * ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and @@ -93,9 +93,9 @@ public: * WriteBlockUnsafe instead of WriteBlock since it shouldn't invalidate the texture * being flushed. */ - void ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const; - void WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, std::size_t size); - void CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size); + void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const; + void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size); + void CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size); /** * IsGranularRange checks if a gpu region can be simply read with a pointer diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h index 2f75f8801..e12dab899 100644 --- a/src/video_core/query_cache.h +++ b/src/video_core/query_cache.h @@ -220,8 +220,8 @@ private: return cache_begin < addr_end && addr_begin < cache_end; }; - const u64 page_end = addr_end >> PAGE_SHIFT; - for (u64 page = addr_begin >> PAGE_SHIFT; page <= page_end; ++page) { + const u64 page_end = addr_end >> PAGE_BITS; + for (u64 page = addr_begin >> PAGE_BITS; page <= page_end; ++page) { const auto& it = cached_queries.find(page); if (it == std::end(cached_queries)) { continue; @@ -242,14 +242,14 @@ private: /// Registers the passed parameters as cached and returns a pointer to the stored cached query. CachedQuery* Register(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr, bool timestamp) { rasterizer.UpdatePagesCachedCount(cpu_addr, CachedQuery::SizeInBytes(timestamp), 1); - const u64 page = static_cast<u64>(cpu_addr) >> PAGE_SHIFT; + const u64 page = static_cast<u64>(cpu_addr) >> PAGE_BITS; return &cached_queries[page].emplace_back(static_cast<QueryCache&>(*this), type, cpu_addr, host_ptr); } /// Tries to a get a cached query. Returns nullptr on failure. CachedQuery* TryGet(VAddr addr) { - const u64 page = static_cast<u64>(addr) >> PAGE_SHIFT; + const u64 page = static_cast<u64>(addr) >> PAGE_BITS; const auto it = cached_queries.find(page); if (it == std::end(cached_queries)) { return nullptr; @@ -268,7 +268,7 @@ private: } static constexpr std::uintptr_t PAGE_SIZE = 4096; - static constexpr unsigned PAGE_SHIFT = 12; + static constexpr unsigned PAGE_BITS = 12; Core::System& system; VideoCore::RasterizerInterface& rasterizer; diff --git a/src/video_core/rasterizer_cache.cpp b/src/video_core/rasterizer_cache.cpp deleted file mode 100644 index 093b2cdf4..000000000 --- a/src/video_core/rasterizer_cache.cpp +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "video_core/rasterizer_cache.h" - -RasterizerCacheObject::~RasterizerCacheObject() = default; diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h deleted file mode 100644 index 096ee337c..000000000 --- a/src/video_core/rasterizer_cache.h +++ /dev/null @@ -1,253 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <mutex> -#include <set> -#include <unordered_map> - -#include <boost/icl/interval_map.hpp> -#include <boost/range/iterator_range_core.hpp> - -#include "common/common_types.h" -#include "core/settings.h" -#include "video_core/gpu.h" -#include "video_core/rasterizer_interface.h" - -class RasterizerCacheObject { -public: - explicit RasterizerCacheObject(const VAddr cpu_addr) : cpu_addr{cpu_addr} {} - - virtual ~RasterizerCacheObject(); - - VAddr GetCpuAddr() const { - return cpu_addr; - } - - /// Gets the size of the shader in guest memory, required for cache management - virtual std::size_t GetSizeInBytes() const = 0; - - /// Sets whether the cached object should be considered registered - void SetIsRegistered(bool registered) { - is_registered = registered; - } - - /// Returns true if the cached object is registered - bool IsRegistered() const { - return is_registered; - } - - /// Returns true if the cached object is dirty - bool IsDirty() const { - return is_dirty; - } - - /// Returns ticks from when this cached object was last modified - u64 GetLastModifiedTicks() const { - return last_modified_ticks; - } - - /// Marks an object as recently modified, used to specify whether it is clean or dirty - template <class T> - void MarkAsModified(bool dirty, T& cache) { - is_dirty = dirty; - last_modified_ticks = cache.GetModifiedTicks(); - } - - void SetMemoryMarked(bool is_memory_marked_) { - is_memory_marked = is_memory_marked_; - } - - bool IsMemoryMarked() const { - return is_memory_marked; - } - - void SetSyncPending(bool is_sync_pending_) { - is_sync_pending = is_sync_pending_; - } - - bool IsSyncPending() const { - return is_sync_pending; - } - -private: - bool is_registered{}; ///< Whether the object is currently registered with the cache - bool is_dirty{}; ///< Whether the object is dirty (out of sync with guest memory) - bool is_memory_marked{}; ///< Whether the object is marking rasterizer memory. - bool is_sync_pending{}; ///< Whether the object is pending deletion. - u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing - VAddr cpu_addr{}; ///< Cpu address memory, unique from emulated virtual address space -}; - -template <class T> -class RasterizerCache : NonCopyable { - friend class RasterizerCacheObject; - -public: - explicit RasterizerCache(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {} - - /// Write any cached resources overlapping the specified region back to memory - void FlushRegion(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; - - const auto& objects{GetSortedObjectsFromRegion(addr, size)}; - for (auto& object : objects) { - FlushObject(object); - } - } - - /// Mark the specified region as being invalidated - void InvalidateRegion(VAddr addr, u64 size) { - std::lock_guard lock{mutex}; - - const auto& objects{GetSortedObjectsFromRegion(addr, size)}; - for (auto& object : objects) { - if (!object->IsRegistered()) { - // Skip duplicates - continue; - } - Unregister(object); - } - } - - void OnCPUWrite(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; - - for (const auto& object : GetSortedObjectsFromRegion(addr, size)) { - if (object->IsRegistered()) { - UnmarkMemory(object); - object->SetSyncPending(true); - marked_for_unregister.emplace_back(object); - } - } - } - - void SyncGuestHost() { - std::lock_guard lock{mutex}; - - for (const auto& object : marked_for_unregister) { - if (object->IsRegistered()) { - object->SetSyncPending(false); - Unregister(object); - } - } - marked_for_unregister.clear(); - } - - /// Invalidates everything in the cache - void InvalidateAll() { - std::lock_guard lock{mutex}; - - while (interval_cache.begin() != interval_cache.end()) { - Unregister(*interval_cache.begin()->second.begin()); - } - } - -protected: - /// Tries to get an object from the cache with the specified cache address - T TryGet(VAddr addr) const { - const auto iter = map_cache.find(addr); - if (iter != map_cache.end()) - return iter->second; - return nullptr; - } - - /// Register an object into the cache - virtual void Register(const T& object) { - std::lock_guard lock{mutex}; - - object->SetIsRegistered(true); - interval_cache.add({GetInterval(object), ObjectSet{object}}); - map_cache.insert({object->GetCpuAddr(), object}); - rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1); - object->SetMemoryMarked(true); - } - - /// Unregisters an object from the cache - virtual void Unregister(const T& object) { - std::lock_guard lock{mutex}; - - UnmarkMemory(object); - object->SetIsRegistered(false); - if (object->IsSyncPending()) { - marked_for_unregister.remove(object); - object->SetSyncPending(false); - } - const VAddr addr = object->GetCpuAddr(); - interval_cache.subtract({GetInterval(object), ObjectSet{object}}); - map_cache.erase(addr); - } - - void UnmarkMemory(const T& object) { - if (!object->IsMemoryMarked()) { - return; - } - rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1); - object->SetMemoryMarked(false); - } - - /// Returns a ticks counter used for tracking when cached objects were last modified - u64 GetModifiedTicks() { - std::lock_guard lock{mutex}; - - return ++modified_ticks; - } - - virtual void FlushObjectInner(const T& object) = 0; - - /// Flushes the specified object, updating appropriate cache state as needed - void FlushObject(const T& object) { - std::lock_guard lock{mutex}; - - if (!object->IsDirty()) { - return; - } - FlushObjectInner(object); - object->MarkAsModified(false, *this); - } - - std::recursive_mutex mutex; - -private: - /// Returns a list of cached objects from the specified memory region, ordered by access time - std::vector<T> GetSortedObjectsFromRegion(VAddr addr, u64 size) { - if (size == 0) { - return {}; - } - - std::vector<T> objects; - const ObjectInterval interval{addr, addr + size}; - for (auto& pair : boost::make_iterator_range(interval_cache.equal_range(interval))) { - for (auto& cached_object : pair.second) { - if (!cached_object) { - continue; - } - objects.push_back(cached_object); - } - } - - std::sort(objects.begin(), objects.end(), [](const T& a, const T& b) -> bool { - return a->GetLastModifiedTicks() < b->GetLastModifiedTicks(); - }); - - return objects; - } - - using ObjectSet = std::set<T>; - using ObjectCache = std::unordered_map<VAddr, T>; - using IntervalCache = boost::icl::interval_map<VAddr, ObjectSet>; - using ObjectInterval = typename IntervalCache::interval_type; - - static auto GetInterval(const T& object) { - return ObjectInterval::right_open(object->GetCpuAddr(), - object->GetCpuAddr() + object->GetSizeInBytes()); - } - - ObjectCache map_cache; - IntervalCache interval_cache; ///< Cache of objects - u64 modified_ticks{}; ///< Counter of cache state ticks, used for in-order flushing - VideoCore::RasterizerInterface& rasterizer; - std::list<T> marked_for_unregister; -}; diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp new file mode 100644 index 000000000..eb5158407 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp @@ -0,0 +1,2073 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <array> +#include <cstddef> +#include <string> +#include <string_view> +#include <utility> +#include <variant> + +#include <fmt/format.h> + +#include "common/alignment.h" +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/renderer_opengl/gl_arb_decompiler.h" +#include "video_core/renderer_opengl/gl_device.h" +#include "video_core/shader/registry.h" +#include "video_core/shader/shader_ir.h" + +// Predicates in the decompiled code follow the convention that -1 means true and 0 means false. +// GLASM lacks booleans, so they have to be implemented as integers. +// Using -1 for true is useful because both CMP.S and NOT.U can negate it, and CMP.S can be used to +// select between two values, because -1 will be evaluated as true and 0 as false. + +namespace OpenGL { + +namespace { + +using Tegra::Engines::ShaderType; +using Tegra::Shader::Attribute; +using Tegra::Shader::PixelImap; +using Tegra::Shader::Register; +using namespace VideoCommon::Shader; +using Operation = const OperationNode&; + +constexpr std::array INTERNAL_FLAG_NAMES = {"ZERO", "SIGN", "CARRY", "OVERFLOW"}; + +char Swizzle(std::size_t component) { + ASSERT(component < 4); + return component["xyzw"]; +} + +constexpr bool IsGenericAttribute(Attribute::Index index) { + return index >= Attribute::Index::Attribute_0 && index <= Attribute::Index::Attribute_31; +} + +u32 GetGenericAttributeIndex(Attribute::Index index) { + ASSERT(IsGenericAttribute(index)); + return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0); +} + +std::string_view Modifiers(Operation operation) { + const auto meta = std::get_if<MetaArithmetic>(&operation.GetMeta()); + if (meta && meta->precise) { + return ".PREC"; + } + return ""; +} + +std::string_view GetInputFlags(PixelImap attribute) { + switch (attribute) { + case PixelImap::Perspective: + return ""; + case PixelImap::Constant: + return "FLAT "; + case PixelImap::ScreenLinear: + return "NOPERSPECTIVE "; + case PixelImap::Unused: + break; + } + UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<int>(attribute)); + return {}; +} + +std::string_view ImageType(Tegra::Shader::ImageType image_type) { + switch (image_type) { + case Tegra::Shader::ImageType::Texture1D: + return "1D"; + case Tegra::Shader::ImageType::TextureBuffer: + return "BUFFER"; + case Tegra::Shader::ImageType::Texture1DArray: + return "ARRAY1D"; + case Tegra::Shader::ImageType::Texture2D: + return "2D"; + case Tegra::Shader::ImageType::Texture2DArray: + return "ARRAY2D"; + case Tegra::Shader::ImageType::Texture3D: + return "3D"; + } + UNREACHABLE(); + return {}; +} + +std::string_view StackName(MetaStackClass stack) { + switch (stack) { + case MetaStackClass::Ssy: + return "SSY"; + case MetaStackClass::Pbk: + return "PBK"; + } + UNREACHABLE(); + return ""; +}; + +std::string_view PrimitiveDescription(Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology topology) { + switch (topology) { + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Points: + return "POINTS"; + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Lines: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStrip: + return "LINES"; + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LinesAdjacency: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStripAdjacency: + return "LINES_ADJACENCY"; + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Triangles: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStrip: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleFan: + return "TRIANGLES"; + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency: + return "TRIANGLES_ADJACENCY"; + default: + UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology)); + return "POINTS"; + } +} + +std::string_view TopologyName(Tegra::Shader::OutputTopology topology) { + switch (topology) { + case Tegra::Shader::OutputTopology::PointList: + return "POINTS"; + case Tegra::Shader::OutputTopology::LineStrip: + return "LINE_STRIP"; + case Tegra::Shader::OutputTopology::TriangleStrip: + return "TRIANGLE_STRIP"; + default: + UNIMPLEMENTED_MSG("Unknown output topology: {}", static_cast<u32>(topology)); + return "points"; + } +} + +std::string_view StageInputName(ShaderType stage) { + switch (stage) { + case ShaderType::Vertex: + case ShaderType::Geometry: + return "vertex"; + case ShaderType::Fragment: + return "fragment"; + case ShaderType::Compute: + return "invocation"; + default: + UNREACHABLE(); + return ""; + } +} + +std::string TextureType(const MetaTexture& meta) { + if (meta.sampler.is_buffer) { + return "BUFFER"; + } + std::string type; + if (meta.sampler.is_shadow) { + type += "SHADOW"; + } + if (meta.sampler.is_array) { + type += "ARRAY"; + } + type += [&meta] { + switch (meta.sampler.type) { + case Tegra::Shader::TextureType::Texture1D: + return "1D"; + case Tegra::Shader::TextureType::Texture2D: + return "2D"; + case Tegra::Shader::TextureType::Texture3D: + return "3D"; + case Tegra::Shader::TextureType::TextureCube: + return "CUBE"; + } + UNREACHABLE(); + return "2D"; + }(); + return type; +} + +std::string GlobalMemoryName(const GlobalMemoryBase& base) { + return fmt::format("gmem{}_{}", base.cbuf_index, base.cbuf_offset); +} + +class ARBDecompiler final { +public: + explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, + ShaderType stage, std::string_view identifier); + + std::string Code() const { + return shader_source; + } + +private: + void DeclareHeader(); + void DeclareVertex(); + void DeclareGeometry(); + void DeclareFragment(); + void DeclareCompute(); + void DeclareInputAttributes(); + void DeclareOutputAttributes(); + void DeclareLocalMemory(); + void DeclareGlobalMemory(); + void DeclareConstantBuffers(); + void DeclareRegisters(); + void DeclareTemporaries(); + void DeclarePredicates(); + void DeclareInternalFlags(); + + void InitializeVariables(); + + void DecompileAST(); + void DecompileBranchMode(); + + void VisitAST(const ASTNode& node); + std::string VisitExpression(const Expr& node); + + void VisitBlock(const NodeBlock& bb); + + std::string Visit(const Node& node); + + std::pair<std::string, std::size_t> BuildCoords(Operation); + std::string BuildAoffi(Operation); + void Exit(); + + std::string Assign(Operation); + std::string Select(Operation); + std::string FClamp(Operation); + std::string FCastHalf0(Operation); + std::string FCastHalf1(Operation); + std::string FSqrt(Operation); + std::string FSwizzleAdd(Operation); + std::string HAdd2(Operation); + std::string HMul2(Operation); + std::string HFma2(Operation); + std::string HAbsolute(Operation); + std::string HNegate(Operation); + std::string HClamp(Operation); + std::string HCastFloat(Operation); + std::string HUnpack(Operation); + std::string HMergeF32(Operation); + std::string HMergeH0(Operation); + std::string HMergeH1(Operation); + std::string HPack2(Operation); + std::string LogicalAssign(Operation); + std::string LogicalPick2(Operation); + std::string LogicalAnd2(Operation); + std::string FloatOrdered(Operation); + std::string FloatUnordered(Operation); + std::string LogicalAddCarry(Operation); + std::string Texture(Operation); + std::string TextureGather(Operation); + std::string TextureQueryDimensions(Operation); + std::string TextureQueryLod(Operation); + std::string TexelFetch(Operation); + std::string TextureGradient(Operation); + std::string ImageLoad(Operation); + std::string ImageStore(Operation); + std::string Branch(Operation); + std::string BranchIndirect(Operation); + std::string PushFlowStack(Operation); + std::string PopFlowStack(Operation); + std::string Exit(Operation); + std::string Discard(Operation); + std::string EmitVertex(Operation); + std::string EndPrimitive(Operation); + std::string InvocationId(Operation); + std::string YNegate(Operation); + std::string ThreadId(Operation); + std::string ShuffleIndexed(Operation); + std::string Barrier(Operation); + std::string MemoryBarrierGroup(Operation); + std::string MemoryBarrierGlobal(Operation); + + template <const std::string_view& op> + std::string Unary(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("{}{} {}, {};", op, Modifiers(operation), temporary, Visit(operation[0])); + return temporary; + } + + template <const std::string_view& op> + std::string Binary(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("{}{} {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]), + Visit(operation[1])); + return temporary; + } + + template <const std::string_view& op> + std::string Trinary(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("{}{} {}, {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]), + Visit(operation[1]), Visit(operation[2])); + return temporary; + } + + template <const std::string_view& op, bool unordered> + std::string FloatComparison(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("TRUNC.U.CC RC.x, {};", Binary<op>(operation)); + AddLine("MOV.S {}, 0;", temporary); + AddLine("MOV.S {} (NE.x), -1;", temporary); + + const std::string op_a = Visit(operation[0]); + const std::string op_b = Visit(operation[1]); + if constexpr (unordered) { + AddLine("SNE.F RC.x, {}, {};", op_a, op_a); + AddLine("TRUNC.U.CC RC.x, RC.x;"); + AddLine("MOV.S {} (NE.x), -1;", temporary); + AddLine("SNE.F RC.x, {}, {};", op_b, op_b); + AddLine("TRUNC.U.CC RC.x, RC.x;"); + AddLine("MOV.S {} (NE.x), -1;", temporary); + } else if (op == SNE_F) { + AddLine("SNE.F RC.x, {}, {};", op_a, op_a); + AddLine("TRUNC.U.CC RC.x, RC.x;"); + AddLine("MOV.S {} (NE.x), 0;", temporary); + AddLine("SNE.F RC.x, {}, {};", op_b, op_b); + AddLine("TRUNC.U.CC RC.x, RC.x;"); + AddLine("MOV.S {} (NE.x), 0;", temporary); + } + return temporary; + } + + template <const std::string_view& op, bool is_nan> + std::string HalfComparison(Operation operation) { + std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + const std::string op_a = Visit(operation[0]); + const std::string op_b = Visit(operation[1]); + AddLine("UP2H.F {}, {};", tmp1, op_a); + AddLine("UP2H.F {}, {};", tmp2, op_b); + AddLine("{} {}, {}, {};", op, tmp1, tmp1, tmp2); + AddLine("TRUNC.U.CC RC.xy, {};", tmp1); + AddLine("MOV.S {}.xy, {{0, 0, 0, 0}};", tmp1); + AddLine("MOV.S {}.x (NE.x), -1;", tmp1); + AddLine("MOV.S {}.y (NE.y), -1;", tmp1); + if constexpr (is_nan) { + AddLine("MOVC.F RC.x, {};", op_a); + AddLine("MOV.S {}.x (NAN.x), -1;", tmp1); + AddLine("MOVC.F RC.x, {};", op_b); + AddLine("MOV.S {}.y (NAN.x), -1;", tmp1); + } + return tmp1; + } + + template <const std::string_view& op, const std::string_view& type> + std::string AtomicImage(Operation operation) { + const auto& meta = std::get<MetaImage>(operation.GetMeta()); + const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index; + const std::size_t num_coords = operation.GetOperandsCount(); + const std::size_t num_values = meta.values.size(); + + const std::string coord = AllocVectorTemporary(); + const std::string value = AllocVectorTemporary(); + for (std::size_t i = 0; i < num_coords; ++i) { + AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i])); + } + for (std::size_t i = 0; i < num_values; ++i) { + AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i])); + } + + AddLine("ATOMIM.{}.{} {}.x, {}, {}, image[{}], {};", op, type, coord, value, coord, + image_id, ImageType(meta.image.type)); + return fmt::format("{}.x", coord); + } + + template <const std::string_view& op, const std::string_view& type> + std::string Atomic(Operation operation) { + std::string temporary = AllocTemporary(); + std::string address; + std::string_view opname; + if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) { + AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()), + Visit(gmem->GetBaseAddress())); + address = fmt::format("{}[{}]", GlobalMemoryName(gmem->GetDescriptor()), temporary); + opname = "ATOMB"; + } else if (const auto smem = std::get_if<SmemNode>(&*operation[0])) { + address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress())); + opname = "ATOMS"; + } else { + UNREACHABLE(); + return "{0, 0, 0, 0}"; + } + AddLine("{}.{}.{} {}, {}, {};", opname, op, type, temporary, Visit(operation[1]), address); + return temporary; + } + + template <char type> + std::string Negate(Operation operation) { + std::string temporary = AllocTemporary(); + if constexpr (type == 'F') { + AddLine("MOV.F32 {}, -{};", temporary, Visit(operation[0])); + } else { + AddLine("MOV.{} {}, -{};", type, temporary, Visit(operation[0])); + } + return temporary; + } + + template <char type> + std::string Absolute(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("MOV.{} {}, |{}|;", type, temporary, Visit(operation[0])); + return temporary; + } + + template <char type> + std::string BitfieldInsert(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[3])); + AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[2])); + AddLine("BFI.{} {}.x, {}, {}, {};", type, temporary, temporary, Visit(operation[1]), + Visit(operation[0])); + return fmt::format("{}.x", temporary); + } + + template <char type> + std::string BitfieldExtract(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[2])); + AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[1])); + AddLine("BFE.{} {}.x, {}, {};", type, temporary, temporary, Visit(operation[0])); + return fmt::format("{}.x", temporary); + } + + template <char swizzle> + std::string LocalInvocationId(Operation) { + return fmt::format("invocation.localid.{}", swizzle); + } + + template <char swizzle> + std::string WorkGroupId(Operation) { + return fmt::format("invocation.groupid.{}", swizzle); + } + + template <char c1, char c2> + std::string ThreadMask(Operation) { + return fmt::format("{}.thread{}{}mask", StageInputName(stage), c1, c2); + } + + template <typename... Args> + void AddExpression(std::string_view text, Args&&... args) { + shader_source += fmt::format(text, std::forward<Args>(args)...); + } + + template <typename... Args> + void AddLine(std::string_view text, Args&&... args) { + AddExpression(text, std::forward<Args>(args)...); + shader_source += '\n'; + } + + std::string AllocTemporary() { + max_temporaries = std::max(max_temporaries, num_temporaries + 1); + return fmt::format("T{}.x", num_temporaries++); + } + + std::string AllocVectorTemporary() { + max_temporaries = std::max(max_temporaries, num_temporaries + 1); + return fmt::format("T{}", num_temporaries++); + } + + void ResetTemporaries() noexcept { + num_temporaries = 0; + } + + const Device& device; + const ShaderIR& ir; + const Registry& registry; + const ShaderType stage; + + std::size_t num_temporaries = 0; + std::size_t max_temporaries = 0; + + std::string shader_source; + + static constexpr std::string_view ADD_F32 = "ADD.F32"; + static constexpr std::string_view ADD_S = "ADD.S"; + static constexpr std::string_view ADD_U = "ADD.U"; + static constexpr std::string_view MUL_F32 = "MUL.F32"; + static constexpr std::string_view MUL_S = "MUL.S"; + static constexpr std::string_view MUL_U = "MUL.U"; + static constexpr std::string_view DIV_F32 = "DIV.F32"; + static constexpr std::string_view DIV_S = "DIV.S"; + static constexpr std::string_view DIV_U = "DIV.U"; + static constexpr std::string_view MAD_F32 = "MAD.F32"; + static constexpr std::string_view RSQ_F32 = "RSQ.F32"; + static constexpr std::string_view COS_F32 = "COS.F32"; + static constexpr std::string_view SIN_F32 = "SIN.F32"; + static constexpr std::string_view EX2_F32 = "EX2.F32"; + static constexpr std::string_view LG2_F32 = "LG2.F32"; + static constexpr std::string_view SLT_F = "SLT.F32"; + static constexpr std::string_view SLT_S = "SLT.S"; + static constexpr std::string_view SLT_U = "SLT.U"; + static constexpr std::string_view SEQ_F = "SEQ.F32"; + static constexpr std::string_view SEQ_S = "SEQ.S"; + static constexpr std::string_view SEQ_U = "SEQ.U"; + static constexpr std::string_view SLE_F = "SLE.F32"; + static constexpr std::string_view SLE_S = "SLE.S"; + static constexpr std::string_view SLE_U = "SLE.U"; + static constexpr std::string_view SGT_F = "SGT.F32"; + static constexpr std::string_view SGT_S = "SGT.S"; + static constexpr std::string_view SGT_U = "SGT.U"; + static constexpr std::string_view SNE_F = "SNE.F32"; + static constexpr std::string_view SNE_S = "SNE.S"; + static constexpr std::string_view SNE_U = "SNE.U"; + static constexpr std::string_view SGE_F = "SGE.F32"; + static constexpr std::string_view SGE_S = "SGE.S"; + static constexpr std::string_view SGE_U = "SGE.U"; + static constexpr std::string_view AND_S = "AND.S"; + static constexpr std::string_view AND_U = "AND.U"; + static constexpr std::string_view TRUNC_F = "TRUNC.F"; + static constexpr std::string_view TRUNC_S = "TRUNC.S"; + static constexpr std::string_view TRUNC_U = "TRUNC.U"; + static constexpr std::string_view SHL_S = "SHL.S"; + static constexpr std::string_view SHL_U = "SHL.U"; + static constexpr std::string_view SHR_S = "SHR.S"; + static constexpr std::string_view SHR_U = "SHR.U"; + static constexpr std::string_view OR_S = "OR.S"; + static constexpr std::string_view OR_U = "OR.U"; + static constexpr std::string_view XOR_S = "XOR.S"; + static constexpr std::string_view XOR_U = "XOR.U"; + static constexpr std::string_view NOT_S = "NOT.S"; + static constexpr std::string_view NOT_U = "NOT.U"; + static constexpr std::string_view BTC_S = "BTC.S"; + static constexpr std::string_view BTC_U = "BTC.U"; + static constexpr std::string_view BTFM_S = "BTFM.S"; + static constexpr std::string_view BTFM_U = "BTFM.U"; + static constexpr std::string_view ROUND_F = "ROUND.F"; + static constexpr std::string_view CEIL_F = "CEIL.F"; + static constexpr std::string_view FLR_F = "FLR.F"; + static constexpr std::string_view I2F_S = "I2F.S"; + static constexpr std::string_view I2F_U = "I2F.U"; + static constexpr std::string_view MIN_F = "MIN.F"; + static constexpr std::string_view MIN_S = "MIN.S"; + static constexpr std::string_view MIN_U = "MIN.U"; + static constexpr std::string_view MAX_F = "MAX.F"; + static constexpr std::string_view MAX_S = "MAX.S"; + static constexpr std::string_view MAX_U = "MAX.U"; + static constexpr std::string_view MOV_U = "MOV.U"; + static constexpr std::string_view TGBALLOT_U = "TGBALLOT.U"; + static constexpr std::string_view TGALL_U = "TGALL.U"; + static constexpr std::string_view TGANY_U = "TGANY.U"; + static constexpr std::string_view TGEQ_U = "TGEQ.U"; + static constexpr std::string_view EXCH = "EXCH"; + static constexpr std::string_view ADD = "ADD"; + static constexpr std::string_view MIN = "MIN"; + static constexpr std::string_view MAX = "MAX"; + static constexpr std::string_view AND = "AND"; + static constexpr std::string_view OR = "OR"; + static constexpr std::string_view XOR = "XOR"; + static constexpr std::string_view U32 = "U32"; + static constexpr std::string_view S32 = "S32"; + + static constexpr std::size_t NUM_ENTRIES = static_cast<std::size_t>(OperationCode::Amount); + using DecompilerType = std::string (ARBDecompiler::*)(Operation); + static constexpr std::array<DecompilerType, NUM_ENTRIES> OPERATION_DECOMPILERS = { + &ARBDecompiler::Assign, + + &ARBDecompiler::Select, + + &ARBDecompiler::Binary<ADD_F32>, + &ARBDecompiler::Binary<MUL_F32>, + &ARBDecompiler::Binary<DIV_F32>, + &ARBDecompiler::Trinary<MAD_F32>, + &ARBDecompiler::Negate<'F'>, + &ARBDecompiler::Absolute<'F'>, + &ARBDecompiler::FClamp, + &ARBDecompiler::FCastHalf0, + &ARBDecompiler::FCastHalf1, + &ARBDecompiler::Binary<MIN_F>, + &ARBDecompiler::Binary<MAX_F>, + &ARBDecompiler::Unary<COS_F32>, + &ARBDecompiler::Unary<SIN_F32>, + &ARBDecompiler::Unary<EX2_F32>, + &ARBDecompiler::Unary<LG2_F32>, + &ARBDecompiler::Unary<RSQ_F32>, + &ARBDecompiler::FSqrt, + &ARBDecompiler::Unary<ROUND_F>, + &ARBDecompiler::Unary<FLR_F>, + &ARBDecompiler::Unary<CEIL_F>, + &ARBDecompiler::Unary<TRUNC_F>, + &ARBDecompiler::Unary<I2F_S>, + &ARBDecompiler::Unary<I2F_U>, + &ARBDecompiler::FSwizzleAdd, + + &ARBDecompiler::Binary<ADD_S>, + &ARBDecompiler::Binary<MUL_S>, + &ARBDecompiler::Binary<DIV_S>, + &ARBDecompiler::Negate<'S'>, + &ARBDecompiler::Absolute<'S'>, + &ARBDecompiler::Binary<MIN_S>, + &ARBDecompiler::Binary<MAX_S>, + + &ARBDecompiler::Unary<TRUNC_S>, + &ARBDecompiler::Unary<MOV_U>, + &ARBDecompiler::Binary<SHL_S>, + &ARBDecompiler::Binary<SHR_U>, + &ARBDecompiler::Binary<SHR_S>, + &ARBDecompiler::Binary<AND_S>, + &ARBDecompiler::Binary<OR_S>, + &ARBDecompiler::Binary<XOR_S>, + &ARBDecompiler::Unary<NOT_S>, + &ARBDecompiler::BitfieldInsert<'S'>, + &ARBDecompiler::BitfieldExtract<'S'>, + &ARBDecompiler::Unary<BTC_S>, + &ARBDecompiler::Unary<BTFM_S>, + + &ARBDecompiler::Binary<ADD_U>, + &ARBDecompiler::Binary<MUL_U>, + &ARBDecompiler::Binary<DIV_U>, + &ARBDecompiler::Binary<MIN_U>, + &ARBDecompiler::Binary<MAX_U>, + &ARBDecompiler::Unary<TRUNC_U>, + &ARBDecompiler::Unary<MOV_U>, + &ARBDecompiler::Binary<SHL_U>, + &ARBDecompiler::Binary<SHR_U>, + &ARBDecompiler::Binary<SHR_U>, + &ARBDecompiler::Binary<AND_U>, + &ARBDecompiler::Binary<OR_U>, + &ARBDecompiler::Binary<XOR_U>, + &ARBDecompiler::Unary<NOT_U>, + &ARBDecompiler::BitfieldInsert<'U'>, + &ARBDecompiler::BitfieldExtract<'U'>, + &ARBDecompiler::Unary<BTC_U>, + &ARBDecompiler::Unary<BTFM_U>, + + &ARBDecompiler::HAdd2, + &ARBDecompiler::HMul2, + &ARBDecompiler::HFma2, + &ARBDecompiler::HAbsolute, + &ARBDecompiler::HNegate, + &ARBDecompiler::HClamp, + &ARBDecompiler::HCastFloat, + &ARBDecompiler::HUnpack, + &ARBDecompiler::HMergeF32, + &ARBDecompiler::HMergeH0, + &ARBDecompiler::HMergeH1, + &ARBDecompiler::HPack2, + + &ARBDecompiler::LogicalAssign, + &ARBDecompiler::Binary<AND_U>, + &ARBDecompiler::Binary<OR_U>, + &ARBDecompiler::Binary<XOR_U>, + &ARBDecompiler::Unary<NOT_U>, + &ARBDecompiler::LogicalPick2, + &ARBDecompiler::LogicalAnd2, + + &ARBDecompiler::FloatComparison<SLT_F, false>, + &ARBDecompiler::FloatComparison<SEQ_F, false>, + &ARBDecompiler::FloatComparison<SLE_F, false>, + &ARBDecompiler::FloatComparison<SGT_F, false>, + &ARBDecompiler::FloatComparison<SNE_F, false>, + &ARBDecompiler::FloatComparison<SGE_F, false>, + &ARBDecompiler::FloatOrdered, + &ARBDecompiler::FloatUnordered, + &ARBDecompiler::FloatComparison<SLT_F, true>, + &ARBDecompiler::FloatComparison<SEQ_F, true>, + &ARBDecompiler::FloatComparison<SLE_F, true>, + &ARBDecompiler::FloatComparison<SGT_F, true>, + &ARBDecompiler::FloatComparison<SNE_F, true>, + &ARBDecompiler::FloatComparison<SGE_F, true>, + + &ARBDecompiler::Binary<SLT_S>, + &ARBDecompiler::Binary<SEQ_S>, + &ARBDecompiler::Binary<SLE_S>, + &ARBDecompiler::Binary<SGT_S>, + &ARBDecompiler::Binary<SNE_S>, + &ARBDecompiler::Binary<SGE_S>, + + &ARBDecompiler::Binary<SLT_U>, + &ARBDecompiler::Binary<SEQ_U>, + &ARBDecompiler::Binary<SLE_U>, + &ARBDecompiler::Binary<SGT_U>, + &ARBDecompiler::Binary<SNE_U>, + &ARBDecompiler::Binary<SGE_U>, + + &ARBDecompiler::LogicalAddCarry, + + &ARBDecompiler::HalfComparison<SLT_F, false>, + &ARBDecompiler::HalfComparison<SEQ_F, false>, + &ARBDecompiler::HalfComparison<SLE_F, false>, + &ARBDecompiler::HalfComparison<SGT_F, false>, + &ARBDecompiler::HalfComparison<SNE_F, false>, + &ARBDecompiler::HalfComparison<SGE_F, false>, + &ARBDecompiler::HalfComparison<SLT_F, true>, + &ARBDecompiler::HalfComparison<SEQ_F, true>, + &ARBDecompiler::HalfComparison<SLE_F, true>, + &ARBDecompiler::HalfComparison<SGT_F, true>, + &ARBDecompiler::HalfComparison<SNE_F, true>, + &ARBDecompiler::HalfComparison<SGE_F, true>, + + &ARBDecompiler::Texture, + &ARBDecompiler::Texture, + &ARBDecompiler::TextureGather, + &ARBDecompiler::TextureQueryDimensions, + &ARBDecompiler::TextureQueryLod, + &ARBDecompiler::TexelFetch, + &ARBDecompiler::TextureGradient, + + &ARBDecompiler::ImageLoad, + &ARBDecompiler::ImageStore, + + &ARBDecompiler::AtomicImage<ADD, U32>, + &ARBDecompiler::AtomicImage<AND, U32>, + &ARBDecompiler::AtomicImage<OR, U32>, + &ARBDecompiler::AtomicImage<XOR, U32>, + &ARBDecompiler::AtomicImage<EXCH, U32>, + + &ARBDecompiler::Atomic<EXCH, U32>, + &ARBDecompiler::Atomic<ADD, U32>, + &ARBDecompiler::Atomic<MIN, U32>, + &ARBDecompiler::Atomic<MAX, U32>, + &ARBDecompiler::Atomic<AND, U32>, + &ARBDecompiler::Atomic<OR, U32>, + &ARBDecompiler::Atomic<XOR, U32>, + + &ARBDecompiler::Atomic<EXCH, S32>, + &ARBDecompiler::Atomic<ADD, S32>, + &ARBDecompiler::Atomic<MIN, S32>, + &ARBDecompiler::Atomic<MAX, S32>, + &ARBDecompiler::Atomic<AND, S32>, + &ARBDecompiler::Atomic<OR, S32>, + &ARBDecompiler::Atomic<XOR, S32>, + + &ARBDecompiler::Atomic<ADD, U32>, + &ARBDecompiler::Atomic<MIN, U32>, + &ARBDecompiler::Atomic<MAX, U32>, + &ARBDecompiler::Atomic<AND, U32>, + &ARBDecompiler::Atomic<OR, U32>, + &ARBDecompiler::Atomic<XOR, U32>, + + &ARBDecompiler::Atomic<ADD, S32>, + &ARBDecompiler::Atomic<MIN, S32>, + &ARBDecompiler::Atomic<MAX, S32>, + &ARBDecompiler::Atomic<AND, S32>, + &ARBDecompiler::Atomic<OR, S32>, + &ARBDecompiler::Atomic<XOR, S32>, + + &ARBDecompiler::Branch, + &ARBDecompiler::BranchIndirect, + &ARBDecompiler::PushFlowStack, + &ARBDecompiler::PopFlowStack, + &ARBDecompiler::Exit, + &ARBDecompiler::Discard, + + &ARBDecompiler::EmitVertex, + &ARBDecompiler::EndPrimitive, + + &ARBDecompiler::InvocationId, + &ARBDecompiler::YNegate, + &ARBDecompiler::LocalInvocationId<'x'>, + &ARBDecompiler::LocalInvocationId<'y'>, + &ARBDecompiler::LocalInvocationId<'z'>, + &ARBDecompiler::WorkGroupId<'x'>, + &ARBDecompiler::WorkGroupId<'y'>, + &ARBDecompiler::WorkGroupId<'z'>, + + &ARBDecompiler::Unary<TGBALLOT_U>, + &ARBDecompiler::Unary<TGALL_U>, + &ARBDecompiler::Unary<TGANY_U>, + &ARBDecompiler::Unary<TGEQ_U>, + + &ARBDecompiler::ThreadId, + &ARBDecompiler::ThreadMask<'e', 'q'>, + &ARBDecompiler::ThreadMask<'g', 'e'>, + &ARBDecompiler::ThreadMask<'g', 't'>, + &ARBDecompiler::ThreadMask<'l', 'e'>, + &ARBDecompiler::ThreadMask<'l', 't'>, + &ARBDecompiler::ShuffleIndexed, + + &ARBDecompiler::Barrier, + &ARBDecompiler::MemoryBarrierGroup, + &ARBDecompiler::MemoryBarrierGlobal, + }; +}; + +ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, + ShaderType stage, std::string_view identifier) + : device{device}, ir{ir}, registry{registry}, stage{stage} { + AddLine("TEMP RC;"); + AddLine("TEMP FSWZA[4];"); + AddLine("TEMP FSWZB[4];"); + if (ir.IsDecompiled()) { + DecompileAST(); + } else { + DecompileBranchMode(); + } + AddLine("END"); + + const std::string code = std::move(shader_source); + DeclareHeader(); + DeclareVertex(); + DeclareGeometry(); + DeclareFragment(); + DeclareCompute(); + DeclareInputAttributes(); + DeclareOutputAttributes(); + DeclareLocalMemory(); + DeclareGlobalMemory(); + DeclareConstantBuffers(); + DeclareRegisters(); + DeclareTemporaries(); + DeclarePredicates(); + DeclareInternalFlags(); + + shader_source += code; +} + +std::string_view HeaderStageName(ShaderType stage) { + switch (stage) { + case ShaderType::Vertex: + return "vp"; + case ShaderType::Geometry: + return "gp"; + case ShaderType::Fragment: + return "fp"; + case ShaderType::Compute: + return "cp"; + default: + UNREACHABLE(); + return ""; + } +} + +void ARBDecompiler::DeclareHeader() { + AddLine("!!NV{}5.0", HeaderStageName(stage)); + // Enabling this allows us to cheat on some instructions like TXL with SHADOWARRAY2D + AddLine("OPTION NV_internal;"); + AddLine("OPTION NV_gpu_program_fp64;"); + AddLine("OPTION NV_shader_storage_buffer;"); + AddLine("OPTION NV_shader_thread_group;"); + if (ir.UsesWarps() && device.HasWarpIntrinsics()) { + AddLine("OPTION NV_shader_thread_shuffle;"); + } + if (stage == ShaderType::Vertex) { + if (device.HasNvViewportArray2()) { + AddLine("OPTION NV_viewport_array2;"); + } + } + if (stage == ShaderType::Fragment) { + AddLine("OPTION ARB_draw_buffers;"); + } + if (device.HasImageLoadFormatted()) { + AddLine("OPTION EXT_shader_image_load_formatted;"); + } +} + +void ARBDecompiler::DeclareVertex() { + if (stage != ShaderType::Vertex) { + return; + } + AddLine("OUTPUT result_clip[] = {{ result.clip[0..7] }};"); +} + +void ARBDecompiler::DeclareGeometry() { + if (stage != ShaderType::Geometry) { + return; + } + const auto& info = registry.GetGraphicsInfo(); + const auto& header = ir.GetHeader(); + AddLine("PRIMITIVE_IN {};", PrimitiveDescription(info.primitive_topology)); + AddLine("PRIMITIVE_OUT {};", TopologyName(header.common3.output_topology)); + AddLine("VERTICES_OUT {};", header.common4.max_output_vertices.Value()); + AddLine("ATTRIB vertex_position = vertex.position;"); +} + +void ARBDecompiler::DeclareFragment() { + if (stage != ShaderType::Fragment) { + return; + } + AddLine("OUTPUT result_color7 = result.color[7];"); + AddLine("OUTPUT result_color6 = result.color[6];"); + AddLine("OUTPUT result_color5 = result.color[5];"); + AddLine("OUTPUT result_color4 = result.color[4];"); + AddLine("OUTPUT result_color3 = result.color[3];"); + AddLine("OUTPUT result_color2 = result.color[2];"); + AddLine("OUTPUT result_color1 = result.color[1];"); + AddLine("OUTPUT result_color0 = result.color;"); +} + +void ARBDecompiler::DeclareCompute() { + if (stage != ShaderType::Compute) { + return; + } + const ComputeInfo& info = registry.GetComputeInfo(); + AddLine("GROUP_SIZE {} {} {};", info.workgroup_size[0], info.workgroup_size[1], + info.workgroup_size[2]); + if (info.shared_memory_size_in_words > 0) { + const u32 size_in_bytes = info.shared_memory_size_in_words * 4; + AddLine("SHARED_MEMORY {};", size_in_bytes); + AddLine("SHARED shared_mem[] = {{program.sharedmem}};"); + } +} + +void ARBDecompiler::DeclareInputAttributes() { + if (stage == ShaderType::Compute) { + return; + } + const std::string_view stage_name = StageInputName(stage); + for (const auto attribute : ir.GetInputAttributes()) { + if (!IsGenericAttribute(attribute)) { + continue; + } + const u32 index = GetGenericAttributeIndex(attribute); + + std::string_view suffix; + if (stage == ShaderType::Fragment) { + const auto input_mode{ir.GetHeader().ps.GetPixelImap(index)}; + if (input_mode == PixelImap::Unused) { + return; + } + suffix = GetInputFlags(input_mode); + } + AddLine("{}ATTRIB in_attr{}[] = {{ {}.attrib[{}..{}] }};", suffix, index, stage_name, index, + index); + } +} + +void ARBDecompiler::DeclareOutputAttributes() { + if (stage == ShaderType::Compute) { + return; + } + for (const auto attribute : ir.GetOutputAttributes()) { + if (!IsGenericAttribute(attribute)) { + continue; + } + const u32 index = GetGenericAttributeIndex(attribute); + AddLine("OUTPUT out_attr{}[] = {{ result.attrib[{}..{}] }};", index, index, index); + } +} + +void ARBDecompiler::DeclareLocalMemory() { + u64 size = 0; + if (stage == ShaderType::Compute) { + size = registry.GetComputeInfo().local_memory_size_in_words * 4ULL; + } else { + size = ir.GetHeader().GetLocalMemorySize(); + } + if (size == 0) { + return; + } + const u64 element_count = Common::AlignUp(size, 4) / 4; + AddLine("TEMP lmem[{}];", element_count); +} + +void ARBDecompiler::DeclareGlobalMemory() { + u32 binding = 0; // device.GetBaseBindings(stage).shader_storage_buffer; + for (const auto& pair : ir.GetGlobalMemory()) { + const auto& base = pair.first; + AddLine("STORAGE {}[] = {{ program.storage[{}] }};", GlobalMemoryName(base), binding); + ++binding; + } +} + +void ARBDecompiler::DeclareConstantBuffers() { + u32 binding = 0; + for (const auto& cbuf : ir.GetConstantBuffers()) { + AddLine("CBUFFER cbuf{}[] = {{ program.buffer[{}] }};", cbuf.first, binding); + ++binding; + } +} + +void ARBDecompiler::DeclareRegisters() { + for (const u32 gpr : ir.GetRegisters()) { + AddLine("TEMP R{};", gpr); + } +} + +void ARBDecompiler::DeclareTemporaries() { + for (std::size_t i = 0; i < max_temporaries; ++i) { + AddLine("TEMP T{};", i); + } +} + +void ARBDecompiler::DeclarePredicates() { + for (const Tegra::Shader::Pred pred : ir.GetPredicates()) { + AddLine("TEMP P{};", static_cast<u64>(pred)); + } +} + +void ARBDecompiler::DeclareInternalFlags() { + for (const char* name : INTERNAL_FLAG_NAMES) { + AddLine("TEMP {};", name); + } +} + +void ARBDecompiler::InitializeVariables() { + AddLine("MOV.F32 FSWZA[0], -1;"); + AddLine("MOV.F32 FSWZA[1], 1;"); + AddLine("MOV.F32 FSWZA[2], -1;"); + AddLine("MOV.F32 FSWZA[3], 0;"); + AddLine("MOV.F32 FSWZB[0], -1;"); + AddLine("MOV.F32 FSWZB[1], -1;"); + AddLine("MOV.F32 FSWZB[2], 1;"); + AddLine("MOV.F32 FSWZB[3], -1;"); + + if (stage == ShaderType::Vertex || stage == ShaderType::Geometry) { + AddLine("MOV.F result.position, {{0, 0, 0, 1}};"); + } + for (const auto attribute : ir.GetOutputAttributes()) { + if (!IsGenericAttribute(attribute)) { + continue; + } + const u32 index = GetGenericAttributeIndex(attribute); + AddLine("MOV.F result.attrib[{}], {{0, 0, 0, 1}};", index); + } + for (const u32 gpr : ir.GetRegisters()) { + AddLine("MOV.F R{}, {{0, 0, 0, 0}};", gpr); + } + for (const Tegra::Shader::Pred pred : ir.GetPredicates()) { + AddLine("MOV.U P{}, {{0, 0, 0, 0}};", static_cast<u64>(pred)); + } +} + +void ARBDecompiler::DecompileAST() { + const u32 num_flow_variables = ir.GetASTNumVariables(); + for (u32 i = 0; i < num_flow_variables; ++i) { + AddLine("TEMP F{};", i); + } + for (u32 i = 0; i < num_flow_variables; ++i) { + AddLine("MOV.U F{}, {{0, 0, 0, 0}};", i); + } + + InitializeVariables(); + + VisitAST(ir.GetASTProgram()); +} + +void ARBDecompiler::DecompileBranchMode() { + static constexpr u32 FLOW_STACK_SIZE = 20; + if (!ir.IsFlowStackDisabled()) { + AddLine("TEMP SSY[{}];", FLOW_STACK_SIZE); + AddLine("TEMP PBK[{}];", FLOW_STACK_SIZE); + AddLine("TEMP SSY_TOP;"); + AddLine("TEMP PBK_TOP;"); + } + + AddLine("TEMP PC;"); + + if (!ir.IsFlowStackDisabled()) { + AddLine("MOV.U SSY_TOP.x, 0;"); + AddLine("MOV.U PBK_TOP.x, 0;"); + } + + InitializeVariables(); + + const auto basic_block_end = ir.GetBasicBlocks().end(); + auto basic_block_it = ir.GetBasicBlocks().begin(); + const u32 first_address = basic_block_it->first; + AddLine("MOV.U PC.x, {};", first_address); + + AddLine("REP;"); + + std::size_t num_blocks = 0; + while (basic_block_it != basic_block_end) { + const auto& [address, bb] = *basic_block_it; + ++num_blocks; + + AddLine("SEQ.S.CC RC.x, PC.x, {};", address); + AddLine("IF NE.x;"); + + VisitBlock(bb); + + ++basic_block_it; + + if (basic_block_it != basic_block_end) { + const auto op = std::get_if<OperationNode>(&*bb[bb.size() - 1]); + if (!op || op->GetCode() != OperationCode::Branch) { + const u32 next_address = basic_block_it->first; + AddLine("MOV.U PC.x, {};", next_address); + AddLine("CONT;"); + } + } + + AddLine("ELSE;"); + } + AddLine("RET;"); + while (num_blocks--) { + AddLine("ENDIF;"); + } + + AddLine("ENDREP;"); +} + +void ARBDecompiler::VisitAST(const ASTNode& node) { + if (const auto ast = std::get_if<ASTProgram>(&*node->GetInnerData())) { + for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + VisitAST(current); + } + } else if (const auto ast = std::get_if<ASTIfThen>(&*node->GetInnerData())) { + const std::string condition = VisitExpression(ast->condition); + ResetTemporaries(); + + AddLine("MOVC.U RC.x, {};", condition); + AddLine("IF NE.x;"); + for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + VisitAST(current); + } + AddLine("ENDIF;"); + } else if (const auto ast = std::get_if<ASTIfElse>(&*node->GetInnerData())) { + AddLine("ELSE;"); + for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + VisitAST(current); + } + } else if (const auto ast = std::get_if<ASTBlockDecoded>(&*node->GetInnerData())) { + VisitBlock(ast->nodes); + } else if (const auto ast = std::get_if<ASTVarSet>(&*node->GetInnerData())) { + AddLine("MOV.U F{}, {};", ast->index, VisitExpression(ast->condition)); + ResetTemporaries(); + } else if (const auto ast = std::get_if<ASTDoWhile>(&*node->GetInnerData())) { + const std::string condition = VisitExpression(ast->condition); + ResetTemporaries(); + AddLine("REP;"); + for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + VisitAST(current); + } + AddLine("MOVC.U RC.x, {};", condition); + AddLine("BRK (NE.x);"); + AddLine("ENDREP;"); + } else if (const auto ast = std::get_if<ASTReturn>(&*node->GetInnerData())) { + const bool is_true = ExprIsTrue(ast->condition); + if (!is_true) { + AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition)); + AddLine("IF NE.x;"); + ResetTemporaries(); + } + if (ast->kills) { + AddLine("KIL TR;"); + } else { + Exit(); + } + if (!is_true) { + AddLine("ENDIF;"); + } + } else if (const auto ast = std::get_if<ASTBreak>(&*node->GetInnerData())) { + if (ExprIsTrue(ast->condition)) { + AddLine("BRK;"); + } else { + AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition)); + AddLine("BRK (NE.x);"); + ResetTemporaries(); + } + } else if (std::holds_alternative<ASTLabel>(*node->GetInnerData())) { + // Nothing to do + } else { + UNREACHABLE(); + } +} + +std::string ARBDecompiler::VisitExpression(const Expr& node) { + if (const auto expr = std::get_if<ExprAnd>(&*node)) { + std::string result = AllocTemporary(); + AddLine("AND.U {}, {}, {};", result, VisitExpression(expr->operand1), + VisitExpression(expr->operand2)); + return result; + } + if (const auto expr = std::get_if<ExprOr>(&*node)) { + std::string result = AllocTemporary(); + AddLine("OR.U {}, {}, {};", result, VisitExpression(expr->operand1), + VisitExpression(expr->operand2)); + return result; + } + if (const auto expr = std::get_if<ExprNot>(&*node)) { + std::string result = AllocTemporary(); + AddLine("CMP.S {}, {}, 0, -1;", result, VisitExpression(expr->operand1)); + return result; + } + if (const auto expr = std::get_if<ExprPredicate>(&*node)) { + return fmt::format("P{}.x", static_cast<u64>(expr->predicate)); + } + if (const auto expr = std::get_if<ExprCondCode>(&*node)) { + return Visit(ir.GetConditionCode(expr->cc)); + } + if (const auto expr = std::get_if<ExprVar>(&*node)) { + return fmt::format("F{}.x", expr->var_index); + } + if (const auto expr = std::get_if<ExprBoolean>(&*node)) { + return expr->value ? "0xffffffff" : "0"; + } + if (const auto expr = std::get_if<ExprGprEqual>(&*node)) { + std::string result = AllocTemporary(); + AddLine("SEQ.U {}, R{}.x, {};", result, expr->gpr, expr->value); + return result; + } + UNREACHABLE(); + return "0"; +} + +void ARBDecompiler::VisitBlock(const NodeBlock& bb) { + for (const auto& node : bb) { + Visit(node); + } +} + +std::string ARBDecompiler::Visit(const Node& node) { + if (const auto operation = std::get_if<OperationNode>(&*node)) { + if (const auto amend_index = operation->GetAmendIndex()) { + Visit(ir.GetAmendNode(*amend_index)); + } + const std::size_t index = static_cast<std::size_t>(operation->GetCode()); + if (index >= OPERATION_DECOMPILERS.size()) { + UNREACHABLE_MSG("Out of bounds operation: {}", index); + return {}; + } + const auto decompiler = OPERATION_DECOMPILERS[index]; + if (decompiler == nullptr) { + UNREACHABLE_MSG("Undefined operation: {}", index); + return {}; + } + return (this->*decompiler)(*operation); + } + + if (const auto gpr = std::get_if<GprNode>(&*node)) { + const u32 index = gpr->GetIndex(); + if (index == Register::ZeroIndex) { + return "{0, 0, 0, 0}.x"; + } + return fmt::format("R{}.x", index); + } + + if (const auto cv = std::get_if<CustomVarNode>(&*node)) { + return fmt::format("CV{}.x", cv->GetIndex()); + } + + if (const auto immediate = std::get_if<ImmediateNode>(&*node)) { + std::string temporary = AllocTemporary(); + AddLine("MOV.U {}, {};", temporary, immediate->GetValue()); + return temporary; + } + + if (const auto predicate = std::get_if<PredicateNode>(&*node)) { + std::string temporary = AllocTemporary(); + switch (const auto index = predicate->GetIndex(); index) { + case Tegra::Shader::Pred::UnusedIndex: + AddLine("MOV.S {}, -1;", temporary); + break; + case Tegra::Shader::Pred::NeverExecute: + AddLine("MOV.S {}, 0;", temporary); + break; + default: + AddLine("MOV.S {}, P{}.x;", temporary, static_cast<u64>(index)); + break; + } + if (predicate->IsNegated()) { + AddLine("CMP.S {}, {}, 0, -1;", temporary, temporary); + } + return temporary; + } + + if (const auto abuf = std::get_if<AbufNode>(&*node)) { + if (abuf->IsPhysicalBuffer()) { + UNIMPLEMENTED_MSG("Physical buffers are not implemented"); + return "{0, 0, 0, 0}.x"; + } + + const auto buffer_index = [this, &abuf]() -> std::string { + if (stage != ShaderType::Geometry) { + return ""; + } + return fmt::format("[{}]", Visit(abuf->GetBuffer())); + }; + + const Attribute::Index index = abuf->GetIndex(); + const u32 element = abuf->GetElement(); + const char swizzle = Swizzle(element); + switch (index) { + case Attribute::Index::Position: { + if (stage == ShaderType::Geometry) { + return fmt::format("{}_position[{}].{}", StageInputName(stage), + Visit(abuf->GetBuffer()), swizzle); + } else { + return fmt::format("{}.position.{}", StageInputName(stage), swizzle); + } + } + case Attribute::Index::TessCoordInstanceIDVertexID: + ASSERT(stage == ShaderType::Vertex); + switch (element) { + case 2: + return "vertex.instance"; + case 3: + return "vertex.id"; + } + UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element); + break; + case Attribute::Index::PointCoord: + switch (element) { + case 0: + return "fragment.pointcoord.x"; + case 1: + return "fragment.pointcoord.y"; + } + UNIMPLEMENTED(); + break; + case Attribute::Index::FrontFacing: { + ASSERT(stage == ShaderType::Fragment); + ASSERT(element == 3); + const std::string temporary = AllocVectorTemporary(); + AddLine("SGT.S RC.x, fragment.facing, {{0, 0, 0, 0}};"); + AddLine("MOV.U.CC RC.x, -RC;"); + AddLine("MOV.S {}.x, 0;", temporary); + AddLine("MOV.S {}.x (NE.x), -1;", temporary); + return fmt::format("{}.x", temporary); + } + default: + if (IsGenericAttribute(index)) { + if (stage == ShaderType::Geometry) { + return fmt::format("in_attr{}[{}][0].{}", GetGenericAttributeIndex(index), + Visit(abuf->GetBuffer()), swizzle); + } else { + return fmt::format("{}.attrib[{}].{}", StageInputName(stage), + GetGenericAttributeIndex(index), swizzle); + } + } + UNIMPLEMENTED_MSG("Unimplemented input attribute={}", static_cast<int>(index)); + break; + } + return "{0, 0, 0, 0}.x"; + } + + if (const auto cbuf = std::get_if<CbufNode>(&*node)) { + std::string offset_string; + const auto& offset = cbuf->GetOffset(); + if (const auto imm = std::get_if<ImmediateNode>(&*offset)) { + offset_string = std::to_string(imm->GetValue()); + } else { + offset_string = Visit(offset); + } + std::string temporary = AllocTemporary(); + AddLine("LDC.F32 {}, cbuf{}[{}];", temporary, cbuf->GetIndex(), offset_string); + return temporary; + } + + if (const auto gmem = std::get_if<GmemNode>(&*node)) { + std::string temporary = AllocTemporary(); + AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()), + Visit(gmem->GetBaseAddress())); + AddLine("LDB.U32 {}, {}[{}];", temporary, GlobalMemoryName(gmem->GetDescriptor()), + temporary); + return temporary; + } + + if (const auto lmem = std::get_if<LmemNode>(&*node)) { + std::string temporary = Visit(lmem->GetAddress()); + AddLine("SHR.U {}, {}, 2;", temporary, temporary); + AddLine("MOV.U {}, lmem[{}].x;", temporary, temporary); + return temporary; + } + + if (const auto smem = std::get_if<SmemNode>(&*node)) { + std::string temporary = Visit(smem->GetAddress()); + AddLine("LDS.U32 {}, shared_mem[{}];", temporary, temporary); + return temporary; + } + + if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) { + const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag()); + return fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]); + } + + if (const auto conditional = std::get_if<ConditionalNode>(&*node)) { + if (const auto amend_index = conditional->GetAmendIndex()) { + Visit(ir.GetAmendNode(*amend_index)); + } + AddLine("MOVC.U RC.x, {};", Visit(conditional->GetCondition())); + AddLine("IF NE.x;"); + VisitBlock(conditional->GetCode()); + AddLine("ENDIF;"); + return {}; + } + + if (const auto cmt = std::get_if<CommentNode>(&*node)) { + // Uncommenting this will generate invalid code. GLASM lacks comments. + // AddLine("// {}", cmt->GetText()); + return {}; + } + + UNIMPLEMENTED(); + return {}; +} + +std::pair<std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + UNIMPLEMENTED_IF(meta.sampler.is_indexed); + UNIMPLEMENTED_IF(meta.sampler.is_shadow && meta.sampler.is_array && + meta.sampler.type == Tegra::Shader::TextureType::TextureCube); + + const std::size_t count = operation.GetOperandsCount(); + std::string temporary = AllocVectorTemporary(); + std::size_t i = 0; + for (; i < count; ++i) { + AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i])); + } + if (meta.sampler.is_array) { + AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i++), Visit(meta.array)); + } + if (meta.sampler.is_shadow) { + AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i++), Visit(meta.depth_compare)); + } + return {std::move(temporary), i}; +} + +std::string ARBDecompiler::BuildAoffi(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + if (meta.aoffi.empty()) { + return {}; + } + const std::string temporary = AllocVectorTemporary(); + std::size_t i = 0; + for (auto& node : meta.aoffi) { + AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i++), Visit(node)); + } + return fmt::format(", offset({})", temporary); +} + +void ARBDecompiler::Exit() { + if (stage != ShaderType::Fragment) { + AddLine("RET;"); + return; + } + + const auto safe_get_register = [this](u32 reg) -> std::string { + // TODO(Rodrigo): Replace with contains once C++20 releases + const auto& used_registers = ir.GetRegisters(); + if (used_registers.find(reg) != used_registers.end()) { + return fmt::format("R{}.x", reg); + } + return "{0, 0, 0, 0}.x"; + }; + + const auto& header = ir.GetHeader(); + u32 current_reg = 0; + for (u32 rt = 0; rt < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; ++rt) { + for (u32 component = 0; component < 4; ++component) { + if (!header.ps.IsColorComponentOutputEnabled(rt, component)) { + continue; + } + AddLine("MOV.F result_color{}.{}, {};", rt, Swizzle(component), + safe_get_register(current_reg)); + ++current_reg; + } + } + if (header.ps.omap.depth) { + AddLine("MOV.F result.depth.z, {};", safe_get_register(current_reg + 1)); + } + + AddLine("RET;"); +} + +std::string ARBDecompiler::Assign(Operation operation) { + const Node& dest = operation[0]; + const Node& src = operation[1]; + + std::string dest_name; + if (const auto gpr = std::get_if<GprNode>(&*dest)) { + if (gpr->GetIndex() == Register::ZeroIndex) { + // Writing to Register::ZeroIndex is a no op + return {}; + } + dest_name = fmt::format("R{}.x", gpr->GetIndex()); + } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) { + const u32 element = abuf->GetElement(); + const char swizzle = Swizzle(element); + switch (const Attribute::Index index = abuf->GetIndex()) { + case Attribute::Index::Position: + dest_name = fmt::format("result.position.{}", swizzle); + break; + case Attribute::Index::LayerViewportPointSize: + switch (element) { + case 0: + UNIMPLEMENTED(); + return {}; + case 1: + case 2: + if (!device.HasNvViewportArray2()) { + LOG_ERROR( + Render_OpenGL, + "NV_viewport_array2 is missing. Maxwell gen 2 or better is required."); + return {}; + } + dest_name = element == 1 ? "result.layer.x" : "result.viewport.x"; + break; + case 3: + dest_name = "result.pointsize.x"; + break; + } + break; + case Attribute::Index::ClipDistances0123: + dest_name = fmt::format("result.clip[{}].x", element); + break; + case Attribute::Index::ClipDistances4567: + dest_name = fmt::format("result.clip[{}].x", element + 4); + break; + default: + if (!IsGenericAttribute(index)) { + UNREACHABLE(); + return {}; + } + dest_name = + fmt::format("result.attrib[{}].{}", GetGenericAttributeIndex(index), swizzle); + break; + } + } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) { + const std::string address = Visit(lmem->GetAddress()); + AddLine("SHR.U {}, {}, 2;", address, address); + dest_name = fmt::format("lmem[{}].x", address); + } else if (const auto smem = std::get_if<SmemNode>(&*dest)) { + AddLine("STS.U32 {}, shared_mem[{}];", Visit(src), Visit(smem->GetAddress())); + ResetTemporaries(); + return {}; + } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) { + const std::string temporary = AllocTemporary(); + AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()), + Visit(gmem->GetBaseAddress())); + AddLine("STB.U32 {}, {}[{}];", Visit(src), GlobalMemoryName(gmem->GetDescriptor()), + temporary); + ResetTemporaries(); + return {}; + } else { + UNREACHABLE(); + ResetTemporaries(); + return {}; + } + + AddLine("MOV.U {}, {};", dest_name, Visit(src)); + ResetTemporaries(); + return {}; +} + +std::string ARBDecompiler::Select(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("CMP.S {}, {}, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]), + Visit(operation[2])); + return temporary; +} + +std::string ARBDecompiler::FClamp(Operation operation) { + // 1.0f in hex, replace with std::bit_cast on C++20 + static constexpr u32 POSITIVE_ONE = 0x3f800000; + + std::string temporary = AllocTemporary(); + const Node& value = operation[0]; + const Node& low = operation[1]; + const Node& high = operation[2]; + const auto* const imm_low = std::get_if<ImmediateNode>(&*low); + const auto* const imm_high = std::get_if<ImmediateNode>(&*high); + if (imm_low && imm_high && imm_low->GetValue() == 0 && imm_high->GetValue() == POSITIVE_ONE) { + AddLine("MOV.F32.SAT {}, {};", temporary, Visit(value)); + } else { + AddLine("MIN.F {}, {}, {};", temporary, Visit(value), Visit(high)); + AddLine("MAX.F {}, {}, {};", temporary, temporary, Visit(low)); + } + return temporary; +} + +std::string ARBDecompiler::FCastHalf0(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.x, {};", temporary, Visit(operation[0])); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::FCastHalf1(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.y, {};", temporary, Visit(operation[0])); + AddLine("MOV {}.x, {}.y;", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::FSqrt(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("RSQ.F32 {}, {};", temporary, Visit(operation[0])); + AddLine("RCP.F32 {}, {};", temporary, temporary); + return temporary; +} + +std::string ARBDecompiler::FSwizzleAdd(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + if (!device.HasWarpIntrinsics()) { + LOG_ERROR(Render_OpenGL, + "NV_shader_thread_shuffle is missing. Kepler or better is required."); + AddLine("ADD.F {}.x, {}, {};", temporary, Visit(operation[0]), Visit(operation[1])); + return fmt::format("{}.x", temporary); + } + + AddLine("AND.U {}.z, {}.threadid, 3;", temporary, StageInputName(stage)); + AddLine("SHL.U {}.z, {}.z, 1;", temporary, temporary); + AddLine("SHR.U {}.z, {}, {}.z;", temporary, Visit(operation[2]), temporary); + AddLine("AND.U {}.z, {}.z, 3;", temporary, temporary); + AddLine("MUL.F32 {}.x, {}, FSWZA[{}.z];", temporary, Visit(operation[0]), temporary); + AddLine("MUL.F32 {}.y, {}, FSWZB[{}.z];", temporary, Visit(operation[1]), temporary); + AddLine("ADD.F32 {}.x, {}.x, {}.y;", temporary, temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HAdd2(Operation operation) { + const std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0])); + AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1])); + AddLine("ADD.F16 {}, {}, {};", tmp1, tmp1, tmp2); + AddLine("PK2H.F {}.x, {};", tmp1, tmp1); + return fmt::format("{}.x", tmp1); +} + +std::string ARBDecompiler::HMul2(Operation operation) { + const std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0])); + AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1])); + AddLine("MUL.F16 {}, {}, {};", tmp1, tmp1, tmp2); + AddLine("PK2H.F {}.x, {};", tmp1, tmp1); + return fmt::format("{}.x", tmp1); +} + +std::string ARBDecompiler::HFma2(Operation operation) { + const std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + const std::string tmp3 = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0])); + AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1])); + AddLine("UP2H.F {}.xy, {};", tmp3, Visit(operation[2])); + AddLine("MAD.F16 {}, {}, {}, {};", tmp1, tmp1, tmp2, tmp3); + AddLine("PK2H.F {}.x, {};", tmp1, tmp1); + return fmt::format("{}.x", tmp1); +} + +std::string ARBDecompiler::HAbsolute(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + AddLine("PK2H.F {}.x, |{}|;", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HNegate(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + AddLine("MOVC.S RC.x, {};", Visit(operation[1])); + AddLine("MOV.F {}.x (NE.x), -{}.x;", temporary, temporary); + AddLine("MOVC.S RC.x, {};", Visit(operation[2])); + AddLine("MOV.F {}.y (NE.x), -{}.y;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HClamp(Operation operation) { + const std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0])); + AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[1])); + AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2); + AddLine("MAX.F {}, {}, {};", tmp1, tmp1, tmp2); + AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[2])); + AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2); + AddLine("MIN.F {}, {}, {};", tmp1, tmp1, tmp2); + AddLine("PK2H.F {}.x, {};", tmp1, tmp1); + return fmt::format("{}.x", tmp1); +} + +std::string ARBDecompiler::HCastFloat(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.F {}.y, {{0, 0, 0, 0}};", temporary); + AddLine("MOV.F {}.x, {};", temporary, Visit(operation[0])); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HUnpack(Operation operation) { + const std::string operand = Visit(operation[0]); + switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) { + case Tegra::Shader::HalfType::H0_H1: + return operand; + case Tegra::Shader::HalfType::F32: { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.U {}.x, {};", temporary, operand); + AddLine("MOV.U {}.y, {}.x;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); + } + case Tegra::Shader::HalfType::H0_H0: { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, operand); + AddLine("MOV.U {}.y, {}.x;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); + } + case Tegra::Shader::HalfType::H1_H1: { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, operand); + AddLine("MOV.U {}.x, {}.y;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); + } + } + UNREACHABLE(); + return "{0, 0, 0, 0}.x"; +} + +std::string ARBDecompiler::HMergeF32(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HMergeH0(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1])); + AddLine("MOV.U {}.x, {}.z;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HMergeH1(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1])); + AddLine("MOV.U {}.y, {}.w;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HPack2(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.U {}.x, {};", temporary, Visit(operation[0])); + AddLine("MOV.U {}.y, {};", temporary, Visit(operation[1])); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::LogicalAssign(Operation operation) { + const Node& dest = operation[0]; + const Node& src = operation[1]; + + std::string target; + + if (const auto pred = std::get_if<PredicateNode>(&*dest)) { + ASSERT_MSG(!pred->IsNegated(), "Negating logical assignment"); + + const Tegra::Shader::Pred index = pred->GetIndex(); + switch (index) { + case Tegra::Shader::Pred::NeverExecute: + case Tegra::Shader::Pred::UnusedIndex: + // Writing to these predicates is a no-op + return {}; + } + target = fmt::format("P{}.x", static_cast<u64>(index)); + } else if (const auto internal_flag = std::get_if<InternalFlagNode>(&*dest)) { + const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag()); + target = fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]); + } else { + UNREACHABLE(); + ResetTemporaries(); + return {}; + } + + AddLine("MOV.U {}, {};", target, Visit(src)); + ResetTemporaries(); + return {}; +} + +std::string ARBDecompiler::LogicalPick2(Operation operation) { + std::string temporary = AllocTemporary(); + const u32 index = std::get<ImmediateNode>(*operation[1]).GetValue(); + AddLine("MOV.U {}, {}.{};", temporary, Visit(operation[0]), Swizzle(index)); + return temporary; +} + +std::string ARBDecompiler::LogicalAnd2(Operation operation) { + std::string temporary = AllocTemporary(); + const std::string op = Visit(operation[0]); + AddLine("AND.U {}, {}.x, {}.y;", temporary, op, op); + return temporary; +} + +std::string ARBDecompiler::FloatOrdered(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("MOVC.F32 RC.x, {};", Visit(operation[0])); + AddLine("MOVC.F32 RC.y, {};", Visit(operation[1])); + AddLine("MOV.S {}, -1;", temporary); + AddLine("MOV.S {} (NAN.x), 0;", temporary); + AddLine("MOV.S {} (NAN.y), 0;", temporary); + return temporary; +} + +std::string ARBDecompiler::FloatUnordered(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("MOVC.F32 RC.x, {};", Visit(operation[0])); + AddLine("MOVC.F32 RC.y, {};", Visit(operation[1])); + AddLine("MOV.S {}, 0;", temporary); + AddLine("MOV.S {} (NAN.x), -1;", temporary); + AddLine("MOV.S {} (NAN.y), -1;", temporary); + return temporary; +} + +std::string ARBDecompiler::LogicalAddCarry(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("ADDC.U RC, {}, {};", Visit(operation[0]), Visit(operation[1])); + AddLine("MOV.S {}, 0;", temporary); + AddLine("IF CF.x;"); + AddLine("MOV.S {}, -1;", temporary); + AddLine("ENDIF;"); + return temporary; +} + +std::string ARBDecompiler::Texture(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + const auto [temporary, swizzle] = BuildCoords(operation); + + std::string_view opcode = "TEX"; + std::string extra; + if (meta.bias) { + ASSERT(!meta.lod); + opcode = "TXB"; + + if (swizzle < 4) { + AddLine("MOV.F {}.w, {};", temporary, Visit(meta.bias)); + } else { + const std::string bias = AllocTemporary(); + AddLine("MOV.F {}, {};", bias, Visit(meta.bias)); + extra = fmt::format(" {},", bias); + } + } + if (meta.lod) { + ASSERT(!meta.bias); + opcode = "TXL"; + + if (swizzle < 4) { + AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod)); + } else { + const std::string lod = AllocTemporary(); + AddLine("MOV.F {}, {};", lod, Visit(meta.lod)); + extra = fmt::format(" {},", lod); + } + } + + AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, temporary, extra, sampler_id, + TextureType(meta), BuildAoffi(operation)); + AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TextureGather(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + const auto [temporary, swizzle] = BuildCoords(operation); + + std::string comp; + if (!meta.sampler.is_shadow) { + const auto& immediate = std::get<ImmediateNode>(*meta.component); + comp = fmt::format(".{}", Swizzle(immediate.GetValue())); + } + + AddLine("TXG.F {}, {}, texture[{}]{}, {}{};", temporary, temporary, sampler_id, comp, + TextureType(meta), BuildAoffi(operation)); + AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TextureQueryDimensions(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + const std::string temporary = AllocVectorTemporary(); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + + ASSERT(!meta.sampler.is_array); + + const std::string lod = operation.GetOperandsCount() > 0 ? Visit(operation[0]) : "0"; + AddLine("TXQ {}, {}, texture[{}], {};", temporary, lod, sampler_id, TextureType(meta)); + AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TextureQueryLod(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + const std::string temporary = AllocVectorTemporary(); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + + ASSERT(!meta.sampler.is_array); + + const std::size_t count = operation.GetOperandsCount(); + for (std::size_t i = 0; i < count; ++i) { + AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i])); + } + AddLine("LOD.F {}, {}, texture[{}], {};", temporary, temporary, sampler_id, TextureType(meta)); + AddLine("MUL.F32 {}, {}, {{256, 256, 0, 0}};", temporary, temporary); + AddLine("TRUNC.S {}, {};", temporary, temporary); + AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TexelFetch(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + const auto [temporary, swizzle] = BuildCoords(operation); + + if (!meta.sampler.is_buffer) { + ASSERT(swizzle < 4); + AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod)); + } + AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, temporary, sampler_id, TextureType(meta), + BuildAoffi(operation)); + AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TextureGradient(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + const std::string ddx = AllocVectorTemporary(); + const std::string ddy = AllocVectorTemporary(); + const std::string coord = BuildCoords(operation).first; + + const std::size_t num_components = meta.derivates.size() / 2; + for (std::size_t index = 0; index < num_components; ++index) { + const char swizzle = Swizzle(index); + AddLine("MOV.F {}.{}, {};", ddx, swizzle, Visit(meta.derivates[index * 2])); + AddLine("MOV.F {}.{}, {};", ddy, swizzle, Visit(meta.derivates[index * 2 + 1])); + } + + const std::string_view result = coord; + AddLine("TXD.F {}, {}, {}, {}, texture[{}], {}{};", result, coord, ddx, ddy, sampler_id, + TextureType(meta), BuildAoffi(operation)); + AddLine("MOV.F {}.x, {}.{};", result, result, Swizzle(meta.element)); + return fmt::format("{}.x", result); +} + +std::string ARBDecompiler::ImageLoad(Operation operation) { + const auto& meta = std::get<MetaImage>(operation.GetMeta()); + const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index; + const std::size_t count = operation.GetOperandsCount(); + const std::string_view type = ImageType(meta.image.type); + + const std::string temporary = AllocVectorTemporary(); + for (std::size_t i = 0; i < count; ++i) { + AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i), Visit(operation[i])); + } + AddLine("LOADIM.F {}, {}, image[{}], {};", temporary, temporary, image_id, type); + AddLine("MOV.F {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::ImageStore(Operation operation) { + const auto& meta = std::get<MetaImage>(operation.GetMeta()); + const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index; + const std::size_t num_coords = operation.GetOperandsCount(); + const std::size_t num_values = meta.values.size(); + const std::string_view type = ImageType(meta.image.type); + + const std::string coord = AllocVectorTemporary(); + const std::string value = AllocVectorTemporary(); + for (std::size_t i = 0; i < num_coords; ++i) { + AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i])); + } + for (std::size_t i = 0; i < num_values; ++i) { + AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i])); + } + AddLine("STOREIM.F image[{}], {}, {}, {};", image_id, value, coord, type); + return {}; +} + +std::string ARBDecompiler::Branch(Operation operation) { + const auto target = std::get<ImmediateNode>(*operation[0]); + AddLine("MOV.U PC.x, {};", target.GetValue()); + AddLine("CONT;"); + return {}; +} + +std::string ARBDecompiler::BranchIndirect(Operation operation) { + AddLine("MOV.U PC.x, {};", Visit(operation[0])); + AddLine("CONT;"); + return {}; +} + +std::string ARBDecompiler::PushFlowStack(Operation operation) { + const auto stack = std::get<MetaStackClass>(operation.GetMeta()); + const u32 target = std::get<ImmediateNode>(*operation[0]).GetValue(); + const std::string_view stack_name = StackName(stack); + AddLine("MOV.U {}[{}_TOP.x].x, {};", stack_name, stack_name, target); + AddLine("ADD.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name); + return {}; +} + +std::string ARBDecompiler::PopFlowStack(Operation operation) { + const auto stack = std::get<MetaStackClass>(operation.GetMeta()); + const std::string_view stack_name = StackName(stack); + AddLine("SUB.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name); + AddLine("MOV.U PC.x, {}[{}_TOP.x].x;", stack_name, stack_name); + AddLine("CONT;"); + return {}; +} + +std::string ARBDecompiler::Exit(Operation) { + Exit(); + return {}; +} + +std::string ARBDecompiler::Discard(Operation) { + AddLine("KIL TR;"); + return {}; +} + +std::string ARBDecompiler::EmitVertex(Operation) { + AddLine("EMIT;"); + return {}; +} + +std::string ARBDecompiler::EndPrimitive(Operation) { + AddLine("ENDPRIM;"); + return {}; +} + +std::string ARBDecompiler::InvocationId(Operation) { + return "primitive.invocation"; +} + +std::string ARBDecompiler::YNegate(Operation) { + LOG_WARNING(Render_OpenGL, "(STUBBED)"); + const std::string temporary = AllocTemporary(); + AddLine("MOV.F {}, 1;", temporary); + return temporary; +} + +std::string ARBDecompiler::ThreadId(Operation) { + return fmt::format("{}.threadid", StageInputName(stage)); +} + +std::string ARBDecompiler::ShuffleIndexed(Operation operation) { + if (!device.HasWarpIntrinsics()) { + LOG_ERROR(Render_OpenGL, + "NV_shader_thread_shuffle is missing. Kepler or better is required."); + return Visit(operation[0]); + } + const std::string temporary = AllocVectorTemporary(); + AddLine("SHFIDX.U {}, {}, {}, {{31, 0, 0, 0}};", temporary, Visit(operation[0]), + Visit(operation[1])); + AddLine("MOV.U {}.x, {}.y;", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::Barrier(Operation) { + if (!ir.IsDecompiled()) { + LOG_ERROR(Render_OpenGL, "BAR used but shader is not decompiled"); + return {}; + } + AddLine("BAR;"); + return {}; +} + +std::string ARBDecompiler::MemoryBarrierGroup(Operation) { + AddLine("MEMBAR.CTA;"); + return {}; +} + +std::string ARBDecompiler::MemoryBarrierGlobal(Operation) { + AddLine("MEMBAR;"); + return {}; +} + +} // Anonymous namespace + +std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, + const VideoCommon::Shader::Registry& registry, + Tegra::Engines::ShaderType stage, std::string_view identifier) { + return ARBDecompiler(device, ir, registry, stage, identifier).Code(); +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.h b/src/video_core/renderer_opengl/gl_arb_decompiler.h new file mode 100644 index 000000000..6afc87220 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_arb_decompiler.h @@ -0,0 +1,29 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <string> +#include <string_view> + +#include "common/common_types.h" + +namespace Tegra::Engines { +enum class ShaderType : u32; +} + +namespace VideoCommon::Shader { +class ShaderIR; +class Registry; +} // namespace VideoCommon::Shader + +namespace OpenGL { + +class Device; + +std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, + const VideoCommon::Shader::Registry& registry, + Tegra::Engines::ShaderType stage, std::string_view identifier); + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index 9964ea894..d9f7b4cc6 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -22,22 +22,46 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs; MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128)); -CachedBufferBlock::CachedBufferBlock(VAddr cpu_addr, const std::size_t size) +Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size) : VideoCommon::BufferBlock{cpu_addr, size} { gl_buffer.Create(); glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW); + if (device.HasVertexBufferUnifiedMemory()) { + glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE); + glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); + } } -CachedBufferBlock::~CachedBufferBlock() = default; +Buffer::~Buffer() = default; + +void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) const { + glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size), + data); +} + +void Buffer::Download(std::size_t offset, std::size_t size, u8* data) const { + MICROPROFILE_SCOPE(OpenGL_Buffer_Download); + glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT); + glGetNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size), + data); +} + +void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, + std::size_t size) const { + glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset), + static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size)); +} OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, - const Device& device, std::size_t stream_size) - : GenericBufferCache{rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} { + const Device& device_, std::size_t stream_size) + : GenericBufferCache{rasterizer, system, + std::make_unique<OGLStreamBuffer>(device_, stream_size, true)}, + device{device_} { if (!device.HasFastBufferSubData()) { return; } - static constexpr auto size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize); + static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize); glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs)); for (const GLuint cbuf : cbufs) { glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW); @@ -48,44 +72,21 @@ OGLBufferCache::~OGLBufferCache() { glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs)); } -Buffer OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { - return std::make_shared<CachedBufferBlock>(cpu_addr, size); -} - -GLuint OGLBufferCache::ToHandle(const Buffer& buffer) { - return buffer->GetHandle(); -} - -GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) { - return 0; +std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { + return std::make_shared<Buffer>(device, cpu_addr, size); } -void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - const u8* data) { - glNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset), - static_cast<GLsizeiptr>(size), data); -} - -void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - u8* data) { - MICROPROFILE_SCOPE(OpenGL_Buffer_Download); - glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT); - glGetNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset), - static_cast<GLsizeiptr>(size), data); -} - -void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, - std::size_t dst_offset, std::size_t size) { - glCopyNamedBufferSubData(src->GetHandle(), dst->GetHandle(), static_cast<GLintptr>(src_offset), - static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size)); +OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) { + return {0, 0, 0}; } OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer, std::size_t size) { DEBUG_ASSERT(cbuf_cursor < std::size(cbufs)); - const GLuint& cbuf = cbufs[cbuf_cursor++]; + const GLuint cbuf = cbufs[cbuf_cursor++]; + glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer); - return {cbuf, 0}; + return {cbuf, 0, 0}; } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index a9e86cfc7..59d95adbc 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -10,7 +10,6 @@ #include "common/common_types.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/rasterizer_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" @@ -24,57 +23,57 @@ class Device; class OGLStreamBuffer; class RasterizerOpenGL; -class CachedBufferBlock; +class Buffer : public VideoCommon::BufferBlock { +public: + explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size); + ~Buffer(); -using Buffer = std::shared_ptr<CachedBufferBlock>; -using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>; + void Upload(std::size_t offset, std::size_t size, const u8* data) const; -class CachedBufferBlock : public VideoCommon::BufferBlock { -public: - explicit CachedBufferBlock(VAddr cpu_addr, const std::size_t size); - ~CachedBufferBlock(); + void Download(std::size_t offset, std::size_t size, u8* data) const; + + void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, + std::size_t size) const; - GLuint GetHandle() const { + GLuint Handle() const noexcept { return gl_buffer.handle; } + u64 Address() const noexcept { + return gpu_address; + } + private: OGLBuffer gl_buffer; + u64 gpu_address = 0; }; +using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>; class OGLBufferCache final : public GenericBufferCache { public: explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, const Device& device, std::size_t stream_size); ~OGLBufferCache(); - GLuint GetEmptyBuffer(std::size_t) override; + BufferInfo GetEmptyBuffer(std::size_t) override; void Acquire() noexcept { cbuf_cursor = 0; } protected: - Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override; - - GLuint ToHandle(const Buffer& buffer) override; - - void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - const u8* data) override; - - void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - u8* data) override; - - void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, - std::size_t dst_offset, std::size_t size) override; + std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override; BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override; private: + static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * + Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; + + const Device& device; + std::size_t cbuf_cursor = 0; - std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * - Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram> - cbufs; + std::array<GLuint, NUM_CBUFS> cbufs{}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index a14641b97..208fc6167 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -123,16 +123,24 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin u32 num_images = GetInteger<u32>(GL_MAX_IMAGE_UNITS); u32 base_images = 0; - // Reserve more image bindings on fragment and vertex stages. + // GL_MAX_IMAGE_UNITS is guaranteed by the spec to have a minimum value of 8. + // Due to the limitation of GL_MAX_IMAGE_UNITS, reserve at least 4 image bindings on the + // fragment stage, and at least 1 for the rest of the stages. + // So far games are observed to use 1 image binding on vertex and 4 on fragment stages. + + // Reserve at least 4 image bindings on the fragment stage. bindings[4].image = - Extract(base_images, num_images, num_images / NumStages + 2, LimitImages[4]); - bindings[0].image = - Extract(base_images, num_images, num_images / NumStages + 1, LimitImages[0]); + Extract(base_images, num_images, std::max(4U, num_images / NumStages), LimitImages[4]); + + // This is guaranteed to be at least 1. + const u32 total_extracted_images = num_images / (NumStages - 1); // Reserve the other image bindings. - const u32 total_extracted_images = num_images / (NumStages - 2); - for (std::size_t i = 2; i < NumStages; ++i) { + for (std::size_t i = 0; i < NumStages; ++i) { const std::size_t stage = stage_swizzle[i]; + if (stage == 4) { + continue; + } bindings[stage].image = Extract(base_images, num_images, total_extracted_images, LimitImages[stage]); } @@ -170,7 +178,7 @@ bool IsASTCSupported() { for (const GLenum format : formats) { for (const GLenum support : required_support) { GLint value; - glGetInternalformativ(GL_TEXTURE_2D, format, support, 1, &value); + glGetInternalformativ(target, format, support, 1, &value); if (value != GL_FULL_SUPPORT) { return false; } @@ -185,6 +193,7 @@ bool IsASTCSupported() { Device::Device() : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} { const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); + const std::string_view renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER)); const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION)); const std::vector extensions = GetExtensions(); @@ -208,13 +217,21 @@ Device::Device() has_shader_ballot = GLAD_GL_ARB_shader_ballot; has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array; has_image_load_formatted = HasExtension(extensions, "GL_EXT_shader_image_load_formatted"); + has_texture_shadow_lod = HasExtension(extensions, "GL_EXT_texture_shadow_lod"); has_astc = IsASTCSupported(); has_variable_aoffi = TestVariableAoffi(); has_component_indexing_bug = is_amd; has_precise_bug = TestPreciseBug(); + has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2; + has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory; + + // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive + // uniform buffers as "push constants" has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data; + use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 && - GLAD_GL_NV_compute_program5; + GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback && + GLAD_GL_NV_transform_feedback2; LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi); LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug); @@ -235,6 +252,7 @@ Device::Device(std::nullptr_t) { has_shader_ballot = true; has_vertex_viewport_layer = true; has_image_load_formatted = true; + has_texture_shadow_lod = true; has_variable_aoffi = true; } diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index 98cca0254..e1d811966 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -68,6 +68,14 @@ public: return has_image_load_formatted; } + bool HasTextureShadowLod() const { + return has_texture_shadow_lod; + } + + bool HasVertexBufferUnifiedMemory() const { + return has_vertex_buffer_unified_memory; + } + bool HasASTC() const { return has_astc; } @@ -88,6 +96,10 @@ public: return has_fast_buffer_sub_data; } + bool HasNvViewportArray2() const { + return has_nv_viewport_array2; + } + bool UseAssemblyShaders() const { return use_assembly_shaders; } @@ -106,11 +118,14 @@ private: bool has_shader_ballot{}; bool has_vertex_viewport_layer{}; bool has_image_load_formatted{}; + bool has_texture_shadow_lod{}; + bool has_vertex_buffer_unified_memory{}; bool has_astc{}; bool has_variable_aoffi{}; bool has_component_indexing_bug{}; bool has_precise_bug{}; bool has_fast_buffer_sub_data{}; + bool has_nv_viewport_array2{}; bool use_assembly_shaders{}; }; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 55e79aaf6..e960a0ef1 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -30,6 +30,7 @@ #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/maxwell_to_gl.h" #include "video_core/renderer_opengl/renderer_opengl.h" +#include "video_core/shader_cache.h" namespace OpenGL { @@ -60,15 +61,28 @@ constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE = constexpr std::size_t TOTAL_CONST_BUFFER_BYTES = NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage; -constexpr std::size_t NumSupportedVertexAttributes = 16; +constexpr std::size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16; +constexpr std::size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16; template <typename Engine, typename Entry> Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, ShaderType shader_type, std::size_t index = 0) { + if constexpr (std::is_same_v<Entry, SamplerEntry>) { + if (entry.is_separated) { + const u32 buffer_1 = entry.buffer; + const u32 buffer_2 = entry.secondary_buffer; + const u32 offset_1 = entry.offset; + const u32 offset_2 = entry.secondary_offset; + const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1); + const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2); + return engine.GetTextureInfo(handle_1 | handle_2); + } + } if (entry.is_bindless) { - const auto tex_handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset); - return engine.GetTextureInfo(tex_handle); + const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset); + return engine.GetTextureInfo(handle); } + const auto& gpu_profile = engine.AccessGuestDriverProfile(); const u32 offset = entry.offset + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize()); if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) { @@ -93,6 +107,34 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer, return buffer.size; } +/// Translates hardware transform feedback indices +/// @param location Hardware location +/// @return Pair of ARB_transform_feedback3 token stream first and third arguments +/// @note Read https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_transform_feedback3.txt +std::pair<GLint, GLint> TransformFeedbackEnum(u8 location) { + const u8 index = location / 4; + if (index >= 8 && index <= 39) { + return {GL_GENERIC_ATTRIB_NV, index - 8}; + } + if (index >= 48 && index <= 55) { + return {GL_TEXTURE_COORD_NV, index - 48}; + } + switch (index) { + case 7: + return {GL_POSITION, 0}; + case 40: + return {GL_PRIMARY_COLOR_NV, 0}; + case 41: + return {GL_SECONDARY_COLOR_NV, 0}; + case 42: + return {GL_BACK_PRIMARY_COLOR_NV, 0}; + case 43: + return {GL_BACK_SECONDARY_COLOR_NV, 0}; + } + UNIMPLEMENTED_MSG("index={}", static_cast<int>(index)); + return {GL_POSITION, 0}; +} + void oglEnable(GLenum cap, bool state) { (state ? glEnable : glDisable)(cap); } @@ -152,7 +194,7 @@ void RasterizerOpenGL::SetupVertexFormat() { // avoid OpenGL errors. // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't // assume every shader uses them all. - for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) { + for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) { if (!flags[Dirty::VertexFormat0 + index]) { continue; } @@ -171,9 +213,10 @@ void RasterizerOpenGL::SetupVertexFormat() { if (attrib.type == Maxwell::VertexAttribute::Type::SignedInt || attrib.type == Maxwell::VertexAttribute::Type::UnsignedInt) { glVertexAttribIFormat(gl_index, attrib.ComponentCount(), - MaxwellToGL::VertexType(attrib), attrib.offset); + MaxwellToGL::VertexFormat(attrib), attrib.offset); } else { - glVertexAttribFormat(gl_index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib), + glVertexAttribFormat(gl_index, attrib.ComponentCount(), + MaxwellToGL::VertexFormat(attrib), attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset); } glVertexAttribBinding(gl_index, attrib.buffer); @@ -190,9 +233,11 @@ void RasterizerOpenGL::SetupVertexBuffer() { MICROPROFILE_SCOPE(OpenGL_VB); + const bool use_unified_memory = device.HasVertexBufferUnifiedMemory(); + // Upload all guest vertex arrays sequentially to our buffer const auto& regs = gpu.regs; - for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { + for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) { if (!flags[Dirty::VertexBuffer0 + index]) { continue; } @@ -205,16 +250,25 @@ void RasterizerOpenGL::SetupVertexBuffer() { const GPUVAddr start = vertex_array.StartAddress(); const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); - ASSERT(end >= start); + + const GLuint gl_index = static_cast<GLuint>(index); const u64 size = end - start; if (size == 0) { - glBindVertexBuffer(static_cast<GLuint>(index), 0, 0, vertex_array.stride); + glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride); + if (use_unified_memory) { + glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0); + } continue; } - const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size); - glBindVertexBuffer(static_cast<GLuint>(index), vertex_buffer, vertex_buffer_offset, - vertex_array.stride); + const auto info = buffer_cache.UploadMemory(start, size); + if (use_unified_memory) { + glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride); + glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, + info.address + info.offset, size); + } else { + glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride); + } } } @@ -227,7 +281,7 @@ void RasterizerOpenGL::SetupVertexInstances() { flags[Dirty::VertexInstances] = false; const auto& regs = gpu.regs; - for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) { + for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) { if (!flags[Dirty::VertexInstance0 + index]) { continue; } @@ -244,9 +298,9 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() { MICROPROFILE_SCOPE(OpenGL_Index); const auto& regs = system.GPU().Maxwell3D().regs; const std::size_t size = CalculateIndexBufferSize(); - const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size); - glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer); - return offset; + const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size); + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle); + return info.offset; } void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { @@ -282,7 +336,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { continue; } - Shader shader{shader_cache.GetStageProgram(program)}; + Shader* const shader = shader_cache.GetStageProgram(program); if (device.UseAssemblyShaders()) { // Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this @@ -576,7 +630,16 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); // Prepare the vertex array. - buffer_cache.Map(buffer_size); + const bool invalidated = buffer_cache.Map(buffer_size); + + if (invalidated) { + // When the stream buffer has been invalidated, we have to consider vertex buffers as dirty + auto& dirty = gpu.dirty.flags; + dirty[Dirty::VertexBuffers] = true; + for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) { + dirty[index] = true; + } + } // Prepare vertex array format. SetupVertexFormat(); @@ -593,9 +656,9 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { if (!device.UseAssemblyShaders()) { MaxwellUniformData ubo; ubo.SetFromRegs(gpu); - const auto [buffer, offset] = + const auto info = buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); - glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset, + glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset, static_cast<GLsizeiptr>(sizeof(ubo))); } @@ -842,7 +905,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, return true; } -void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) { static constexpr std::array PARAMETER_LUT = { GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV, @@ -872,7 +935,7 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shad } } -void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { +void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) { MICROPROFILE_SCOPE(OpenGL_UBO); const auto& launch_desc = system.GPU().KeplerCompute().launch_description; const auto& entries = kernel->GetEntries(); @@ -906,8 +969,7 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, if (device.UseAssemblyShaders()) { glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0); } else { - glBindBufferRange(GL_UNIFORM_BUFFER, binding, - buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float)); + glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float)); } return; } @@ -920,28 +982,29 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment(); const GPUVAddr gpu_addr = buffer.address; - auto [cbuf, offset] = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload); + auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload); if (device.UseAssemblyShaders()) { UNIMPLEMENTED_IF(use_unified); - if (offset != 0) { + if (info.offset != 0) { const GLuint staging_cbuf = staging_cbufs[current_cbuf++]; - glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size); - cbuf = staging_cbuf; - offset = 0; + glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size); + info.handle = staging_cbuf; + info.offset = 0; } - glBindBufferRangeNV(stage, binding, cbuf, offset, size); + glBindBufferRangeNV(stage, binding, info.handle, info.offset, size); return; } if (use_unified) { - glCopyNamedBufferSubData(cbuf, unified_uniform_buffer.handle, offset, unified_offset, size); + glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset, + unified_offset, size); } else { - glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size); + glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size); } } -void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) { auto& gpu{system.GPU()}; auto& memory_manager{gpu.MemoryManager()}; const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]}; @@ -956,7 +1019,7 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad } } -void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) { +void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) { auto& gpu{system.GPU()}; auto& memory_manager{gpu.MemoryManager()}; const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config}; @@ -973,13 +1036,12 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) { void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr, std::size_t size) { const auto alignment{device.GetShaderStorageBufferAlignment()}; - const auto [ssbo, buffer_offset] = - buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, ssbo, buffer_offset, + const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset, static_cast<GLsizeiptr>(size)); } -void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) { MICROPROFILE_SCOPE(OpenGL_Texture); const auto& maxwell3d = system.GPU().Maxwell3D(); u32 binding = device.GetBaseBindings(stage_index).sampler; @@ -992,7 +1054,7 @@ void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& } } -void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) { +void RasterizerOpenGL::SetupComputeTextures(Shader* kernel) { MICROPROFILE_SCOPE(OpenGL_Texture); const auto& compute = system.GPU().KeplerCompute(); u32 binding = 0; @@ -1021,7 +1083,7 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu } } -void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, Shader* shader) { const auto& maxwell3d = system.GPU().Maxwell3D(); u32 binding = device.GetBaseBindings(stage_index).image; for (const auto& entry : shader->GetEntries().images) { @@ -1031,7 +1093,7 @@ void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& sh } } -void RasterizerOpenGL::SetupComputeImages(const Shader& shader) { +void RasterizerOpenGL::SetupComputeImages(Shader* shader) { const auto& compute = system.GPU().KeplerCompute(); u32 binding = 0; for (const auto& entry : shader->GetEntries().images) { @@ -1547,12 +1609,70 @@ void RasterizerOpenGL::SyncFramebufferSRGB() { oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb); } +void RasterizerOpenGL::SyncTransformFeedback() { + // TODO(Rodrigo): Inject SKIP_COMPONENTS*_NV when required. An unimplemented message will signal + // when this is required. + const auto& regs = system.GPU().Maxwell3D().regs; + + static constexpr std::size_t STRIDE = 3; + std::array<GLint, 128 * STRIDE * Maxwell::NumTransformFeedbackBuffers> attribs; + std::array<GLint, Maxwell::NumTransformFeedbackBuffers> streams; + + GLint* cursor = attribs.data(); + GLint* current_stream = streams.data(); + + for (std::size_t feedback = 0; feedback < Maxwell::NumTransformFeedbackBuffers; ++feedback) { + const auto& layout = regs.tfb_layouts[feedback]; + UNIMPLEMENTED_IF_MSG(layout.stride != layout.varying_count * 4, "Stride padding"); + if (layout.varying_count == 0) { + continue; + } + + *current_stream = static_cast<GLint>(feedback); + if (current_stream != streams.data()) { + // When stepping one stream, push the expected token + cursor[0] = GL_NEXT_BUFFER_NV; + cursor[1] = 0; + cursor[2] = 0; + cursor += STRIDE; + } + ++current_stream; + + const auto& locations = regs.tfb_varying_locs[feedback]; + std::optional<u8> current_index; + for (u32 offset = 0; offset < layout.varying_count; ++offset) { + const u8 location = locations[offset]; + const u8 index = location / 4; + + if (current_index == index) { + // Increase number of components of the previous attachment + ++cursor[-2]; + continue; + } + current_index = index; + + std::tie(cursor[0], cursor[2]) = TransformFeedbackEnum(location); + cursor[1] = 1; + cursor += STRIDE; + } + } + + const GLsizei num_attribs = static_cast<GLsizei>((cursor - attribs.data()) / STRIDE); + const GLsizei num_strides = static_cast<GLsizei>(current_stream - streams.data()); + glTransformFeedbackStreamAttribsNV(num_attribs, attribs.data(), num_strides, streams.data(), + GL_INTERLEAVED_ATTRIBS); +} + void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) { const auto& regs = system.GPU().Maxwell3D().regs; if (regs.tfb_enabled == 0) { return; } + if (device.UseAssemblyShaders()) { + SyncTransformFeedback(); + } + UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); @@ -1579,6 +1699,10 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) { static_cast<GLsizeiptr>(size)); } + // We may have to call BeginTransformFeedbackNV here since they seem to call different + // implementations on Nvidia's driver (the pointer is different) but we are using + // ARB_transform_feedback3 features with NV_transform_feedback interactions and the ARB + // extension doesn't define BeginTransformFeedback (without NV) interactions. It just works. glBeginTransformFeedback(GL_POINTS); } @@ -1600,8 +1724,9 @@ void RasterizerOpenGL::EndTransformFeedback() { const GLuint handle = transform_feedback_buffers[index].handle; const GPUVAddr gpu_addr = binding.Address(); const std::size_t size = binding.buffer_size; - const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true); - glCopyNamedBufferSubData(handle, dest_buffer, 0, offset, static_cast<GLsizeiptr>(size)); + const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true); + glCopyNamedBufferSubData(handle, info.handle, 0, info.offset, + static_cast<GLsizeiptr>(size)); } } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index f5dc56a0e..4f082592f 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -19,7 +19,6 @@ #include "video_core/engines/const_buffer_info.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/rasterizer_accelerated.h" -#include "video_core/rasterizer_cache.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_device.h" @@ -100,10 +99,10 @@ private: void ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil); /// Configures the current constbuffers to use for the draw command. - void SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader); + void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader); /// Configures the current constbuffers to use for the kernel invocation. - void SetupComputeConstBuffers(const Shader& kernel); + void SetupComputeConstBuffers(Shader* kernel); /// Configures a constant buffer. void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, @@ -111,30 +110,30 @@ private: std::size_t unified_offset); /// Configures the current global memory entries to use for the draw command. - void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader); + void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader); /// Configures the current global memory entries to use for the kernel invocation. - void SetupComputeGlobalMemory(const Shader& kernel); + void SetupComputeGlobalMemory(Shader* kernel); /// Configures a constant buffer. void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr, std::size_t size); /// Configures the current textures to use for the draw command. - void SetupDrawTextures(std::size_t stage_index, const Shader& shader); + void SetupDrawTextures(std::size_t stage_index, Shader* shader); /// Configures the textures used in a compute shader. - void SetupComputeTextures(const Shader& kernel); + void SetupComputeTextures(Shader* kernel); /// Configures a texture. void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry); /// Configures images in a graphics shader. - void SetupDrawImages(std::size_t stage_index, const Shader& shader); + void SetupDrawImages(std::size_t stage_index, Shader* shader); /// Configures images in a compute shader. - void SetupComputeImages(const Shader& shader); + void SetupComputeImages(Shader* shader); /// Configures an image. void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); @@ -202,6 +201,10 @@ private: /// Syncs the framebuffer sRGB state to match the guest state void SyncFramebufferSRGB(); + /// Syncs transform feedback state to match guest state + /// @note Only valid on assembly shaders + void SyncTransformFeedback(); + /// Begin a transform feedback void BeginTransformFeedback(GLenum primitive_mode); diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index a991ca64a..c6a3bf3a1 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -20,6 +20,7 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/shader_type.h" #include "video_core/memory_manager.h" +#include "video_core/renderer_opengl/gl_arb_decompiler.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" @@ -29,6 +30,7 @@ #include "video_core/shader/memory_util.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" +#include "video_core/shader_cache.h" namespace OpenGL { @@ -147,7 +149,8 @@ ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 u auto program = std::make_shared<ProgramHandle>(); if (device.UseAssemblyShaders()) { - const std::string arb = "Not implemented"; + const std::string arb = + DecompileAssemblyShader(device, ir, registry, shader_type, shader_id); GLuint& arb_prog = program->assembly_program.handle; @@ -194,12 +197,9 @@ std::unordered_set<GLenum> GetSupportedFormats() { } // Anonymous namespace -CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, - std::shared_ptr<VideoCommon::Shader::Registry> registry, - ShaderEntries entries, ProgramSharedPtr program_) - : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)}, - size_in_bytes{size_in_bytes}, program{std::move(program_)} { - // Assign either the assembly program or source program. We can't have both. +Shader::Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry_, ShaderEntries entries_, + ProgramSharedPtr program_) + : registry{std::move(registry_)}, entries{std::move(entries_)}, program{std::move(program_)} { handle = program->assembly_program.handle; if (handle == 0) { handle = program->source_program.handle; @@ -207,16 +207,16 @@ CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, ASSERT(handle != 0); } -CachedShader::~CachedShader() = default; +Shader::~Shader() = default; -GLuint CachedShader::GetHandle() const { +GLuint Shader::GetHandle() const { DEBUG_ASSERT(registry->IsConsistent()); return handle; } -Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, - Maxwell::ShaderProgram program_type, ProgramCode code, - ProgramCode code_b) { +std::unique_ptr<Shader> Shader::CreateStageFromMemory(const ShaderParameters& params, + Maxwell::ShaderProgram program_type, + ProgramCode code, ProgramCode code_b) { const auto shader_type = GetShaderType(program_type); const std::size_t size_in_bytes = code.size() * sizeof(u64); @@ -241,12 +241,12 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, entry.bindless_samplers = registry->GetBindlessSamplers(); params.disk_cache.SaveEntry(std::move(entry)); - return std::shared_ptr<CachedShader>( - new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry), - MakeEntries(params.device, ir, shader_type), std::move(program))); + return std::unique_ptr<Shader>(new Shader( + std::move(registry), MakeEntries(params.device, ir, shader_type), std::move(program))); } -Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) { +std::unique_ptr<Shader> Shader::CreateKernelFromMemory(const ShaderParameters& params, + ProgramCode code) { const std::size_t size_in_bytes = code.size() * sizeof(u64); auto& engine = params.system.GPU().KeplerCompute(); @@ -266,23 +266,23 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog entry.bindless_samplers = registry->GetBindlessSamplers(); params.disk_cache.SaveEntry(std::move(entry)); - return std::shared_ptr<CachedShader>( - new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry), - MakeEntries(params.device, ir, ShaderType::Compute), std::move(program))); + return std::unique_ptr<Shader>(new Shader(std::move(registry), + MakeEntries(params.device, ir, ShaderType::Compute), + std::move(program))); } -Shader CachedShader::CreateFromCache(const ShaderParameters& params, - const PrecompiledShader& precompiled_shader, - std::size_t size_in_bytes) { - return std::shared_ptr<CachedShader>( - new CachedShader(params.cpu_addr, size_in_bytes, precompiled_shader.registry, - precompiled_shader.entries, precompiled_shader.program)); +std::unique_ptr<Shader> Shader::CreateFromCache(const ShaderParameters& params, + const PrecompiledShader& precompiled_shader) { + return std::unique_ptr<Shader>(new Shader( + precompiled_shader.registry, precompiled_shader.entries, precompiled_shader.program)); } ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, Core::Frontend::EmuWindow& emu_window, const Device& device) - : RasterizerCache{rasterizer}, system{system}, emu_window{emu_window}, device{device}, - disk_cache{system} {} + : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system}, + emu_window{emu_window}, device{device}, disk_cache{system} {} + +ShaderCacheOpenGL::~ShaderCacheOpenGL() = default; void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) { @@ -436,7 +436,7 @@ ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram( return program; } -Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { +Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) { return last_shaders[static_cast<std::size_t>(program)]; } @@ -446,8 +446,7 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { // Look up shader in the cache based on address const auto cpu_addr{memory_manager.GpuToCpuAddress(address)}; - Shader shader{cpu_addr ? TryGet(*cpu_addr) : null_shader}; - if (shader) { + if (Shader* const shader{cpu_addr ? TryGet(*cpu_addr) : null_shader.get()}) { return last_shaders[static_cast<std::size_t>(program)] = shader; } @@ -461,62 +460,64 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { const u8* host_ptr_b = memory_manager.GetPointer(address_b); code_b = GetShaderCode(memory_manager, address_b, host_ptr_b, false); } + const std::size_t code_size = code.size() * sizeof(u64); - const auto unique_identifier = GetUniqueIdentifier( + const u64 unique_identifier = GetUniqueIdentifier( GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b); const ShaderParameters params{system, disk_cache, device, *cpu_addr, host_ptr, unique_identifier}; + std::unique_ptr<Shader> shader; const auto found = runtime_cache.find(unique_identifier); if (found == runtime_cache.end()) { - shader = CachedShader::CreateStageFromMemory(params, program, std::move(code), - std::move(code_b)); + shader = Shader::CreateStageFromMemory(params, program, std::move(code), std::move(code_b)); } else { - const std::size_t size_in_bytes = code.size() * sizeof(u64); - shader = CachedShader::CreateFromCache(params, found->second, size_in_bytes); + shader = Shader::CreateFromCache(params, found->second); } + Shader* const result = shader.get(); if (cpu_addr) { - Register(shader); + Register(std::move(shader), *cpu_addr, code_size); } else { - null_shader = shader; + null_shader = std::move(shader); } - return last_shaders[static_cast<std::size_t>(program)] = shader; + return last_shaders[static_cast<std::size_t>(program)] = result; } -Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { +Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { auto& memory_manager{system.GPU().MemoryManager()}; const auto cpu_addr{memory_manager.GpuToCpuAddress(code_addr)}; - auto kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel; - if (kernel) { + if (Shader* const kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get()) { return kernel; } const auto host_ptr{memory_manager.GetPointer(code_addr)}; // No kernel found, create a new one - auto code{GetShaderCode(memory_manager, code_addr, host_ptr, true)}; - const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)}; + ProgramCode code{GetShaderCode(memory_manager, code_addr, host_ptr, true)}; + const std::size_t code_size{code.size() * sizeof(u64)}; + const u64 unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)}; const ShaderParameters params{system, disk_cache, device, *cpu_addr, host_ptr, unique_identifier}; + std::unique_ptr<Shader> kernel; const auto found = runtime_cache.find(unique_identifier); if (found == runtime_cache.end()) { - kernel = CachedShader::CreateKernelFromMemory(params, std::move(code)); + kernel = Shader::CreateKernelFromMemory(params, std::move(code)); } else { - const std::size_t size_in_bytes = code.size() * sizeof(u64); - kernel = CachedShader::CreateFromCache(params, found->second, size_in_bytes); + kernel = Shader::CreateFromCache(params, found->second); } + Shader* const result = kernel.get(); if (cpu_addr) { - Register(kernel); + Register(std::move(kernel), *cpu_addr, code_size); } else { - null_kernel = kernel; + null_kernel = std::move(kernel); } - return kernel; + return result; } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index b2ae8d7f9..994aaeaf2 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -18,12 +18,12 @@ #include "common/common_types.h" #include "video_core/engines/shader_type.h" -#include "video_core/rasterizer_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_disk_cache.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" +#include "video_core/shader_cache.h" namespace Core { class System; @@ -35,12 +35,9 @@ class EmuWindow; namespace OpenGL { -class CachedShader; class Device; class RasterizerOpenGL; -struct UnspecializedShader; -using Shader = std::shared_ptr<CachedShader>; using Maxwell = Tegra::Engines::Maxwell3D::Regs; struct ProgramHandle { @@ -64,62 +61,53 @@ struct ShaderParameters { u64 unique_identifier; }; -class CachedShader final : public RasterizerCacheObject { +class Shader final { public: - ~CachedShader(); + ~Shader(); /// Gets the GL program handle for the shader GLuint GetHandle() const; - /// Returns the size in bytes of the shader - std::size_t GetSizeInBytes() const override { - return size_in_bytes; - } - /// Gets the shader entries for the shader const ShaderEntries& GetEntries() const { return entries; } - static Shader CreateStageFromMemory(const ShaderParameters& params, - Maxwell::ShaderProgram program_type, - ProgramCode program_code, ProgramCode program_code_b); - static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code); + static std::unique_ptr<Shader> CreateStageFromMemory(const ShaderParameters& params, + Maxwell::ShaderProgram program_type, + ProgramCode program_code, + ProgramCode program_code_b); + static std::unique_ptr<Shader> CreateKernelFromMemory(const ShaderParameters& params, + ProgramCode code); - static Shader CreateFromCache(const ShaderParameters& params, - const PrecompiledShader& precompiled_shader, - std::size_t size_in_bytes); + static std::unique_ptr<Shader> CreateFromCache(const ShaderParameters& params, + const PrecompiledShader& precompiled_shader); private: - explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, - std::shared_ptr<VideoCommon::Shader::Registry> registry, - ShaderEntries entries, ProgramSharedPtr program); + explicit Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry, ShaderEntries entries, + ProgramSharedPtr program); std::shared_ptr<VideoCommon::Shader::Registry> registry; ShaderEntries entries; - std::size_t size_in_bytes = 0; ProgramSharedPtr program; GLuint handle = 0; }; -class ShaderCacheOpenGL final : public RasterizerCache<Shader> { +class ShaderCacheOpenGL final : public VideoCommon::ShaderCache<Shader> { public: explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, Core::Frontend::EmuWindow& emu_window, const Device& device); + ~ShaderCacheOpenGL() override; /// Loads disk cache for the current game void LoadDiskCache(const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback); /// Gets the current specified shader stage program - Shader GetStageProgram(Maxwell::ShaderProgram program); + Shader* GetStageProgram(Maxwell::ShaderProgram program); /// Gets a compute kernel in the passed address - Shader GetComputeKernel(GPUVAddr code_addr); - -protected: - // We do not have to flush this cache as things in it are never modified by us. - void FlushObjectInner(const Shader& object) override {} + Shader* GetComputeKernel(GPUVAddr code_addr); private: ProgramSharedPtr GeneratePrecompiledProgram( @@ -132,10 +120,10 @@ private: ShaderDiskCacheOpenGL disk_cache; std::unordered_map<u64, PrecompiledShader> runtime_cache; - Shader null_shader{}; - Shader null_kernel{}; + std::unique_ptr<Shader> null_shader; + std::unique_ptr<Shader> null_kernel; - std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; + std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index d6e30b321..2c49aeaac 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -37,6 +37,7 @@ using Tegra::Shader::IpaMode; using Tegra::Shader::IpaSampleMode; using Tegra::Shader::PixelImap; using Tegra::Shader::Register; +using Tegra::Shader::TextureType; using VideoCommon::Shader::BuildTransformFeedback; using VideoCommon::Shader::Registry; @@ -526,6 +527,9 @@ private: if (device.HasImageLoadFormatted()) { code.AddLine("#extension GL_EXT_shader_image_load_formatted : require"); } + if (device.HasTextureShadowLod()) { + code.AddLine("#extension GL_EXT_texture_shadow_lod : require"); + } if (device.HasWarpIntrinsics()) { code.AddLine("#extension GL_NV_gpu_shader5 : require"); code.AddLine("#extension GL_NV_shader_thread_group : require"); @@ -909,13 +913,13 @@ private: return "samplerBuffer"; } switch (sampler.type) { - case Tegra::Shader::TextureType::Texture1D: + case TextureType::Texture1D: return "sampler1D"; - case Tegra::Shader::TextureType::Texture2D: + case TextureType::Texture2D: return "sampler2D"; - case Tegra::Shader::TextureType::Texture3D: + case TextureType::Texture3D: return "sampler3D"; - case Tegra::Shader::TextureType::TextureCube: + case TextureType::TextureCube: return "samplerCube"; default: UNREACHABLE(); @@ -1380,8 +1384,19 @@ private: const std::size_t count = operation.GetOperandsCount(); const bool has_array = meta->sampler.is_array; const bool has_shadow = meta->sampler.is_shadow; + const bool workaround_lod_array_shadow_as_grad = + !device.HasTextureShadowLod() && function_suffix == "Lod" && meta->sampler.is_shadow && + ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) || + meta->sampler.type == TextureType::TextureCube); + + std::string expr = "texture"; + + if (workaround_lod_array_shadow_as_grad) { + expr += "Grad"; + } else { + expr += function_suffix; + } - std::string expr = "texture" + function_suffix; if (!meta->aoffi.empty()) { expr += "Offset"; } else if (!meta->ptp.empty()) { @@ -1415,6 +1430,16 @@ private: expr += ')'; } + if (workaround_lod_array_shadow_as_grad) { + switch (meta->sampler.type) { + case TextureType::Texture2D: + return expr + ", vec2(0.0), vec2(0.0))"; + case TextureType::TextureCube: + return expr + ", vec3(0.0), vec3(0.0))"; + } + UNREACHABLE(); + } + for (const auto& variant : extras) { if (const auto argument = std::get_if<TextureArgument>(&variant)) { expr += GenerateTextureArgument(*argument); @@ -2041,8 +2066,19 @@ private: const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); ASSERT(meta); - std::string expr = GenerateTexture( - operation, "Lod", {TextureArgument{Type::Float, meta->lod}, TextureOffset{}}); + std::string expr{}; + + if (!device.HasTextureShadowLod() && meta->sampler.is_shadow && + ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) || + meta->sampler.type == TextureType::TextureCube)) { + LOG_ERROR(Render_OpenGL, + "Device lacks GL_EXT_texture_shadow_lod, using textureGrad as a workaround"); + expr = GenerateTexture(operation, "Lod", {}); + } else { + expr = GenerateTexture(operation, "Lod", + {TextureArgument{Type::Float, meta->lod}, TextureOffset{}}); + } + if (meta->sampler.is_shadow) { expr = "vec4(" + expr + ')'; } diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index 9e95a122b..653c3f2f9 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp @@ -29,6 +29,8 @@ using VideoCommon::Shader::KeyMap; namespace { +using VideoCommon::Shader::SeparateSamplerKey; + using ShaderCacheVersionHash = std::array<u8, 64>; struct ConstBufferKey { @@ -37,18 +39,26 @@ struct ConstBufferKey { u32 value = 0; }; -struct BoundSamplerKey { +struct BoundSamplerEntry { u32 offset = 0; Tegra::Engines::SamplerDescriptor sampler; }; -struct BindlessSamplerKey { +struct SeparateSamplerEntry { + u32 cbuf1 = 0; + u32 cbuf2 = 0; + u32 offset1 = 0; + u32 offset2 = 0; + Tegra::Engines::SamplerDescriptor sampler; +}; + +struct BindlessSamplerEntry { u32 cbuf = 0; u32 offset = 0; Tegra::Engines::SamplerDescriptor sampler; }; -constexpr u32 NativeVersion = 20; +constexpr u32 NativeVersion = 21; ShaderCacheVersionHash GetShaderCacheVersionHash() { ShaderCacheVersionHash hash{}; @@ -87,12 +97,14 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) { u32 texture_handler_size_value; u32 num_keys; u32 num_bound_samplers; + u32 num_separate_samplers; u32 num_bindless_samplers; if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 || file.ReadArray(&is_texture_handler_size_known, 1) != 1 || file.ReadArray(&texture_handler_size_value, 1) != 1 || file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 || file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 || + file.ReadArray(&num_separate_samplers, 1) != 1 || file.ReadArray(&num_bindless_samplers, 1) != 1) { return false; } @@ -101,23 +113,32 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) { } std::vector<ConstBufferKey> flat_keys(num_keys); - std::vector<BoundSamplerKey> flat_bound_samplers(num_bound_samplers); - std::vector<BindlessSamplerKey> flat_bindless_samplers(num_bindless_samplers); + std::vector<BoundSamplerEntry> flat_bound_samplers(num_bound_samplers); + std::vector<SeparateSamplerEntry> flat_separate_samplers(num_separate_samplers); + std::vector<BindlessSamplerEntry> flat_bindless_samplers(num_bindless_samplers); if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() || file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) != flat_bound_samplers.size() || + file.ReadArray(flat_separate_samplers.data(), flat_separate_samplers.size()) != + flat_separate_samplers.size() || file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) != flat_bindless_samplers.size()) { return false; } - for (const auto& key : flat_keys) { - keys.insert({{key.cbuf, key.offset}, key.value}); + for (const auto& entry : flat_keys) { + keys.insert({{entry.cbuf, entry.offset}, entry.value}); } - for (const auto& key : flat_bound_samplers) { - bound_samplers.emplace(key.offset, key.sampler); + for (const auto& entry : flat_bound_samplers) { + bound_samplers.emplace(entry.offset, entry.sampler); } - for (const auto& key : flat_bindless_samplers) { - bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler}); + for (const auto& entry : flat_separate_samplers) { + SeparateSamplerKey key; + key.buffers = {entry.cbuf1, entry.cbuf2}; + key.offsets = {entry.offset1, entry.offset2}; + separate_samplers.emplace(key, entry.sampler); + } + for (const auto& entry : flat_bindless_samplers) { + bindless_samplers.insert({{entry.cbuf, entry.offset}, entry.sampler}); } return true; @@ -142,6 +163,7 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const { file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 || file.WriteObject(static_cast<u32>(keys.size())) != 1 || file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 || + file.WriteObject(static_cast<u32>(separate_samplers.size())) != 1 || file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) { return false; } @@ -152,22 +174,34 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const { flat_keys.push_back(ConstBufferKey{address.first, address.second, value}); } - std::vector<BoundSamplerKey> flat_bound_samplers; + std::vector<BoundSamplerEntry> flat_bound_samplers; flat_bound_samplers.reserve(bound_samplers.size()); for (const auto& [address, sampler] : bound_samplers) { - flat_bound_samplers.push_back(BoundSamplerKey{address, sampler}); + flat_bound_samplers.push_back(BoundSamplerEntry{address, sampler}); + } + + std::vector<SeparateSamplerEntry> flat_separate_samplers; + flat_separate_samplers.reserve(separate_samplers.size()); + for (const auto& [key, sampler] : separate_samplers) { + SeparateSamplerEntry entry; + std::tie(entry.cbuf1, entry.cbuf2) = key.buffers; + std::tie(entry.offset1, entry.offset2) = key.offsets; + entry.sampler = sampler; + flat_separate_samplers.push_back(entry); } - std::vector<BindlessSamplerKey> flat_bindless_samplers; + std::vector<BindlessSamplerEntry> flat_bindless_samplers; flat_bindless_samplers.reserve(bindless_samplers.size()); for (const auto& [address, sampler] : bindless_samplers) { flat_bindless_samplers.push_back( - BindlessSamplerKey{address.first, address.second, sampler}); + BindlessSamplerEntry{address.first, address.second, sampler}); } return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() && file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) == flat_bound_samplers.size() && + file.WriteArray(flat_separate_samplers.data(), flat_separate_samplers.size()) == + flat_separate_samplers.size() && file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) == flat_bindless_samplers.size(); } diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h index d5be52e40..a79cef0e9 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h @@ -57,6 +57,7 @@ struct ShaderDiskCacheEntry { VideoCommon::Shader::ComputeInfo compute_info; VideoCommon::Shader::KeyMap keys; VideoCommon::Shader::BoundSamplerMap bound_samplers; + VideoCommon::Shader::SeparateSamplerMap separate_samplers; VideoCommon::Shader::BindlessSamplerMap bindless_samplers; }; diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index 6ec328c53..3655ff629 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -2,11 +2,13 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <deque> +#include <tuple> #include <vector> + #include "common/alignment.h" #include "common/assert.h" #include "common/microprofile.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", @@ -14,8 +16,7 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", namespace OpenGL { -OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent, - bool use_persistent) +OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage) : buffer_size(size) { gl_buffer.Create(); @@ -29,34 +30,22 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool p allocate_size *= 2; } - if (use_persistent) { - persistent = true; - coherent = prefer_coherent; - const GLbitfield flags = - GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0); - glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags); - mapped_ptr = static_cast<u8*>(glMapNamedBufferRange( - gl_buffer.handle, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT))); - } else { - glNamedBufferData(gl_buffer.handle, allocate_size, nullptr, GL_STREAM_DRAW); + static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT; + glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags); + mapped_ptr = static_cast<u8*>( + glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT)); + + if (device.HasVertexBufferUnifiedMemory()) { + glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY); + glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); } } OGLStreamBuffer::~OGLStreamBuffer() { - if (persistent) { - glUnmapNamedBuffer(gl_buffer.handle); - } + glUnmapNamedBuffer(gl_buffer.handle); gl_buffer.Release(); } -GLuint OGLStreamBuffer::GetHandle() const { - return gl_buffer.handle; -} - -GLsizeiptr OGLStreamBuffer::GetSize() const { - return buffer_size; -} - std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) { ASSERT(size <= buffer_size); ASSERT(alignment <= buffer_size); @@ -68,36 +57,21 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a bool invalidate = false; if (buffer_pos + size > buffer_size) { + MICROPROFILE_SCOPE(OpenGL_StreamBuffer); + glInvalidateBufferData(gl_buffer.handle); + buffer_pos = 0; invalidate = true; - - if (persistent) { - glUnmapNamedBuffer(gl_buffer.handle); - } } - if (invalidate || !persistent) { - MICROPROFILE_SCOPE(OpenGL_StreamBuffer); - GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) | - (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) | - (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT); - mapped_ptr = static_cast<u8*>( - glMapNamedBufferRange(gl_buffer.handle, buffer_pos, buffer_size - buffer_pos, flags)); - mapped_offset = buffer_pos; - } - - return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate); + return std::make_tuple(mapped_ptr + buffer_pos, buffer_pos, invalidate); } void OGLStreamBuffer::Unmap(GLsizeiptr size) { ASSERT(size <= mapped_size); - if (!coherent && size > 0) { - glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos - mapped_offset, size); - } - - if (!persistent) { - glUnmapNamedBuffer(gl_buffer.handle); + if (size > 0) { + glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size); } buffer_pos += size; diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h index f8383cbd4..307a67113 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.h +++ b/src/video_core/renderer_opengl/gl_stream_buffer.h @@ -11,15 +11,13 @@ namespace OpenGL { +class Device; + class OGLStreamBuffer : private NonCopyable { public: - explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false, - bool use_persistent = true); + explicit OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage); ~OGLStreamBuffer(); - GLuint GetHandle() const; - GLsizeiptr GetSize() const; - /* * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes * and the optional alignment requirement. @@ -32,15 +30,24 @@ public: void Unmap(GLsizeiptr size); + GLuint Handle() const { + return gl_buffer.handle; + } + + u64 Address() const { + return gpu_address; + } + + GLsizeiptr Size() const noexcept { + return buffer_size; + } + private: OGLBuffer gl_buffer; - bool coherent = false; - bool persistent = false; - + GLuint64EXT gpu_address = 0; GLintptr buffer_pos = 0; GLsizeiptr buffer_size = 0; - GLintptr mapped_offset = 0; GLsizeiptr mapped_size = 0; u8* mapped_ptr = nullptr; }; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 57db5a08b..61505879b 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -263,9 +263,14 @@ CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& param target = GetTextureTarget(params.target); texture = CreateTexture(params, target, internal_format, texture_buffer); DecorateSurfaceName(); - main_view = CreateViewInner( - ViewParams(params.target, 0, params.is_layered ? params.depth : 1, 0, params.num_levels), - true); + + u32 num_layers = 1; + if (params.is_layered || params.target == SurfaceTarget::Texture3D) { + num_layers = params.depth; + } + + main_view = + CreateViewInner(ViewParams(params.target, 0, num_layers, 0, params.num_levels), true); } CachedSurface::~CachedSurface() = default; @@ -413,20 +418,23 @@ CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& p CachedSurfaceView::~CachedSurfaceView() = default; -void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { +void CachedSurfaceView::Attach(GLenum attachment, GLenum fb_target) const { ASSERT(params.num_levels == 1); + if (params.target == SurfaceTarget::Texture3D) { + if (params.num_layers > 1) { + ASSERT(params.base_layer == 0); + glFramebufferTexture(fb_target, attachment, surface.texture.handle, params.base_level); + } else { + glFramebufferTexture3D(fb_target, attachment, target, surface.texture.handle, + params.base_level, params.base_layer); + } + return; + } + if (params.num_layers > 1) { - // Layered framebuffer attachments UNIMPLEMENTED_IF(params.base_layer != 0); - - switch (params.target) { - case SurfaceTarget::Texture2DArray: - glFramebufferTexture(target, attachment, GetTexture(), 0); - break; - default: - UNIMPLEMENTED(); - } + glFramebufferTexture(fb_target, attachment, GetTexture(), 0); return; } @@ -434,16 +442,16 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { const GLuint texture = surface.GetTexture(); switch (surface.GetSurfaceParams().target) { case SurfaceTarget::Texture1D: - glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level); + glFramebufferTexture1D(fb_target, attachment, view_target, texture, params.base_level); break; case SurfaceTarget::Texture2D: - glFramebufferTexture2D(target, attachment, view_target, texture, params.base_level); + glFramebufferTexture2D(fb_target, attachment, view_target, texture, params.base_level); break; case SurfaceTarget::Texture1DArray: case SurfaceTarget::Texture2DArray: case SurfaceTarget::TextureCubemap: case SurfaceTarget::TextureCubeArray: - glFramebufferTextureLayer(target, attachment, texture, params.base_level, + glFramebufferTextureLayer(fb_target, attachment, texture, params.base_level, params.base_layer); break; default: @@ -500,8 +508,13 @@ OGLTextureView CachedSurfaceView::CreateTextureView() const { OGLTextureView texture_view; texture_view.Create(); - glTextureView(texture_view.handle, target, surface.texture.handle, format, params.base_level, - params.num_levels, params.base_layer, params.num_layers); + if (target == GL_TEXTURE_3D) { + glTextureView(texture_view.handle, target, surface.texture.handle, format, + params.base_level, params.num_levels, 0, 1); + } else { + glTextureView(texture_view.handle, target, surface.texture.handle, format, + params.base_level, params.num_levels, params.base_layer, params.num_layers); + } ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle); return texture_view; @@ -544,8 +557,8 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view, const Tegra::Engines::Fermi2D::Config& copy_config) { const auto& src_params{src_view->GetSurfaceParams()}; const auto& dst_params{dst_view->GetSurfaceParams()}; - UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D); - UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D); + UNIMPLEMENTED_IF(src_params.depth != 1); + UNIMPLEMENTED_IF(dst_params.depth != 1); state_tracker.NotifyScissor0(); state_tracker.NotifyFramebuffer(); diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 8a2ac8603..bfc4ddf5d 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -80,8 +80,10 @@ public: explicit CachedSurfaceView(CachedSurface& surface, const ViewParams& params, bool is_proxy); ~CachedSurfaceView(); - /// Attaches this texture view to the current bound GL_DRAW_FRAMEBUFFER - void Attach(GLenum attachment, GLenum target) const; + /// @brief Attaches this texture view to the currently bound fb_target framebuffer + /// @param attachment Attachment to bind textures to + /// @param fb_target Framebuffer target to attach to (e.g. DRAW_FRAMEBUFFER) + void Attach(GLenum attachment, GLenum fb_target) const; GLuint GetTexture(Tegra::Texture::SwizzleSource x_source, Tegra::Texture::SwizzleSource y_source, diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index 994ae98eb..774e70a5b 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -24,10 +24,11 @@ namespace MaxwellToGL { using Maxwell = Tegra::Engines::Maxwell3D::Regs; -inline GLenum VertexType(Maxwell::VertexAttribute attrib) { +inline GLenum VertexFormat(Maxwell::VertexAttribute attrib) { switch (attrib.type) { - case Maxwell::VertexAttribute::Type::UnsignedInt: case Maxwell::VertexAttribute::Type::UnsignedNorm: + case Maxwell::VertexAttribute::Type::UnsignedScaled: + case Maxwell::VertexAttribute::Type::UnsignedInt: switch (attrib.size) { case Maxwell::VertexAttribute::Size::Size_8: case Maxwell::VertexAttribute::Size::Size_8_8: @@ -46,12 +47,11 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { return GL_UNSIGNED_INT; case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return GL_UNSIGNED_INT_2_10_10_10_REV; - default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; } - case Maxwell::VertexAttribute::Type::SignedInt: + break; case Maxwell::VertexAttribute::Type::SignedNorm: + case Maxwell::VertexAttribute::Type::SignedScaled: + case Maxwell::VertexAttribute::Type::SignedInt: switch (attrib.size) { case Maxwell::VertexAttribute::Size::Size_8: case Maxwell::VertexAttribute::Size::Size_8_8: @@ -70,10 +70,8 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { return GL_INT; case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return GL_INT_2_10_10_10_REV; - default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; } + break; case Maxwell::VertexAttribute::Type::Float: switch (attrib.size) { case Maxwell::VertexAttribute::Size::Size_16: @@ -86,46 +84,12 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { case Maxwell::VertexAttribute::Size::Size_32_32_32: case Maxwell::VertexAttribute::Size::Size_32_32_32_32: return GL_FLOAT; - default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; - } - case Maxwell::VertexAttribute::Type::UnsignedScaled: - switch (attrib.size) { - case Maxwell::VertexAttribute::Size::Size_8: - case Maxwell::VertexAttribute::Size::Size_8_8: - case Maxwell::VertexAttribute::Size::Size_8_8_8: - case Maxwell::VertexAttribute::Size::Size_8_8_8_8: - return GL_UNSIGNED_BYTE; - case Maxwell::VertexAttribute::Size::Size_16: - case Maxwell::VertexAttribute::Size::Size_16_16: - case Maxwell::VertexAttribute::Size::Size_16_16_16: - case Maxwell::VertexAttribute::Size::Size_16_16_16_16: - return GL_UNSIGNED_SHORT; - default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; } - case Maxwell::VertexAttribute::Type::SignedScaled: - switch (attrib.size) { - case Maxwell::VertexAttribute::Size::Size_8: - case Maxwell::VertexAttribute::Size::Size_8_8: - case Maxwell::VertexAttribute::Size::Size_8_8_8: - case Maxwell::VertexAttribute::Size::Size_8_8_8_8: - return GL_BYTE; - case Maxwell::VertexAttribute::Size::Size_16: - case Maxwell::VertexAttribute::Size::Size_16_16: - case Maxwell::VertexAttribute::Size::Size_16_16_16: - case Maxwell::VertexAttribute::Size::Size_16_16_16_16: - return GL_SHORT; - default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; - } - default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString()); - return {}; + break; } + UNIMPLEMENTED_MSG("Unimplemented vertex format of type={} and size={}", attrib.TypeString(), + attrib.SizeString()); + return {}; } inline GLenum IndexFormat(Maxwell::IndexFormat index_format) { @@ -137,8 +101,7 @@ inline GLenum IndexFormat(Maxwell::IndexFormat index_format) { case Maxwell::IndexFormat::UnsignedInt: return GL_UNSIGNED_INT; } - LOG_CRITICAL(Render_OpenGL, "Unimplemented index_format={}", static_cast<u32>(index_format)); - UNREACHABLE(); + UNREACHABLE_MSG("Invalid index_format={}", static_cast<u32>(index_format)); return {}; } @@ -180,33 +143,32 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) { } inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode, - Tegra::Texture::TextureMipmapFilter mip_filter_mode) { + Tegra::Texture::TextureMipmapFilter mipmap_filter_mode) { switch (filter_mode) { - case Tegra::Texture::TextureFilter::Linear: { - switch (mip_filter_mode) { + case Tegra::Texture::TextureFilter::Nearest: + switch (mipmap_filter_mode) { case Tegra::Texture::TextureMipmapFilter::None: - return GL_LINEAR; + return GL_NEAREST; case Tegra::Texture::TextureMipmapFilter::Nearest: - return GL_LINEAR_MIPMAP_NEAREST; + return GL_NEAREST_MIPMAP_NEAREST; case Tegra::Texture::TextureMipmapFilter::Linear: - return GL_LINEAR_MIPMAP_LINEAR; + return GL_NEAREST_MIPMAP_LINEAR; } break; - } - case Tegra::Texture::TextureFilter::Nearest: { - switch (mip_filter_mode) { + case Tegra::Texture::TextureFilter::Linear: + switch (mipmap_filter_mode) { case Tegra::Texture::TextureMipmapFilter::None: - return GL_NEAREST; + return GL_LINEAR; case Tegra::Texture::TextureMipmapFilter::Nearest: - return GL_NEAREST_MIPMAP_NEAREST; + return GL_LINEAR_MIPMAP_NEAREST; case Tegra::Texture::TextureMipmapFilter::Linear: - return GL_NEAREST_MIPMAP_LINEAR; + return GL_LINEAR_MIPMAP_LINEAR; } break; } - } - LOG_ERROR(Render_OpenGL, "Unimplemented texture filter mode={}", static_cast<u32>(filter_mode)); - return GL_LINEAR; + UNREACHABLE_MSG("Invalid texture filter mode={} and mipmap filter mode={}", + static_cast<u32>(filter_mode), static_cast<u32>(mipmap_filter_mode)); + return GL_NEAREST; } inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) { @@ -229,10 +191,9 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) { } else { return GL_MIRROR_CLAMP_TO_EDGE; } - default: - LOG_ERROR(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode)); - return GL_REPEAT; } + UNIMPLEMENTED_MSG("Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode)); + return GL_REPEAT; } inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) { @@ -254,8 +215,7 @@ inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) { case Tegra::Texture::DepthCompareFunc::Always: return GL_ALWAYS; } - LOG_ERROR(Render_OpenGL, "Unimplemented texture depth compare function ={}", - static_cast<u32>(func)); + UNIMPLEMENTED_MSG("Unimplemented texture depth compare function={}", static_cast<u32>(func)); return GL_GREATER; } @@ -277,7 +237,7 @@ inline GLenum BlendEquation(Maxwell::Blend::Equation equation) { case Maxwell::Blend::Equation::MaxGL: return GL_MAX; } - LOG_ERROR(Render_OpenGL, "Unimplemented blend equation={}", static_cast<u32>(equation)); + UNIMPLEMENTED_MSG("Unimplemented blend equation={}", static_cast<u32>(equation)); return GL_FUNC_ADD; } @@ -341,7 +301,7 @@ inline GLenum BlendFunc(Maxwell::Blend::Factor factor) { case Maxwell::Blend::Factor::OneMinusConstantAlphaGL: return GL_ONE_MINUS_CONSTANT_ALPHA; } - LOG_ERROR(Render_OpenGL, "Unimplemented blend factor={}", static_cast<u32>(factor)); + UNIMPLEMENTED_MSG("Unimplemented blend factor={}", static_cast<u32>(factor)); return GL_ZERO; } @@ -361,7 +321,7 @@ inline GLenum SwizzleSource(Tegra::Texture::SwizzleSource source) { case Tegra::Texture::SwizzleSource::OneFloat: return GL_ONE; } - LOG_ERROR(Render_OpenGL, "Unimplemented swizzle source={}", static_cast<u32>(source)); + UNIMPLEMENTED_MSG("Unimplemented swizzle source={}", static_cast<u32>(source)); return GL_ZERO; } @@ -392,7 +352,7 @@ inline GLenum ComparisonOp(Maxwell::ComparisonOp comparison) { case Maxwell::ComparisonOp::AlwaysOld: return GL_ALWAYS; } - LOG_ERROR(Render_OpenGL, "Unimplemented comparison op={}", static_cast<u32>(comparison)); + UNIMPLEMENTED_MSG("Unimplemented comparison op={}", static_cast<u32>(comparison)); return GL_ALWAYS; } @@ -423,7 +383,7 @@ inline GLenum StencilOp(Maxwell::StencilOp stencil) { case Maxwell::StencilOp::DecrWrapOGL: return GL_DECR_WRAP; } - LOG_ERROR(Render_OpenGL, "Unimplemented stencil op={}", static_cast<u32>(stencil)); + UNIMPLEMENTED_MSG("Unimplemented stencil op={}", static_cast<u32>(stencil)); return GL_KEEP; } @@ -434,7 +394,7 @@ inline GLenum FrontFace(Maxwell::FrontFace front_face) { case Maxwell::FrontFace::CounterClockWise: return GL_CCW; } - LOG_ERROR(Render_OpenGL, "Unimplemented front face cull={}", static_cast<u32>(front_face)); + UNIMPLEMENTED_MSG("Unimplemented front face cull={}", static_cast<u32>(front_face)); return GL_CCW; } @@ -447,7 +407,7 @@ inline GLenum CullFace(Maxwell::CullFace cull_face) { case Maxwell::CullFace::FrontAndBack: return GL_FRONT_AND_BACK; } - LOG_ERROR(Render_OpenGL, "Unimplemented cull face={}", static_cast<u32>(cull_face)); + UNIMPLEMENTED_MSG("Unimplemented cull face={}", static_cast<u32>(cull_face)); return GL_BACK; } @@ -486,7 +446,7 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) { case Maxwell::LogicOperation::Set: return GL_SET; } - LOG_ERROR(Render_OpenGL, "Unimplemented logic operation={}", static_cast<u32>(operation)); + UNIMPLEMENTED_MSG("Unimplemented logic operation={}", static_cast<u32>(operation)); return GL_COPY; } diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 6214fcbc3..c40adb6e7 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -488,6 +488,15 @@ void RendererOpenGL::InitOpenGLObjects() { // Clear screen to black LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture); + + // Enable unified vertex attributes and query vertex buffer address when the driver supports it + if (device.HasVertexBufferUnifiedMemory()) { + glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV); + + glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY); + glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, + &vertex_buffer_address); + } } void RendererOpenGL::AddTelemetryFields() { @@ -656,7 +665,13 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { offsetof(ScreenRectVertex, tex_coord)); glVertexAttribBinding(PositionLocation, 0); glVertexAttribBinding(TexCoordLocation, 0); - glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex)); + if (device.HasVertexBufferUnifiedMemory()) { + glBindVertexBuffer(0, 0, 0, sizeof(ScreenRectVertex)); + glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, 0, vertex_buffer_address, + sizeof(vertices)); + } else { + glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex)); + } glBindTextureUnit(0, screen_info.display_texture); glBindSampler(0, 0); diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index 61bf507f4..8b18d32e6 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -107,6 +107,9 @@ private: OGLPipeline pipeline; OGLFramebuffer screenshot_framebuffer; + // GPU address of the vertex buffer + GLuint64EXT vertex_buffer_address = 0; + /// Display information for Switch screen ScreenInfo screen_info; diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 62e950d31..d7f1ae89f 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -21,29 +21,29 @@ namespace Sampler { VkFilter Filter(Tegra::Texture::TextureFilter filter) { switch (filter) { - case Tegra::Texture::TextureFilter::Linear: - return VK_FILTER_LINEAR; case Tegra::Texture::TextureFilter::Nearest: return VK_FILTER_NEAREST; + case Tegra::Texture::TextureFilter::Linear: + return VK_FILTER_LINEAR; } - UNIMPLEMENTED_MSG("Unimplemented sampler filter={}", static_cast<u32>(filter)); + UNREACHABLE_MSG("Invalid sampler filter={}", static_cast<u32>(filter)); return {}; } VkSamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filter) { switch (mipmap_filter) { case Tegra::Texture::TextureMipmapFilter::None: - // TODO(Rodrigo): None seems to be mapped to OpenGL's mag and min filters without mipmapping - // (e.g. GL_NEAREST and GL_LINEAR). Vulkan doesn't have such a thing, find out if we have to - // use an image view with a single mipmap level to emulate this. - return VK_SAMPLER_MIPMAP_MODE_LINEAR; - ; - case Tegra::Texture::TextureMipmapFilter::Linear: - return VK_SAMPLER_MIPMAP_MODE_LINEAR; + // There are no Vulkan filter modes that directly correspond to OpenGL minification filters + // of GL_LINEAR or GL_NEAREST, but they can be emulated using + // VK_SAMPLER_MIPMAP_MODE_NEAREST, minLod = 0, and maxLod = 0.25, and using minFilter = + // VK_FILTER_LINEAR or minFilter = VK_FILTER_NEAREST, respectively. + return VK_SAMPLER_MIPMAP_MODE_NEAREST; case Tegra::Texture::TextureMipmapFilter::Nearest: return VK_SAMPLER_MIPMAP_MODE_NEAREST; + case Tegra::Texture::TextureMipmapFilter::Linear: + return VK_SAMPLER_MIPMAP_MODE_LINEAR; } - UNIMPLEMENTED_MSG("Unimplemented sampler mipmap mode={}", static_cast<u32>(mipmap_filter)); + UNREACHABLE_MSG("Invalid sampler mipmap mode={}", static_cast<u32>(mipmap_filter)); return {}; } @@ -78,10 +78,9 @@ VkSamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode w case Tegra::Texture::WrapMode::MirrorOnceBorder: UNIMPLEMENTED(); return VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE; - default: - UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode)); - return {}; } + UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode)); + return {}; } VkCompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_func) { @@ -288,14 +287,35 @@ VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const VKDevice& device, return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; case Maxwell::PrimitiveTopology::Patches: return VK_PRIMITIVE_TOPOLOGY_PATCH_LIST; - default: - UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology)); - return {}; } + UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology)); + return {}; } VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size) { switch (type) { + case Maxwell::VertexAttribute::Type::UnsignedNorm: + switch (size) { + case Maxwell::VertexAttribute::Size::Size_8: + return VK_FORMAT_R8_UNORM; + case Maxwell::VertexAttribute::Size::Size_8_8: + return VK_FORMAT_R8G8_UNORM; + case Maxwell::VertexAttribute::Size::Size_8_8_8: + return VK_FORMAT_R8G8B8_UNORM; + case Maxwell::VertexAttribute::Size::Size_8_8_8_8: + return VK_FORMAT_R8G8B8A8_UNORM; + case Maxwell::VertexAttribute::Size::Size_16: + return VK_FORMAT_R16_UNORM; + case Maxwell::VertexAttribute::Size::Size_16_16: + return VK_FORMAT_R16G16_UNORM; + case Maxwell::VertexAttribute::Size::Size_16_16_16: + return VK_FORMAT_R16G16B16_UNORM; + case Maxwell::VertexAttribute::Size::Size_16_16_16_16: + return VK_FORMAT_R16G16B16A16_UNORM; + case Maxwell::VertexAttribute::Size::Size_10_10_10_2: + return VK_FORMAT_A2B10G10R10_UNORM_PACK32; + } + break; case Maxwell::VertexAttribute::Type::SignedNorm: switch (size) { case Maxwell::VertexAttribute::Size::Size_8: @@ -316,62 +336,50 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib return VK_FORMAT_R16G16B16A16_SNORM; case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return VK_FORMAT_A2B10G10R10_SNORM_PACK32; - default: - break; } break; - case Maxwell::VertexAttribute::Type::UnsignedNorm: + case Maxwell::VertexAttribute::Type::UnsignedScaled: switch (size) { case Maxwell::VertexAttribute::Size::Size_8: - return VK_FORMAT_R8_UNORM; + return VK_FORMAT_R8_USCALED; case Maxwell::VertexAttribute::Size::Size_8_8: - return VK_FORMAT_R8G8_UNORM; + return VK_FORMAT_R8G8_USCALED; case Maxwell::VertexAttribute::Size::Size_8_8_8: - return VK_FORMAT_R8G8B8_UNORM; + return VK_FORMAT_R8G8B8_USCALED; case Maxwell::VertexAttribute::Size::Size_8_8_8_8: - return VK_FORMAT_R8G8B8A8_UNORM; + return VK_FORMAT_R8G8B8A8_USCALED; case Maxwell::VertexAttribute::Size::Size_16: - return VK_FORMAT_R16_UNORM; + return VK_FORMAT_R16_USCALED; case Maxwell::VertexAttribute::Size::Size_16_16: - return VK_FORMAT_R16G16_UNORM; + return VK_FORMAT_R16G16_USCALED; case Maxwell::VertexAttribute::Size::Size_16_16_16: - return VK_FORMAT_R16G16B16_UNORM; + return VK_FORMAT_R16G16B16_USCALED; case Maxwell::VertexAttribute::Size::Size_16_16_16_16: - return VK_FORMAT_R16G16B16A16_UNORM; + return VK_FORMAT_R16G16B16A16_USCALED; case Maxwell::VertexAttribute::Size::Size_10_10_10_2: - return VK_FORMAT_A2B10G10R10_UNORM_PACK32; - default: - break; + return VK_FORMAT_A2B10G10R10_USCALED_PACK32; } break; - case Maxwell::VertexAttribute::Type::SignedInt: + case Maxwell::VertexAttribute::Type::SignedScaled: switch (size) { case Maxwell::VertexAttribute::Size::Size_8: - return VK_FORMAT_R8_SINT; + return VK_FORMAT_R8_SSCALED; case Maxwell::VertexAttribute::Size::Size_8_8: - return VK_FORMAT_R8G8_SINT; + return VK_FORMAT_R8G8_SSCALED; case Maxwell::VertexAttribute::Size::Size_8_8_8: - return VK_FORMAT_R8G8B8_SINT; + return VK_FORMAT_R8G8B8_SSCALED; case Maxwell::VertexAttribute::Size::Size_8_8_8_8: - return VK_FORMAT_R8G8B8A8_SINT; + return VK_FORMAT_R8G8B8A8_SSCALED; case Maxwell::VertexAttribute::Size::Size_16: - return VK_FORMAT_R16_SINT; + return VK_FORMAT_R16_SSCALED; case Maxwell::VertexAttribute::Size::Size_16_16: - return VK_FORMAT_R16G16_SINT; + return VK_FORMAT_R16G16_SSCALED; case Maxwell::VertexAttribute::Size::Size_16_16_16: - return VK_FORMAT_R16G16B16_SINT; + return VK_FORMAT_R16G16B16_SSCALED; case Maxwell::VertexAttribute::Size::Size_16_16_16_16: - return VK_FORMAT_R16G16B16A16_SINT; - case Maxwell::VertexAttribute::Size::Size_32: - return VK_FORMAT_R32_SINT; - case Maxwell::VertexAttribute::Size::Size_32_32: - return VK_FORMAT_R32G32_SINT; - case Maxwell::VertexAttribute::Size::Size_32_32_32: - return VK_FORMAT_R32G32B32_SINT; - case Maxwell::VertexAttribute::Size::Size_32_32_32_32: - return VK_FORMAT_R32G32B32A32_SINT; - default: - break; + return VK_FORMAT_R16G16B16A16_SSCALED; + case Maxwell::VertexAttribute::Size::Size_10_10_10_2: + return VK_FORMAT_A2B10G10R10_SSCALED_PACK32; } break; case Maxwell::VertexAttribute::Type::UnsignedInt: @@ -400,56 +408,50 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib return VK_FORMAT_R32G32B32_UINT; case Maxwell::VertexAttribute::Size::Size_32_32_32_32: return VK_FORMAT_R32G32B32A32_UINT; - default: - break; + case Maxwell::VertexAttribute::Size::Size_10_10_10_2: + return VK_FORMAT_A2B10G10R10_UINT_PACK32; } break; - case Maxwell::VertexAttribute::Type::UnsignedScaled: + case Maxwell::VertexAttribute::Type::SignedInt: switch (size) { case Maxwell::VertexAttribute::Size::Size_8: - return VK_FORMAT_R8_USCALED; + return VK_FORMAT_R8_SINT; case Maxwell::VertexAttribute::Size::Size_8_8: - return VK_FORMAT_R8G8_USCALED; + return VK_FORMAT_R8G8_SINT; case Maxwell::VertexAttribute::Size::Size_8_8_8: - return VK_FORMAT_R8G8B8_USCALED; + return VK_FORMAT_R8G8B8_SINT; case Maxwell::VertexAttribute::Size::Size_8_8_8_8: - return VK_FORMAT_R8G8B8A8_USCALED; + return VK_FORMAT_R8G8B8A8_SINT; case Maxwell::VertexAttribute::Size::Size_16: - return VK_FORMAT_R16_USCALED; + return VK_FORMAT_R16_SINT; case Maxwell::VertexAttribute::Size::Size_16_16: - return VK_FORMAT_R16G16_USCALED; + return VK_FORMAT_R16G16_SINT; case Maxwell::VertexAttribute::Size::Size_16_16_16: - return VK_FORMAT_R16G16B16_USCALED; + return VK_FORMAT_R16G16B16_SINT; case Maxwell::VertexAttribute::Size::Size_16_16_16_16: - return VK_FORMAT_R16G16B16A16_USCALED; - default: - break; + return VK_FORMAT_R16G16B16A16_SINT; + case Maxwell::VertexAttribute::Size::Size_32: + return VK_FORMAT_R32_SINT; + case Maxwell::VertexAttribute::Size::Size_32_32: + return VK_FORMAT_R32G32_SINT; + case Maxwell::VertexAttribute::Size::Size_32_32_32: + return VK_FORMAT_R32G32B32_SINT; + case Maxwell::VertexAttribute::Size::Size_32_32_32_32: + return VK_FORMAT_R32G32B32A32_SINT; + case Maxwell::VertexAttribute::Size::Size_10_10_10_2: + return VK_FORMAT_A2B10G10R10_SINT_PACK32; } break; - case Maxwell::VertexAttribute::Type::SignedScaled: + case Maxwell::VertexAttribute::Type::Float: switch (size) { - case Maxwell::VertexAttribute::Size::Size_8: - return VK_FORMAT_R8_SSCALED; - case Maxwell::VertexAttribute::Size::Size_8_8: - return VK_FORMAT_R8G8_SSCALED; - case Maxwell::VertexAttribute::Size::Size_8_8_8: - return VK_FORMAT_R8G8B8_SSCALED; - case Maxwell::VertexAttribute::Size::Size_8_8_8_8: - return VK_FORMAT_R8G8B8A8_SSCALED; case Maxwell::VertexAttribute::Size::Size_16: - return VK_FORMAT_R16_SSCALED; + return VK_FORMAT_R16_SFLOAT; case Maxwell::VertexAttribute::Size::Size_16_16: - return VK_FORMAT_R16G16_SSCALED; + return VK_FORMAT_R16G16_SFLOAT; case Maxwell::VertexAttribute::Size::Size_16_16_16: - return VK_FORMAT_R16G16B16_SSCALED; + return VK_FORMAT_R16G16B16_SFLOAT; case Maxwell::VertexAttribute::Size::Size_16_16_16_16: - return VK_FORMAT_R16G16B16A16_SSCALED; - default: - break; - } - break; - case Maxwell::VertexAttribute::Type::Float: - switch (size) { + return VK_FORMAT_R16G16B16A16_SFLOAT; case Maxwell::VertexAttribute::Size::Size_32: return VK_FORMAT_R32_SFLOAT; case Maxwell::VertexAttribute::Size::Size_32_32: @@ -458,16 +460,6 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib return VK_FORMAT_R32G32B32_SFLOAT; case Maxwell::VertexAttribute::Size::Size_32_32_32_32: return VK_FORMAT_R32G32B32A32_SFLOAT; - case Maxwell::VertexAttribute::Size::Size_16: - return VK_FORMAT_R16_SFLOAT; - case Maxwell::VertexAttribute::Size::Size_16_16: - return VK_FORMAT_R16G16_SFLOAT; - case Maxwell::VertexAttribute::Size::Size_16_16_16: - return VK_FORMAT_R16G16B16_SFLOAT; - case Maxwell::VertexAttribute::Size::Size_16_16_16_16: - return VK_FORMAT_R16G16B16A16_SFLOAT; - default: - break; } break; } diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 59b441943..2d9b18ed9 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -13,6 +13,7 @@ #include <fmt/format.h> #include "common/dynamic_library.h" +#include "common/file_util.h" #include "common/logging/log.h" #include "common/telemetry.h" #include "core/core.h" @@ -76,7 +77,8 @@ Common::DynamicLibrary OpenVulkanLibrary() { char* libvulkan_env = getenv("LIBVULKAN_PATH"); if (!libvulkan_env || !library.Open(libvulkan_env)) { // Use the libvulkan.dylib from the application bundle. - std::string filename = File::GetBundleDirectory() + "/Contents/Frameworks/libvulkan.dylib"; + const std::string filename = + FileUtil::GetBundleDirectory() + "/Contents/Frameworks/libvulkan.dylib"; library.Open(filename.c_str()); } #else @@ -153,11 +155,31 @@ vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatc } } - static constexpr std::array layers_data{"VK_LAYER_LUNARG_standard_validation"}; - vk::Span<const char*> layers = layers_data; - if (!enable_layers) { - layers = {}; + std::vector<const char*> layers; + layers.reserve(1); + if (enable_layers) { + layers.push_back("VK_LAYER_KHRONOS_validation"); + } + + const std::optional layer_properties = vk::EnumerateInstanceLayerProperties(dld); + if (!layer_properties) { + LOG_ERROR(Render_Vulkan, "Failed to query layer properties, disabling layers"); + layers.clear(); + } + + for (auto layer_it = layers.begin(); layer_it != layers.end();) { + const char* const layer = *layer_it; + const auto it = std::find_if( + layer_properties->begin(), layer_properties->end(), + [layer](const VkLayerProperties& prop) { return !std::strcmp(layer, prop.layerName); }); + if (it == layer_properties->end()) { + LOG_ERROR(Render_Vulkan, "Layer {} not available, removing it", layer); + layer_it = layers.erase(layer_it); + } else { + ++layer_it; + } } + vk::Instance instance = vk::Instance::Create(layers, extensions, dld); if (!instance) { LOG_ERROR(Render_Vulkan, "Failed to create Vulkan instance"); diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 5f33d9e40..f10f96cd8 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -37,9 +37,9 @@ std::unique_ptr<VKStreamBuffer> CreateStreamBuffer(const VKDevice& device, VKSch } // Anonymous namespace -CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager, - VAddr cpu_addr, std::size_t size) - : VideoCommon::BufferBlock{cpu_addr, size} { +Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler_, + VKStagingBufferPool& staging_pool_, VAddr cpu_addr, std::size_t size) + : VideoCommon::BufferBlock{cpu_addr, size}, scheduler{scheduler_}, staging_pool{staging_pool_} { VkBufferCreateInfo ci; ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; ci.pNext = nullptr; @@ -54,46 +54,17 @@ CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& me buffer.commit = memory_manager.Commit(buffer.handle, false); } -CachedBufferBlock::~CachedBufferBlock() = default; +Buffer::~Buffer() = default; -VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, - const VKDevice& device, VKMemoryManager& memory_manager, - VKScheduler& scheduler, VKStagingBufferPool& staging_pool) - : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, system, - CreateStreamBuffer(device, - scheduler)}, - device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{ - staging_pool} {} - -VKBufferCache::~VKBufferCache() = default; - -Buffer VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { - return std::make_shared<CachedBufferBlock>(device, memory_manager, cpu_addr, size); -} - -VkBuffer VKBufferCache::ToHandle(const Buffer& buffer) { - return buffer->GetHandle(); -} - -VkBuffer VKBufferCache::GetEmptyBuffer(std::size_t size) { - size = std::max(size, std::size_t(4)); - const auto& empty = staging_pool.GetUnusedBuffer(size, false); - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) { - cmdbuf.FillBuffer(buffer, 0, size, 0); - }); - return *empty.handle; -} - -void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - const u8* data) { +void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) const { const auto& staging = staging_pool.GetUnusedBuffer(size, true); std::memcpy(staging.commit->Map(size), data, size); scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset, - size](vk::CommandBuffer cmdbuf) { - cmdbuf.CopyBuffer(staging, buffer, VkBufferCopy{0, offset, size}); + + const VkBuffer handle = Handle(); + scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) { + cmdbuf.CopyBuffer(staging, handle, VkBufferCopy{0, offset, size}); VkBufferMemoryBarrier barrier; barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; @@ -102,7 +73,7 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st barrier.dstAccessMask = UPLOAD_ACCESS_BARRIERS; barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.buffer = buffer; + barrier.buffer = handle; barrier.offset = offset; barrier.size = size; cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, {}, @@ -110,12 +81,12 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st }); } -void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - u8* data) { +void Buffer::Download(std::size_t offset, std::size_t size, u8* data) const { const auto& staging = staging_pool.GetUnusedBuffer(size, true); scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset, - size](vk::CommandBuffer cmdbuf) { + + const VkBuffer handle = Handle(); + scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) { VkBufferMemoryBarrier barrier; barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; barrier.pNext = nullptr; @@ -123,7 +94,7 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.buffer = buffer; + barrier.buffer = handle; barrier.offset = offset; barrier.size = size; @@ -131,18 +102,20 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, {}, barrier, {}); - cmdbuf.CopyBuffer(buffer, staging, VkBufferCopy{offset, 0, size}); + cmdbuf.CopyBuffer(handle, staging, VkBufferCopy{offset, 0, size}); }); scheduler.Finish(); std::memcpy(data, staging.commit->Map(size), size); } -void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, - std::size_t dst_offset, std::size_t size) { +void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, + std::size_t size) const { scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([src_buffer = src->GetHandle(), dst_buffer = dst->GetHandle(), src_offset, - dst_offset, size](vk::CommandBuffer cmdbuf) { + + const VkBuffer dst_buffer = Handle(); + scheduler.Record([src_buffer = src.Handle(), dst_buffer, src_offset, dst_offset, + size](vk::CommandBuffer cmdbuf) { cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, size}); std::array<VkBufferMemoryBarrier, 2> barriers; @@ -169,4 +142,30 @@ void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t }); } +VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, + const VKDevice& device, VKMemoryManager& memory_manager, + VKScheduler& scheduler, VKStagingBufferPool& staging_pool) + : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, system, + CreateStreamBuffer(device, + scheduler)}, + device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{ + staging_pool} {} + +VKBufferCache::~VKBufferCache() = default; + +std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { + return std::make_shared<Buffer>(device, memory_manager, scheduler, staging_pool, cpu_addr, + size); +} + +VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) { + size = std::max(size, std::size_t(4)); + const auto& empty = staging_pool.GetUnusedBuffer(size, false); + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) { + cmdbuf.FillBuffer(buffer, 0, size, 0); + }); + return {*empty.handle, 0, 0}; +} + } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index a54583e7d..3630aca77 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -8,7 +8,6 @@ #include "common/common_types.h" #include "video_core/buffer_cache/buffer_cache.h" -#include "video_core/rasterizer_cache.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_stream_buffer.h" @@ -24,22 +23,34 @@ class VKDevice; class VKMemoryManager; class VKScheduler; -class CachedBufferBlock final : public VideoCommon::BufferBlock { +class Buffer final : public VideoCommon::BufferBlock { public: - explicit CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager, - VAddr cpu_addr, std::size_t size); - ~CachedBufferBlock(); + explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler, + VKStagingBufferPool& staging_pool, VAddr cpu_addr, std::size_t size); + ~Buffer(); - VkBuffer GetHandle() const { + void Upload(std::size_t offset, std::size_t size, const u8* data) const; + + void Download(std::size_t offset, std::size_t size, u8* data) const; + + void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, + std::size_t size) const; + + VkBuffer Handle() const { return *buffer.handle; } + u64 Address() const { + return 0; + } + private: + VKScheduler& scheduler; + VKStagingBufferPool& staging_pool; + VKBuffer buffer; }; -using Buffer = std::shared_ptr<CachedBufferBlock>; - class VKBufferCache final : public VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer> { public: explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, @@ -47,21 +58,10 @@ public: VKScheduler& scheduler, VKStagingBufferPool& staging_pool); ~VKBufferCache(); - VkBuffer GetEmptyBuffer(std::size_t size) override; + BufferInfo GetEmptyBuffer(std::size_t size) override; protected: - VkBuffer ToHandle(const Buffer& buffer) override; - - Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override; - - void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - const u8* data) override; - - void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - u8* data) override; - - void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, - std::size_t dst_offset, std::size_t size) override; + std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override; private: const VKDevice& device; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index b8ccf164f..ea66e621e 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -27,6 +27,7 @@ #include "video_core/renderer_vulkan/wrapper.h" #include "video_core/shader/compiler_settings.h" #include "video_core/shader/memory_util.h" +#include "video_core/shader_cache.h" namespace Vulkan { @@ -132,19 +133,18 @@ bool ComputePipelineCacheKey::operator==(const ComputePipelineCacheKey& rhs) con return std::memcmp(&rhs, this, sizeof *this) == 0; } -CachedShader::CachedShader(Core::System& system, Tegra::Engines::ShaderType stage, - GPUVAddr gpu_addr, VAddr cpu_addr, ProgramCode program_code, - u32 main_offset) - : RasterizerCacheObject{cpu_addr}, gpu_addr{gpu_addr}, program_code{std::move(program_code)}, +Shader::Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, + VideoCommon::Shader::ProgramCode program_code, u32 main_offset) + : gpu_addr{gpu_addr}, program_code{std::move(program_code)}, registry{stage, GetEngine(system, stage)}, shader_ir{this->program_code, main_offset, compiler_settings, registry}, entries{GenerateShaderEntries(shader_ir)} {} -CachedShader::~CachedShader() = default; +Shader::~Shader() = default; -Tegra::Engines::ConstBufferEngineInterface& CachedShader::GetEngine( - Core::System& system, Tegra::Engines::ShaderType stage) { - if (stage == Tegra::Engines::ShaderType::Compute) { +Tegra::Engines::ConstBufferEngineInterface& Shader::GetEngine(Core::System& system, + Tegra::Engines::ShaderType stage) { + if (stage == ShaderType::Compute) { return system.GPU().KeplerCompute(); } else { return system.GPU().Maxwell3D(); @@ -156,16 +156,16 @@ VKPipelineCache::VKPipelineCache(Core::System& system, RasterizerVulkan& rasteri VKDescriptorPool& descriptor_pool, VKUpdateDescriptorQueue& update_descriptor_queue, VKRenderPassCache& renderpass_cache) - : RasterizerCache{rasterizer}, system{system}, device{device}, scheduler{scheduler}, - descriptor_pool{descriptor_pool}, update_descriptor_queue{update_descriptor_queue}, - renderpass_cache{renderpass_cache} {} + : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system}, device{device}, + scheduler{scheduler}, descriptor_pool{descriptor_pool}, + update_descriptor_queue{update_descriptor_queue}, renderpass_cache{renderpass_cache} {} VKPipelineCache::~VKPipelineCache() = default; -std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() { +std::array<Shader*, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() { const auto& gpu = system.GPU().Maxwell3D(); - std::array<Shader, Maxwell::MaxShaderProgram> shaders; + std::array<Shader*, Maxwell::MaxShaderProgram> shaders{}; for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { const auto program{static_cast<Maxwell::ShaderProgram>(index)}; @@ -178,24 +178,28 @@ std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() { const GPUVAddr program_addr{GetShaderAddress(system, program)}; const std::optional cpu_addr = memory_manager.GpuToCpuAddress(program_addr); ASSERT(cpu_addr); - auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader; - if (!shader) { + + Shader* result = cpu_addr ? TryGet(*cpu_addr) : null_shader.get(); + if (!result) { const auto host_ptr{memory_manager.GetPointer(program_addr)}; // No shader found - create a new one constexpr u32 stage_offset = STAGE_MAIN_OFFSET; - const auto stage = static_cast<Tegra::Engines::ShaderType>(index == 0 ? 0 : index - 1); + const auto stage = static_cast<ShaderType>(index == 0 ? 0 : index - 1); ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, false); + const std::size_t size_in_bytes = code.size() * sizeof(u64); + + auto shader = std::make_unique<Shader>(system, stage, program_addr, std::move(code), + stage_offset); + result = shader.get(); - shader = std::make_shared<CachedShader>(system, stage, program_addr, *cpu_addr, - std::move(code), stage_offset); if (cpu_addr) { - Register(shader); + Register(std::move(shader), *cpu_addr, size_in_bytes); } else { - null_shader = shader; + null_shader = std::move(shader); } } - shaders[index] = std::move(shader); + shaders[index] = result; } return last_shaders = shaders; } @@ -236,19 +240,22 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach const auto cpu_addr = memory_manager.GpuToCpuAddress(program_addr); ASSERT(cpu_addr); - auto shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel; + Shader* shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get(); if (!shader) { // No shader found - create a new one const auto host_ptr = memory_manager.GetPointer(program_addr); ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, true); - shader = std::make_shared<CachedShader>(system, Tegra::Engines::ShaderType::Compute, - program_addr, *cpu_addr, std::move(code), - KERNEL_MAIN_OFFSET); + const std::size_t size_in_bytes = code.size() * sizeof(u64); + + auto shader_info = std::make_unique<Shader>(system, ShaderType::Compute, program_addr, + std::move(code), KERNEL_MAIN_OFFSET); + shader = shader_info.get(); + if (cpu_addr) { - Register(shader); + Register(std::move(shader_info), *cpu_addr, size_in_bytes); } else { - null_kernel = shader; + null_kernel = std::move(shader_info); } } @@ -264,7 +271,7 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach return *entry; } -void VKPipelineCache::Unregister(const Shader& shader) { +void VKPipelineCache::OnShaderRemoval(Shader* shader) { bool finished = false; const auto Finish = [&] { // TODO(Rodrigo): Instead of finishing here, wait for the fences that use this pipeline and @@ -296,8 +303,6 @@ void VKPipelineCache::Unregister(const Shader& shader) { Finish(); it = compute_cache.erase(it); } - - RasterizerCache::Unregister(shader); } std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>> @@ -332,12 +337,11 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) { } const GPUVAddr gpu_addr = GetShaderAddress(system, program_enum); - const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr); - const auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader; - ASSERT(shader); + const std::optional<VAddr> cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr); + Shader* const shader = cpu_addr ? TryGet(*cpu_addr) : null_shader.get(); const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5 - const auto program_type = GetShaderType(program_enum); + const ShaderType program_type = GetShaderType(program_enum); const auto& entries = shader->GetEntries(); program[stage] = { Decompile(device, shader->GetIR(), program_type, shader->GetRegistry(), specialization), diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h index 0b5796fef..0a36e5112 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h @@ -17,7 +17,6 @@ #include "common/common_types.h" #include "video_core/engines/const_buffer_engine_interface.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/rasterizer_cache.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_renderpass_cache.h" @@ -26,6 +25,7 @@ #include "video_core/shader/memory_util.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" +#include "video_core/shader_cache.h" namespace Core { class System; @@ -41,8 +41,6 @@ class VKFence; class VKScheduler; class VKUpdateDescriptorQueue; -class CachedShader; -using Shader = std::shared_ptr<CachedShader>; using Maxwell = Tegra::Engines::Maxwell3D::Regs; struct GraphicsPipelineCacheKey { @@ -102,21 +100,16 @@ struct hash<Vulkan::ComputePipelineCacheKey> { namespace Vulkan { -class CachedShader final : public RasterizerCacheObject { +class Shader { public: - explicit CachedShader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, - VAddr cpu_addr, VideoCommon::Shader::ProgramCode program_code, - u32 main_offset); - ~CachedShader(); + explicit Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, + VideoCommon::Shader::ProgramCode program_code, u32 main_offset); + ~Shader(); GPUVAddr GetGpuAddr() const { return gpu_addr; } - std::size_t GetSizeInBytes() const override { - return program_code.size() * sizeof(u64); - } - VideoCommon::Shader::ShaderIR& GetIR() { return shader_ir; } @@ -144,25 +137,23 @@ private: ShaderEntries entries; }; -class VKPipelineCache final : public RasterizerCache<Shader> { +class VKPipelineCache final : public VideoCommon::ShaderCache<Shader> { public: explicit VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer, const VKDevice& device, VKScheduler& scheduler, VKDescriptorPool& descriptor_pool, VKUpdateDescriptorQueue& update_descriptor_queue, VKRenderPassCache& renderpass_cache); - ~VKPipelineCache(); + ~VKPipelineCache() override; - std::array<Shader, Maxwell::MaxShaderProgram> GetShaders(); + std::array<Shader*, Maxwell::MaxShaderProgram> GetShaders(); VKGraphicsPipeline& GetGraphicsPipeline(const GraphicsPipelineCacheKey& key); VKComputePipeline& GetComputePipeline(const ComputePipelineCacheKey& key); protected: - void Unregister(const Shader& shader) override; - - void FlushObjectInner(const Shader& object) override {} + void OnShaderRemoval(Shader* shader) final; private: std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>> DecompileShaders( @@ -175,10 +166,10 @@ private: VKUpdateDescriptorQueue& update_descriptor_queue; VKRenderPassCache& renderpass_cache; - Shader null_shader{}; - Shader null_kernel{}; + std::unique_ptr<Shader> null_shader; + std::unique_ptr<Shader> null_kernel; - std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; + std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{}; GraphicsPipelineCacheKey last_graphics_key; VKGraphicsPipeline* last_graphics_pipeline = nullptr; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index d86c46412..a8d94eac3 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -38,6 +38,7 @@ #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" #include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/shader_cache.h" namespace Vulkan { @@ -98,7 +99,7 @@ VkRect2D GetScissorState(const Maxwell& regs, std::size_t index) { } std::array<GPUVAddr, Maxwell::MaxShaderProgram> GetShaderAddresses( - const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) { + const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) { std::array<GPUVAddr, Maxwell::MaxShaderProgram> addresses; for (std::size_t i = 0; i < std::size(addresses); ++i) { addresses[i] = shaders[i] ? shaders[i]->GetGpuAddr() : 0; @@ -117,6 +118,17 @@ template <typename Engine, typename Entry> Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, std::size_t stage, std::size_t index = 0) { const auto stage_type = static_cast<Tegra::Engines::ShaderType>(stage); + if constexpr (std::is_same_v<Entry, SamplerEntry>) { + if (entry.is_separated) { + const u32 buffer_1 = entry.buffer; + const u32 buffer_2 = entry.secondary_buffer; + const u32 offset_1 = entry.offset; + const u32 offset_2 = entry.secondary_offset; + const u32 handle_1 = engine.AccessConstBuffer32(stage_type, buffer_1, offset_1); + const u32 handle_2 = engine.AccessConstBuffer32(stage_type, buffer_2, offset_2); + return engine.GetTextureInfo(handle_1 | handle_2); + } + } if (entry.is_bindless) { const auto tex_handle = engine.AccessConstBuffer32(stage_type, entry.buffer, entry.offset); return engine.GetTextureInfo(tex_handle); @@ -131,6 +143,49 @@ Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry } } +/// @brief Determine if an attachment to be updated has to preserve contents +/// @param is_clear True when a clear is being executed +/// @param regs 3D registers +/// @return True when the contents have to be preserved +bool HasToPreserveColorContents(bool is_clear, const Maxwell& regs) { + if (!is_clear) { + return true; + } + // First we have to make sure all clear masks are enabled. + if (!regs.clear_buffers.R || !regs.clear_buffers.G || !regs.clear_buffers.B || + !regs.clear_buffers.A) { + return true; + } + // If scissors are disabled, the whole screen is cleared + if (!regs.clear_flags.scissor) { + return false; + } + // Then we have to confirm scissor testing clears the whole image + const std::size_t index = regs.clear_buffers.RT; + const auto& scissor = regs.scissor_test[0]; + return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.rt[index].width || + scissor.max_y < regs.rt[index].height; +} + +/// @brief Determine if an attachment to be updated has to preserve contents +/// @param is_clear True when a clear is being executed +/// @param regs 3D registers +/// @return True when the contents have to be preserved +bool HasToPreserveDepthContents(bool is_clear, const Maxwell& regs) { + // If we are not clearing, the contents have to be preserved + if (!is_clear) { + return true; + } + // For depth stencil clears we only have to confirm scissor test covers the whole image + if (!regs.clear_flags.scissor) { + return false; + } + // Make sure the clear cover the whole image + const auto& scissor = regs.scissor_test[0]; + return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.zeta_width || + scissor.max_y < regs.zeta_height; +} + } // Anonymous namespace class BufferBindings final { @@ -332,7 +387,7 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { buffer_cache.Unmap(); - const Texceptions texceptions = UpdateAttachments(); + const Texceptions texceptions = UpdateAttachments(false); SetupImageTransitions(texceptions, color_attachments, zeta_attachment); key.renderpass_params = GetRenderPassParams(texceptions); @@ -388,7 +443,7 @@ void RasterizerVulkan::Clear() { return; } - [[maybe_unused]] const auto texceptions = UpdateAttachments(); + [[maybe_unused]] const auto texceptions = UpdateAttachments(true); DEBUG_ASSERT(texceptions.none()); SetupImageTransitions(0, color_attachments, zeta_attachment); @@ -665,9 +720,12 @@ void RasterizerVulkan::FlushWork() { draw_counter = 0; } -RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() { +RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments(bool is_clear) { MICROPROFILE_SCOPE(Vulkan_RenderTargets); - auto& dirty = system.GPU().Maxwell3D().dirty.flags; + auto& maxwell3d = system.GPU().Maxwell3D(); + auto& dirty = maxwell3d.dirty.flags; + auto& regs = maxwell3d.regs; + const bool update_rendertargets = dirty[VideoCommon::Dirty::RenderTargets]; dirty[VideoCommon::Dirty::RenderTargets] = false; @@ -676,7 +734,8 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() { Texceptions texceptions; for (std::size_t rt = 0; rt < Maxwell::NumRenderTargets; ++rt) { if (update_rendertargets) { - color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, true); + const bool preserve_contents = HasToPreserveColorContents(is_clear, regs); + color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, preserve_contents); } if (color_attachments[rt] && WalkAttachmentOverlaps(*color_attachments[rt])) { texceptions[rt] = true; @@ -684,7 +743,8 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() { } if (update_rendertargets) { - zeta_attachment = texture_cache.GetDepthBufferSurface(true); + const bool preserve_contents = HasToPreserveDepthContents(is_clear, regs); + zeta_attachment = texture_cache.GetDepthBufferSurface(preserve_contents); } if (zeta_attachment && WalkAttachmentOverlaps(*zeta_attachment)) { texceptions[ZETA_TEXCEPTION_INDEX] = true; @@ -716,7 +776,7 @@ std::tuple<VkFramebuffer, VkExtent2D> RasterizerVulkan::ConfigureFramebuffers( if (!view) { return false; } - key.views.push_back(view->GetHandle()); + key.views.push_back(view->GetAttachment()); key.width = std::min(key.width, view->GetWidth()); key.height = std::min(key.height, view->GetHeight()); key.layers = std::min(key.layers, view->GetNumLayers()); @@ -776,12 +836,12 @@ RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineSt } void RasterizerVulkan::SetupShaderDescriptors( - const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) { + const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) { texture_cache.GuardSamplers(true); for (std::size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { // Skip VertexA stage - const auto& shader = shaders[stage + 1]; + Shader* const shader = shaders[stage + 1]; if (!shader) { continue; } @@ -858,10 +918,10 @@ void RasterizerVulkan::BeginTransformFeedback() { UNIMPLEMENTED_IF(binding.buffer_offset != 0); const GPUVAddr gpu_addr = binding.Address(); - const std::size_t size = binding.buffer_size; - const auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true); + const VkDeviceSize size = static_cast<VkDeviceSize>(binding.buffer_size); + const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true); - scheduler.Record([buffer = buffer, offset = offset, size](vk::CommandBuffer cmdbuf) { + scheduler.Record([buffer = info.handle, offset = info.offset, size](vk::CommandBuffer cmdbuf) { cmdbuf.BindTransformFeedbackBuffersEXT(0, 1, &buffer, &offset, &size); cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); }); @@ -913,8 +973,8 @@ void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex buffer_bindings.AddVertexBinding(DefaultBuffer(), 0); continue; } - const auto [buffer, offset] = buffer_cache.UploadMemory(start, size); - buffer_bindings.AddVertexBinding(buffer, offset); + const auto info = buffer_cache.UploadMemory(start, size); + buffer_bindings.AddVertexBinding(info.handle, info.offset); } } @@ -936,7 +996,9 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar break; } const GPUVAddr gpu_addr = regs.index_array.IndexStart(); - auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); + const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); + VkBuffer buffer = info.handle; + u64 offset = info.offset; std::tie(buffer, offset) = quad_indexed_pass.Assemble( regs.index_array.format, params.num_vertices, params.base_vertex, buffer, offset); @@ -950,7 +1012,9 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar break; } const GPUVAddr gpu_addr = regs.index_array.IndexStart(); - auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); + const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); + VkBuffer buffer = info.handle; + u64 offset = info.offset; auto format = regs.index_array.format; const bool is_uint8 = format == Maxwell::IndexFormat::UnsignedByte; @@ -1097,10 +1161,9 @@ void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry, Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float)); ASSERT(size <= MaxConstbufferSize); - const auto [buffer_handle, offset] = + const auto info = buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment()); - - update_descriptor_queue.AddBuffer(buffer_handle, offset, size); + update_descriptor_queue.AddBuffer(info.handle, info.offset, size); } void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address) { @@ -1114,14 +1177,14 @@ void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAdd // Note: Do *not* use DefaultBuffer() here, storage buffers can be written breaking the // default buffer. static constexpr std::size_t dummy_size = 4; - const auto buffer = buffer_cache.GetEmptyBuffer(dummy_size); - update_descriptor_queue.AddBuffer(buffer, 0, dummy_size); + const auto info = buffer_cache.GetEmptyBuffer(dummy_size); + update_descriptor_queue.AddBuffer(info.handle, info.offset, dummy_size); return; } - const auto [buffer, offset] = buffer_cache.UploadMemory( + const auto info = buffer_cache.UploadMemory( actual_addr, size, device.GetStorageBufferAlignment(), entry.IsWritten()); - update_descriptor_queue.AddBuffer(buffer, offset, size); + update_descriptor_queue.AddBuffer(info.handle, info.offset, size); } void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic, @@ -1137,12 +1200,12 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu auto view = texture_cache.GetTextureSurface(texture.tic, entry); ASSERT(!view->IsBufferView()); - const auto image_view = view->GetHandle(texture.tic.x_source, texture.tic.y_source, - texture.tic.z_source, texture.tic.w_source); + const VkImageView image_view = view->GetImageView(texture.tic.x_source, texture.tic.y_source, + texture.tic.z_source, texture.tic.w_source); const auto sampler = sampler_cache.GetSampler(texture.tsc); update_descriptor_queue.AddSampledImage(sampler, image_view); - const auto image_layout = update_descriptor_queue.GetLastImageLayout(); + VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout(); *image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; sampled_views.push_back(ImageView{std::move(view), image_layout}); } @@ -1164,10 +1227,11 @@ void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const Ima UNIMPLEMENTED_IF(tic.IsBuffer()); - const auto image_view = view->GetHandle(tic.x_source, tic.y_source, tic.z_source, tic.w_source); + const VkImageView image_view = + view->GetImageView(tic.x_source, tic.y_source, tic.z_source, tic.w_source); update_descriptor_queue.AddImage(image_view); - const auto image_layout = update_descriptor_queue.GetLastImageLayout(); + VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout(); *image_layout = VK_IMAGE_LAYOUT_GENERAL; image_views.push_back(ImageView{std::move(view), image_layout}); } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 04be37a5e..83e00e7e9 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -159,7 +159,10 @@ private: void FlushWork(); - Texceptions UpdateAttachments(); + /// @brief Updates the currently bound attachments + /// @param is_clear True when the framebuffer is updated as a clear + /// @return Bitfield of attachments being used as sampled textures + Texceptions UpdateAttachments(bool is_clear); std::tuple<VkFramebuffer, VkExtent2D> ConfigureFramebuffers(VkRenderPass renderpass); @@ -168,7 +171,7 @@ private: bool is_indexed, bool is_instanced); /// Setup descriptors in the graphics pipeline. - void SetupShaderDescriptors(const std::array<Shader, Maxwell::MaxShaderProgram>& shaders); + void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders); void SetupImageTransitions(Texceptions texceptions, const std::array<View, Maxwell::NumRenderTargets>& color_attachments, diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp index e6f2fa553..616eacc36 100644 --- a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp @@ -9,6 +9,8 @@ #include "video_core/renderer_vulkan/wrapper.h" #include "video_core/textures/texture.h" +using Tegra::Texture::TextureMipmapFilter; + namespace Vulkan { namespace { @@ -63,8 +65,8 @@ vk::Sampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc) c ci.maxAnisotropy = tsc.GetMaxAnisotropy(); ci.compareEnable = tsc.depth_compare_enabled; ci.compareOp = MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func); - ci.minLod = tsc.GetMinLod(); - ci.maxLod = tsc.GetMaxLod(); + ci.minLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.0f : tsc.GetMinLod(); + ci.maxLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.25f : tsc.GetMaxLod(); ci.borderColor = arbitrary_borders ? VK_BORDER_COLOR_INT_CUSTOM_EXT : ConvertBorderColor(color); ci.unnormalizedCoordinates = VK_FALSE; return device.GetLogical().CreateSampler(ci); diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 82ec9180e..56524e6f3 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -9,6 +9,7 @@ #include <utility> #include "common/microprofile.h" +#include "common/thread.h" #include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_resource_manager.h" @@ -133,6 +134,7 @@ void VKScheduler::BindGraphicsPipeline(VkPipeline pipeline) { } void VKScheduler::WorkerThread() { + Common::SetCurrentThreadPriority(Common::ThreadPriority::High); std::unique_lock lock{mutex}; do { cv.wait(lock, [this] { return !chunk_queue.Empty() || quit; }); diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h index dfddf7ad6..689f0d276 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.h +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h @@ -35,10 +35,14 @@ public: /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy. void Unmap(u64 size); - VkBuffer GetHandle() const { + VkBuffer Handle() const noexcept { return *buffer; } + u64 Address() const noexcept { + return 0; + } + private: struct Watch final { VKFenceWatch fence; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index ea487b770..430031665 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -167,6 +167,7 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP ci.extent = {params.width, params.height, 1}; break; case SurfaceTarget::Texture3D: + ci.flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT; ci.extent = {params.width, params.height, params.depth}; break; case SurfaceTarget::TextureBuffer: @@ -176,6 +177,12 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP return ci; } +u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, Tegra::Texture::SwizzleSource y_source, + Tegra::Texture::SwizzleSource z_source, Tegra::Texture::SwizzleSource w_source) { + return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) | + (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source); +} + } // Anonymous namespace CachedSurface::CachedSurface(Core::System& system, const VKDevice& device, @@ -203,9 +210,11 @@ CachedSurface::CachedSurface(Core::System& system, const VKDevice& device, } // TODO(Rodrigo): Move this to a virtual function. - main_view = CreateViewInner( - ViewParams(params.target, 0, static_cast<u32>(params.GetNumLayers()), 0, params.num_levels), - true); + u32 num_layers = 1; + if (params.is_layered || params.target == SurfaceTarget::Texture3D) { + num_layers = params.depth; + } + main_view = CreateView(ViewParams(params.target, 0, num_layers, 0, params.num_levels)); } CachedSurface::~CachedSurface() = default; @@ -253,12 +262,8 @@ void CachedSurface::DecorateSurfaceName() { } View CachedSurface::CreateView(const ViewParams& params) { - return CreateViewInner(params, false); -} - -View CachedSurface::CreateViewInner(const ViewParams& params, bool is_proxy) { // TODO(Rodrigo): Add name decorations - return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params, is_proxy); + return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params); } void CachedSurface::UploadBuffer(const std::vector<u8>& staging_buffer) { @@ -342,18 +347,27 @@ VkImageSubresourceRange CachedSurface::GetImageSubresourceRange() const { } CachedSurfaceView::CachedSurfaceView(const VKDevice& device, CachedSurface& surface, - const ViewParams& params, bool is_proxy) + const ViewParams& params) : VideoCommon::ViewBase{params}, params{surface.GetSurfaceParams()}, image{surface.GetImageHandle()}, buffer_view{surface.GetBufferViewHandle()}, aspect_mask{surface.GetAspectMask()}, device{device}, surface{surface}, - base_layer{params.base_layer}, num_layers{params.num_layers}, base_level{params.base_level}, - num_levels{params.num_levels}, image_view_type{image ? GetImageViewType(params.target) - : VK_IMAGE_VIEW_TYPE_1D} {} + base_level{params.base_level}, num_levels{params.num_levels}, + image_view_type{image ? GetImageViewType(params.target) : VK_IMAGE_VIEW_TYPE_1D} { + if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { + base_layer = 0; + num_layers = 1; + base_slice = params.base_layer; + num_slices = params.num_layers; + } else { + base_layer = params.base_layer; + num_layers = params.num_layers; + } +} CachedSurfaceView::~CachedSurfaceView() = default; -VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y_source, - SwizzleSource z_source, SwizzleSource w_source) { +VkImageView CachedSurfaceView::GetImageView(SwizzleSource x_source, SwizzleSource y_source, + SwizzleSource z_source, SwizzleSource w_source) { const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source); if (last_image_view && last_swizzle == new_swizzle) { return last_image_view; @@ -399,6 +413,11 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y }); } + if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { + ASSERT(base_slice == 0); + ASSERT(num_slices == params.depth); + } + VkImageViewCreateInfo ci; ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; ci.pNext = nullptr; @@ -417,6 +436,35 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y return last_image_view = *image_view; } +VkImageView CachedSurfaceView::GetAttachment() { + if (render_target) { + return *render_target; + } + + VkImageViewCreateInfo ci; + ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; + ci.pNext = nullptr; + ci.flags = 0; + ci.image = surface.GetImageHandle(); + ci.format = surface.GetImage().GetFormat(); + ci.components = {VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY, + VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY}; + ci.subresourceRange.aspectMask = aspect_mask; + ci.subresourceRange.baseMipLevel = base_level; + ci.subresourceRange.levelCount = num_levels; + if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { + ci.viewType = num_slices > 1 ? VK_IMAGE_VIEW_TYPE_2D_ARRAY : VK_IMAGE_VIEW_TYPE_2D; + ci.subresourceRange.baseArrayLayer = base_slice; + ci.subresourceRange.layerCount = num_slices; + } else { + ci.viewType = image_view_type; + ci.subresourceRange.baseArrayLayer = base_layer; + ci.subresourceRange.layerCount = num_layers; + } + render_target = device.GetLogical().CreateImageView(ci); + return *render_target; +} + VKTextureCache::VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, const VKDevice& device, VKResourceManager& resource_manager, VKMemoryManager& memory_manager, VKScheduler& scheduler, diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index f211ccb1e..807e26c8a 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -91,7 +91,6 @@ protected: void DecorateSurfaceName(); View CreateView(const ViewParams& params) override; - View CreateViewInner(const ViewParams& params, bool is_proxy); private: void UploadBuffer(const std::vector<u8>& staging_buffer); @@ -120,23 +119,20 @@ private: class CachedSurfaceView final : public VideoCommon::ViewBase { public: explicit CachedSurfaceView(const VKDevice& device, CachedSurface& surface, - const ViewParams& params, bool is_proxy); + const ViewParams& params); ~CachedSurfaceView(); - VkImageView GetHandle(Tegra::Texture::SwizzleSource x_source, - Tegra::Texture::SwizzleSource y_source, - Tegra::Texture::SwizzleSource z_source, - Tegra::Texture::SwizzleSource w_source); + VkImageView GetImageView(Tegra::Texture::SwizzleSource x_source, + Tegra::Texture::SwizzleSource y_source, + Tegra::Texture::SwizzleSource z_source, + Tegra::Texture::SwizzleSource w_source); + + VkImageView GetAttachment(); bool IsSameSurface(const CachedSurfaceView& rhs) const { return &surface == &rhs.surface; } - VkImageView GetHandle() { - return GetHandle(Tegra::Texture::SwizzleSource::R, Tegra::Texture::SwizzleSource::G, - Tegra::Texture::SwizzleSource::B, Tegra::Texture::SwizzleSource::A); - } - u32 GetWidth() const { return params.GetMipWidth(base_level); } @@ -180,14 +176,6 @@ public: } private: - static u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, - Tegra::Texture::SwizzleSource y_source, - Tegra::Texture::SwizzleSource z_source, - Tegra::Texture::SwizzleSource w_source) { - return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) | - (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source); - } - // Store a copy of these values to avoid double dereference when reading them const SurfaceParams params; const VkImage image; @@ -196,15 +184,18 @@ private: const VKDevice& device; CachedSurface& surface; - const u32 base_layer; - const u32 num_layers; const u32 base_level; const u32 num_levels; const VkImageViewType image_view_type; + u32 base_layer = 0; + u32 num_layers = 0; + u32 base_slice = 0; + u32 num_slices = 0; VkImageView last_image_view = nullptr; u32 last_swizzle = 0; + vk::ImageView render_target; std::unordered_map<u32, vk::ImageView> view_cache; }; diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp index 681ecde98..351c048d2 100644 --- a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp +++ b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp @@ -24,35 +24,25 @@ void VKUpdateDescriptorQueue::TickFrame() { } void VKUpdateDescriptorQueue::Acquire() { - entries.clear(); -} + // Minimum number of entries required. + // This is the maximum number of entries a single draw call migth use. + static constexpr std::size_t MIN_ENTRIES = 0x400; -void VKUpdateDescriptorQueue::Send(VkDescriptorUpdateTemplateKHR update_template, - VkDescriptorSet set) { - if (payload.size() + entries.size() >= payload.max_size()) { + if (payload.size() + MIN_ENTRIES >= payload.max_size()) { LOG_WARNING(Render_Vulkan, "Payload overflow, waiting for worker thread"); scheduler.WaitWorker(); payload.clear(); } + upload_start = &*payload.end(); +} - // TODO(Rodrigo): Rework to write the payload directly - const auto payload_start = payload.data() + payload.size(); - for (const auto& entry : entries) { - if (const auto image = std::get_if<VkDescriptorImageInfo>(&entry)) { - payload.push_back(*image); - } else if (const auto buffer = std::get_if<VkDescriptorBufferInfo>(&entry)) { - payload.push_back(*buffer); - } else if (const auto texel = std::get_if<VkBufferView>(&entry)) { - payload.push_back(*texel); - } else { - UNREACHABLE(); - } - } - - scheduler.Record( - [payload_start, set, update_template, logical = &device.GetLogical()](vk::CommandBuffer) { - logical->UpdateDescriptorSet(set, update_template, payload_start); - }); +void VKUpdateDescriptorQueue::Send(VkDescriptorUpdateTemplateKHR update_template, + VkDescriptorSet set) { + const void* const data = upload_start; + const vk::Device* const logical = &device.GetLogical(); + scheduler.Record([data, logical, set, update_template](vk::CommandBuffer) { + logical->UpdateDescriptorSet(set, update_template, data); + }); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.h b/src/video_core/renderer_vulkan/vk_update_descriptor.h index cc7e3dff4..945320c72 100644 --- a/src/video_core/renderer_vulkan/vk_update_descriptor.h +++ b/src/video_core/renderer_vulkan/vk_update_descriptor.h @@ -15,17 +15,13 @@ namespace Vulkan { class VKDevice; class VKScheduler; -class DescriptorUpdateEntry { -public: - explicit DescriptorUpdateEntry() {} - - DescriptorUpdateEntry(VkDescriptorImageInfo image) : image{image} {} +struct DescriptorUpdateEntry { + DescriptorUpdateEntry(VkDescriptorImageInfo image_) : image{image_} {} - DescriptorUpdateEntry(VkDescriptorBufferInfo buffer) : buffer{buffer} {} + DescriptorUpdateEntry(VkDescriptorBufferInfo buffer_) : buffer{buffer_} {} - DescriptorUpdateEntry(VkBufferView texel_buffer) : texel_buffer{texel_buffer} {} + DescriptorUpdateEntry(VkBufferView texel_buffer_) : texel_buffer{texel_buffer_} {} -private: union { VkDescriptorImageInfo image; VkDescriptorBufferInfo buffer; @@ -45,32 +41,34 @@ public: void Send(VkDescriptorUpdateTemplateKHR update_template, VkDescriptorSet set); void AddSampledImage(VkSampler sampler, VkImageView image_view) { - entries.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}}); + payload.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}}); } void AddImage(VkImageView image_view) { - entries.emplace_back(VkDescriptorImageInfo{{}, image_view, {}}); + payload.emplace_back(VkDescriptorImageInfo{{}, image_view, {}}); } void AddBuffer(VkBuffer buffer, u64 offset, std::size_t size) { - entries.emplace_back(VkDescriptorBufferInfo{buffer, offset, size}); + payload.emplace_back(VkDescriptorBufferInfo{buffer, offset, size}); } void AddTexelBuffer(VkBufferView texel_buffer) { - entries.emplace_back(texel_buffer); + payload.emplace_back(texel_buffer); } - VkImageLayout* GetLastImageLayout() { - return &std::get<VkDescriptorImageInfo>(entries.back()).imageLayout; + VkImageLayout* LastImageLayout() { + return &payload.back().image.imageLayout; } -private: - using Variant = std::variant<VkDescriptorImageInfo, VkDescriptorBufferInfo, VkBufferView>; + const VkImageLayout* LastImageLayout() const { + return &payload.back().image.imageLayout; + } +private: const VKDevice& device; VKScheduler& scheduler; - boost::container::static_vector<Variant, 0x400> entries; + const DescriptorUpdateEntry* upload_start = nullptr; boost::container::static_vector<DescriptorUpdateEntry, 0x10000> payload; }; diff --git a/src/video_core/renderer_vulkan/wrapper.cpp b/src/video_core/renderer_vulkan/wrapper.cpp index 2ce9b0626..0d485a662 100644 --- a/src/video_core/renderer_vulkan/wrapper.cpp +++ b/src/video_core/renderer_vulkan/wrapper.cpp @@ -153,7 +153,8 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { bool Load(InstanceDispatch& dld) noexcept { #define X(name) Proc(dld.name, dld, #name) - return X(vkCreateInstance) && X(vkEnumerateInstanceExtensionProperties); + return X(vkCreateInstance) && X(vkEnumerateInstanceExtensionProperties) && + X(vkEnumerateInstanceLayerProperties); #undef X } @@ -725,8 +726,7 @@ bool PhysicalDevice::GetSurfaceSupportKHR(u32 queue_family_index, VkSurfaceKHR s return supported == VK_TRUE; } -VkSurfaceCapabilitiesKHR PhysicalDevice::GetSurfaceCapabilitiesKHR(VkSurfaceKHR surface) const - noexcept { +VkSurfaceCapabilitiesKHR PhysicalDevice::GetSurfaceCapabilitiesKHR(VkSurfaceKHR surface) const { VkSurfaceCapabilitiesKHR capabilities; Check(dld->vkGetPhysicalDeviceSurfaceCapabilitiesKHR(physical_device, surface, &capabilities)); return capabilities; @@ -771,4 +771,17 @@ std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProp return properties; } +std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties( + const InstanceDispatch& dld) { + u32 num; + if (dld.vkEnumerateInstanceLayerProperties(&num, nullptr) != VK_SUCCESS) { + return std::nullopt; + } + std::vector<VkLayerProperties> properties(num); + if (dld.vkEnumerateInstanceLayerProperties(&num, properties.data()) != VK_SUCCESS) { + return std::nullopt; + } + return properties; +} + } // namespace Vulkan::vk diff --git a/src/video_core/renderer_vulkan/wrapper.h b/src/video_core/renderer_vulkan/wrapper.h index 98937a77a..d56fdb3f9 100644 --- a/src/video_core/renderer_vulkan/wrapper.h +++ b/src/video_core/renderer_vulkan/wrapper.h @@ -141,6 +141,7 @@ struct InstanceDispatch { PFN_vkCreateInstance vkCreateInstance; PFN_vkDestroyInstance vkDestroyInstance; PFN_vkEnumerateInstanceExtensionProperties vkEnumerateInstanceExtensionProperties; + PFN_vkEnumerateInstanceLayerProperties vkEnumerateInstanceLayerProperties; PFN_vkCreateDebugUtilsMessengerEXT vkCreateDebugUtilsMessengerEXT; PFN_vkCreateDevice vkCreateDevice; @@ -779,7 +780,7 @@ public: bool GetSurfaceSupportKHR(u32 queue_family_index, VkSurfaceKHR) const; - VkSurfaceCapabilitiesKHR GetSurfaceCapabilitiesKHR(VkSurfaceKHR) const noexcept; + VkSurfaceCapabilitiesKHR GetSurfaceCapabilitiesKHR(VkSurfaceKHR) const; std::vector<VkSurfaceFormatKHR> GetSurfaceFormatsKHR(VkSurfaceKHR) const; @@ -996,4 +997,7 @@ private: std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProperties( const InstanceDispatch& dld); +std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties( + const InstanceDispatch& dld); + } // namespace Vulkan::vk diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp index 848e46874..b2e88fa20 100644 --- a/src/video_core/shader/decode/half_set.cpp +++ b/src/video_core/shader/decode/half_set.cpp @@ -13,55 +13,101 @@ namespace VideoCommon::Shader { +using std::move; using Tegra::Shader::Instruction; using Tegra::Shader::OpCode; +using Tegra::Shader::PredCondition; u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; const auto opcode = OpCode::Decode(instr); - if (instr.hset2.ftz == 0) { - LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName()); + PredCondition cond; + bool bf; + bool ftz; + bool neg_a; + bool abs_a; + bool neg_b; + bool abs_b; + switch (opcode->get().GetId()) { + case OpCode::Id::HSET2_C: + case OpCode::Id::HSET2_IMM: + cond = instr.hsetp2.cbuf_and_imm.cond; + bf = instr.Bit(53); + ftz = instr.Bit(54); + neg_a = instr.Bit(43); + abs_a = instr.Bit(44); + neg_b = instr.Bit(56); + abs_b = instr.Bit(54); + break; + case OpCode::Id::HSET2_R: + cond = instr.hsetp2.reg.cond; + bf = instr.Bit(49); + ftz = instr.Bit(50); + neg_a = instr.Bit(43); + abs_a = instr.Bit(44); + neg_b = instr.Bit(31); + abs_b = instr.Bit(30); + break; + default: + UNREACHABLE(); } - Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a); - op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a); - - Node op_b = [&]() { + Node op_b = [this, instr, opcode] { switch (opcode->get().GetId()) { + case OpCode::Id::HSET2_C: + // Inform as unimplemented as this is not tested. + UNIMPLEMENTED_MSG("HSET2_C is not implemented"); + return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); case OpCode::Id::HSET2_R: return GetRegister(instr.gpr20); + case OpCode::Id::HSET2_IMM: + return UnpackHalfImmediate(instr, true); default: UNREACHABLE(); - return Immediate(0); + return Node{}; } }(); - op_b = UnpackHalfFloat(op_b, instr.hset2.type_b); - op_b = GetOperandAbsNegHalf(op_b, instr.hset2.abs_b, instr.hset2.negate_b); - const Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred); + if (!ftz) { + LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName()); + } + + Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a); + op_a = GetOperandAbsNegHalf(op_a, abs_a, neg_a); + + switch (opcode->get().GetId()) { + case OpCode::Id::HSET2_R: + op_b = GetOperandAbsNegHalf(move(op_b), abs_b, neg_b); + [[fallthrough]]; + case OpCode::Id::HSET2_C: + op_b = UnpackHalfFloat(move(op_b), instr.hset2.type_b); + break; + default: + break; + } - const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, op_a, op_b); + Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred); + + Node comparison_pair = GetPredicateComparisonHalf(cond, op_a, op_b); const OperationCode combiner = GetPredicateCombiner(instr.hset2.op); // HSET2 operates on each half float in the pack. std::array<Node, 2> values; for (u32 i = 0; i < 2; ++i) { - const u32 raw_value = instr.hset2.bf ? 0x3c00 : 0xffff; - const Node true_value = Immediate(raw_value << (i * 16)); - const Node false_value = Immediate(0); - - const Node comparison = - Operation(OperationCode::LogicalPick2, comparison_pair, Immediate(i)); - const Node predicate = Operation(combiner, comparison, second_pred); + const u32 raw_value = bf ? 0x3c00 : 0xffff; + Node true_value = Immediate(raw_value << (i * 16)); + Node false_value = Immediate(0); + Node comparison = Operation(OperationCode::LogicalPick2, comparison_pair, Immediate(i)); + Node predicate = Operation(combiner, comparison, second_pred); values[i] = - Operation(OperationCode::Select, NO_PRECISE, predicate, true_value, false_value); + Operation(OperationCode::Select, predicate, move(true_value), move(false_value)); } - const Node value = Operation(OperationCode::UBitwiseOr, NO_PRECISE, values[0], values[1]); - SetRegister(bb, instr.gpr0, value); + Node value = Operation(OperationCode::UBitwiseOr, values[0], values[1]); + SetRegister(bb, instr.gpr0, move(value)); return pc; } diff --git a/src/video_core/shader/decode/image.cpp b/src/video_core/shader/decode/image.cpp index 60b6ad72a..07778dc3e 100644 --- a/src/video_core/shader/decode/image.cpp +++ b/src/video_core/shader/decode/image.cpp @@ -97,6 +97,7 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor, break; case TextureFormat::B5G6R5: case TextureFormat::B6G5R5: + case TextureFormat::BF10GF11RF11: if (component == 0) { return descriptor.b_type; } @@ -119,7 +120,7 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor, } break; } - UNIMPLEMENTED_MSG("texture format not implement={}", format); + UNIMPLEMENTED_MSG("Texture format not implemented={}", format); return ComponentType::FLOAT; } @@ -191,6 +192,14 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) { return 6; } return 0; + case TextureFormat::BF10GF11RF11: + if (component == 1 || component == 2) { + return 11; + } + if (component == 0) { + return 10; + } + return 0; case TextureFormat::G8R24: if (component == 0) { return 8; @@ -211,10 +220,9 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) { return (component == 0 || component == 1) ? 8 : 0; case TextureFormat::G4R4: return (component == 0 || component == 1) ? 4 : 0; - default: - UNIMPLEMENTED_MSG("texture format not implement={}", format); - return 0; } + UNIMPLEMENTED_MSG("Texture format not implemented={}", format); + return 0; } std::size_t GetImageComponentMask(TextureFormat format) { @@ -235,6 +243,7 @@ std::size_t GetImageComponentMask(TextureFormat format) { case TextureFormat::R32_B24G8: case TextureFormat::B5G6R5: case TextureFormat::B6G5R5: + case TextureFormat::BF10GF11RF11: return std::size_t{R | G | B}; case TextureFormat::R32_G32: case TextureFormat::R16_G16: @@ -248,10 +257,9 @@ std::size_t GetImageComponentMask(TextureFormat format) { case TextureFormat::R8: case TextureFormat::R1: return std::size_t{R}; - default: - UNIMPLEMENTED_MSG("texture format not implement={}", format); - return std::size_t{R | G | B | A}; } + UNIMPLEMENTED_MSG("Texture format not implemented={}", format); + return std::size_t{R | G | B | A}; } std::size_t GetImageTypeNumCoordinates(Tegra::Shader::ImageType image_type) { @@ -299,7 +307,7 @@ std::pair<Node, bool> ShaderIR::GetComponentValue(ComponentType component_type, return {std::move(original_value), true}; } default: - UNIMPLEMENTED_MSG("Unimplement component type={}", component_type); + UNIMPLEMENTED_MSG("Unimplemented component type={}", component_type); return {std::move(original_value), true}; } } @@ -459,7 +467,7 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) { default: break; } - UNIMPLEMENTED_MSG("Unimplemented operation={} type={}", + UNIMPLEMENTED_MSG("Unimplemented operation={}, type={}", static_cast<u64>(instr.suatom_d.operation.Value()), static_cast<u64>(instr.suatom_d.operation_type.Value())); return OperationCode::AtomicImageAdd; diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index 8f0bb996e..29ebf65ba 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -357,13 +357,11 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { return pc; } -ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset, - std::optional<u32> buffer) { +ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo( + SamplerInfo info, std::optional<Tegra::Engines::SamplerDescriptor> sampler) { if (info.IsComplete()) { return info; } - const auto sampler = buffer ? registry.ObtainBindlessSampler(*buffer, offset) - : registry.ObtainBoundSampler(offset); if (!sampler) { LOG_WARNING(HW_GPU, "Unknown sampler info"); info.type = info.type.value_or(Tegra::Shader::TextureType::Texture2D); @@ -381,8 +379,8 @@ ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset, std::optional<Sampler> ShaderIR::GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo sampler_info) { - const auto offset = static_cast<u32>(sampler.index.Value()); - const auto info = GetSamplerInfo(sampler_info, offset); + const u32 offset = static_cast<u32>(sampler.index.Value()); + const auto info = GetSamplerInfo(sampler_info, registry.ObtainBoundSampler(offset)); // If this sampler has already been used, return the existing mapping. const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), @@ -404,20 +402,19 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, const Node sampler_register = GetRegister(reg); const auto [base_node, tracked_sampler_info] = TrackBindlessSampler(sampler_register, global_code, static_cast<s64>(global_code.size())); - ASSERT(base_node != nullptr); - if (base_node == nullptr) { + if (!base_node) { + UNREACHABLE(); return std::nullopt; } - if (const auto bindless_sampler_info = - std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) { - const u32 buffer = bindless_sampler_info->GetIndex(); - const u32 offset = bindless_sampler_info->GetOffset(); - info = GetSamplerInfo(info, offset, buffer); + if (const auto sampler_info = std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) { + const u32 buffer = sampler_info->index; + const u32 offset = sampler_info->offset; + info = GetSamplerInfo(info, registry.ObtainBindlessSampler(buffer, offset)); // If this sampler has already been used, return the existing mapping. const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), - [buffer = buffer, offset = offset](const Sampler& entry) { + [buffer, offset](const Sampler& entry) { return entry.buffer == buffer && entry.offset == offset; }); if (it != used_samplers.end()) { @@ -431,10 +428,32 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, return used_samplers.emplace_back(next_index, offset, buffer, *info.type, *info.is_array, *info.is_shadow, *info.is_buffer, false); } - if (const auto array_sampler_info = std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) { - const u32 base_offset = array_sampler_info->GetBaseOffset() / 4; - index_var = GetCustomVariable(array_sampler_info->GetIndexVar()); - info = GetSamplerInfo(info, base_offset); + if (const auto sampler_info = std::get_if<SeparateSamplerNode>(&*tracked_sampler_info)) { + const std::pair indices = sampler_info->indices; + const std::pair offsets = sampler_info->offsets; + info = GetSamplerInfo(info, registry.ObtainSeparateSampler(indices, offsets)); + + // Try to use an already created sampler if it exists + const auto it = std::find_if( + used_samplers.begin(), used_samplers.end(), [indices, offsets](const Sampler& entry) { + return offsets == std::pair{entry.offset, entry.secondary_offset} && + indices == std::pair{entry.buffer, entry.secondary_buffer}; + }); + if (it != used_samplers.end()) { + ASSERT(it->is_separated && it->type == info.type && it->is_array == info.is_array && + it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer); + return *it; + } + + // Otherwise create a new mapping for this sampler + const u32 next_index = static_cast<u32>(used_samplers.size()); + return used_samplers.emplace_back(next_index, offsets, indices, *info.type, *info.is_array, + *info.is_shadow, *info.is_buffer); + } + if (const auto sampler_info = std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) { + const u32 base_offset = sampler_info->base_offset / 4; + index_var = GetCustomVariable(sampler_info->bindless_var); + info = GetSamplerInfo(info, registry.ObtainBoundSampler(base_offset)); // If this sampler has already been used, return the existing mapping. const auto it = std::find_if( diff --git a/src/video_core/shader/memory_util.cpp b/src/video_core/shader/memory_util.cpp index 074f21691..5071c83ca 100644 --- a/src/video_core/shader/memory_util.cpp +++ b/src/video_core/shader/memory_util.cpp @@ -66,12 +66,12 @@ ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, GPUVAddr gpu_add u64 GetUniqueIdentifier(Tegra::Engines::ShaderType shader_type, bool is_a, const ProgramCode& code, const ProgramCode& code_b) { - u64 unique_identifier = boost::hash_value(code); + size_t unique_identifier = boost::hash_value(code); if (is_a) { // VertexA programs include two programs boost::hash_combine(unique_identifier, boost::hash_value(code_b)); } - return unique_identifier; + return static_cast<u64>(unique_identifier); } } // namespace VideoCommon::Shader diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index c5e5165ff..8f230d57a 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -275,10 +275,11 @@ using Node = std::shared_ptr<NodeData>; using Node4 = std::array<Node, 4>; using NodeBlock = std::vector<Node>; -class BindlessSamplerNode; -class ArraySamplerNode; +struct ArraySamplerNode; +struct BindlessSamplerNode; +struct SeparateSamplerNode; -using TrackSamplerData = std::variant<BindlessSamplerNode, ArraySamplerNode>; +using TrackSamplerData = std::variant<BindlessSamplerNode, SeparateSamplerNode, ArraySamplerNode>; using TrackSampler = std::shared_ptr<TrackSamplerData>; struct Sampler { @@ -288,63 +289,51 @@ struct Sampler { : index{index}, offset{offset}, type{type}, is_array{is_array}, is_shadow{is_shadow}, is_buffer{is_buffer}, is_indexed{is_indexed} {} + /// Separate sampler constructor + constexpr explicit Sampler(u32 index, std::pair<u32, u32> offsets, std::pair<u32, u32> buffers, + Tegra::Shader::TextureType type, bool is_array, bool is_shadow, + bool is_buffer) + : index{index}, offset{offsets.first}, secondary_offset{offsets.second}, + buffer{buffers.first}, secondary_buffer{buffers.second}, type{type}, is_array{is_array}, + is_shadow{is_shadow}, is_buffer{is_buffer}, is_separated{true} {} + /// Bindless samplers constructor constexpr explicit Sampler(u32 index, u32 offset, u32 buffer, Tegra::Shader::TextureType type, bool is_array, bool is_shadow, bool is_buffer, bool is_indexed) : index{index}, offset{offset}, buffer{buffer}, type{type}, is_array{is_array}, is_shadow{is_shadow}, is_buffer{is_buffer}, is_bindless{true}, is_indexed{is_indexed} {} - u32 index = 0; ///< Emulated index given for the this sampler. - u32 offset = 0; ///< Offset in the const buffer from where the sampler is being read. - u32 buffer = 0; ///< Buffer where the bindless sampler is being read (unused on bound samplers). - u32 size = 1; ///< Size of the sampler. + u32 index = 0; ///< Emulated index given for the this sampler. + u32 offset = 0; ///< Offset in the const buffer from where the sampler is being read. + u32 secondary_offset = 0; ///< Secondary offset in the const buffer. + u32 buffer = 0; ///< Buffer where the bindless sampler is read. + u32 secondary_buffer = 0; ///< Secondary buffer where the bindless sampler is read. + u32 size = 1; ///< Size of the sampler. Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc) - bool is_array = false; ///< Whether the texture is being sampled as an array texture or not. - bool is_shadow = false; ///< Whether the texture is being sampled as a depth texture or not. - bool is_buffer = false; ///< Whether the texture is a texture buffer without sampler. - bool is_bindless = false; ///< Whether this sampler belongs to a bindless texture or not. - bool is_indexed = false; ///< Whether this sampler is an indexed array of textures. + bool is_array = false; ///< Whether the texture is being sampled as an array texture or not. + bool is_shadow = false; ///< Whether the texture is being sampled as a depth texture or not. + bool is_buffer = false; ///< Whether the texture is a texture buffer without sampler. + bool is_bindless = false; ///< Whether this sampler belongs to a bindless texture or not. + bool is_indexed = false; ///< Whether this sampler is an indexed array of textures. + bool is_separated = false; ///< Whether the image and sampler is separated or not. }; /// Represents a tracked bindless sampler into a direct const buffer -class ArraySamplerNode final { -public: - explicit ArraySamplerNode(u32 index, u32 base_offset, u32 bindless_var) - : index{index}, base_offset{base_offset}, bindless_var{bindless_var} {} - - constexpr u32 GetIndex() const { - return index; - } - - constexpr u32 GetBaseOffset() const { - return base_offset; - } - - constexpr u32 GetIndexVar() const { - return bindless_var; - } - -private: +struct ArraySamplerNode { u32 index; u32 base_offset; u32 bindless_var; }; -/// Represents a tracked bindless sampler into a direct const buffer -class BindlessSamplerNode final { -public: - explicit BindlessSamplerNode(u32 index, u32 offset) : index{index}, offset{offset} {} - - constexpr u32 GetIndex() const { - return index; - } - - constexpr u32 GetOffset() const { - return offset; - } +/// Represents a tracked separate sampler image pair that was folded statically +struct SeparateSamplerNode { + std::pair<u32, u32> indices; + std::pair<u32, u32> offsets; +}; -private: +/// Represents a tracked bindless sampler into a direct const buffer +struct BindlessSamplerNode { u32 index; u32 offset; }; diff --git a/src/video_core/shader/node_helper.h b/src/video_core/shader/node_helper.h index 11231bbea..1e0886185 100644 --- a/src/video_core/shader/node_helper.h +++ b/src/video_core/shader/node_helper.h @@ -48,7 +48,7 @@ Node MakeNode(Args&&... args) { template <typename T, typename... Args> TrackSampler MakeTrackSampler(Args&&... args) { static_assert(std::is_convertible_v<T, TrackSamplerData>); - return std::make_shared<TrackSamplerData>(T(std::forward<Args>(args)...)); + return std::make_shared<TrackSamplerData>(T{std::forward<Args>(args)...}); } template <typename... Args> diff --git a/src/video_core/shader/registry.cpp b/src/video_core/shader/registry.cpp index af70b3f35..cdf274e54 100644 --- a/src/video_core/shader/registry.cpp +++ b/src/video_core/shader/registry.cpp @@ -93,6 +93,26 @@ std::optional<SamplerDescriptor> Registry::ObtainBoundSampler(u32 offset) { return value; } +std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainSeparateSampler( + std::pair<u32, u32> buffers, std::pair<u32, u32> offsets) { + SeparateSamplerKey key; + key.buffers = buffers; + key.offsets = offsets; + const auto iter = separate_samplers.find(key); + if (iter != separate_samplers.end()) { + return iter->second; + } + if (!engine) { + return std::nullopt; + } + + const u32 handle_1 = engine->AccessConstBuffer32(stage, key.buffers.first, key.offsets.first); + const u32 handle_2 = engine->AccessConstBuffer32(stage, key.buffers.second, key.offsets.second); + const SamplerDescriptor value = engine->AccessSampler(handle_1 | handle_2); + separate_samplers.emplace(key, value); + return value; +} + std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer, u32 offset) { const std::pair key = {buffer, offset}; diff --git a/src/video_core/shader/registry.h b/src/video_core/shader/registry.h index 0c80d35fd..231206765 100644 --- a/src/video_core/shader/registry.h +++ b/src/video_core/shader/registry.h @@ -19,8 +19,39 @@ namespace VideoCommon::Shader { +struct SeparateSamplerKey { + std::pair<u32, u32> buffers; + std::pair<u32, u32> offsets; +}; + +} // namespace VideoCommon::Shader + +namespace std { + +template <> +struct hash<VideoCommon::Shader::SeparateSamplerKey> { + std::size_t operator()(const VideoCommon::Shader::SeparateSamplerKey& key) const noexcept { + return std::hash<u32>{}(key.buffers.first ^ key.buffers.second ^ key.offsets.first ^ + key.offsets.second); + } +}; + +template <> +struct equal_to<VideoCommon::Shader::SeparateSamplerKey> { + bool operator()(const VideoCommon::Shader::SeparateSamplerKey& lhs, + const VideoCommon::Shader::SeparateSamplerKey& rhs) const noexcept { + return lhs.buffers == rhs.buffers && lhs.offsets == rhs.offsets; + } +}; + +} // namespace std + +namespace VideoCommon::Shader { + using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>; using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>; +using SeparateSamplerMap = + std::unordered_map<SeparateSamplerKey, Tegra::Engines::SamplerDescriptor>; using BindlessSamplerMap = std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>; @@ -73,6 +104,9 @@ public: std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset); + std::optional<Tegra::Engines::SamplerDescriptor> ObtainSeparateSampler( + std::pair<u32, u32> buffers, std::pair<u32, u32> offsets); + std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset); /// Inserts a key. @@ -128,6 +162,7 @@ private: Tegra::Engines::ConstBufferEngineInterface* engine = nullptr; KeyMap keys; BoundSamplerMap bound_samplers; + SeparateSamplerMap separate_samplers; BindlessSamplerMap bindless_samplers; u32 bound_buffer; GraphicsInfo graphics_info; diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index 15ae152f2..3a98b2104 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -330,8 +330,8 @@ private: OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation); /// Queries the missing sampler info from the execution context. - SamplerInfo GetSamplerInfo(SamplerInfo info, u32 offset, - std::optional<u32> buffer = std::nullopt); + SamplerInfo GetSamplerInfo(SamplerInfo info, + std::optional<Tegra::Engines::SamplerDescriptor> sampler); /// Accesses a texture sampler. std::optional<Sampler> GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo info); @@ -409,8 +409,14 @@ private: std::tuple<Node, u32, u32> TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const; - std::tuple<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code, - s64 cursor); + std::pair<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code, + s64 cursor); + + std::pair<Node, TrackSampler> HandleBindlessIndirectRead(const CbufNode& cbuf, + const OperationNode& operation, + Node gpr, Node base_offset, + Node tracked, const NodeBlock& code, + s64 cursor); std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const; diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp index eb97bfd41..d5ed81442 100644 --- a/src/video_core/shader/track.cpp +++ b/src/video_core/shader/track.cpp @@ -14,6 +14,7 @@ namespace VideoCommon::Shader { namespace { + std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor, OperationCode operation_code) { for (; cursor >= 0; --cursor) { @@ -63,7 +64,8 @@ bool AmendNodeCv(std::size_t amend_index, Node node) { if (const auto operation = std::get_if<OperationNode>(&*node)) { operation->SetAmendIndex(amend_index); return true; - } else if (const auto conditional = std::get_if<ConditionalNode>(&*node)) { + } + if (const auto conditional = std::get_if<ConditionalNode>(&*node)) { conditional->SetAmendIndex(amend_index); return true; } @@ -72,40 +74,27 @@ bool AmendNodeCv(std::size_t amend_index, Node node) { } // Anonymous namespace -std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code, - s64 cursor) { +std::pair<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code, + s64 cursor) { if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) { + const u32 cbuf_index = cbuf->GetIndex(); + // Constant buffer found, test if it's an immediate const auto& offset = cbuf->GetOffset(); if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { - auto track = - MakeTrackSampler<BindlessSamplerNode>(cbuf->GetIndex(), immediate->GetValue()); + auto track = MakeTrackSampler<BindlessSamplerNode>(cbuf_index, immediate->GetValue()); return {tracked, track}; } if (const auto operation = std::get_if<OperationNode>(&*offset)) { const u32 bound_buffer = registry.GetBoundBuffer(); - if (bound_buffer != cbuf->GetIndex()) { + if (bound_buffer != cbuf_index) { return {}; } - const auto pair = DecoupleIndirectRead(*operation); - if (!pair) { - return {}; + if (const std::optional pair = DecoupleIndirectRead(*operation)) { + auto [gpr, base_offset] = *pair; + return HandleBindlessIndirectRead(*cbuf, *operation, gpr, base_offset, tracked, + code, cursor); } - auto [gpr, base_offset] = *pair; - const auto offset_inm = std::get_if<ImmediateNode>(&*base_offset); - const auto& gpu_driver = registry.AccessGuestDriverProfile(); - const u32 bindless_cv = NewCustomVariable(); - Node op = - Operation(OperationCode::UDiv, gpr, Immediate(gpu_driver.GetTextureHandlerSize())); - - const Node cv_node = GetCustomVariable(bindless_cv); - Node amend_op = Operation(OperationCode::Assign, cv_node, std::move(op)); - const std::size_t amend_index = DeclareAmend(std::move(amend_op)); - AmendNodeCv(amend_index, code[cursor]); - // TODO Implement Bindless Index custom variable - auto track = MakeTrackSampler<ArraySamplerNode>(cbuf->GetIndex(), - offset_inm->GetValue(), bindless_cv); - return {tracked, track}; } return {}; } @@ -122,10 +111,23 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons return TrackBindlessSampler(source, code, new_cursor); } if (const auto operation = std::get_if<OperationNode>(&*tracked)) { - for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) { - if (auto found = TrackBindlessSampler((*operation)[i - 1], code, cursor); - std::get<0>(found)) { - // Cbuf found in operand. + const OperationNode& op = *operation; + + const OperationCode opcode = operation->GetCode(); + if (opcode == OperationCode::IBitwiseOr || opcode == OperationCode::UBitwiseOr) { + ASSERT(op.GetOperandsCount() == 2); + auto [node_a, index_a, offset_a] = TrackCbuf(op[0], code, cursor); + auto [node_b, index_b, offset_b] = TrackCbuf(op[1], code, cursor); + if (node_a && node_b) { + auto track = MakeTrackSampler<SeparateSamplerNode>(std::pair{index_a, index_b}, + std::pair{offset_a, offset_b}); + return {tracked, std::move(track)}; + } + } + std::size_t i = op.GetOperandsCount(); + while (i--) { + if (auto found = TrackBindlessSampler(op[i - 1], code, cursor); std::get<0>(found)) { + // Constant buffer found in operand. return found; } } @@ -139,6 +141,26 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons return {}; } +std::pair<Node, TrackSampler> ShaderIR::HandleBindlessIndirectRead( + const CbufNode& cbuf, const OperationNode& operation, Node gpr, Node base_offset, Node tracked, + const NodeBlock& code, s64 cursor) { + const auto offset_imm = std::get<ImmediateNode>(*base_offset); + const auto& gpu_driver = registry.AccessGuestDriverProfile(); + const u32 bindless_cv = NewCustomVariable(); + const u32 texture_handler_size = gpu_driver.GetTextureHandlerSize(); + Node op = Operation(OperationCode::UDiv, gpr, Immediate(texture_handler_size)); + + Node cv_node = GetCustomVariable(bindless_cv); + Node amend_op = Operation(OperationCode::Assign, std::move(cv_node), std::move(op)); + const std::size_t amend_index = DeclareAmend(std::move(amend_op)); + AmendNodeCv(amend_index, code[cursor]); + + // TODO: Implement bindless index custom variable + auto track = + MakeTrackSampler<ArraySamplerNode>(cbuf.GetIndex(), offset_imm.GetValue(), bindless_cv); + return {tracked, track}; +} + std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const { if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) { diff --git a/src/video_core/shader_cache.h b/src/video_core/shader_cache.h new file mode 100644 index 000000000..2dd270e99 --- /dev/null +++ b/src/video_core/shader_cache.h @@ -0,0 +1,228 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <algorithm> +#include <memory> +#include <mutex> +#include <unordered_map> +#include <utility> +#include <vector> + +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/rasterizer_interface.h" + +namespace VideoCommon { + +template <class T> +class ShaderCache { + static constexpr u64 PAGE_BITS = 14; + + struct Entry { + VAddr addr_start; + VAddr addr_end; + T* data; + + bool is_memory_marked = true; + + constexpr bool Overlaps(VAddr start, VAddr end) const noexcept { + return start < addr_end && addr_start < end; + } + }; + +public: + virtual ~ShaderCache() = default; + + /// @brief Removes shaders inside a given region + /// @note Checks for ranges + /// @param addr Start address of the invalidation + /// @param size Number of bytes of the invalidation + void InvalidateRegion(VAddr addr, std::size_t size) { + std::scoped_lock lock{invalidation_mutex}; + InvalidatePagesInRegion(addr, size); + RemovePendingShaders(); + } + + /// @brief Unmarks a memory region as cached and marks it for removal + /// @param addr Start address of the CPU write operation + /// @param size Number of bytes of the CPU write operation + void OnCPUWrite(VAddr addr, std::size_t size) { + std::lock_guard lock{invalidation_mutex}; + InvalidatePagesInRegion(addr, size); + } + + /// @brief Flushes delayed removal operations + void SyncGuestHost() { + std::scoped_lock lock{invalidation_mutex}; + RemovePendingShaders(); + } + + /// @brief Tries to obtain a cached shader starting in a given address + /// @note Doesn't check for ranges, the given address has to be the start of the shader + /// @param addr Start address of the shader, this doesn't cache for region + /// @return Pointer to a valid shader, nullptr when nothing is found + T* TryGet(VAddr addr) const { + std::scoped_lock lock{lookup_mutex}; + + const auto it = lookup_cache.find(addr); + if (it == lookup_cache.end()) { + return nullptr; + } + return it->second->data; + } + +protected: + explicit ShaderCache(VideoCore::RasterizerInterface& rasterizer_) : rasterizer{rasterizer_} {} + + /// @brief Register in the cache a given entry + /// @param data Shader to store in the cache + /// @param addr Start address of the shader that will be registered + /// @param size Size in bytes of the shader + void Register(std::unique_ptr<T> data, VAddr addr, std::size_t size) { + std::scoped_lock lock{invalidation_mutex, lookup_mutex}; + + const VAddr addr_end = addr + size; + Entry* const entry = NewEntry(addr, addr_end, data.get()); + + const u64 page_end = addr_end >> PAGE_BITS; + for (u64 page = addr >> PAGE_BITS; page <= page_end; ++page) { + invalidation_cache[page].push_back(entry); + } + + storage.push_back(std::move(data)); + + rasterizer.UpdatePagesCachedCount(addr, size, 1); + } + + /// @brief Called when a shader is going to be removed + /// @param shader Shader that will be removed + /// @pre invalidation_cache is locked + /// @pre lookup_mutex is locked + virtual void OnShaderRemoval([[maybe_unused]] T* shader) {} + +private: + /// @brief Invalidate pages in a given region + /// @pre invalidation_mutex is locked + void InvalidatePagesInRegion(VAddr addr, std::size_t size) { + const VAddr addr_end = addr + size; + const u64 page_end = addr_end >> PAGE_BITS; + for (u64 page = addr >> PAGE_BITS; page <= page_end; ++page) { + const auto it = invalidation_cache.find(page); + if (it == invalidation_cache.end()) { + continue; + } + + std::vector<Entry*>& entries = it->second; + InvalidatePageEntries(entries, addr, addr_end); + + // If there's nothing else in this page, remove it to avoid overpopulating the hash map. + if (entries.empty()) { + invalidation_cache.erase(it); + } + } + } + + /// @brief Remove shaders marked for deletion + /// @pre invalidation_mutex is locked + void RemovePendingShaders() { + if (marked_for_removal.empty()) { + return; + } + std::scoped_lock lock{lookup_mutex}; + + std::vector<T*> removed_shaders; + removed_shaders.reserve(marked_for_removal.size()); + + for (Entry* const entry : marked_for_removal) { + if (lookup_cache.erase(entry->addr_start) > 0) { + removed_shaders.push_back(entry->data); + } + } + marked_for_removal.clear(); + + if (!removed_shaders.empty()) { + RemoveShadersFromStorage(std::move(removed_shaders)); + } + } + + /// @brief Invalidates entries in a given range for the passed page + /// @param entries Vector of entries in the page, it will be modified on overlaps + /// @param addr Start address of the invalidation + /// @param addr_end Non-inclusive end address of the invalidation + /// @pre invalidation_mutex is locked + void InvalidatePageEntries(std::vector<Entry*>& entries, VAddr addr, VAddr addr_end) { + auto it = entries.begin(); + while (it != entries.end()) { + Entry* const entry = *it; + if (!entry->Overlaps(addr, addr_end)) { + ++it; + continue; + } + UnmarkMemory(entry); + marked_for_removal.push_back(entry); + + it = entries.erase(it); + } + } + + /// @brief Unmarks an entry from the rasterizer cache + /// @param entry Entry to unmark from memory + void UnmarkMemory(Entry* entry) { + if (!entry->is_memory_marked) { + return; + } + entry->is_memory_marked = false; + + const VAddr addr = entry->addr_start; + const std::size_t size = entry->addr_end - addr; + rasterizer.UpdatePagesCachedCount(addr, size, -1); + } + + /// @brief Removes a vector of shaders from a list + /// @param removed_shaders Shaders to be removed from the storage, it can contain duplicates + /// @pre invalidation_mutex is locked + /// @pre lookup_mutex is locked + void RemoveShadersFromStorage(std::vector<T*> removed_shaders) { + // Remove duplicates + std::sort(removed_shaders.begin(), removed_shaders.end()); + removed_shaders.erase(std::unique(removed_shaders.begin(), removed_shaders.end()), + removed_shaders.end()); + + // Now that there are no duplicates, we can notify removals + for (T* const shader : removed_shaders) { + OnShaderRemoval(shader); + } + + // Remove them from the cache + const auto is_removed = [&removed_shaders](std::unique_ptr<T>& shader) { + return std::find(removed_shaders.begin(), removed_shaders.end(), shader.get()) != + removed_shaders.end(); + }; + storage.erase(std::remove_if(storage.begin(), storage.end(), is_removed), storage.end()); + } + + /// @brief Creates a new entry in the lookup cache and returns its pointer + /// @pre lookup_mutex is locked + Entry* NewEntry(VAddr addr, VAddr addr_end, T* data) { + auto entry = std::make_unique<Entry>(Entry{addr, addr_end, data}); + Entry* const entry_pointer = entry.get(); + + lookup_cache.emplace(addr, std::move(entry)); + return entry_pointer; + } + + VideoCore::RasterizerInterface& rasterizer; + + mutable std::mutex lookup_mutex; + std::mutex invalidation_mutex; + + std::unordered_map<u64, std::unique_ptr<Entry>> lookup_cache; + std::unordered_map<u64, std::vector<Entry*>> invalidation_cache; + std::vector<std::unique_ptr<T>> storage; + std::vector<Entry*> marked_for_removal; +}; + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp index 715f39d0d..0caf3b4f0 100644 --- a/src/video_core/texture_cache/surface_base.cpp +++ b/src/video_core/texture_cache/surface_base.cpp @@ -120,6 +120,9 @@ std::optional<std::pair<u32, u32>> SurfaceBaseImpl::GetLayerMipmap( } const auto relative_address{static_cast<GPUVAddr>(candidate_gpu_addr - gpu_addr)}; const auto layer{static_cast<u32>(relative_address / layer_size)}; + if (layer >= params.depth) { + return {}; + } const GPUVAddr mipmap_address = relative_address - layer_size * layer; const auto mipmap_it = Common::BinaryFind(mipmap_offsets.begin(), mipmap_offsets.end(), mipmap_address); @@ -248,12 +251,11 @@ void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager, // Use an extra temporal buffer auto& tmp_buffer = staging_cache.GetBuffer(1); - // Special case for 3D Texture Segments - const bool must_read_current_data = - params.block_depth > 0 && params.target == VideoCore::Surface::SurfaceTarget::Texture2D; tmp_buffer.resize(guest_memory_size); host_ptr = tmp_buffer.data(); - if (must_read_current_data) { + + if (params.target == SurfaceTarget::Texture3D) { + // Special case for 3D texture segments memory_manager.ReadBlockUnsafe(gpu_addr, host_ptr, guest_memory_size); } diff --git a/src/video_core/texture_cache/surface_base.h b/src/video_core/texture_cache/surface_base.h index 79e10ffbb..173f2edba 100644 --- a/src/video_core/texture_cache/surface_base.h +++ b/src/video_core/texture_cache/surface_base.h @@ -217,8 +217,8 @@ public: } bool IsProtected() const { - // Only 3D Slices are to be protected - return is_target && params.block_depth > 0; + // Only 3D slices are to be protected + return is_target && params.target == SurfaceTarget::Texture3D; } bool IsRenderTarget() const { @@ -250,6 +250,11 @@ public: return GetView(ViewParams(overview_params.target, 0, num_layers, 0, params.num_levels)); } + TView Emplace3DView(u32 slice, u32 depth, u32 base_level, u32 num_levels) { + return GetView(ViewParams(VideoCore::Surface::SurfaceTarget::Texture3D, slice, depth, + base_level, num_levels)); + } + std::optional<TView> EmplaceIrregularView(const SurfaceParams& view_params, const GPUVAddr view_addr, const std::size_t candidate_size, const u32 mipmap, @@ -272,8 +277,8 @@ public: std::optional<TView> EmplaceView(const SurfaceParams& view_params, const GPUVAddr view_addr, const std::size_t candidate_size) { if (params.target == SurfaceTarget::Texture3D || - (params.num_levels == 1 && !params.is_layered) || - view_params.target == SurfaceTarget::Texture3D) { + view_params.target == SurfaceTarget::Texture3D || + (params.num_levels == 1 && !params.is_layered)) { return {}; } const auto layer_mipmap{GetLayerMipmap(view_addr)}; diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp index 884fabffe..0b2b2b8c4 100644 --- a/src/video_core/texture_cache/surface_params.cpp +++ b/src/video_core/texture_cache/surface_params.cpp @@ -215,10 +215,19 @@ SurfaceParams SurfaceParams::CreateForFramebuffer(Core::System& system, std::siz params.num_levels = 1; params.emulated_levels = 1; - const bool is_layered = config.layers > 1 && params.block_depth == 0; - params.is_layered = is_layered; - params.depth = is_layered ? config.layers.Value() : 1; - params.target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D; + if (config.memory_layout.is_3d != 0) { + params.depth = config.layers.Value(); + params.is_layered = false; + params.target = SurfaceTarget::Texture3D; + } else if (config.layers > 1) { + params.depth = config.layers.Value(); + params.is_layered = true; + params.target = SurfaceTarget::Texture2DArray; + } else { + params.depth = 1; + params.is_layered = false; + params.target = SurfaceTarget::Texture2D; + } return params; } @@ -237,7 +246,7 @@ SurfaceParams SurfaceParams::CreateForFermiCopySurface( params.width = config.width; params.height = config.height; params.pitch = config.pitch; - // TODO(Rodrigo): Try to guess the surface target from depth and layer parameters + // TODO(Rodrigo): Try to guess texture arrays from parameters params.target = SurfaceTarget::Texture2D; params.depth = 1; params.num_levels = 1; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 6f63217a2..6207d8dfe 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -24,6 +24,7 @@ #include "core/core.h" #include "core/memory.h" #include "core/settings.h" +#include "video_core/compatible_formats.h" #include "video_core/dirty_flags.h" #include "video_core/engines/fermi_2d.h" #include "video_core/engines/maxwell_3d.h" @@ -47,8 +48,8 @@ class RasterizerInterface; namespace VideoCommon { +using VideoCore::Surface::FormatCompatibility; using VideoCore::Surface::PixelFormat; - using VideoCore::Surface::SurfaceTarget; using RenderTargetConfig = Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig; @@ -298,15 +299,13 @@ public: const GPUVAddr src_gpu_addr = src_config.Address(); const GPUVAddr dst_gpu_addr = dst_config.Address(); DeduceBestBlit(src_params, dst_params, src_gpu_addr, dst_gpu_addr); - const std::optional<VAddr> dst_cpu_addr = - system.GPU().MemoryManager().GpuToCpuAddress(dst_gpu_addr); - const std::optional<VAddr> src_cpu_addr = - system.GPU().MemoryManager().GpuToCpuAddress(src_gpu_addr); - std::pair<TSurface, TView> dst_surface = - GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false); - std::pair<TSurface, TView> src_surface = - GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false); - ImageBlit(src_surface.second, dst_surface.second, copy_config); + + const auto& memory_manager = system.GPU().MemoryManager(); + const std::optional<VAddr> dst_cpu_addr = memory_manager.GpuToCpuAddress(dst_gpu_addr); + const std::optional<VAddr> src_cpu_addr = memory_manager.GpuToCpuAddress(src_gpu_addr); + std::pair dst_surface = GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false); + TView src_surface = GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false).second; + ImageBlit(src_surface, dst_surface.second, copy_config); dst_surface.first->MarkAsModified(true, Tick()); } @@ -508,12 +507,12 @@ private: return RecycleStrategy::Flush; } // 3D Textures decision - if (params.block_depth > 1 || params.target == SurfaceTarget::Texture3D) { + if (params.target == SurfaceTarget::Texture3D) { return RecycleStrategy::Flush; } for (const auto& s : overlaps) { const auto& s_params = s->GetSurfaceParams(); - if (s_params.block_depth > 1 || s_params.target == SurfaceTarget::Texture3D) { + if (s_params.target == SurfaceTarget::Texture3D) { return RecycleStrategy::Flush; } } @@ -597,7 +596,7 @@ private: } else { new_surface = GetUncachedSurface(gpu_addr, params); } - const auto& final_params = new_surface->GetSurfaceParams(); + const SurfaceParams& final_params = new_surface->GetSurfaceParams(); if (cr_params.type != final_params.type) { if (Settings::IsGPULevelExtreme()) { BufferCopy(current_surface, new_surface); @@ -605,7 +604,7 @@ private: } else { std::vector<CopyParams> bricks = current_surface->BreakDown(final_params); for (auto& brick : bricks) { - ImageCopy(current_surface, new_surface, brick); + TryCopyImage(current_surface, new_surface, brick); } } Unregister(current_surface); @@ -696,7 +695,7 @@ private: } const CopyParams copy_params(0, 0, 0, 0, 0, base_layer, 0, mipmap, width, height, src_params.depth); - ImageCopy(surface, new_surface, copy_params); + TryCopyImage(surface, new_surface, copy_params); } } if (passed_tests == 0) { @@ -731,51 +730,9 @@ private: */ std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(VectorSurface& overlaps, const SurfaceParams& params, - const GPUVAddr gpu_addr, - const VAddr cpu_addr, + GPUVAddr gpu_addr, VAddr cpu_addr, bool preserve_contents) { - if (params.target == SurfaceTarget::Texture3D) { - bool failed = false; - if (params.num_levels > 1) { - // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach - return std::nullopt; - } - TSurface new_surface = GetUncachedSurface(gpu_addr, params); - bool modified = false; - for (auto& surface : overlaps) { - const SurfaceParams& src_params = surface->GetSurfaceParams(); - if (src_params.target != SurfaceTarget::Texture2D) { - failed = true; - break; - } - if (src_params.height != params.height) { - failed = true; - break; - } - if (src_params.block_depth != params.block_depth || - src_params.block_height != params.block_height) { - failed = true; - break; - } - const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr); - const auto offsets = params.GetBlockOffsetXYZ(offset); - const auto z = std::get<2>(offsets); - modified |= surface->IsModified(); - const CopyParams copy_params(0, 0, 0, 0, 0, z, 0, 0, params.width, params.height, - 1); - ImageCopy(surface, new_surface, copy_params); - } - if (failed) { - return std::nullopt; - } - for (const auto& surface : overlaps) { - Unregister(surface); - } - new_surface->MarkAsModified(modified, Tick()); - Register(new_surface); - auto view = new_surface->GetMainView(); - return {{std::move(new_surface), view}}; - } else { + if (params.target != SurfaceTarget::Texture3D) { for (const auto& surface : overlaps) { if (!surface->MatchTarget(params.target)) { if (overlaps.size() == 1 && surface->GetCpuAddr() == cpu_addr) { @@ -791,11 +748,60 @@ private: continue; } if (surface->MatchesStructure(params) == MatchStructureResult::FullMatch) { - return {{surface, surface->GetMainView()}}; + return std::make_pair(surface, surface->GetMainView()); } } return InitializeSurface(gpu_addr, params, preserve_contents); } + + if (params.num_levels > 1) { + // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach + return std::nullopt; + } + + if (overlaps.size() == 1) { + const auto& surface = overlaps[0]; + const SurfaceParams& overlap_params = surface->GetSurfaceParams(); + // Don't attempt to render to textures with more than one level for now + // The texture has to be to the right or the sample address if we want to render to it + if (overlap_params.num_levels == 1 && cpu_addr >= surface->GetCpuAddr()) { + const u32 offset = static_cast<u32>(cpu_addr - surface->GetCpuAddr()); + const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset)); + if (slice < overlap_params.depth) { + auto view = surface->Emplace3DView(slice, params.depth, 0, 1); + return std::make_pair(std::move(surface), std::move(view)); + } + } + } + + TSurface new_surface = GetUncachedSurface(gpu_addr, params); + bool modified = false; + + for (auto& surface : overlaps) { + const SurfaceParams& src_params = surface->GetSurfaceParams(); + if (src_params.target != SurfaceTarget::Texture2D || + src_params.height != params.height || + src_params.block_depth != params.block_depth || + src_params.block_height != params.block_height) { + return std::nullopt; + } + modified |= surface->IsModified(); + + const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr); + const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset)); + const u32 width = params.width; + const u32 height = params.height; + const CopyParams copy_params(0, 0, 0, 0, 0, slice, 0, 0, width, height, 1); + TryCopyImage(surface, new_surface, copy_params); + } + for (const auto& surface : overlaps) { + Unregister(surface); + } + new_surface->MarkAsModified(modified, Tick()); + Register(new_surface); + + TView view = new_surface->GetMainView(); + return std::make_pair(std::move(new_surface), std::move(view)); } /** @@ -873,7 +879,7 @@ private: } } - // Check if it's a 3D texture + // Manage 3D textures if (params.block_depth > 0) { auto surface = Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr, preserve_contents); @@ -1048,7 +1054,7 @@ private: void DeduceBestBlit(SurfaceParams& src_params, SurfaceParams& dst_params, const GPUVAddr src_gpu_addr, const GPUVAddr dst_gpu_addr) { auto deduced_src = DeduceSurface(src_gpu_addr, src_params); - auto deduced_dst = DeduceSurface(src_gpu_addr, src_params); + auto deduced_dst = DeduceSurface(dst_gpu_addr, dst_params); if (deduced_src.Failed() || deduced_dst.Failed()) { return; } @@ -1187,6 +1193,19 @@ private: return {}; } + /// Try to do an image copy logging when formats are incompatible. + void TryCopyImage(TSurface& src, TSurface& dst, const CopyParams& copy) { + const SurfaceParams& src_params = src->GetSurfaceParams(); + const SurfaceParams& dst_params = dst->GetSurfaceParams(); + if (!format_compatibility.TestCopy(src_params.pixel_format, dst_params.pixel_format)) { + LOG_ERROR(HW_GPU, "Illegal copy between formats={{{}, {}}}", + static_cast<int>(dst_params.pixel_format), + static_cast<int>(src_params.pixel_format)); + return; + } + ImageCopy(src, dst, copy); + } + constexpr PixelFormat GetSiblingFormat(PixelFormat format) const { return siblings_table[static_cast<std::size_t>(format)]; } @@ -1236,6 +1255,7 @@ private: VideoCore::RasterizerInterface& rasterizer; FormatLookupTable format_lookup_table; + FormatCompatibility format_compatibility; u64 ticks{}; |
