diff options
Diffstat (limited to 'src/video_core')
220 files changed, 20450 insertions, 9452 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 258d58eba..abcee2a1c 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -1,13 +1,37 @@ +add_subdirectory(host_shaders) + add_library(video_core STATIC buffer_cache/buffer_block.h buffer_cache/buffer_cache.h + buffer_cache/map_interval.cpp buffer_cache/map_interval.h + cdma_pusher.cpp + cdma_pusher.h + command_classes/codecs/codec.cpp + command_classes/codecs/codec.h + command_classes/codecs/h264.cpp + command_classes/codecs/h264.h + command_classes/codecs/vp9.cpp + command_classes/codecs/vp9.h + command_classes/codecs/vp9_types.h + command_classes/host1x.cpp + command_classes/host1x.h + command_classes/nvdec.cpp + command_classes/nvdec.h + command_classes/nvdec_common.h + command_classes/sync_manager.cpp + command_classes/sync_manager.h + command_classes/vic.cpp + command_classes/vic.h + compatible_formats.cpp + compatible_formats.h dirty_flags.cpp dirty_flags.h dma_pusher.cpp dma_pusher.h engines/const_buffer_engine_interface.h engines/const_buffer_info.h + engines/engine_interface.h engines/engine_upload.cpp engines/engine_upload.h engines/fermi_2d.cpp @@ -23,6 +47,15 @@ add_library(video_core STATIC engines/shader_bytecode.h engines/shader_header.h engines/shader_type.h + macro/macro.cpp + macro/macro.h + macro/macro_hle.cpp + macro/macro_hle.h + macro/macro_interpreter.cpp + macro/macro_interpreter.h + macro/macro_jit_x64.cpp + macro/macro_jit_x64.h + fence_manager.h gpu.cpp gpu.h gpu_asynch.cpp @@ -33,8 +66,6 @@ add_library(video_core STATIC gpu_thread.h guest_driver.cpp guest_driver.h - macro_interpreter.cpp - macro_interpreter.h memory_manager.cpp memory_manager.h morton.cpp @@ -42,15 +73,17 @@ add_library(video_core STATIC query_cache.h rasterizer_accelerated.cpp rasterizer_accelerated.h - rasterizer_cache.cpp - rasterizer_cache.h rasterizer_interface.h renderer_base.cpp renderer_base.h + renderer_opengl/gl_arb_decompiler.cpp + renderer_opengl/gl_arb_decompiler.h renderer_opengl/gl_buffer_cache.cpp renderer_opengl/gl_buffer_cache.h renderer_opengl/gl_device.cpp renderer_opengl/gl_device.h + renderer_opengl/gl_fence_manager.cpp + renderer_opengl/gl_fence_manager.h renderer_opengl/gl_framebuffer_cache.cpp renderer_opengl/gl_framebuffer_cache.h renderer_opengl/gl_rasterizer.cpp @@ -84,6 +117,9 @@ add_library(video_core STATIC renderer_opengl/utils.h sampler_cache.cpp sampler_cache.h + shader_cache.h + shader_notify.cpp + shader_notify.h shader/decode/arithmetic.cpp shader/decode/arithmetic_immediate.cpp shader/decode/bfe.cpp @@ -114,6 +150,8 @@ add_library(video_core STATIC shader/decode/other.cpp shader/ast.cpp shader/ast.h + shader/async_shaders.cpp + shader/async_shaders.h shader/compiler_settings.cpp shader/compiler_settings.h shader/control_flow.cpp @@ -121,6 +159,8 @@ add_library(video_core STATIC shader/decode.cpp shader/expr.cpp shader/expr.h + shader/memory_util.cpp + shader/memory_util.h shader/node_helper.cpp shader/node_helper.h shader/node.h @@ -160,12 +200,16 @@ if (ENABLE_VULKAN) renderer_vulkan/fixed_pipeline_state.h renderer_vulkan/maxwell_to_vk.cpp renderer_vulkan/maxwell_to_vk.h + renderer_vulkan/nsight_aftermath_tracker.cpp + renderer_vulkan/nsight_aftermath_tracker.h renderer_vulkan/renderer_vulkan.h renderer_vulkan/renderer_vulkan.cpp renderer_vulkan/vk_blit_screen.cpp renderer_vulkan/vk_blit_screen.h renderer_vulkan/vk_buffer_cache.cpp renderer_vulkan/vk_buffer_cache.h + renderer_vulkan/vk_command_pool.cpp + renderer_vulkan/vk_command_pool.h renderer_vulkan/vk_compute_pass.cpp renderer_vulkan/vk_compute_pass.h renderer_vulkan/vk_compute_pipeline.cpp @@ -174,10 +218,14 @@ if (ENABLE_VULKAN) renderer_vulkan/vk_descriptor_pool.h renderer_vulkan/vk_device.cpp renderer_vulkan/vk_device.h + renderer_vulkan/vk_fence_manager.cpp + renderer_vulkan/vk_fence_manager.h renderer_vulkan/vk_graphics_pipeline.cpp renderer_vulkan/vk_graphics_pipeline.h renderer_vulkan/vk_image.cpp renderer_vulkan/vk_image.h + renderer_vulkan/vk_master_semaphore.cpp + renderer_vulkan/vk_master_semaphore.h renderer_vulkan/vk_memory_manager.cpp renderer_vulkan/vk_memory_manager.h renderer_vulkan/vk_pipeline_cache.cpp @@ -188,8 +236,8 @@ if (ENABLE_VULKAN) renderer_vulkan/vk_rasterizer.h renderer_vulkan/vk_renderpass_cache.cpp renderer_vulkan/vk_renderpass_cache.h - renderer_vulkan/vk_resource_manager.cpp - renderer_vulkan/vk_resource_manager.h + renderer_vulkan/vk_resource_pool.cpp + renderer_vulkan/vk_resource_pool.h renderer_vulkan/vk_sampler_cache.cpp renderer_vulkan/vk_sampler_cache.h renderer_vulkan/vk_scheduler.cpp @@ -213,21 +261,55 @@ if (ENABLE_VULKAN) renderer_vulkan/wrapper.cpp renderer_vulkan/wrapper.h ) - - target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include) - target_compile_definitions(video_core PRIVATE HAS_VULKAN) endif() create_target_directory_groups(video_core) target_link_libraries(video_core PUBLIC common core) -target_link_libraries(video_core PRIVATE glad) +target_link_libraries(video_core PRIVATE glad xbyak) + +if (MSVC) + target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR}) + target_link_libraries(video_core PUBLIC ${FFMPEG_LIBRARY_DIR}/swscale.lib ${FFMPEG_LIBRARY_DIR}/avcodec.lib ${FFMPEG_LIBRARY_DIR}/avutil.lib) +else() + target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR}) + target_link_libraries(video_core PRIVATE ${FFMPEG_LIBRARIES}) +endif() + +add_dependencies(video_core host_shaders) +target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE}) + if (ENABLE_VULKAN) + target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include) + target_compile_definitions(video_core PRIVATE HAS_VULKAN) target_link_libraries(video_core PRIVATE sirit) endif() +if (ENABLE_NSIGHT_AFTERMATH) + if (NOT DEFINED ENV{NSIGHT_AFTERMATH_SDK}) + message(ERROR "Environment variable NSIGHT_AFTERMATH_SDK has to be provided") + endif() + if (NOT WIN32) + message(ERROR "Nsight Aftermath doesn't support non-Windows platforms") + endif() + target_compile_definitions(video_core PRIVATE HAS_NSIGHT_AFTERMATH) + target_include_directories(video_core PRIVATE "$ENV{NSIGHT_AFTERMATH_SDK}/include") +endif() + if (MSVC) target_compile_options(video_core PRIVATE /we4267) else() - target_compile_options(video_core PRIVATE -Werror=conversion -Wno-error=sign-conversion) + target_compile_options(video_core PRIVATE + -Werror=conversion + -Wno-error=sign-conversion + -Werror=pessimizing-move + -Werror=redundant-move + -Werror=switch + -Werror=type-limits + -Werror=unused-variable + + $<$<CXX_COMPILER_ID:GNU>:-Werror=class-memaccess> + $<$<CXX_COMPILER_ID:GNU>:-Werror=unused-but-set-parameter> + $<$<CXX_COMPILER_ID:GNU>:-Werror=unused-but-set-variable> + ) endif() diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h index e35ee0b67..e64170e66 100644 --- a/src/video_core/buffer_cache/buffer_block.h +++ b/src/video_core/buffer_cache/buffer_block.h @@ -15,48 +15,47 @@ namespace VideoCommon { class BufferBlock { public: - bool Overlaps(const VAddr start, const VAddr end) const { + bool Overlaps(VAddr start, VAddr end) const { return (cpu_addr < end) && (cpu_addr_end > start); } - bool IsInside(const VAddr other_start, const VAddr other_end) const { + bool IsInside(VAddr other_start, VAddr other_end) const { return cpu_addr <= other_start && other_end <= cpu_addr_end; } - std::size_t GetOffset(const VAddr in_addr) { + std::size_t Offset(VAddr in_addr) const { return static_cast<std::size_t>(in_addr - cpu_addr); } - VAddr GetCpuAddr() const { + VAddr CpuAddr() const { return cpu_addr; } - VAddr GetCpuAddrEnd() const { + VAddr CpuAddrEnd() const { return cpu_addr_end; } - void SetCpuAddr(const VAddr new_addr) { + void SetCpuAddr(VAddr new_addr) { cpu_addr = new_addr; cpu_addr_end = new_addr + size; } - std::size_t GetSize() const { + std::size_t Size() const { return size; } - void SetEpoch(u64 new_epoch) { - epoch = new_epoch; + u64 Epoch() const { + return epoch; } - u64 GetEpoch() { - return epoch; + void SetEpoch(u64 new_epoch) { + epoch = new_epoch; } protected: - explicit BufferBlock(VAddr cpu_addr, const std::size_t size) : size{size} { - SetCpuAddr(cpu_addr); + explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} { + SetCpuAddr(cpu_addr_); } - ~BufferBlock() = default; private: VAddr cpu_addr{}; diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index b57c0d4d4..e7edd733f 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -4,7 +4,7 @@ #pragma once -#include <array> +#include <list> #include <memory> #include <mutex> #include <unordered_map> @@ -12,14 +12,17 @@ #include <utility> #include <vector> -#include <boost/icl/interval_map.hpp> +#include <boost/container/small_vector.hpp> #include <boost/icl/interval_set.hpp> -#include <boost/range/iterator_range.hpp> +#include <boost/intrusive/set.hpp> #include "common/alignment.h" +#include "common/assert.h" #include "common/common_types.h" +#include "common/logging/log.h" #include "core/core.h" #include "core/memory.h" +#include "core/settings.h" #include "video_core/buffer_cache/buffer_block.h" #include "video_core/buffer_cache/map_interval.h" #include "video_core/memory_manager.h" @@ -27,105 +30,122 @@ namespace VideoCommon { -using MapInterval = std::shared_ptr<MapIntervalBase>; - -template <typename TBuffer, typename TBufferType, typename StreamBuffer> +template <typename Buffer, typename BufferType, typename StreamBuffer> class BufferCache { + using IntervalSet = boost::icl::interval_set<VAddr>; + using IntervalType = typename IntervalSet::interval_type; + using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>; + + static constexpr u64 WRITE_PAGE_BIT = 11; + static constexpr u64 BLOCK_PAGE_BITS = 21; + static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS; + public: - using BufferInfo = std::pair<const TBufferType*, u64>; + struct BufferInfo { + BufferType handle; + u64 offset; + u64 address; + }; BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, bool is_written = false, bool use_fast_cbuf = false) { std::lock_guard lock{mutex}; - const std::optional<VAddr> cpu_addr_opt = - system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr); - - if (!cpu_addr_opt) { - return {GetEmptyBuffer(size), 0}; + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + if (!cpu_addr) { + return GetEmptyBuffer(size); } - VAddr cpu_addr = *cpu_addr_opt; - // Cache management is a big overhead, so only cache entries with a given size. // TODO: Figure out which size is the best for given games. constexpr std::size_t max_stream_size = 0x800; if (use_fast_cbuf || size < max_stream_size) { - if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) { - auto& memory_manager = system.GPU().MemoryManager(); + if (!is_written && !IsRegionWritten(*cpu_addr, *cpu_addr + size - 1)) { + const bool is_granular = gpu_memory.IsGranularRange(gpu_addr, size); if (use_fast_cbuf) { - if (memory_manager.IsGranularRange(gpu_addr, size)) { - const auto host_ptr = memory_manager.GetPointer(gpu_addr); - return ConstBufferUpload(host_ptr, size); + u8* dest; + if (is_granular) { + dest = gpu_memory.GetPointer(gpu_addr); } else { staging_buffer.resize(size); - memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); - return ConstBufferUpload(staging_buffer.data(), size); + dest = staging_buffer.data(); + gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size); } + return ConstBufferUpload(dest, size); + } + if (is_granular) { + u8* const host_ptr = gpu_memory.GetPointer(gpu_addr); + return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) { + std::memcpy(dest, host_ptr, size); + }); } else { - if (memory_manager.IsGranularRange(gpu_addr, size)) { - const auto host_ptr = memory_manager.GetPointer(gpu_addr); - return StreamBufferUpload(host_ptr, size, alignment); - } else { - staging_buffer.resize(size); - memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); - return StreamBufferUpload(staging_buffer.data(), size, alignment); - } + return StreamBufferUpload(size, alignment, [this, gpu_addr, size](u8* dest) { + gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size); + }); } } } - auto block = GetBlock(cpu_addr, size); - auto map = MapAddress(block, gpu_addr, cpu_addr, size); + Buffer* const block = GetBlock(*cpu_addr, size); + MapInterval* const map = MapAddress(block, gpu_addr, *cpu_addr, size); + if (!map) { + return GetEmptyBuffer(size); + } if (is_written) { map->MarkAsModified(true, GetModifiedTicks()); - if (!map->IsWritten()) { - map->MarkAsWritten(true); - MarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1); + if (Settings::IsGPULevelHigh() && + Settings::values.use_asynchronous_gpu_emulation.GetValue()) { + MarkForAsyncFlush(map); } - } else { - if (map->IsWritten()) { - WriteBarrier(); + if (!map->is_written) { + map->is_written = true; + MarkRegionAsWritten(map->start, map->end - 1); } } - const u64 offset = static_cast<u64>(block->GetOffset(cpu_addr)); - - return {ToHandle(block), offset}; + return BufferInfo{block->Handle(), block->Offset(*cpu_addr), block->Address()}; } /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset. BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size, std::size_t alignment = 4) { std::lock_guard lock{mutex}; - return StreamBufferUpload(raw_pointer, size, alignment); + return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) { + std::memcpy(dest, raw_pointer, size); + }); } - void Map(std::size_t max_size) { + /// Prepares the buffer cache for data uploading + /// @param max_size Maximum number of bytes that will be uploaded + /// @return True when a stream buffer invalidation was required, false otherwise + bool Map(std::size_t max_size) { std::lock_guard lock{mutex}; + bool invalidated; std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4); buffer_offset = buffer_offset_base; + + return invalidated; } - /// Finishes the upload stream, returns true on bindings invalidation. - bool Unmap() { + /// Finishes the upload stream + void Unmap() { std::lock_guard lock{mutex}; - stream_buffer->Unmap(buffer_offset - buffer_offset_base); - return std::exchange(invalidated, false); } + /// Function called at the end of each frame, inteded for deferred operations void TickFrame() { ++epoch; + while (!pending_destruction.empty()) { // Delay at least 4 frames before destruction. // This is due to triple buffering happening on some drivers. static constexpr u64 epochs_to_destroy = 5; - if (pending_destruction.front()->GetEpoch() + epochs_to_destroy > epoch) { + if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) { break; } - pending_destruction.pop_front(); + pending_destruction.pop(); } } @@ -133,117 +153,193 @@ public: void FlushRegion(VAddr addr, std::size_t size) { std::lock_guard lock{mutex}; - std::vector<MapInterval> objects = GetMapsInRange(addr, size); - std::sort(objects.begin(), objects.end(), [](const MapInterval& a, const MapInterval& b) { - return a->GetModificationTick() < b->GetModificationTick(); - }); - for (auto& object : objects) { - if (object->IsModified() && object->IsRegistered()) { + VectorMapInterval objects = GetMapsInRange(addr, size); + std::sort(objects.begin(), objects.end(), + [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; }); + for (MapInterval* object : objects) { + if (object->is_modified && object->is_registered) { + mutex.unlock(); FlushMap(object); + mutex.lock(); } } } + bool MustFlushRegion(VAddr addr, std::size_t size) { + std::lock_guard lock{mutex}; + + const VectorMapInterval objects = GetMapsInRange(addr, size); + return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) { + return map->is_modified && map->is_registered; + }); + } + /// Mark the specified region as being invalidated void InvalidateRegion(VAddr addr, u64 size) { std::lock_guard lock{mutex}; - std::vector<MapInterval> objects = GetMapsInRange(addr, size); - for (auto& object : objects) { - if (object->IsRegistered()) { + for (auto& object : GetMapsInRange(addr, size)) { + if (object->is_registered) { Unregister(object); } } } - virtual const TBufferType* GetEmptyBuffer(std::size_t size) = 0; + void OnCPUWrite(VAddr addr, std::size_t size) { + std::lock_guard lock{mutex}; -protected: - explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, - std::unique_ptr<StreamBuffer> stream_buffer) - : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)}, - stream_buffer_handle{this->stream_buffer->GetHandle()} {} + for (MapInterval* object : GetMapsInRange(addr, size)) { + if (object->is_memory_marked && object->is_registered) { + UnmarkMemory(object); + object->is_sync_pending = true; + marked_for_unregister.emplace_back(object); + } + } + } - ~BufferCache() = default; + void SyncGuestHost() { + std::lock_guard lock{mutex}; + + for (auto& object : marked_for_unregister) { + if (object->is_registered) { + object->is_sync_pending = false; + Unregister(object); + } + } + marked_for_unregister.clear(); + } + + void CommitAsyncFlushes() { + if (uncommitted_flushes) { + auto commit_list = std::make_shared<std::list<MapInterval*>>(); + for (MapInterval* map : *uncommitted_flushes) { + if (map->is_registered && map->is_modified) { + // TODO(Blinkhawk): Implement backend asynchronous flushing + // AsyncFlushMap(map) + commit_list->push_back(map); + } + } + if (!commit_list->empty()) { + committed_flushes.push_back(commit_list); + } else { + committed_flushes.emplace_back(); + } + } else { + committed_flushes.emplace_back(); + } + uncommitted_flushes.reset(); + } - virtual const TBufferType* ToHandle(const TBuffer& storage) = 0; + bool ShouldWaitAsyncFlushes() const { + return !committed_flushes.empty() && committed_flushes.front() != nullptr; + } - virtual void WriteBarrier() = 0; + bool HasUncommittedFlushes() const { + return uncommitted_flushes != nullptr; + } + + void PopAsyncFlushes() { + if (committed_flushes.empty()) { + return; + } + auto& flush_list = committed_flushes.front(); + if (!flush_list) { + committed_flushes.pop_front(); + return; + } + for (MapInterval* map : *flush_list) { + if (map->is_registered) { + // TODO(Blinkhawk): Replace this for reading the asynchronous flush + FlushMap(map); + } + } + committed_flushes.pop_front(); + } - virtual TBuffer CreateBlock(VAddr cpu_addr, std::size_t size) = 0; + virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0; - virtual void UploadBlockData(const TBuffer& buffer, std::size_t offset, std::size_t size, - const u8* data) = 0; +protected: + explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, + Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, + std::unique_ptr<StreamBuffer> stream_buffer_) + : rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, + stream_buffer{std::move(stream_buffer_)}, stream_buffer_handle{stream_buffer->Handle()} {} - virtual void DownloadBlockData(const TBuffer& buffer, std::size_t offset, std::size_t size, - u8* data) = 0; + ~BufferCache() = default; - virtual void CopyBlock(const TBuffer& src, const TBuffer& dst, std::size_t src_offset, - std::size_t dst_offset, std::size_t size) = 0; + virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0; virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) { return {}; } /// Register an object into the cache - void Register(const MapInterval& new_map, bool inherit_written = false) { - const VAddr cpu_addr = new_map->GetStart(); + MapInterval* Register(MapInterval new_map, bool inherit_written = false) { + const VAddr cpu_addr = new_map.start; if (!cpu_addr) { LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}", - new_map->GetGpuAddress()); - return; + new_map.gpu_addr); + return nullptr; } - const std::size_t size = new_map->GetEnd() - new_map->GetStart(); - new_map->MarkAsRegistered(true); - const IntervalType interval{new_map->GetStart(), new_map->GetEnd()}; - mapped_addresses.insert({interval, new_map}); + const std::size_t size = new_map.end - new_map.start; + new_map.is_registered = true; rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1); + new_map.is_memory_marked = true; if (inherit_written) { - MarkRegionAsWritten(new_map->GetStart(), new_map->GetEnd() - 1); - new_map->MarkAsWritten(true); + MarkRegionAsWritten(new_map.start, new_map.end - 1); + new_map.is_written = true; } + MapInterval* const storage = mapped_addresses_allocator.Allocate(); + *storage = new_map; + mapped_addresses.insert(*storage); + return storage; } - /// Unregisters an object from the cache - void Unregister(MapInterval& map) { - const std::size_t size = map->GetEnd() - map->GetStart(); - rasterizer.UpdatePagesCachedCount(map->GetStart(), size, -1); - map->MarkAsRegistered(false); - if (map->IsWritten()) { - UnmarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1); + void UnmarkMemory(MapInterval* map) { + if (!map->is_memory_marked) { + return; } - const IntervalType delete_interval{map->GetStart(), map->GetEnd()}; - mapped_addresses.erase(delete_interval); + const std::size_t size = map->end - map->start; + rasterizer.UpdatePagesCachedCount(map->start, size, -1); + map->is_memory_marked = false; } -private: - MapInterval CreateMap(const VAddr start, const VAddr end, const GPUVAddr gpu_addr) { - return std::make_shared<MapIntervalBase>(start, end, gpu_addr); + /// Unregisters an object from the cache + void Unregister(MapInterval* map) { + UnmarkMemory(map); + map->is_registered = false; + if (map->is_sync_pending) { + map->is_sync_pending = false; + marked_for_unregister.remove(map); + } + if (map->is_written) { + UnmarkRegionAsWritten(map->start, map->end - 1); + } + const auto it = mapped_addresses.find(*map); + ASSERT(it != mapped_addresses.end()); + mapped_addresses.erase(it); + mapped_addresses_allocator.Release(map); } - MapInterval MapAddress(const TBuffer& block, const GPUVAddr gpu_addr, const VAddr cpu_addr, - const std::size_t size) { - - std::vector<MapInterval> overlaps = GetMapsInRange(cpu_addr, size); +private: + MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) { + const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size); if (overlaps.empty()) { - auto& memory_manager = system.GPU().MemoryManager(); const VAddr cpu_addr_end = cpu_addr + size; - MapInterval new_map = CreateMap(cpu_addr, cpu_addr_end, gpu_addr); - if (memory_manager.IsGranularRange(gpu_addr, size)) { - u8* host_ptr = memory_manager.GetPointer(gpu_addr); - UploadBlockData(block, block->GetOffset(cpu_addr), size, host_ptr); + if (gpu_memory.IsGranularRange(gpu_addr, size)) { + u8* const host_ptr = gpu_memory.GetPointer(gpu_addr); + block->Upload(block->Offset(cpu_addr), size, host_ptr); } else { staging_buffer.resize(size); - memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); - UploadBlockData(block, block->GetOffset(cpu_addr), size, staging_buffer.data()); + gpu_memory.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); + block->Upload(block->Offset(cpu_addr), size, staging_buffer.data()); } - Register(new_map); - return new_map; + return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr)); } const VAddr cpu_addr_end = cpu_addr + size; if (overlaps.size() == 1) { - MapInterval& current_map = overlaps[0]; + MapInterval* const current_map = overlaps[0]; if (current_map->IsInside(cpu_addr, cpu_addr_end)) { return current_map; } @@ -253,57 +349,70 @@ private: bool write_inheritance = false; bool modified_inheritance = false; // Calculate new buffer parameters - for (auto& overlap : overlaps) { - new_start = std::min(overlap->GetStart(), new_start); - new_end = std::max(overlap->GetEnd(), new_end); - write_inheritance |= overlap->IsWritten(); - modified_inheritance |= overlap->IsModified(); + for (MapInterval* overlap : overlaps) { + new_start = std::min(overlap->start, new_start); + new_end = std::max(overlap->end, new_end); + write_inheritance |= overlap->is_written; + modified_inheritance |= overlap->is_modified; } GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr; for (auto& overlap : overlaps) { Unregister(overlap); } UpdateBlock(block, new_start, new_end, overlaps); - MapInterval new_map = CreateMap(new_start, new_end, new_gpu_addr); + + const MapInterval new_map{new_start, new_end, new_gpu_addr}; + MapInterval* const map = Register(new_map, write_inheritance); + if (!map) { + return nullptr; + } if (modified_inheritance) { - new_map->MarkAsModified(true, GetModifiedTicks()); + map->MarkAsModified(true, GetModifiedTicks()); + if (Settings::IsGPULevelHigh() && + Settings::values.use_asynchronous_gpu_emulation.GetValue()) { + MarkForAsyncFlush(map); + } } - Register(new_map, write_inheritance); - return new_map; + return map; } - void UpdateBlock(const TBuffer& block, VAddr start, VAddr end, - std::vector<MapInterval>& overlaps) { + void UpdateBlock(Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) { const IntervalType base_interval{start, end}; IntervalSet interval_set{}; interval_set.add(base_interval); for (auto& overlap : overlaps) { - const IntervalType subtract{overlap->GetStart(), overlap->GetEnd()}; + const IntervalType subtract{overlap->start, overlap->end}; interval_set.subtract(subtract); } for (auto& interval : interval_set) { - std::size_t size = interval.upper() - interval.lower(); - if (size > 0) { - staging_buffer.resize(size); - system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size); - UploadBlockData(block, block->GetOffset(interval.lower()), size, - staging_buffer.data()); + const std::size_t size = interval.upper() - interval.lower(); + if (size == 0) { + continue; } + staging_buffer.resize(size); + cpu_memory.ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size); + block->Upload(block->Offset(interval.lower()), size, staging_buffer.data()); } } - std::vector<MapInterval> GetMapsInRange(VAddr addr, std::size_t size) { + VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) { + VectorMapInterval result; if (size == 0) { - return {}; + return result; } - std::vector<MapInterval> objects{}; - const IntervalType interval{addr, addr + size}; - for (auto& pair : boost::make_iterator_range(mapped_addresses.equal_range(interval))) { - objects.push_back(pair.second); + const VAddr addr_end = addr + size; + auto it = mapped_addresses.lower_bound(addr); + if (it != mapped_addresses.begin()) { + --it; } - - return objects; + while (it != mapped_addresses.end() && it->start < addr_end) { + if (it->Overlaps(addr, addr_end)) { + result.push_back(&*it); + } + ++it; + } + return result; } /// Returns a ticks counter used for tracking when cached objects were last modified @@ -311,24 +420,28 @@ private: return ++modified_ticks; } - void FlushMap(MapInterval map) { - std::size_t size = map->GetEnd() - map->GetStart(); - TBuffer block = blocks[map->GetStart() >> block_page_bits]; + void FlushMap(MapInterval* map) { + const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS); + ASSERT_OR_EXECUTE(it != blocks.end(), return;); + + std::shared_ptr<Buffer> block = it->second; + + const std::size_t size = map->end - map->start; staging_buffer.resize(size); - DownloadBlockData(block, block->GetOffset(map->GetStart()), size, staging_buffer.data()); - system.Memory().WriteBlockUnsafe(map->GetStart(), staging_buffer.data(), size); + block->Download(block->Offset(map->start), size, staging_buffer.data()); + cpu_memory.WriteBlockUnsafe(map->start, staging_buffer.data(), size); map->MarkAsModified(false, 0); } - BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size, - std::size_t alignment) { + template <typename Callable> + BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) { AlignBuffer(alignment); const std::size_t uploaded_offset = buffer_offset; - std::memcpy(buffer_ptr, raw_pointer, size); + callable(buffer_ptr); buffer_ptr += size; buffer_offset += size; - return {&stream_buffer_handle, uploaded_offset}; + return BufferInfo{stream_buffer->Handle(), uploaded_offset, stream_buffer->Address()}; } void AlignBuffer(std::size_t alignment) { @@ -338,151 +451,148 @@ private: buffer_offset = offset_aligned; } - TBuffer EnlargeBlock(TBuffer buffer) { - const std::size_t old_size = buffer->GetSize(); - const std::size_t new_size = old_size + block_page_size; - const VAddr cpu_addr = buffer->GetCpuAddr(); - TBuffer new_buffer = CreateBlock(cpu_addr, new_size); - CopyBlock(buffer, new_buffer, 0, 0, old_size); - buffer->SetEpoch(epoch); - pending_destruction.push_back(buffer); + std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) { + const std::size_t old_size = buffer->Size(); + const std::size_t new_size = old_size + BLOCK_PAGE_SIZE; + const VAddr cpu_addr = buffer->CpuAddr(); + std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size); + new_buffer->CopyFrom(*buffer, 0, 0, old_size); + QueueDestruction(std::move(buffer)); + const VAddr cpu_addr_end = cpu_addr + new_size - 1; - u64 page_start = cpu_addr >> block_page_bits; - const u64 page_end = cpu_addr_end >> block_page_bits; - while (page_start <= page_end) { - blocks[page_start] = new_buffer; - ++page_start; + const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; + for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { + blocks.insert_or_assign(page_start, new_buffer); } + return new_buffer; } - TBuffer MergeBlocks(TBuffer first, TBuffer second) { - const std::size_t size_1 = first->GetSize(); - const std::size_t size_2 = second->GetSize(); - const VAddr first_addr = first->GetCpuAddr(); - const VAddr second_addr = second->GetCpuAddr(); + std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first, + std::shared_ptr<Buffer> second) { + const std::size_t size_1 = first->Size(); + const std::size_t size_2 = second->Size(); + const VAddr first_addr = first->CpuAddr(); + const VAddr second_addr = second->CpuAddr(); const VAddr new_addr = std::min(first_addr, second_addr); const std::size_t new_size = size_1 + size_2; - TBuffer new_buffer = CreateBlock(new_addr, new_size); - CopyBlock(first, new_buffer, 0, new_buffer->GetOffset(first_addr), size_1); - CopyBlock(second, new_buffer, 0, new_buffer->GetOffset(second_addr), size_2); - first->SetEpoch(epoch); - second->SetEpoch(epoch); - pending_destruction.push_back(first); - pending_destruction.push_back(second); + + std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size); + new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1); + new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2); + QueueDestruction(std::move(first)); + QueueDestruction(std::move(second)); + const VAddr cpu_addr_end = new_addr + new_size - 1; - u64 page_start = new_addr >> block_page_bits; - const u64 page_end = cpu_addr_end >> block_page_bits; - while (page_start <= page_end) { - blocks[page_start] = new_buffer; - ++page_start; + const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; + for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { + blocks.insert_or_assign(page_start, new_buffer); } return new_buffer; } - TBuffer GetBlock(const VAddr cpu_addr, const std::size_t size) { - TBuffer found{}; + Buffer* GetBlock(VAddr cpu_addr, std::size_t size) { + std::shared_ptr<Buffer> found; + const VAddr cpu_addr_end = cpu_addr + size - 1; - u64 page_start = cpu_addr >> block_page_bits; - const u64 page_end = cpu_addr_end >> block_page_bits; - while (page_start <= page_end) { + const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; + for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { auto it = blocks.find(page_start); if (it == blocks.end()) { if (found) { found = EnlargeBlock(found); - } else { - const VAddr start_addr = (page_start << block_page_bits); - found = CreateBlock(start_addr, block_page_size); - blocks[page_start] = found; - } - } else { - if (found) { - if (found == it->second) { - ++page_start; - continue; - } - found = MergeBlocks(found, it->second); - } else { - found = it->second; + continue; } + const VAddr start_addr = page_start << BLOCK_PAGE_BITS; + found = CreateBlock(start_addr, BLOCK_PAGE_SIZE); + blocks.insert_or_assign(page_start, found); + continue; + } + if (!found) { + found = it->second; + continue; + } + if (found != it->second) { + found = MergeBlocks(std::move(found), it->second); } - ++page_start; } - return found; + return found.get(); } - void MarkRegionAsWritten(const VAddr start, const VAddr end) { - u64 page_start = start >> write_page_bit; - const u64 page_end = end >> write_page_bit; - while (page_start <= page_end) { - auto it = written_pages.find(page_start); - if (it != written_pages.end()) { - it->second = it->second + 1; - } else { - written_pages[page_start] = 1; + void MarkRegionAsWritten(VAddr start, VAddr end) { + const u64 page_end = end >> WRITE_PAGE_BIT; + for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { + if (const auto [it, inserted] = written_pages.emplace(page_start, 1); !inserted) { + ++it->second; } - page_start++; } } - void UnmarkRegionAsWritten(const VAddr start, const VAddr end) { - u64 page_start = start >> write_page_bit; - const u64 page_end = end >> write_page_bit; - while (page_start <= page_end) { + void UnmarkRegionAsWritten(VAddr start, VAddr end) { + const u64 page_end = end >> WRITE_PAGE_BIT; + for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { auto it = written_pages.find(page_start); if (it != written_pages.end()) { if (it->second > 1) { - it->second = it->second - 1; + --it->second; } else { written_pages.erase(it); } } - page_start++; } } - bool IsRegionWritten(const VAddr start, const VAddr end) const { - u64 page_start = start >> write_page_bit; - const u64 page_end = end >> write_page_bit; - while (page_start <= page_end) { + bool IsRegionWritten(VAddr start, VAddr end) const { + const u64 page_end = end >> WRITE_PAGE_BIT; + for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { if (written_pages.count(page_start) > 0) { return true; } - page_start++; } return false; } + void QueueDestruction(std::shared_ptr<Buffer> buffer) { + buffer->SetEpoch(epoch); + pending_destruction.push(std::move(buffer)); + } + + void MarkForAsyncFlush(MapInterval* map) { + if (!uncommitted_flushes) { + uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>(); + } + uncommitted_flushes->insert(map); + } + VideoCore::RasterizerInterface& rasterizer; - Core::System& system; + Tegra::MemoryManager& gpu_memory; + Core::Memory::Memory& cpu_memory; std::unique_ptr<StreamBuffer> stream_buffer; - TBufferType stream_buffer_handle{}; - - bool invalidated = false; + BufferType stream_buffer_handle; u8* buffer_ptr = nullptr; u64 buffer_offset = 0; u64 buffer_offset_base = 0; - using IntervalSet = boost::icl::interval_set<VAddr>; - using IntervalCache = boost::icl::interval_map<VAddr, MapInterval>; - using IntervalType = typename IntervalCache::interval_type; - IntervalCache mapped_addresses; + MapIntervalAllocator mapped_addresses_allocator; + boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>> + mapped_addresses; - static constexpr u64 write_page_bit = 11; std::unordered_map<u64, u32> written_pages; + std::unordered_map<u64, std::shared_ptr<Buffer>> blocks; - static constexpr u64 block_page_bits = 21; - static constexpr u64 block_page_size = 1ULL << block_page_bits; - std::unordered_map<u64, TBuffer> blocks; - - std::list<TBuffer> pending_destruction; + std::queue<std::shared_ptr<Buffer>> pending_destruction; u64 epoch = 0; u64 modified_ticks = 0; std::vector<u8> staging_buffer; + std::list<MapInterval*> marked_for_unregister; + + std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes; + std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes; + std::recursive_mutex mutex; }; diff --git a/src/video_core/buffer_cache/map_interval.cpp b/src/video_core/buffer_cache/map_interval.cpp new file mode 100644 index 000000000..62587e18a --- /dev/null +++ b/src/video_core/buffer_cache/map_interval.cpp @@ -0,0 +1,33 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <array> +#include <cstddef> +#include <memory> + +#include "video_core/buffer_cache/map_interval.h" + +namespace VideoCommon { + +MapIntervalAllocator::MapIntervalAllocator() { + FillFreeList(first_chunk); +} + +MapIntervalAllocator::~MapIntervalAllocator() = default; + +void MapIntervalAllocator::AllocateNewChunk() { + *new_chunk = std::make_unique<Chunk>(); + FillFreeList(**new_chunk); + new_chunk = &(*new_chunk)->next; +} + +void MapIntervalAllocator::FillFreeList(Chunk& chunk) { + const std::size_t old_size = free_list.size(); + free_list.resize(old_size + chunk.data.size()); + std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size, + [](MapInterval& interval) { return &interval; }); +} + +} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h index b0956029d..fe0bcd1d8 100644 --- a/src/video_core/buffer_cache/map_interval.h +++ b/src/video_core/buffer_cache/map_interval.h @@ -4,86 +4,89 @@ #pragma once +#include <array> +#include <cstddef> +#include <memory> +#include <vector> + +#include <boost/intrusive/set_hook.hpp> + #include "common/common_types.h" #include "video_core/gpu.h" namespace VideoCommon { -class MapIntervalBase { -public: - MapIntervalBase(const VAddr start, const VAddr end, const GPUVAddr gpu_addr) - : start{start}, end{end}, gpu_addr{gpu_addr} {} +struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> { + MapInterval() = default; - void SetCpuAddress(VAddr new_cpu_addr) { - cpu_addr = new_cpu_addr; - } + /*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {} - VAddr GetCpuAddress() const { - return cpu_addr; - } + explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept + : start{start_}, end{end_}, gpu_addr{gpu_addr_} {} - GPUVAddr GetGpuAddress() const { - return gpu_addr; + bool IsInside(VAddr other_start, VAddr other_end) const noexcept { + return start <= other_start && other_end <= end; } - bool IsInside(const VAddr other_start, const VAddr other_end) const { - return (start <= other_start && other_end <= end); + bool Overlaps(VAddr other_start, VAddr other_end) const noexcept { + return start < other_end && other_start < end; } - bool operator==(const MapIntervalBase& rhs) const { - return std::tie(start, end) == std::tie(rhs.start, rhs.end); - } - - bool operator!=(const MapIntervalBase& rhs) const { - return !operator==(rhs); - } - - void MarkAsRegistered(const bool registered) { - is_registered = registered; + void MarkAsModified(bool is_modified_, u64 ticks_) noexcept { + is_modified = is_modified_; + ticks = ticks_; } - bool IsRegistered() const { - return is_registered; - } + boost::intrusive::set_member_hook<> member_hook_; + VAddr start = 0; + VAddr end = 0; + GPUVAddr gpu_addr = 0; + u64 ticks = 0; + bool is_written = false; + bool is_modified = false; + bool is_registered = false; + bool is_memory_marked = false; + bool is_sync_pending = false; +}; - VAddr GetStart() const { - return start; +struct MapIntervalCompare { + constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept { + return lhs.start < rhs.start; } +}; - VAddr GetEnd() const { - return end; +class MapIntervalAllocator { +public: + MapIntervalAllocator(); + ~MapIntervalAllocator(); + + MapInterval* Allocate() { + if (free_list.empty()) { + AllocateNewChunk(); + } + MapInterval* const interval = free_list.back(); + free_list.pop_back(); + return interval; } - void MarkAsModified(const bool is_modified_, const u64 tick) { - is_modified = is_modified_; - ticks = tick; + void Release(MapInterval* interval) { + free_list.push_back(interval); } - bool IsModified() const { - return is_modified; - } +private: + struct Chunk { + std::unique_ptr<Chunk> next; + std::array<MapInterval, 0x8000> data; + }; - u64 GetModificationTick() const { - return ticks; - } + void AllocateNewChunk(); - void MarkAsWritten(const bool is_written_) { - is_written = is_written_; - } + void FillFreeList(Chunk& chunk); - bool IsWritten() const { - return is_written; - } + std::vector<MapInterval*> free_list; + std::unique_ptr<Chunk>* new_chunk = &first_chunk.next; -private: - VAddr start; - VAddr end; - GPUVAddr gpu_addr; - VAddr cpu_addr{}; - bool is_written{}; - bool is_modified{}; - bool is_registered{}; - u64 ticks{}; + Chunk first_chunk; }; } // namespace VideoCommon diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp new file mode 100644 index 000000000..b60f86260 --- /dev/null +++ b/src/video_core/cdma_pusher.cpp @@ -0,0 +1,171 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#include "command_classes/host1x.h" +#include "command_classes/nvdec.h" +#include "command_classes/vic.h" +#include "common/bit_util.h" +#include "video_core/cdma_pusher.h" +#include "video_core/command_classes/nvdec_common.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" + +namespace Tegra { +CDmaPusher::CDmaPusher(GPU& gpu) + : gpu(gpu), nvdec_processor(std::make_shared<Nvdec>(gpu)), + vic_processor(std::make_unique<Vic>(gpu, nvdec_processor)), + host1x_processor(std::make_unique<Host1x>(gpu)), + nvdec_sync(std::make_unique<SyncptIncrManager>(gpu)), + vic_sync(std::make_unique<SyncptIncrManager>(gpu)) {} + +CDmaPusher::~CDmaPusher() = default; + +void CDmaPusher::Push(ChCommandHeaderList&& entries) { + cdma_queue.push(std::move(entries)); +} + +void CDmaPusher::DispatchCalls() { + while (!cdma_queue.empty()) { + Step(); + } +} + +void CDmaPusher::Step() { + const auto entries{cdma_queue.front()}; + cdma_queue.pop(); + + std::vector<u32> values(entries.size()); + std::memcpy(values.data(), entries.data(), entries.size() * sizeof(u32)); + + for (const u32 value : values) { + if (mask != 0) { + const u32 lbs = Common::CountTrailingZeroes32(mask); + mask &= ~(1U << lbs); + ExecuteCommand(static_cast<u32>(offset + lbs), value); + continue; + } else if (count != 0) { + --count; + ExecuteCommand(static_cast<u32>(offset), value); + if (incrementing) { + ++offset; + } + continue; + } + const auto mode = static_cast<ChSubmissionMode>((value >> 28) & 0xf); + switch (mode) { + case ChSubmissionMode::SetClass: { + mask = value & 0x3f; + offset = (value >> 16) & 0xfff; + current_class = static_cast<ChClassId>((value >> 6) & 0x3ff); + break; + } + case ChSubmissionMode::Incrementing: + case ChSubmissionMode::NonIncrementing: + count = value & 0xffff; + offset = (value >> 16) & 0xfff; + incrementing = mode == ChSubmissionMode::Incrementing; + break; + case ChSubmissionMode::Mask: + mask = value & 0xffff; + offset = (value >> 16) & 0xfff; + break; + case ChSubmissionMode::Immediate: { + const u32 data = value & 0xfff; + offset = (value >> 16) & 0xfff; + ExecuteCommand(static_cast<u32>(offset), data); + break; + } + default: + UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast<u32>(mode)); + break; + } + } +} + +void CDmaPusher::ExecuteCommand(u32 offset, u32 data) { + switch (current_class) { + case ChClassId::NvDec: + ThiStateWrite(nvdec_thi_state, offset, {data}); + switch (static_cast<ThiMethod>(offset)) { + case ThiMethod::IncSyncpt: { + LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method"); + const auto syncpoint_id = static_cast<u32>(data & 0xFF); + const auto cond = static_cast<u32>((data >> 8) & 0xFF); + if (cond == 0) { + nvdec_sync->Increment(syncpoint_id); + } else { + nvdec_sync->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id); + nvdec_sync->SignalDone(syncpoint_id); + } + break; + } + case ThiMethod::SetMethod1: + LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}", + static_cast<u32>(nvdec_thi_state.method_0)); + nvdec_processor->ProcessMethod( + static_cast<Tegra::Nvdec::Method>(nvdec_thi_state.method_0), {data}); + break; + default: + break; + } + break; + case ChClassId::GraphicsVic: + ThiStateWrite(vic_thi_state, static_cast<u32>(offset), {data}); + switch (static_cast<ThiMethod>(offset)) { + case ThiMethod::IncSyncpt: { + LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method"); + const auto syncpoint_id = static_cast<u32>(data & 0xFF); + const auto cond = static_cast<u32>((data >> 8) & 0xFF); + if (cond == 0) { + vic_sync->Increment(syncpoint_id); + } else { + vic_sync->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id); + vic_sync->SignalDone(syncpoint_id); + } + break; + } + case ThiMethod::SetMethod1: + LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})", + static_cast<u32>(vic_thi_state.method_0), data); + vic_processor->ProcessMethod(static_cast<Tegra::Vic::Method>(vic_thi_state.method_0), + {data}); + break; + default: + break; + } + break; + case ChClassId::Host1x: + // This device is mainly for syncpoint synchronization + LOG_DEBUG(Service_NVDRV, "Host1X Class Method"); + host1x_processor->ProcessMethod(static_cast<Tegra::Host1x::Method>(offset), {data}); + break; + default: + UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class)); + break; + } +} + +void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 offset, const std::vector<u32>& arguments) { + u8* const state_offset = reinterpret_cast<u8*>(&state) + sizeof(u32) * offset; + std::memcpy(state_offset, arguments.data(), sizeof(u32) * arguments.size()); +} + +} // namespace Tegra diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h new file mode 100644 index 000000000..982f309c5 --- /dev/null +++ b/src/video_core/cdma_pusher.h @@ -0,0 +1,138 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <unordered_map> +#include <vector> +#include <queue> + +#include "common/bit_field.h" +#include "common/common_types.h" +#include "video_core/command_classes/sync_manager.h" + +namespace Tegra { + +class GPU; +class Nvdec; +class Vic; +class Host1x; + +enum class ChSubmissionMode : u32 { + SetClass = 0, + Incrementing = 1, + NonIncrementing = 2, + Mask = 3, + Immediate = 4, + Restart = 5, + Gather = 6, +}; + +enum class ChClassId : u32 { + NoClass = 0x0, + Host1x = 0x1, + VideoEncodeMpeg = 0x20, + VideoEncodeNvEnc = 0x21, + VideoStreamingVi = 0x30, + VideoStreamingIsp = 0x32, + VideoStreamingIspB = 0x34, + VideoStreamingViI2c = 0x36, + GraphicsVic = 0x5d, + Graphics3D = 0x60, + GraphicsGpu = 0x61, + Tsec = 0xe0, + TsecB = 0xe1, + NvJpg = 0xc0, + NvDec = 0xf0 +}; + +enum class ChMethod : u32 { + Empty = 0, + SetMethod = 0x10, + SetData = 0x11, +}; + +union ChCommandHeader { + u32 raw; + BitField<0, 16, u32> value; + BitField<16, 12, ChMethod> method_offset; + BitField<28, 4, ChSubmissionMode> submission_mode; +}; +static_assert(sizeof(ChCommandHeader) == sizeof(u32), "ChCommand header is an invalid size"); + +struct ChCommand { + ChClassId class_id{}; + int method_offset{}; + std::vector<u32> arguments; +}; + +using ChCommandHeaderList = std::vector<Tegra::ChCommandHeader>; +using ChCommandList = std::vector<Tegra::ChCommand>; + +struct ThiRegisters { + u32_le increment_syncpt{}; + INSERT_PADDING_WORDS(1); + u32_le increment_syncpt_error{}; + u32_le ctx_switch_incremement_syncpt{}; + INSERT_PADDING_WORDS(4); + u32_le ctx_switch{}; + INSERT_PADDING_WORDS(1); + u32_le ctx_syncpt_eof{}; + INSERT_PADDING_WORDS(5); + u32_le method_0{}; + u32_le method_1{}; + INSERT_PADDING_WORDS(12); + u32_le int_status{}; + u32_le int_mask{}; +}; + +enum class ThiMethod : u32 { + IncSyncpt = offsetof(ThiRegisters, increment_syncpt) / sizeof(u32), + SetMethod0 = offsetof(ThiRegisters, method_0) / sizeof(u32), + SetMethod1 = offsetof(ThiRegisters, method_1) / sizeof(u32), +}; + +class CDmaPusher { +public: + explicit CDmaPusher(GPU& gpu); + ~CDmaPusher(); + + /// Push NVDEC command buffer entries into queue + void Push(ChCommandHeaderList&& entries); + + /// Process queued command buffer entries + void DispatchCalls(); + + /// Process one queue element + void Step(); + + /// Invoke command class devices to execute the command based on the current state + void ExecuteCommand(u32 offset, u32 data); + +private: + /// Write arguments value to the ThiRegisters member at the specified offset + void ThiStateWrite(ThiRegisters& state, u32 offset, const std::vector<u32>& arguments); + + GPU& gpu; + + std::shared_ptr<Tegra::Nvdec> nvdec_processor; + std::unique_ptr<Tegra::Vic> vic_processor; + std::unique_ptr<Tegra::Host1x> host1x_processor; + std::unique_ptr<SyncptIncrManager> nvdec_sync; + std::unique_ptr<SyncptIncrManager> vic_sync; + ChClassId current_class{}; + ThiRegisters vic_thi_state{}; + ThiRegisters nvdec_thi_state{}; + + s32 count{}; + s32 offset{}; + s32 mask{}; + bool incrementing{}; + + // Queue of command lists to be processed + std::queue<ChCommandHeaderList> cdma_queue; +}; + +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/codec.cpp b/src/video_core/command_classes/codecs/codec.cpp new file mode 100644 index 000000000..1adf3cd13 --- /dev/null +++ b/src/video_core/command_classes/codecs/codec.cpp @@ -0,0 +1,115 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <cstring> +#include <fstream> +#include <vector> +#include "common/assert.h" +#include "video_core/command_classes/codecs/codec.h" +#include "video_core/command_classes/codecs/h264.h" +#include "video_core/command_classes/codecs/vp9.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" + +extern "C" { +#include <libavutil/opt.h> +} + +namespace Tegra { + +Codec::Codec(GPU& gpu_) + : gpu(gpu_), h264_decoder(std::make_unique<Decoder::H264>(gpu)), + vp9_decoder(std::make_unique<Decoder::VP9>(gpu)) {} + +Codec::~Codec() { + if (!initialized) { + return; + } + // Free libav memory + avcodec_send_packet(av_codec_ctx, nullptr); + avcodec_receive_frame(av_codec_ctx, av_frame); + avcodec_flush_buffers(av_codec_ctx); + + av_frame_unref(av_frame); + av_free(av_frame); + avcodec_close(av_codec_ctx); +} + +void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) { + LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", static_cast<u32>(codec)); + current_codec = codec; +} + +void Codec::StateWrite(u32 offset, u64 arguments) { + u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u64); + std::memcpy(state_offset, &arguments, sizeof(u64)); +} + +void Codec::Decode() { + bool is_first_frame = false; + + if (!initialized) { + if (current_codec == NvdecCommon::VideoCodec::H264) { + av_codec = avcodec_find_decoder(AV_CODEC_ID_H264); + } else if (current_codec == NvdecCommon::VideoCodec::Vp9) { + av_codec = avcodec_find_decoder(AV_CODEC_ID_VP9); + } else { + LOG_ERROR(Service_NVDRV, "Unknown video codec {}", static_cast<u32>(current_codec)); + return; + } + + av_codec_ctx = avcodec_alloc_context3(av_codec); + av_frame = av_frame_alloc(); + av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0); + + // TODO(ameerj): libavcodec gpu hw acceleration + + const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr); + if (av_error < 0) { + LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed."); + av_frame_unref(av_frame); + av_free(av_frame); + avcodec_close(av_codec_ctx); + return; + } + initialized = true; + is_first_frame = true; + } + bool vp9_hidden_frame = false; + + AVPacket packet{}; + av_init_packet(&packet); + std::vector<u8> frame_data; + + if (current_codec == NvdecCommon::VideoCodec::H264) { + frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame); + } else if (current_codec == NvdecCommon::VideoCodec::Vp9) { + frame_data = vp9_decoder->ComposeFrameHeader(state); + vp9_hidden_frame = vp9_decoder->WasFrameHidden(); + } + + packet.data = frame_data.data(); + packet.size = static_cast<int>(frame_data.size()); + + avcodec_send_packet(av_codec_ctx, &packet); + + if (!vp9_hidden_frame) { + // Only receive/store visible frames + avcodec_receive_frame(av_codec_ctx, av_frame); + } +} + +AVFrame* Codec::GetCurrentFrame() { + return av_frame; +} + +const AVFrame* Codec::GetCurrentFrame() const { + return av_frame; +} + +NvdecCommon::VideoCodec Codec::GetCurrentCodec() const { + return current_codec; +} + +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/codec.h b/src/video_core/command_classes/codecs/codec.h new file mode 100644 index 000000000..5bbe6a332 --- /dev/null +++ b/src/video_core/command_classes/codecs/codec.h @@ -0,0 +1,66 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include "common/common_types.h" +#include "video_core/command_classes/nvdec_common.h" + +extern "C" { +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic ignored "-Wconversion" +#endif +#include <libavcodec/avcodec.h> +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +namespace Tegra { +class GPU; +struct VicRegisters; + +namespace Decoder { +class H264; +class VP9; +} // namespace Decoder + +class Codec { +public: + explicit Codec(GPU& gpu); + ~Codec(); + + /// Sets NVDEC video stream codec + void SetTargetCodec(NvdecCommon::VideoCodec codec); + + /// Populate NvdecRegisters state with argument value at the provided offset + void StateWrite(u32 offset, u64 arguments); + + /// Call decoders to construct headers, decode AVFrame with ffmpeg + void Decode(); + + /// Returns most recently decoded frame + [[nodiscard]] AVFrame* GetCurrentFrame(); + [[nodiscard]] const AVFrame* GetCurrentFrame() const; + + /// Returns the value of current_codec + [[nodiscard]] NvdecCommon::VideoCodec GetCurrentCodec() const; + +private: + bool initialized{}; + NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None}; + + AVCodec* av_codec{nullptr}; + AVCodecContext* av_codec_ctx{nullptr}; + AVFrame* av_frame{nullptr}; + + GPU& gpu; + std::unique_ptr<Decoder::H264> h264_decoder; + std::unique_ptr<Decoder::VP9> vp9_decoder; + + NvdecCommon::NvdecRegisters state{}; +}; + +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/h264.cpp b/src/video_core/command_classes/codecs/h264.cpp new file mode 100644 index 000000000..33e063e20 --- /dev/null +++ b/src/video_core/command_classes/codecs/h264.cpp @@ -0,0 +1,293 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#include <array> +#include "common/bit_util.h" +#include "video_core/command_classes/codecs/h264.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" + +namespace Tegra::Decoder { +namespace { +// ZigZag LUTs from libavcodec. +constexpr std::array<u8, 64> zig_zag_direct{ + 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, + 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, + 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63, +}; + +constexpr std::array<u8, 16> zig_zag_scan{ + 0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4, 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4, + 1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4, +}; +} // Anonymous namespace + +H264::H264(GPU& gpu_) : gpu(gpu_) {} + +H264::~H264() = default; + +const std::vector<u8>& H264::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state, + bool is_first_frame) { + H264DecoderContext context{}; + gpu.MemoryManager().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext)); + + const s32 frame_number = static_cast<s32>((context.h264_parameter_set.flags >> 46) & 0x1ffff); + if (!is_first_frame && frame_number != 0) { + frame.resize(context.frame_data_size); + + gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size()); + } else { + /// Encode header + H264BitWriter writer{}; + writer.WriteU(1, 24); + writer.WriteU(0, 1); + writer.WriteU(3, 2); + writer.WriteU(7, 5); + writer.WriteU(100, 8); + writer.WriteU(0, 8); + writer.WriteU(31, 8); + writer.WriteUe(0); + const auto chroma_format_idc = + static_cast<u32>((context.h264_parameter_set.flags >> 12) & 3); + writer.WriteUe(chroma_format_idc); + if (chroma_format_idc == 3) { + writer.WriteBit(false); + } + + writer.WriteUe(0); + writer.WriteUe(0); + writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag + writer.WriteBit(false); // Scaling matrix present flag + + const auto order_cnt_type = static_cast<u32>((context.h264_parameter_set.flags >> 14) & 3); + writer.WriteUe(static_cast<u32>((context.h264_parameter_set.flags >> 8) & 0xf)); + writer.WriteUe(order_cnt_type); + if (order_cnt_type == 0) { + writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt); + } else if (order_cnt_type == 1) { + writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0); + + writer.WriteSe(0); + writer.WriteSe(0); + writer.WriteUe(0); + } + + const s32 pic_height = context.h264_parameter_set.pic_height_in_map_units / + (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2); + + writer.WriteUe(16); + writer.WriteBit(false); + writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1); + writer.WriteUe(pic_height - 1); + writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0); + + if (!context.h264_parameter_set.frame_mbs_only_flag) { + writer.WriteBit(((context.h264_parameter_set.flags >> 0) & 1) != 0); + } + + writer.WriteBit(((context.h264_parameter_set.flags >> 1) & 1) != 0); + writer.WriteBit(false); // Frame cropping flag + writer.WriteBit(false); // VUI parameter present flag + + writer.End(); + + // H264 PPS + writer.WriteU(1, 24); + writer.WriteU(0, 1); + writer.WriteU(3, 2); + writer.WriteU(8, 5); + + writer.WriteUe(0); + writer.WriteUe(0); + + writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag != 0); + writer.WriteBit(false); + writer.WriteUe(0); + writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active); + writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active); + writer.WriteBit(((context.h264_parameter_set.flags >> 2) & 1) != 0); + writer.WriteU(static_cast<s32>((context.h264_parameter_set.flags >> 32) & 0x3), 2); + s32 pic_init_qp = static_cast<s32>((context.h264_parameter_set.flags >> 16) & 0x3f); + pic_init_qp = (pic_init_qp << 26) >> 26; + writer.WriteSe(pic_init_qp); + writer.WriteSe(0); + s32 chroma_qp_index_offset = + static_cast<s32>((context.h264_parameter_set.flags >> 22) & 0x1f); + chroma_qp_index_offset = (chroma_qp_index_offset << 27) >> 27; + + writer.WriteSe(chroma_qp_index_offset); + writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_flag != 0); + writer.WriteBit(((context.h264_parameter_set.flags >> 3) & 1) != 0); + writer.WriteBit(context.h264_parameter_set.redundant_pic_count_flag != 0); + writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0); + + writer.WriteBit(true); + + for (s32 index = 0; index < 6; index++) { + writer.WriteBit(true); + const auto matrix_x4 = + std::vector<u8>(context.scaling_matrix_4.begin(), context.scaling_matrix_4.end()); + writer.WriteScalingList(matrix_x4, index * 16, 16); + } + + if (context.h264_parameter_set.transform_8x8_mode_flag) { + for (s32 index = 0; index < 2; index++) { + writer.WriteBit(true); + const auto matrix_x8 = std::vector<u8>(context.scaling_matrix_8.begin(), + context.scaling_matrix_8.end()); + + writer.WriteScalingList(matrix_x8, index * 64, 64); + } + } + + s32 chroma_qp_index_offset2 = + static_cast<s32>((context.h264_parameter_set.flags >> 27) & 0x1f); + chroma_qp_index_offset2 = (chroma_qp_index_offset2 << 27) >> 27; + + writer.WriteSe(chroma_qp_index_offset2); + + writer.End(); + + const auto& encoded_header = writer.GetByteArray(); + frame.resize(encoded_header.size() + context.frame_data_size); + std::memcpy(frame.data(), encoded_header.data(), encoded_header.size()); + + gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, + frame.data() + encoded_header.size(), + context.frame_data_size); + } + + return frame; +} + +H264BitWriter::H264BitWriter() = default; + +H264BitWriter::~H264BitWriter() = default; + +void H264BitWriter::WriteU(s32 value, s32 value_sz) { + WriteBits(value, value_sz); +} + +void H264BitWriter::WriteSe(s32 value) { + WriteExpGolombCodedInt(value); +} + +void H264BitWriter::WriteUe(u32 value) { + WriteExpGolombCodedUInt(value); +} + +void H264BitWriter::End() { + WriteBit(true); + Flush(); +} + +void H264BitWriter::WriteBit(bool state) { + WriteBits(state ? 1 : 0, 1); +} + +void H264BitWriter::WriteScalingList(const std::vector<u8>& list, s32 start, s32 count) { + std::vector<u8> scan(count); + if (count == 16) { + std::memcpy(scan.data(), zig_zag_scan.data(), scan.size()); + } else { + std::memcpy(scan.data(), zig_zag_direct.data(), scan.size()); + } + u8 last_scale = 8; + + for (s32 index = 0; index < count; index++) { + const u8 value = list[start + scan[index]]; + const s32 delta_scale = static_cast<s32>(value - last_scale); + + WriteSe(delta_scale); + + last_scale = value; + } +} + +std::vector<u8>& H264BitWriter::GetByteArray() { + return byte_array; +} + +const std::vector<u8>& H264BitWriter::GetByteArray() const { + return byte_array; +} + +void H264BitWriter::WriteBits(s32 value, s32 bit_count) { + s32 value_pos = 0; + + s32 remaining = bit_count; + + while (remaining > 0) { + s32 copy_size = remaining; + + const s32 free_bits = GetFreeBufferBits(); + + if (copy_size > free_bits) { + copy_size = free_bits; + } + + const s32 mask = (1 << copy_size) - 1; + + const s32 src_shift = (bit_count - value_pos) - copy_size; + const s32 dst_shift = (buffer_size - buffer_pos) - copy_size; + + buffer |= ((value >> src_shift) & mask) << dst_shift; + + value_pos += copy_size; + buffer_pos += copy_size; + remaining -= copy_size; + } +} + +void H264BitWriter::WriteExpGolombCodedInt(s32 value) { + const s32 sign = value <= 0 ? 0 : 1; + if (value < 0) { + value = -value; + } + value = (value << 1) - sign; + WriteExpGolombCodedUInt(value); +} + +void H264BitWriter::WriteExpGolombCodedUInt(u32 value) { + const s32 size = 32 - Common::CountLeadingZeroes32(static_cast<s32>(value + 1)); + WriteBits(1, size); + + value -= (1U << (size - 1)) - 1; + WriteBits(static_cast<s32>(value), size - 1); +} + +s32 H264BitWriter::GetFreeBufferBits() { + if (buffer_pos == buffer_size) { + Flush(); + } + + return buffer_size - buffer_pos; +} + +void H264BitWriter::Flush() { + if (buffer_pos == 0) { + return; + } + byte_array.push_back(static_cast<u8>(buffer)); + + buffer = 0; + buffer_pos = 0; +} +} // namespace Tegra::Decoder diff --git a/src/video_core/command_classes/codecs/h264.h b/src/video_core/command_classes/codecs/h264.h new file mode 100644 index 000000000..273449495 --- /dev/null +++ b/src/video_core/command_classes/codecs/h264.h @@ -0,0 +1,118 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#pragma once + +#include <vector> +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "video_core/command_classes/nvdec_common.h" + +namespace Tegra { +class GPU; +namespace Decoder { + +class H264BitWriter { +public: + H264BitWriter(); + ~H264BitWriter(); + + /// The following Write methods are based on clause 9.1 in the H.264 specification. + /// WriteSe and WriteUe write in the Exp-Golomb-coded syntax + void WriteU(s32 value, s32 value_sz); + void WriteSe(s32 value); + void WriteUe(u32 value); + + /// Finalize the bitstream + void End(); + + /// append a bit to the stream, equivalent value to the state parameter + void WriteBit(bool state); + + /// Based on section 7.3.2.1.1.1 and Table 7-4 in the H.264 specification + /// Writes the scaling matrices of the sream + void WriteScalingList(const std::vector<u8>& list, s32 start, s32 count); + + /// Return the bitstream as a vector. + [[nodiscard]] std::vector<u8>& GetByteArray(); + [[nodiscard]] const std::vector<u8>& GetByteArray() const; + +private: + void WriteBits(s32 value, s32 bit_count); + void WriteExpGolombCodedInt(s32 value); + void WriteExpGolombCodedUInt(u32 value); + [[nodiscard]] s32 GetFreeBufferBits(); + void Flush(); + + s32 buffer_size{8}; + + s32 buffer{}; + s32 buffer_pos{}; + std::vector<u8> byte_array; +}; + +class H264 { +public: + explicit H264(GPU& gpu); + ~H264(); + + /// Compose the H264 header of the frame for FFmpeg decoding + [[nodiscard]] const std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state, + bool is_first_frame = false); + +private: + struct H264ParameterSet { + u32 log2_max_pic_order_cnt{}; + u32 delta_pic_order_always_zero_flag{}; + u32 frame_mbs_only_flag{}; + u32 pic_width_in_mbs{}; + u32 pic_height_in_map_units{}; + INSERT_PADDING_WORDS(1); + u32 entropy_coding_mode_flag{}; + u32 bottom_field_pic_order_flag{}; + u32 num_refidx_l0_default_active{}; + u32 num_refidx_l1_default_active{}; + u32 deblocking_filter_control_flag{}; + u32 redundant_pic_count_flag{}; + u32 transform_8x8_mode_flag{}; + INSERT_PADDING_WORDS(9); + u64 flags{}; + u32 frame_number{}; + u32 frame_number2{}; + }; + static_assert(sizeof(H264ParameterSet) == 0x68, "H264ParameterSet is an invalid size"); + + struct H264DecoderContext { + INSERT_PADDING_BYTES(0x48); + u32 frame_data_size{}; + INSERT_PADDING_BYTES(0xc); + H264ParameterSet h264_parameter_set{}; + INSERT_PADDING_BYTES(0x100); + std::array<u8, 0x60> scaling_matrix_4; + std::array<u8, 0x80> scaling_matrix_8; + }; + static_assert(sizeof(H264DecoderContext) == 0x2a0, "H264DecoderContext is an invalid size"); + + std::vector<u8> frame; + GPU& gpu; +}; + +} // namespace Decoder +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/vp9.cpp b/src/video_core/command_classes/codecs/vp9.cpp new file mode 100644 index 000000000..ab44fdc9e --- /dev/null +++ b/src/video_core/command_classes/codecs/vp9.cpp @@ -0,0 +1,1040 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <cstring> // for std::memcpy +#include <numeric> +#include "video_core/command_classes/codecs/vp9.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" + +namespace Tegra::Decoder { +namespace { +// Default compressed header probabilities once frame context resets +constexpr Vp9EntropyProbs default_probs{ + .y_mode_prob{ + 65, 32, 18, 144, 162, 194, 41, 51, 98, 132, 68, 18, 165, 217, 196, 45, 40, 78, + 173, 80, 19, 176, 240, 193, 64, 35, 46, 221, 135, 38, 194, 248, 121, 96, 85, 29, + }, + .partition_prob{ + 199, 122, 141, 0, 147, 63, 159, 0, 148, 133, 118, 0, 121, 104, 114, 0, + 174, 73, 87, 0, 92, 41, 83, 0, 82, 99, 50, 0, 53, 39, 39, 0, + 177, 58, 59, 0, 68, 26, 63, 0, 52, 79, 25, 0, 17, 14, 12, 0, + 222, 34, 30, 0, 72, 16, 44, 0, 58, 32, 12, 0, 10, 7, 6, 0, + }, + .coef_probs{ + 195, 29, 183, 0, 84, 49, 136, 0, 8, 42, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 31, 107, 169, 0, 35, 99, 159, 0, 17, 82, 140, 0, 8, 66, 114, 0, + 2, 44, 76, 0, 1, 19, 32, 0, 40, 132, 201, 0, 29, 114, 187, 0, 13, 91, 157, 0, + 7, 75, 127, 0, 3, 58, 95, 0, 1, 28, 47, 0, 69, 142, 221, 0, 42, 122, 201, 0, + 15, 91, 159, 0, 6, 67, 121, 0, 1, 42, 77, 0, 1, 17, 31, 0, 102, 148, 228, 0, + 67, 117, 204, 0, 17, 82, 154, 0, 6, 59, 114, 0, 2, 39, 75, 0, 1, 15, 29, 0, + 156, 57, 233, 0, 119, 57, 212, 0, 58, 48, 163, 0, 29, 40, 124, 0, 12, 30, 81, 0, + 3, 12, 31, 0, 191, 107, 226, 0, 124, 117, 204, 0, 25, 99, 155, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 29, 148, 210, 0, 37, 126, 194, 0, 8, 93, 157, 0, + 2, 68, 118, 0, 1, 39, 69, 0, 1, 17, 33, 0, 41, 151, 213, 0, 27, 123, 193, 0, + 3, 82, 144, 0, 1, 58, 105, 0, 1, 32, 60, 0, 1, 13, 26, 0, 59, 159, 220, 0, + 23, 126, 198, 0, 4, 88, 151, 0, 1, 66, 114, 0, 1, 38, 71, 0, 1, 18, 34, 0, + 114, 136, 232, 0, 51, 114, 207, 0, 11, 83, 155, 0, 3, 56, 105, 0, 1, 33, 65, 0, + 1, 17, 34, 0, 149, 65, 234, 0, 121, 57, 215, 0, 61, 49, 166, 0, 28, 36, 114, 0, + 12, 25, 76, 0, 3, 16, 42, 0, 214, 49, 220, 0, 132, 63, 188, 0, 42, 65, 137, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 85, 137, 221, 0, 104, 131, 216, 0, + 49, 111, 192, 0, 21, 87, 155, 0, 2, 49, 87, 0, 1, 16, 28, 0, 89, 163, 230, 0, + 90, 137, 220, 0, 29, 100, 183, 0, 10, 70, 135, 0, 2, 42, 81, 0, 1, 17, 33, 0, + 108, 167, 237, 0, 55, 133, 222, 0, 15, 97, 179, 0, 4, 72, 135, 0, 1, 45, 85, 0, + 1, 19, 38, 0, 124, 146, 240, 0, 66, 124, 224, 0, 17, 88, 175, 0, 4, 58, 122, 0, + 1, 36, 75, 0, 1, 18, 37, 0, 141, 79, 241, 0, 126, 70, 227, 0, 66, 58, 182, 0, + 30, 44, 136, 0, 12, 34, 96, 0, 2, 20, 47, 0, 229, 99, 249, 0, 143, 111, 235, 0, + 46, 109, 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 82, 158, 236, 0, + 94, 146, 224, 0, 25, 117, 191, 0, 9, 87, 149, 0, 3, 56, 99, 0, 1, 33, 57, 0, + 83, 167, 237, 0, 68, 145, 222, 0, 10, 103, 177, 0, 2, 72, 131, 0, 1, 41, 79, 0, + 1, 20, 39, 0, 99, 167, 239, 0, 47, 141, 224, 0, 10, 104, 178, 0, 2, 73, 133, 0, + 1, 44, 85, 0, 1, 22, 47, 0, 127, 145, 243, 0, 71, 129, 228, 0, 17, 93, 177, 0, + 3, 61, 124, 0, 1, 41, 84, 0, 1, 21, 52, 0, 157, 78, 244, 0, 140, 72, 231, 0, + 69, 58, 184, 0, 31, 44, 137, 0, 14, 38, 105, 0, 8, 23, 61, 0, 125, 34, 187, 0, + 52, 41, 133, 0, 6, 31, 56, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 37, 109, 153, 0, 51, 102, 147, 0, 23, 87, 128, 0, 8, 67, 101, 0, 1, 41, 63, 0, + 1, 19, 29, 0, 31, 154, 185, 0, 17, 127, 175, 0, 6, 96, 145, 0, 2, 73, 114, 0, + 1, 51, 82, 0, 1, 28, 45, 0, 23, 163, 200, 0, 10, 131, 185, 0, 2, 93, 148, 0, + 1, 67, 111, 0, 1, 41, 69, 0, 1, 14, 24, 0, 29, 176, 217, 0, 12, 145, 201, 0, + 3, 101, 156, 0, 1, 69, 111, 0, 1, 39, 63, 0, 1, 14, 23, 0, 57, 192, 233, 0, + 25, 154, 215, 0, 6, 109, 167, 0, 3, 78, 118, 0, 1, 48, 69, 0, 1, 21, 29, 0, + 202, 105, 245, 0, 108, 106, 216, 0, 18, 90, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 33, 172, 219, 0, 64, 149, 206, 0, 14, 117, 177, 0, 5, 90, 141, 0, + 2, 61, 95, 0, 1, 37, 57, 0, 33, 179, 220, 0, 11, 140, 198, 0, 1, 89, 148, 0, + 1, 60, 104, 0, 1, 33, 57, 0, 1, 12, 21, 0, 30, 181, 221, 0, 8, 141, 198, 0, + 1, 87, 145, 0, 1, 58, 100, 0, 1, 31, 55, 0, 1, 12, 20, 0, 32, 186, 224, 0, + 7, 142, 198, 0, 1, 86, 143, 0, 1, 58, 100, 0, 1, 31, 55, 0, 1, 12, 22, 0, + 57, 192, 227, 0, 20, 143, 204, 0, 3, 96, 154, 0, 1, 68, 112, 0, 1, 42, 69, 0, + 1, 19, 32, 0, 212, 35, 215, 0, 113, 47, 169, 0, 29, 48, 105, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 74, 129, 203, 0, 106, 120, 203, 0, 49, 107, 178, 0, + 19, 84, 144, 0, 4, 50, 84, 0, 1, 15, 25, 0, 71, 172, 217, 0, 44, 141, 209, 0, + 15, 102, 173, 0, 6, 76, 133, 0, 2, 51, 89, 0, 1, 24, 42, 0, 64, 185, 231, 0, + 31, 148, 216, 0, 8, 103, 175, 0, 3, 74, 131, 0, 1, 46, 81, 0, 1, 18, 30, 0, + 65, 196, 235, 0, 25, 157, 221, 0, 5, 105, 174, 0, 1, 67, 120, 0, 1, 38, 69, 0, + 1, 15, 30, 0, 65, 204, 238, 0, 30, 156, 224, 0, 7, 107, 177, 0, 2, 70, 124, 0, + 1, 42, 73, 0, 1, 18, 34, 0, 225, 86, 251, 0, 144, 104, 235, 0, 42, 99, 181, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 85, 175, 239, 0, 112, 165, 229, 0, + 29, 136, 200, 0, 12, 103, 162, 0, 6, 77, 123, 0, 2, 53, 84, 0, 75, 183, 239, 0, + 30, 155, 221, 0, 3, 106, 171, 0, 1, 74, 128, 0, 1, 44, 76, 0, 1, 17, 28, 0, + 73, 185, 240, 0, 27, 159, 222, 0, 2, 107, 172, 0, 1, 75, 127, 0, 1, 42, 73, 0, + 1, 17, 29, 0, 62, 190, 238, 0, 21, 159, 222, 0, 2, 107, 172, 0, 1, 72, 122, 0, + 1, 40, 71, 0, 1, 18, 32, 0, 61, 199, 240, 0, 27, 161, 226, 0, 4, 113, 180, 0, + 1, 76, 129, 0, 1, 46, 80, 0, 1, 23, 41, 0, 7, 27, 153, 0, 5, 30, 95, 0, + 1, 16, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50, 75, 127, 0, + 57, 75, 124, 0, 27, 67, 108, 0, 10, 54, 86, 0, 1, 33, 52, 0, 1, 12, 18, 0, + 43, 125, 151, 0, 26, 108, 148, 0, 7, 83, 122, 0, 2, 59, 89, 0, 1, 38, 60, 0, + 1, 17, 27, 0, 23, 144, 163, 0, 13, 112, 154, 0, 2, 75, 117, 0, 1, 50, 81, 0, + 1, 31, 51, 0, 1, 14, 23, 0, 18, 162, 185, 0, 6, 123, 171, 0, 1, 78, 125, 0, + 1, 51, 86, 0, 1, 31, 54, 0, 1, 14, 23, 0, 15, 199, 227, 0, 3, 150, 204, 0, + 1, 91, 146, 0, 1, 55, 95, 0, 1, 30, 53, 0, 1, 11, 20, 0, 19, 55, 240, 0, + 19, 59, 196, 0, 3, 52, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 41, 166, 207, 0, 104, 153, 199, 0, 31, 123, 181, 0, 14, 101, 152, 0, 5, 72, 106, 0, + 1, 36, 52, 0, 35, 176, 211, 0, 12, 131, 190, 0, 2, 88, 144, 0, 1, 60, 101, 0, + 1, 36, 60, 0, 1, 16, 28, 0, 28, 183, 213, 0, 8, 134, 191, 0, 1, 86, 142, 0, + 1, 56, 96, 0, 1, 30, 53, 0, 1, 12, 20, 0, 20, 190, 215, 0, 4, 135, 192, 0, + 1, 84, 139, 0, 1, 53, 91, 0, 1, 28, 49, 0, 1, 11, 20, 0, 13, 196, 216, 0, + 2, 137, 192, 0, 1, 86, 143, 0, 1, 57, 99, 0, 1, 32, 56, 0, 1, 13, 24, 0, + 211, 29, 217, 0, 96, 47, 156, 0, 22, 43, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 78, 120, 193, 0, 111, 116, 186, 0, 46, 102, 164, 0, 15, 80, 128, 0, + 2, 49, 76, 0, 1, 18, 28, 0, 71, 161, 203, 0, 42, 132, 192, 0, 10, 98, 150, 0, + 3, 69, 109, 0, 1, 44, 70, 0, 1, 18, 29, 0, 57, 186, 211, 0, 30, 140, 196, 0, + 4, 93, 146, 0, 1, 62, 102, 0, 1, 38, 65, 0, 1, 16, 27, 0, 47, 199, 217, 0, + 14, 145, 196, 0, 1, 88, 142, 0, 1, 57, 98, 0, 1, 36, 62, 0, 1, 15, 26, 0, + 26, 219, 229, 0, 5, 155, 207, 0, 1, 94, 151, 0, 1, 60, 104, 0, 1, 36, 62, 0, + 1, 16, 28, 0, 233, 29, 248, 0, 146, 47, 220, 0, 43, 52, 140, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 100, 163, 232, 0, 179, 161, 222, 0, 63, 142, 204, 0, + 37, 113, 174, 0, 26, 89, 137, 0, 18, 68, 97, 0, 85, 181, 230, 0, 32, 146, 209, 0, + 7, 100, 164, 0, 3, 71, 121, 0, 1, 45, 77, 0, 1, 18, 30, 0, 65, 187, 230, 0, + 20, 148, 207, 0, 2, 97, 159, 0, 1, 68, 116, 0, 1, 40, 70, 0, 1, 14, 29, 0, + 40, 194, 227, 0, 8, 147, 204, 0, 1, 94, 155, 0, 1, 65, 112, 0, 1, 39, 66, 0, + 1, 14, 26, 0, 16, 208, 228, 0, 3, 151, 207, 0, 1, 98, 160, 0, 1, 67, 117, 0, + 1, 41, 74, 0, 1, 17, 31, 0, 17, 38, 140, 0, 7, 34, 80, 0, 1, 17, 29, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 37, 75, 128, 0, 41, 76, 128, 0, + 26, 66, 116, 0, 12, 52, 94, 0, 2, 32, 55, 0, 1, 10, 16, 0, 50, 127, 154, 0, + 37, 109, 152, 0, 16, 82, 121, 0, 5, 59, 85, 0, 1, 35, 54, 0, 1, 13, 20, 0, + 40, 142, 167, 0, 17, 110, 157, 0, 2, 71, 112, 0, 1, 44, 72, 0, 1, 27, 45, 0, + 1, 11, 17, 0, 30, 175, 188, 0, 9, 124, 169, 0, 1, 74, 116, 0, 1, 48, 78, 0, + 1, 30, 49, 0, 1, 11, 18, 0, 10, 222, 223, 0, 2, 150, 194, 0, 1, 83, 128, 0, + 1, 48, 79, 0, 1, 27, 45, 0, 1, 11, 17, 0, 36, 41, 235, 0, 29, 36, 193, 0, + 10, 27, 111, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 85, 165, 222, 0, + 177, 162, 215, 0, 110, 135, 195, 0, 57, 113, 168, 0, 23, 83, 120, 0, 10, 49, 61, 0, + 85, 190, 223, 0, 36, 139, 200, 0, 5, 90, 146, 0, 1, 60, 103, 0, 1, 38, 65, 0, + 1, 18, 30, 0, 72, 202, 223, 0, 23, 141, 199, 0, 2, 86, 140, 0, 1, 56, 97, 0, + 1, 36, 61, 0, 1, 16, 27, 0, 55, 218, 225, 0, 13, 145, 200, 0, 1, 86, 141, 0, + 1, 57, 99, 0, 1, 35, 61, 0, 1, 13, 22, 0, 15, 235, 212, 0, 1, 132, 184, 0, + 1, 84, 139, 0, 1, 57, 97, 0, 1, 34, 56, 0, 1, 14, 23, 0, 181, 21, 201, 0, + 61, 37, 123, 0, 10, 38, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 47, 106, 172, 0, 95, 104, 173, 0, 42, 93, 159, 0, 18, 77, 131, 0, 4, 50, 81, 0, + 1, 17, 23, 0, 62, 147, 199, 0, 44, 130, 189, 0, 28, 102, 154, 0, 18, 75, 115, 0, + 2, 44, 65, 0, 1, 12, 19, 0, 55, 153, 210, 0, 24, 130, 194, 0, 3, 93, 146, 0, + 1, 61, 97, 0, 1, 31, 50, 0, 1, 10, 16, 0, 49, 186, 223, 0, 17, 148, 204, 0, + 1, 96, 142, 0, 1, 53, 83, 0, 1, 26, 44, 0, 1, 11, 17, 0, 13, 217, 212, 0, + 2, 136, 180, 0, 1, 78, 124, 0, 1, 50, 83, 0, 1, 29, 49, 0, 1, 14, 23, 0, + 197, 13, 247, 0, 82, 17, 222, 0, 25, 17, 162, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 126, 186, 247, 0, 234, 191, 243, 0, 176, 177, 234, 0, 104, 158, 220, 0, + 66, 128, 186, 0, 55, 90, 137, 0, 111, 197, 242, 0, 46, 158, 219, 0, 9, 104, 171, 0, + 2, 65, 125, 0, 1, 44, 80, 0, 1, 17, 91, 0, 104, 208, 245, 0, 39, 168, 224, 0, + 3, 109, 162, 0, 1, 79, 124, 0, 1, 50, 102, 0, 1, 43, 102, 0, 84, 220, 246, 0, + 31, 177, 231, 0, 2, 115, 180, 0, 1, 79, 134, 0, 1, 55, 77, 0, 1, 60, 79, 0, + 43, 243, 240, 0, 8, 180, 217, 0, 1, 115, 166, 0, 1, 84, 121, 0, 1, 51, 67, 0, + 1, 16, 6, 0, + }, + .switchable_interp_prob{235, 162, 36, 255, 34, 3, 149, 144}, + .inter_mode_prob{ + 2, 173, 34, 0, 7, 145, 85, 0, 7, 166, 63, 0, 7, 94, + 66, 0, 8, 64, 46, 0, 17, 81, 31, 0, 25, 29, 30, 0, + }, + .intra_inter_prob{9, 102, 187, 225}, + .comp_inter_prob{9, 102, 187, 225, 0}, + .single_ref_prob{33, 16, 77, 74, 142, 142, 172, 170, 238, 247}, + .comp_ref_prob{50, 126, 123, 221, 226}, + .tx_32x32_prob{3, 136, 37, 5, 52, 13}, + .tx_16x16_prob{20, 152, 15, 101}, + .tx_8x8_prob{100, 66}, + .skip_probs{192, 128, 64}, + .joints{32, 64, 96}, + .sign{128, 128}, + .classes{ + 224, 144, 192, 168, 192, 176, 192, 198, 198, 245, + 216, 128, 176, 160, 176, 176, 192, 198, 198, 208, + }, + .class_0{216, 208}, + .prob_bits{ + 136, 140, 148, 160, 176, 192, 224, 234, 234, 240, + 136, 140, 148, 160, 176, 192, 224, 234, 234, 240, + }, + .class_0_fr{128, 128, 64, 96, 112, 64, 128, 128, 64, 96, 112, 64}, + .fr{64, 96, 64, 64, 96, 64}, + .class_0_hp{160, 160}, + .high_precision{128, 128}, +}; + +constexpr std::array<s32, 256> norm_lut{ + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +constexpr std::array<s32, 254> map_lut{ + 20, 21, 22, 23, 24, 25, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 2, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 3, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, + 73, 4, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 5, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 6, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, + 108, 109, 7, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 8, 122, 123, 124, + 125, 126, 127, 128, 129, 130, 131, 132, 133, 9, 134, 135, 136, 137, 138, 139, 140, 141, 142, + 143, 144, 145, 10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12, 170, 171, 172, 173, 174, 175, 176, 177, + 178, 179, 180, 181, 13, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 14, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 15, 206, 207, 208, 209, 210, 211, 212, + 213, 214, 215, 216, 217, 16, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 17, + 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 18, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 19, +}; + +// 6.2.14 Tile size calculation + +[[nodiscard]] s32 CalcMinLog2TileCols(s32 frame_width) { + const s32 sb64_cols = (frame_width + 63) / 64; + s32 min_log2 = 0; + + while ((64 << min_log2) < sb64_cols) { + min_log2++; + } + + return min_log2; +} + +[[nodiscard]] s32 CalcMaxLog2TileCols(s32 frame_width) { + const s32 sb64_cols = (frame_width + 63) / 64; + s32 max_log2 = 1; + + while ((sb64_cols >> max_log2) >= 4) { + max_log2++; + } + + return max_log2 - 1; +} + +// Recenters probability. Based on section 6.3.6 of VP9 Specification +[[nodiscard]] s32 RecenterNonNeg(s32 new_prob, s32 old_prob) { + if (new_prob > old_prob * 2) { + return new_prob; + } + + if (new_prob >= old_prob) { + return (new_prob - old_prob) * 2; + } + + return (old_prob - new_prob) * 2 - 1; +} + +// Adjusts old_prob depending on new_prob. Based on section 6.3.5 of VP9 Specification +[[nodiscard]] s32 RemapProbability(s32 new_prob, s32 old_prob) { + new_prob--; + old_prob--; + + std::size_t index{}; + + if (old_prob * 2 <= 0xff) { + index = static_cast<std::size_t>(std::max(0, RecenterNonNeg(new_prob, old_prob) - 1)); + } else { + index = static_cast<std::size_t>( + std::max(0, RecenterNonNeg(0xff - 1 - new_prob, 0xff - 1 - old_prob) - 1)); + } + + return map_lut[index]; +} +} // Anonymous namespace + +VP9::VP9(GPU& gpu) : gpu(gpu) {} + +VP9::~VP9() = default; + +void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) { + const bool update = new_prob != old_prob; + + writer.Write(update, diff_update_probability); + + if (update) { + WriteProbabilityDelta(writer, new_prob, old_prob); + } +} +template <typename T, std::size_t N> +void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob, + const std::array<T, N>& old_prob) { + for (std::size_t offset = 0; offset < new_prob.size(); ++offset) { + WriteProbabilityUpdate(writer, new_prob[offset], old_prob[offset]); + } +} + +template <typename T, std::size_t N> +void VP9::WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob, + const std::array<T, N>& old_prob) { + for (std::size_t offset = 0; offset < new_prob.size(); offset += 4) { + WriteProbabilityUpdate(writer, new_prob[offset + 0], old_prob[offset + 0]); + WriteProbabilityUpdate(writer, new_prob[offset + 1], old_prob[offset + 1]); + WriteProbabilityUpdate(writer, new_prob[offset + 2], old_prob[offset + 2]); + } +} + +void VP9::WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) { + const int delta = RemapProbability(new_prob, old_prob); + + EncodeTermSubExp(writer, delta); +} + +void VP9::EncodeTermSubExp(VpxRangeEncoder& writer, s32 value) { + if (WriteLessThan(writer, value, 16)) { + writer.Write(value, 4); + } else if (WriteLessThan(writer, value, 32)) { + writer.Write(value - 16, 4); + } else if (WriteLessThan(writer, value, 64)) { + writer.Write(value - 32, 5); + } else { + value -= 64; + + constexpr s32 size = 8; + + const s32 mask = (1 << size) - 191; + + const s32 delta = value - mask; + + if (delta < 0) { + writer.Write(value, size - 1); + } else { + writer.Write(delta / 2 + mask, size - 1); + writer.Write(delta & 1, 1); + } + } +} + +bool VP9::WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test) { + const bool is_lt = value < test; + writer.Write(!is_lt); + return is_lt; +} + +void VP9::WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode, + const std::array<u8, 2304>& new_prob, + const std::array<u8, 2304>& old_prob) { + // Note: There's 1 byte added on each packet for alignment, + // this byte is ignored when doing updates. + constexpr s32 block_bytes = 2 * 2 * 6 * 6 * 4; + + const auto needs_update = [&](s32 base_index) -> bool { + s32 index = base_index; + for (s32 i = 0; i < 2; i++) { + for (s32 j = 0; j < 2; j++) { + for (s32 k = 0; k < 6; k++) { + for (s32 l = 0; l < 6; l++) { + if (new_prob[index + 0] != old_prob[index + 0] || + new_prob[index + 1] != old_prob[index + 1] || + new_prob[index + 2] != old_prob[index + 2]) { + return true; + } + + index += 4; + } + } + } + } + return false; + }; + + for (s32 block_index = 0; block_index < 4; block_index++) { + const s32 base_index = block_index * block_bytes; + const bool update = needs_update(base_index); + writer.Write(update); + + if (update) { + s32 index = base_index; + for (s32 i = 0; i < 2; i++) { + for (s32 j = 0; j < 2; j++) { + for (s32 k = 0; k < 6; k++) { + for (s32 l = 0; l < 6; l++) { + if (k != 0 || l < 3) { + WriteProbabilityUpdate(writer, new_prob[index + 0], + old_prob[index + 0]); + WriteProbabilityUpdate(writer, new_prob[index + 1], + old_prob[index + 1]); + WriteProbabilityUpdate(writer, new_prob[index + 2], + old_prob[index + 2]); + } + index += 4; + } + } + } + } + } + + if (block_index == tx_mode) { + break; + } + } +} + +void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) { + const bool update = new_prob != old_prob; + writer.Write(update, diff_update_probability); + + if (update) { + writer.Write(new_prob >> 1, 7); + } +} + +Vp9PictureInfo VP9::GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state) { + PictureInfo picture_info{}; + gpu.MemoryManager().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo)); + Vp9PictureInfo vp9_info = picture_info.Convert(); + + InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy); + + // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following + // order: last, golden, altref, current. It may be worthwhile to track the updates done here + // to avoid buffering frame data needed for reference frame updating in the header composition. + std::memcpy(vp9_info.frame_offsets.data(), state.surface_luma_offset.data(), 4 * sizeof(u64)); + + return vp9_info; +} + +void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) { + EntropyProbs entropy{}; + gpu.MemoryManager().ReadBlock(offset, &entropy, sizeof(EntropyProbs)); + entropy.Convert(dst); +} + +Vp9FrameContainer VP9::GetCurrentFrame(const NvdecCommon::NvdecRegisters& state) { + Vp9FrameContainer frame{}; + { + gpu.SyncGuestHost(); + frame.info = GetVp9PictureInfo(state); + + frame.bit_stream.resize(frame.info.bitstream_size); + gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.bit_stream.data(), + frame.info.bitstream_size); + } + // Buffer two frames, saving the last show frame info + if (!next_next_frame.bit_stream.empty()) { + Vp9FrameContainer temp{ + .info = frame.info, + .bit_stream = frame.bit_stream, + }; + next_next_frame.info.show_frame = frame.info.last_frame_shown; + frame.info = next_next_frame.info; + frame.bit_stream = next_next_frame.bit_stream; + next_next_frame = std::move(temp); + + if (!next_frame.bit_stream.empty()) { + Vp9FrameContainer temp2{ + .info = frame.info, + .bit_stream = frame.bit_stream, + }; + next_frame.info.show_frame = frame.info.last_frame_shown; + frame.info = next_frame.info; + frame.bit_stream = next_frame.bit_stream; + next_frame = std::move(temp2); + } else { + next_frame.info = frame.info; + next_frame.bit_stream = frame.bit_stream; + } + } else { + next_next_frame.info = frame.info; + next_next_frame.bit_stream = frame.bit_stream; + } + return frame; +} + +std::vector<u8> VP9::ComposeCompressedHeader() { + VpxRangeEncoder writer{}; + + if (!current_frame_info.lossless) { + if (static_cast<u32>(current_frame_info.transform_mode) >= 3) { + writer.Write(3, 2); + writer.Write(current_frame_info.transform_mode == 4); + } else { + writer.Write(current_frame_info.transform_mode, 2); + } + } + + if (current_frame_info.transform_mode == 4) { + // tx_mode_probs() in the spec + WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_8x8_prob, + prev_frame_probs.tx_8x8_prob); + WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_16x16_prob, + prev_frame_probs.tx_16x16_prob); + WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_32x32_prob, + prev_frame_probs.tx_32x32_prob); + if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { + prev_frame_probs.tx_8x8_prob = current_frame_info.entropy.tx_8x8_prob; + prev_frame_probs.tx_16x16_prob = current_frame_info.entropy.tx_16x16_prob; + prev_frame_probs.tx_32x32_prob = current_frame_info.entropy.tx_32x32_prob; + } + } + // read_coef_probs() in the spec + WriteCoefProbabilityUpdate(writer, current_frame_info.transform_mode, + current_frame_info.entropy.coef_probs, prev_frame_probs.coef_probs); + // read_skip_probs() in the spec + WriteProbabilityUpdate(writer, current_frame_info.entropy.skip_probs, + prev_frame_probs.skip_probs); + + if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { + prev_frame_probs.coef_probs = current_frame_info.entropy.coef_probs; + prev_frame_probs.skip_probs = current_frame_info.entropy.skip_probs; + } + + if (!current_frame_info.intra_only) { + // read_inter_probs() in the spec + WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.inter_mode_prob, + prev_frame_probs.inter_mode_prob); + if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { + prev_frame_probs.inter_mode_prob = current_frame_info.entropy.inter_mode_prob; + } + + if (current_frame_info.interp_filter == 4) { + // read_interp_filter_probs() in the spec + WriteProbabilityUpdate(writer, current_frame_info.entropy.switchable_interp_prob, + prev_frame_probs.switchable_interp_prob); + if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { + prev_frame_probs.switchable_interp_prob = + current_frame_info.entropy.switchable_interp_prob; + } + } + + // read_is_inter_probs() in the spec + WriteProbabilityUpdate(writer, current_frame_info.entropy.intra_inter_prob, + prev_frame_probs.intra_inter_prob); + if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { + prev_frame_probs.intra_inter_prob = current_frame_info.entropy.intra_inter_prob; + } + // frame_reference_mode() in the spec + if ((current_frame_info.ref_frame_sign_bias[1] & 1) != + (current_frame_info.ref_frame_sign_bias[2] & 1) || + (current_frame_info.ref_frame_sign_bias[1] & 1) != + (current_frame_info.ref_frame_sign_bias[3] & 1)) { + if (current_frame_info.reference_mode >= 1) { + writer.Write(1, 1); + writer.Write(current_frame_info.reference_mode == 2); + } else { + writer.Write(0, 1); + } + } + + // frame_reference_mode_probs() in the spec + if (current_frame_info.reference_mode == 2) { + WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_inter_prob, + prev_frame_probs.comp_inter_prob); + if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { + prev_frame_probs.comp_inter_prob = current_frame_info.entropy.comp_inter_prob; + } + } + + if (current_frame_info.reference_mode != 1) { + WriteProbabilityUpdate(writer, current_frame_info.entropy.single_ref_prob, + prev_frame_probs.single_ref_prob); + if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { + prev_frame_probs.single_ref_prob = current_frame_info.entropy.single_ref_prob; + } + } + + if (current_frame_info.reference_mode != 0) { + WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_ref_prob, + prev_frame_probs.comp_ref_prob); + if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { + prev_frame_probs.comp_ref_prob = current_frame_info.entropy.comp_ref_prob; + } + } + + // read_y_mode_probs + for (std::size_t index = 0; index < current_frame_info.entropy.y_mode_prob.size(); + ++index) { + WriteProbabilityUpdate(writer, current_frame_info.entropy.y_mode_prob[index], + prev_frame_probs.y_mode_prob[index]); + } + if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { + prev_frame_probs.y_mode_prob = current_frame_info.entropy.y_mode_prob; + } + // read_partition_probs + WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.partition_prob, + prev_frame_probs.partition_prob); + if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { + prev_frame_probs.partition_prob = current_frame_info.entropy.partition_prob; + } + + // mv_probs + for (s32 i = 0; i < 3; i++) { + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.joints[i], + prev_frame_probs.joints[i]); + } + if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { + prev_frame_probs.joints = current_frame_info.entropy.joints; + } + + for (s32 i = 0; i < 2; i++) { + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.sign[i], + prev_frame_probs.sign[i]); + + for (s32 j = 0; j < 10; j++) { + const int index = i * 10 + j; + + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.classes[index], + prev_frame_probs.classes[index]); + } + + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0[i], + prev_frame_probs.class_0[i]); + + for (s32 j = 0; j < 10; j++) { + const int index = i * 10 + j; + + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.prob_bits[index], + prev_frame_probs.prob_bits[index]); + } + } + + for (s32 i = 0; i < 2; i++) { + for (s32 j = 0; j < 2; j++) { + for (s32 k = 0; k < 3; k++) { + const int index = i * 2 * 3 + j * 3 + k; + + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_fr[index], + prev_frame_probs.class_0_fr[index]); + } + } + + for (s32 j = 0; j < 3; j++) { + const int index = i * 3 + j; + + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.fr[index], + prev_frame_probs.fr[index]); + } + } + + if (current_frame_info.allow_high_precision_mv) { + for (s32 index = 0; index < 2; index++) { + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_hp[index], + prev_frame_probs.class_0_hp[index]); + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.high_precision[index], + prev_frame_probs.high_precision[index]); + } + } + + // save previous probs + if (current_frame_info.show_frame && !current_frame_info.is_key_frame) { + prev_frame_probs.sign = current_frame_info.entropy.sign; + prev_frame_probs.classes = current_frame_info.entropy.classes; + prev_frame_probs.class_0 = current_frame_info.entropy.class_0; + prev_frame_probs.prob_bits = current_frame_info.entropy.prob_bits; + prev_frame_probs.class_0_fr = current_frame_info.entropy.class_0_fr; + prev_frame_probs.fr = current_frame_info.entropy.fr; + prev_frame_probs.class_0_hp = current_frame_info.entropy.class_0_hp; + prev_frame_probs.high_precision = current_frame_info.entropy.high_precision; + } + } + + writer.End(); + return writer.GetBuffer(); +} + +VpxBitStreamWriter VP9::ComposeUncompressedHeader() { + VpxBitStreamWriter uncomp_writer{}; + + uncomp_writer.WriteU(2, 2); // Frame marker. + uncomp_writer.WriteU(0, 2); // Profile. + uncomp_writer.WriteBit(false); // Show existing frame. + uncomp_writer.WriteBit(!current_frame_info.is_key_frame); // is key frame? + uncomp_writer.WriteBit(current_frame_info.show_frame); // show frame? + uncomp_writer.WriteBit(current_frame_info.error_resilient_mode); // error reslience + + if (current_frame_info.is_key_frame) { + uncomp_writer.WriteU(frame_sync_code, 24); + uncomp_writer.WriteU(0, 3); // Color space. + uncomp_writer.WriteU(0, 1); // Color range. + uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16); + uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16); + uncomp_writer.WriteBit(false); // Render and frame size different. + + // Reset context + prev_frame_probs = default_probs; + swap_next_golden = false; + loop_filter_ref_deltas.fill(0); + loop_filter_mode_deltas.fill(0); + + // allow frames offsets to stabilize before checking for golden frames + grace_period = 4; + + // On key frames, all frame slots are set to the current frame, + // so the value of the selected slot doesn't really matter. + frame_ctxs.fill({current_frame_number, false, default_probs}); + + // intra only, meaning the frame can be recreated with no other references + current_frame_info.intra_only = true; + + } else { + + if (!current_frame_info.show_frame) { + uncomp_writer.WriteBit(current_frame_info.intra_only); + if (!current_frame_info.last_frame_was_key) { + swap_next_golden = !swap_next_golden; + } + } else { + current_frame_info.intra_only = false; + } + if (!current_frame_info.error_resilient_mode) { + uncomp_writer.WriteU(0, 2); // Reset frame context. + } + + // Last, Golden, Altref frames + std::array<s32, 3> ref_frame_index{0, 1, 2}; + + // Set when next frame is hidden + // altref and golden references are swapped + if (swap_next_golden) { + ref_frame_index = std::array<s32, 3>{0, 2, 1}; + } + + // update Last Frame + u64 refresh_frame_flags = 1; + + // golden frame may refresh, determined if the next golden frame offset is changed + bool golden_refresh = false; + if (grace_period <= 0) { + for (s32 index = 1; index < 3; ++index) { + if (current_frame_info.frame_offsets[index] != + next_frame.info.frame_offsets[index]) { + current_frame_info.refresh_frame[index] = true; + golden_refresh = true; + grace_period = 3; + } + } + } + + if (current_frame_info.show_frame && + (!next_frame.info.show_frame || next_frame.info.is_key_frame)) { + // Update golden frame + refresh_frame_flags = swap_next_golden ? 2 : 4; + } + + if (!current_frame_info.show_frame) { + // Update altref + refresh_frame_flags = swap_next_golden ? 2 : 4; + } else if (golden_refresh) { + refresh_frame_flags = 3; + } + + if (current_frame_info.intra_only) { + uncomp_writer.WriteU(frame_sync_code, 24); + uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8); + uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16); + uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16); + uncomp_writer.WriteBit(false); // Render and frame size different. + } else { + uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8); + + for (s32 index = 1; index < 4; index++) { + uncomp_writer.WriteU(ref_frame_index[index - 1], 3); + uncomp_writer.WriteU(current_frame_info.ref_frame_sign_bias[index], 1); + } + + uncomp_writer.WriteBit(true); // Frame size with refs. + uncomp_writer.WriteBit(false); // Render and frame size different. + uncomp_writer.WriteBit(current_frame_info.allow_high_precision_mv); + uncomp_writer.WriteBit(current_frame_info.interp_filter == 4); + + if (current_frame_info.interp_filter != 4) { + uncomp_writer.WriteU(current_frame_info.interp_filter, 2); + } + } + } + + if (!current_frame_info.error_resilient_mode) { + uncomp_writer.WriteBit(true); // Refresh frame context. where do i get this info from? + uncomp_writer.WriteBit(true); // Frame parallel decoding mode. + } + + int frame_ctx_idx = 0; + if (!current_frame_info.show_frame) { + frame_ctx_idx = 1; + } + + uncomp_writer.WriteU(frame_ctx_idx, 2); // Frame context index. + prev_frame_probs = + frame_ctxs[frame_ctx_idx].probs; // reference probabilities for compressed header + frame_ctxs[frame_ctx_idx] = {current_frame_number, false, current_frame_info.entropy}; + + uncomp_writer.WriteU(current_frame_info.first_level, 6); + uncomp_writer.WriteU(current_frame_info.sharpness_level, 3); + uncomp_writer.WriteBit(current_frame_info.mode_ref_delta_enabled); + + if (current_frame_info.mode_ref_delta_enabled) { + // check if ref deltas are different, update accordingly + std::array<bool, 4> update_loop_filter_ref_deltas; + std::array<bool, 2> update_loop_filter_mode_deltas; + + bool loop_filter_delta_update = false; + + for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) { + const s8 old_deltas = loop_filter_ref_deltas[index]; + const s8 new_deltas = current_frame_info.ref_deltas[index]; + const bool differing_delta = old_deltas != new_deltas; + + update_loop_filter_ref_deltas[index] = differing_delta; + loop_filter_delta_update |= differing_delta; + } + + for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) { + const s8 old_deltas = loop_filter_mode_deltas[index]; + const s8 new_deltas = current_frame_info.mode_deltas[index]; + const bool differing_delta = old_deltas != new_deltas; + + update_loop_filter_mode_deltas[index] = differing_delta; + loop_filter_delta_update |= differing_delta; + } + + uncomp_writer.WriteBit(loop_filter_delta_update); + + if (loop_filter_delta_update) { + for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) { + uncomp_writer.WriteBit(update_loop_filter_ref_deltas[index]); + + if (update_loop_filter_ref_deltas[index]) { + uncomp_writer.WriteS(current_frame_info.ref_deltas[index], 6); + } + } + + for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) { + uncomp_writer.WriteBit(update_loop_filter_mode_deltas[index]); + + if (update_loop_filter_mode_deltas[index]) { + uncomp_writer.WriteS(current_frame_info.mode_deltas[index], 6); + } + } + // save new deltas + loop_filter_ref_deltas = current_frame_info.ref_deltas; + loop_filter_mode_deltas = current_frame_info.mode_deltas; + } + } + + uncomp_writer.WriteU(current_frame_info.base_q_index, 8); + + uncomp_writer.WriteDeltaQ(current_frame_info.y_dc_delta_q); + uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q); + uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q); + + uncomp_writer.WriteBit(false); // Segmentation enabled (TODO). + + const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width); + const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width); + + const s32 tile_cols_log2_diff = current_frame_info.log2_tile_cols - min_tile_cols_log2; + const s32 tile_cols_log2_inc_mask = (1 << tile_cols_log2_diff) - 1; + + // If it's less than the maximum, we need to add an extra 0 on the bitstream + // to indicate that it should stop reading. + if (current_frame_info.log2_tile_cols < max_tile_cols_log2) { + uncomp_writer.WriteU(tile_cols_log2_inc_mask << 1, tile_cols_log2_diff + 1); + } else { + uncomp_writer.WriteU(tile_cols_log2_inc_mask, tile_cols_log2_diff); + } + + const bool tile_rows_log2_is_nonzero = current_frame_info.log2_tile_rows != 0; + + uncomp_writer.WriteBit(tile_rows_log2_is_nonzero); + + if (tile_rows_log2_is_nonzero) { + uncomp_writer.WriteBit(current_frame_info.log2_tile_rows > 1); + } + + return uncomp_writer; +} + +const std::vector<u8>& VP9::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state) { + std::vector<u8> bitstream; + { + Vp9FrameContainer curr_frame = GetCurrentFrame(state); + current_frame_info = curr_frame.info; + bitstream = std::move(curr_frame.bit_stream); + } + + // The uncompressed header routine sets PrevProb parameters needed for the compressed header + auto uncomp_writer = ComposeUncompressedHeader(); + std::vector<u8> compressed_header = ComposeCompressedHeader(); + + uncomp_writer.WriteU(static_cast<s32>(compressed_header.size()), 16); + uncomp_writer.Flush(); + std::vector<u8> uncompressed_header = uncomp_writer.GetByteArray(); + + // Write headers and frame to buffer + frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size()); + std::memcpy(frame.data(), uncompressed_header.data(), uncompressed_header.size()); + std::memcpy(frame.data() + uncompressed_header.size(), compressed_header.data(), + compressed_header.size()); + std::memcpy(frame.data() + uncompressed_header.size() + compressed_header.size(), + bitstream.data(), bitstream.size()); + + // keep track of frame number + current_frame_number++; + grace_period--; + + // don't display hidden frames + hidden = !current_frame_info.show_frame; + return frame; +} + +VpxRangeEncoder::VpxRangeEncoder() { + Write(false); +} + +VpxRangeEncoder::~VpxRangeEncoder() = default; + +void VpxRangeEncoder::Write(s32 value, s32 value_size) { + for (s32 bit = value_size - 1; bit >= 0; bit--) { + Write(((value >> bit) & 1) != 0); + } +} + +void VpxRangeEncoder::Write(bool bit) { + Write(bit, half_probability); +} + +void VpxRangeEncoder::Write(bool bit, s32 probability) { + u32 local_range = range; + const u32 split = 1 + (((local_range - 1) * static_cast<u32>(probability)) >> 8); + local_range = split; + + if (bit) { + low_value += split; + local_range = range - split; + } + + s32 shift = norm_lut[local_range]; + local_range <<= shift; + count += shift; + + if (count >= 0) { + const s32 offset = shift - count; + + if (((low_value << (offset - 1)) >> 31) != 0) { + const s32 current_pos = static_cast<s32>(base_stream.GetPosition()); + base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos); + while (PeekByte() == 0xff) { + base_stream.WriteByte(0); + + base_stream.Seek(-2, Common::SeekOrigin::FromCurrentPos); + } + base_stream.WriteByte(static_cast<u8>((PeekByte() + 1))); + base_stream.Seek(current_pos, Common::SeekOrigin::SetOrigin); + } + base_stream.WriteByte(static_cast<u8>((low_value >> (24 - offset)))); + + low_value <<= offset; + shift = count; + low_value &= 0xffffff; + count -= 8; + } + + low_value <<= shift; + range = local_range; +} + +void VpxRangeEncoder::End() { + for (std::size_t index = 0; index < 32; ++index) { + Write(false); + } +} + +u8 VpxRangeEncoder::PeekByte() { + const u8 value = base_stream.ReadByte(); + base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos); + + return value; +} + +VpxBitStreamWriter::VpxBitStreamWriter() = default; + +VpxBitStreamWriter::~VpxBitStreamWriter() = default; + +void VpxBitStreamWriter::WriteU(u32 value, u32 value_size) { + WriteBits(value, value_size); +} + +void VpxBitStreamWriter::WriteS(s32 value, u32 value_size) { + const bool sign = value < 0; + if (sign) { + value = -value; + } + + WriteBits(static_cast<u32>(value << 1) | (sign ? 1 : 0), value_size + 1); +} + +void VpxBitStreamWriter::WriteDeltaQ(u32 value) { + const bool delta_coded = value != 0; + WriteBit(delta_coded); + + if (delta_coded) { + WriteBits(value, 4); + } +} + +void VpxBitStreamWriter::WriteBits(u32 value, u32 bit_count) { + s32 value_pos = 0; + s32 remaining = bit_count; + + while (remaining > 0) { + s32 copy_size = remaining; + + const s32 free = GetFreeBufferBits(); + + if (copy_size > free) { + copy_size = free; + } + + const s32 mask = (1 << copy_size) - 1; + + const s32 src_shift = (bit_count - value_pos) - copy_size; + const s32 dst_shift = (buffer_size - buffer_pos) - copy_size; + + buffer |= ((value >> src_shift) & mask) << dst_shift; + + value_pos += copy_size; + buffer_pos += copy_size; + remaining -= copy_size; + } +} + +void VpxBitStreamWriter::WriteBit(bool state) { + WriteBits(state ? 1 : 0, 1); +} + +s32 VpxBitStreamWriter::GetFreeBufferBits() { + if (buffer_pos == buffer_size) { + Flush(); + } + + return buffer_size - buffer_pos; +} + +void VpxBitStreamWriter::Flush() { + if (buffer_pos == 0) { + return; + } + byte_array.push_back(static_cast<u8>(buffer)); + buffer = 0; + buffer_pos = 0; +} + +std::vector<u8>& VpxBitStreamWriter::GetByteArray() { + return byte_array; +} + +const std::vector<u8>& VpxBitStreamWriter::GetByteArray() const { + return byte_array; +} + +} // namespace Tegra::Decoder diff --git a/src/video_core/command_classes/codecs/vp9.h b/src/video_core/command_classes/codecs/vp9.h new file mode 100644 index 000000000..e2504512c --- /dev/null +++ b/src/video_core/command_classes/codecs/vp9.h @@ -0,0 +1,196 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <vector> + +#include "common/common_types.h" +#include "common/stream.h" +#include "video_core/command_classes/codecs/vp9_types.h" +#include "video_core/command_classes/nvdec_common.h" + +namespace Tegra { +class GPU; +enum class FrameType { KeyFrame = 0, InterFrame = 1 }; +namespace Decoder { + +/// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the +/// VP9 header bitstreams. + +class VpxRangeEncoder { +public: + VpxRangeEncoder(); + ~VpxRangeEncoder(); + + VpxRangeEncoder(const VpxRangeEncoder&) = delete; + VpxRangeEncoder& operator=(const VpxRangeEncoder&) = delete; + + VpxRangeEncoder(VpxRangeEncoder&&) = default; + VpxRangeEncoder& operator=(VpxRangeEncoder&&) = default; + + /// Writes the rightmost value_size bits from value into the stream + void Write(s32 value, s32 value_size); + + /// Writes a single bit with half probability + void Write(bool bit); + + /// Writes a bit to the base_stream encoded with probability + void Write(bool bit, s32 probability); + + /// Signal the end of the bitstream + void End(); + + [[nodiscard]] std::vector<u8>& GetBuffer() { + return base_stream.GetBuffer(); + } + + [[nodiscard]] const std::vector<u8>& GetBuffer() const { + return base_stream.GetBuffer(); + } + +private: + u8 PeekByte(); + Common::Stream base_stream{}; + u32 low_value{}; + u32 range{0xff}; + s32 count{-24}; + s32 half_probability{128}; +}; + +class VpxBitStreamWriter { +public: + VpxBitStreamWriter(); + ~VpxBitStreamWriter(); + + VpxBitStreamWriter(const VpxBitStreamWriter&) = delete; + VpxBitStreamWriter& operator=(const VpxBitStreamWriter&) = delete; + + VpxBitStreamWriter(VpxBitStreamWriter&&) = default; + VpxBitStreamWriter& operator=(VpxBitStreamWriter&&) = default; + + /// Write an unsigned integer value + void WriteU(u32 value, u32 value_size); + + /// Write a signed integer value + void WriteS(s32 value, u32 value_size); + + /// Based on 6.2.10 of VP9 Spec, writes a delta coded value + void WriteDeltaQ(u32 value); + + /// Write a single bit. + void WriteBit(bool state); + + /// Pushes current buffer into buffer_array, resets buffer + void Flush(); + + /// Returns byte_array + [[nodiscard]] std::vector<u8>& GetByteArray(); + + /// Returns const byte_array + [[nodiscard]] const std::vector<u8>& GetByteArray() const; + +private: + /// Write bit_count bits from value into buffer + void WriteBits(u32 value, u32 bit_count); + + /// Gets next available position in buffer, invokes Flush() if buffer is full + s32 GetFreeBufferBits(); + + s32 buffer_size{8}; + + s32 buffer{}; + s32 buffer_pos{}; + std::vector<u8> byte_array; +}; + +class VP9 { +public: + explicit VP9(GPU& gpu); + ~VP9(); + + VP9(const VP9&) = delete; + VP9& operator=(const VP9&) = delete; + + VP9(VP9&&) = default; + VP9& operator=(VP9&&) = delete; + + /// Composes the VP9 frame from the GPU state information. Based on the official VP9 spec + /// documentation + [[nodiscard]] const std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state); + + /// Returns true if the most recent frame was a hidden frame. + [[nodiscard]] bool WasFrameHidden() const { + return hidden; + } + +private: + /// Generates compressed header probability updates in the bitstream writer + template <typename T, std::size_t N> + void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob, + const std::array<T, N>& old_prob); + + /// Generates compressed header probability updates in the bitstream writer + /// If probs are not equal, WriteProbabilityDelta is invoked + void WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); + + /// Generates compressed header probability deltas in the bitstream writer + void WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); + + /// Inverse of 6.3.4 Decode term subexp + void EncodeTermSubExp(VpxRangeEncoder& writer, s32 value); + + /// Writes if the value is less than the test value + bool WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test); + + /// Writes probability updates for the Coef probabilities + void WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode, + const std::array<u8, 2304>& new_prob, + const std::array<u8, 2304>& old_prob); + + /// Write probabilities for 4-byte aligned structures + template <typename T, std::size_t N> + void WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob, + const std::array<T, N>& old_prob); + + /// Write motion vector probability updates. 6.3.17 in the spec + void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); + + /// Returns VP9 information from NVDEC provided offset and size + [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state); + + /// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct + void InsertEntropy(u64 offset, Vp9EntropyProbs& dst); + + /// Returns frame to be decoded after buffering + [[nodiscard]] Vp9FrameContainer GetCurrentFrame(const NvdecCommon::NvdecRegisters& state); + + /// Use NVDEC providied information to compose the headers for the current frame + [[nodiscard]] std::vector<u8> ComposeCompressedHeader(); + [[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader(); + + GPU& gpu; + std::vector<u8> frame; + + std::array<s8, 4> loop_filter_ref_deltas{}; + std::array<s8, 2> loop_filter_mode_deltas{}; + + bool hidden = false; + s64 current_frame_number = -2; // since we buffer 2 frames + s32 grace_period = 6; // frame offsets need to stabilize + std::array<FrameContexts, 4> frame_ctxs{}; + Vp9FrameContainer next_frame{}; + Vp9FrameContainer next_next_frame{}; + bool swap_next_golden{}; + + Vp9PictureInfo current_frame_info{}; + Vp9EntropyProbs prev_frame_probs{}; + + s32 diff_update_probability = 252; + s32 frame_sync_code = 0x498342; +}; + +} // namespace Decoder +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/vp9_types.h b/src/video_core/command_classes/codecs/vp9_types.h new file mode 100644 index 000000000..4f0b05d22 --- /dev/null +++ b/src/video_core/command_classes/codecs/vp9_types.h @@ -0,0 +1,366 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <cstring> +#include <vector> +#include "common/common_funcs.h" +#include "common/common_types.h" + +namespace Tegra { +class GPU; + +namespace Decoder { +struct Vp9FrameDimensions { + s16 width{}; + s16 height{}; + s16 luma_pitch{}; + s16 chroma_pitch{}; +}; +static_assert(sizeof(Vp9FrameDimensions) == 0x8, "Vp9 Vp9FrameDimensions is an invalid size"); + +enum FrameFlags : u32 { + IsKeyFrame = 1 << 0, + LastFrameIsKeyFrame = 1 << 1, + FrameSizeChanged = 1 << 2, + ErrorResilientMode = 1 << 3, + LastShowFrame = 1 << 4, + IntraOnly = 1 << 5, +}; + +enum class MvJointType { + MvJointZero = 0, /* Zero vector */ + MvJointHnzvz = 1, /* Vert zero, hor nonzero */ + MvJointHzvnz = 2, /* Hor zero, vert nonzero */ + MvJointHnzvnz = 3, /* Both components nonzero */ +}; +enum class MvClassType { + MvClass0 = 0, /* (0, 2] integer pel */ + MvClass1 = 1, /* (2, 4] integer pel */ + MvClass2 = 2, /* (4, 8] integer pel */ + MvClass3 = 3, /* (8, 16] integer pel */ + MvClass4 = 4, /* (16, 32] integer pel */ + MvClass5 = 5, /* (32, 64] integer pel */ + MvClass6 = 6, /* (64, 128] integer pel */ + MvClass7 = 7, /* (128, 256] integer pel */ + MvClass8 = 8, /* (256, 512] integer pel */ + MvClass9 = 9, /* (512, 1024] integer pel */ + MvClass10 = 10, /* (1024,2048] integer pel */ +}; + +enum class BlockSize { + Block4x4 = 0, + Block4x8 = 1, + Block8x4 = 2, + Block8x8 = 3, + Block8x16 = 4, + Block16x8 = 5, + Block16x16 = 6, + Block16x32 = 7, + Block32x16 = 8, + Block32x32 = 9, + Block32x64 = 10, + Block64x32 = 11, + Block64x64 = 12, + BlockSizes = 13, + BlockInvalid = BlockSizes +}; + +enum class PredictionMode { + DcPred = 0, // Average of above and left pixels + VPred = 1, // Vertical + HPred = 2, // Horizontal + D45Pred = 3, // Directional 45 deg = round(arctan(1 / 1) * 180 / pi) + D135Pred = 4, // Directional 135 deg = 180 - 45 + D117Pred = 5, // Directional 117 deg = 180 - 63 + D153Pred = 6, // Directional 153 deg = 180 - 27 + D207Pred = 7, // Directional 207 deg = 180 + 27 + D63Pred = 8, // Directional 63 deg = round(arctan(2 / 1) * 180 / pi) + TmPred = 9, // True-motion + NearestMv = 10, + NearMv = 11, + ZeroMv = 12, + NewMv = 13, + MbModeCount = 14 +}; + +enum class TxSize { + Tx4x4 = 0, // 4x4 transform + Tx8x8 = 1, // 8x8 transform + Tx16x16 = 2, // 16x16 transform + Tx32x32 = 3, // 32x32 transform + TxSizes = 4 +}; + +enum class TxMode { + Only4X4 = 0, // Only 4x4 transform used + Allow8X8 = 1, // Allow block transform size up to 8x8 + Allow16X16 = 2, // Allow block transform size up to 16x16 + Allow32X32 = 3, // Allow block transform size up to 32x32 + TxModeSelect = 4, // Transform specified for each block + TxModes = 5 +}; + +enum class reference_mode { + SingleReference = 0, + CompoundReference = 1, + ReferenceModeSelect = 2, + ReferenceModes = 3 +}; + +struct Segmentation { + u8 enabled{}; + u8 update_map{}; + u8 temporal_update{}; + u8 abs_delta{}; + std::array<u32, 8> feature_mask{}; + std::array<std::array<s16, 4>, 8> feature_data{}; +}; +static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size"); + +struct LoopFilter { + u8 mode_ref_delta_enabled{}; + std::array<s8, 4> ref_deltas{}; + std::array<s8, 2> mode_deltas{}; +}; +static_assert(sizeof(LoopFilter) == 0x7, "LoopFilter is an invalid size"); + +struct Vp9EntropyProbs { + std::array<u8, 36> y_mode_prob{}; + std::array<u8, 64> partition_prob{}; + std::array<u8, 2304> coef_probs{}; + std::array<u8, 8> switchable_interp_prob{}; + std::array<u8, 28> inter_mode_prob{}; + std::array<u8, 4> intra_inter_prob{}; + std::array<u8, 5> comp_inter_prob{}; + std::array<u8, 10> single_ref_prob{}; + std::array<u8, 5> comp_ref_prob{}; + std::array<u8, 6> tx_32x32_prob{}; + std::array<u8, 4> tx_16x16_prob{}; + std::array<u8, 2> tx_8x8_prob{}; + std::array<u8, 3> skip_probs{}; + std::array<u8, 3> joints{}; + std::array<u8, 2> sign{}; + std::array<u8, 20> classes{}; + std::array<u8, 2> class_0{}; + std::array<u8, 20> prob_bits{}; + std::array<u8, 12> class_0_fr{}; + std::array<u8, 6> fr{}; + std::array<u8, 2> class_0_hp{}; + std::array<u8, 2> high_precision{}; +}; +static_assert(sizeof(Vp9EntropyProbs) == 0x9F4, "Vp9EntropyProbs is an invalid size"); + +struct Vp9PictureInfo { + bool is_key_frame{}; + bool intra_only{}; + bool last_frame_was_key{}; + bool frame_size_changed{}; + bool error_resilient_mode{}; + bool last_frame_shown{}; + bool show_frame{}; + std::array<s8, 4> ref_frame_sign_bias{}; + s32 base_q_index{}; + s32 y_dc_delta_q{}; + s32 uv_dc_delta_q{}; + s32 uv_ac_delta_q{}; + bool lossless{}; + s32 transform_mode{}; + bool allow_high_precision_mv{}; + s32 interp_filter{}; + s32 reference_mode{}; + s8 comp_fixed_ref{}; + std::array<s8, 2> comp_var_ref{}; + s32 log2_tile_cols{}; + s32 log2_tile_rows{}; + bool segment_enabled{}; + bool segment_map_update{}; + bool segment_map_temporal_update{}; + s32 segment_abs_delta{}; + std::array<u32, 8> segment_feature_enable{}; + std::array<std::array<s16, 4>, 8> segment_feature_data{}; + bool mode_ref_delta_enabled{}; + bool use_prev_in_find_mv_refs{}; + std::array<s8, 4> ref_deltas{}; + std::array<s8, 2> mode_deltas{}; + Vp9EntropyProbs entropy{}; + Vp9FrameDimensions frame_size{}; + u8 first_level{}; + u8 sharpness_level{}; + u32 bitstream_size{}; + std::array<u64, 4> frame_offsets{}; + std::array<bool, 4> refresh_frame{}; +}; + +struct Vp9FrameContainer { + Vp9PictureInfo info{}; + std::vector<u8> bit_stream; +}; + +struct PictureInfo { + INSERT_PADDING_WORDS(12); + u32 bitstream_size{}; + INSERT_PADDING_WORDS(5); + Vp9FrameDimensions last_frame_size{}; + Vp9FrameDimensions golden_frame_size{}; + Vp9FrameDimensions alt_frame_size{}; + Vp9FrameDimensions current_frame_size{}; + u32 vp9_flags{}; + std::array<s8, 4> ref_frame_sign_bias{}; + u8 first_level{}; + u8 sharpness_level{}; + u8 base_q_index{}; + u8 y_dc_delta_q{}; + u8 uv_ac_delta_q{}; + u8 uv_dc_delta_q{}; + u8 lossless{}; + u8 tx_mode{}; + u8 allow_high_precision_mv{}; + u8 interp_filter{}; + u8 reference_mode{}; + s8 comp_fixed_ref{}; + std::array<s8, 2> comp_var_ref{}; + u8 log2_tile_cols{}; + u8 log2_tile_rows{}; + Segmentation segmentation{}; + LoopFilter loop_filter{}; + INSERT_PADDING_BYTES(5); + u32 surface_params{}; + INSERT_PADDING_WORDS(3); + + [[nodiscard]] Vp9PictureInfo Convert() const { + return { + .is_key_frame = (vp9_flags & FrameFlags::IsKeyFrame) != 0, + .intra_only = (vp9_flags & FrameFlags::IntraOnly) != 0, + .last_frame_was_key = (vp9_flags & FrameFlags::LastFrameIsKeyFrame) != 0, + .frame_size_changed = (vp9_flags & FrameFlags::FrameSizeChanged) != 0, + .error_resilient_mode = (vp9_flags & FrameFlags::ErrorResilientMode) != 0, + .last_frame_shown = (vp9_flags & FrameFlags::LastShowFrame) != 0, + .ref_frame_sign_bias = ref_frame_sign_bias, + .base_q_index = base_q_index, + .y_dc_delta_q = y_dc_delta_q, + .uv_dc_delta_q = uv_dc_delta_q, + .uv_ac_delta_q = uv_ac_delta_q, + .lossless = lossless != 0, + .transform_mode = tx_mode, + .allow_high_precision_mv = allow_high_precision_mv != 0, + .interp_filter = interp_filter, + .reference_mode = reference_mode, + .comp_fixed_ref = comp_fixed_ref, + .comp_var_ref = comp_var_ref, + .log2_tile_cols = log2_tile_cols, + .log2_tile_rows = log2_tile_rows, + .segment_enabled = segmentation.enabled != 0, + .segment_map_update = segmentation.update_map != 0, + .segment_map_temporal_update = segmentation.temporal_update != 0, + .segment_abs_delta = segmentation.abs_delta, + .segment_feature_enable = segmentation.feature_mask, + .segment_feature_data = segmentation.feature_data, + .mode_ref_delta_enabled = loop_filter.mode_ref_delta_enabled != 0, + .use_prev_in_find_mv_refs = !(vp9_flags == (FrameFlags::ErrorResilientMode)) && + !(vp9_flags == (FrameFlags::FrameSizeChanged)) && + !(vp9_flags == (FrameFlags::IntraOnly)) && + (vp9_flags == (FrameFlags::LastShowFrame)) && + !(vp9_flags == (FrameFlags::LastFrameIsKeyFrame)), + .ref_deltas = loop_filter.ref_deltas, + .mode_deltas = loop_filter.mode_deltas, + .frame_size = current_frame_size, + .first_level = first_level, + .sharpness_level = sharpness_level, + .bitstream_size = bitstream_size, + }; + } +}; +static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size"); + +struct EntropyProbs { + INSERT_PADDING_BYTES(1024); + std::array<std::array<u8, 4>, 7> inter_mode_prob{}; + std::array<u8, 4> intra_inter_prob{}; + INSERT_PADDING_BYTES(80); + std::array<std::array<u8, 1>, 2> tx_8x8_prob{}; + std::array<std::array<u8, 2>, 2> tx_16x16_prob{}; + std::array<std::array<u8, 3>, 2> tx_32x32_prob{}; + std::array<u8, 4> y_mode_prob_e8{}; + std::array<std::array<u8, 8>, 4> y_mode_prob_e0e7{}; + INSERT_PADDING_BYTES(64); + std::array<std::array<u8, 4>, 16> partition_prob{}; + INSERT_PADDING_BYTES(10); + std::array<std::array<u8, 2>, 4> switchable_interp_prob{}; + std::array<u8, 5> comp_inter_prob{}; + std::array<u8, 4> skip_probs{}; + std::array<u8, 3> joints{}; + std::array<u8, 2> sign{}; + std::array<std::array<u8, 1>, 2> class_0{}; + std::array<std::array<u8, 3>, 2> fr{}; + std::array<u8, 2> class_0_hp{}; + std::array<u8, 2> high_precision{}; + std::array<std::array<u8, 10>, 2> classes{}; + std::array<std::array<std::array<u8, 3>, 2>, 2> class_0_fr{}; + std::array<std::array<u8, 10>, 2> pred_bits{}; + std::array<std::array<u8, 2>, 5> single_ref_prob{}; + std::array<u8, 5> comp_ref_prob{}; + INSERT_PADDING_BYTES(17); + std::array<std::array<std::array<std::array<std::array<std::array<u8, 4>, 6>, 6>, 2>, 2>, 4> + coef_probs{}; + + void Convert(Vp9EntropyProbs& fc) { + std::memcpy(fc.inter_mode_prob.data(), inter_mode_prob.data(), fc.inter_mode_prob.size()); + + std::memcpy(fc.intra_inter_prob.data(), intra_inter_prob.data(), + fc.intra_inter_prob.size()); + + std::memcpy(fc.tx_8x8_prob.data(), tx_8x8_prob.data(), fc.tx_8x8_prob.size()); + std::memcpy(fc.tx_16x16_prob.data(), tx_16x16_prob.data(), fc.tx_16x16_prob.size()); + std::memcpy(fc.tx_32x32_prob.data(), tx_32x32_prob.data(), fc.tx_32x32_prob.size()); + + for (s32 i = 0; i < 4; i++) { + for (s32 j = 0; j < 9; j++) { + fc.y_mode_prob[j + 9 * i] = j < 8 ? y_mode_prob_e0e7[i][j] : y_mode_prob_e8[i]; + } + } + + std::memcpy(fc.partition_prob.data(), partition_prob.data(), fc.partition_prob.size()); + + std::memcpy(fc.switchable_interp_prob.data(), switchable_interp_prob.data(), + fc.switchable_interp_prob.size()); + std::memcpy(fc.comp_inter_prob.data(), comp_inter_prob.data(), fc.comp_inter_prob.size()); + std::memcpy(fc.skip_probs.data(), skip_probs.data(), fc.skip_probs.size()); + + std::memcpy(fc.joints.data(), joints.data(), fc.joints.size()); + + std::memcpy(fc.sign.data(), sign.data(), fc.sign.size()); + std::memcpy(fc.class_0.data(), class_0.data(), fc.class_0.size()); + std::memcpy(fc.fr.data(), fr.data(), fc.fr.size()); + std::memcpy(fc.class_0_hp.data(), class_0_hp.data(), fc.class_0_hp.size()); + std::memcpy(fc.high_precision.data(), high_precision.data(), fc.high_precision.size()); + std::memcpy(fc.classes.data(), classes.data(), fc.classes.size()); + std::memcpy(fc.class_0_fr.data(), class_0_fr.data(), fc.class_0_fr.size()); + std::memcpy(fc.prob_bits.data(), pred_bits.data(), fc.prob_bits.size()); + std::memcpy(fc.single_ref_prob.data(), single_ref_prob.data(), fc.single_ref_prob.size()); + std::memcpy(fc.comp_ref_prob.data(), comp_ref_prob.data(), fc.comp_ref_prob.size()); + + std::memcpy(fc.coef_probs.data(), coef_probs.data(), fc.coef_probs.size()); + } +}; +static_assert(sizeof(EntropyProbs) == 0xEA0, "EntropyProbs is an invalid size"); + +enum class Ref { Last, Golden, AltRef }; + +struct RefPoolElement { + s64 frame{}; + Ref ref{}; + bool refresh{}; +}; + +struct FrameContexts { + s64 from{}; + bool adapted{}; + Vp9EntropyProbs probs{}; +}; + +}; // namespace Decoder +}; // namespace Tegra diff --git a/src/video_core/command_classes/host1x.cpp b/src/video_core/command_classes/host1x.cpp new file mode 100644 index 000000000..c4dd4881a --- /dev/null +++ b/src/video_core/command_classes/host1x.cpp @@ -0,0 +1,39 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "video_core/command_classes/host1x.h" +#include "video_core/gpu.h" + +Tegra::Host1x::Host1x(GPU& gpu_) : gpu(gpu_) {} + +Tegra::Host1x::~Host1x() = default; + +void Tegra::Host1x::StateWrite(u32 offset, u32 arguments) { + u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u32); + std::memcpy(state_offset, &arguments, sizeof(u32)); +} + +void Tegra::Host1x::ProcessMethod(Method method, const std::vector<u32>& arguments) { + StateWrite(static_cast<u32>(method), arguments[0]); + switch (method) { + case Method::WaitSyncpt: + Execute(arguments[0]); + break; + case Method::LoadSyncptPayload32: + syncpoint_value = arguments[0]; + break; + case Method::WaitSyncpt32: + Execute(arguments[0]); + break; + default: + UNIMPLEMENTED_MSG("Host1x method 0x{:X}", static_cast<u32>(method)); + break; + } +} + +void Tegra::Host1x::Execute(u32 data) { + // This method waits on a valid syncpoint. + // TODO: Implement when proper Async is in place +} diff --git a/src/video_core/command_classes/host1x.h b/src/video_core/command_classes/host1x.h new file mode 100644 index 000000000..013eaa0c1 --- /dev/null +++ b/src/video_core/command_classes/host1x.h @@ -0,0 +1,78 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <vector> +#include "common/common_funcs.h" +#include "common/common_types.h" + +namespace Tegra { +class GPU; +class Nvdec; + +class Host1x { +public: + struct Host1xClassRegisters { + u32 incr_syncpt{}; + u32 incr_syncpt_ctrl{}; + u32 incr_syncpt_error{}; + INSERT_PADDING_WORDS(5); + u32 wait_syncpt{}; + u32 wait_syncpt_base{}; + u32 wait_syncpt_incr{}; + u32 load_syncpt_base{}; + u32 incr_syncpt_base{}; + u32 clear{}; + u32 wait{}; + u32 wait_with_interrupt{}; + u32 delay_use{}; + u32 tick_count_high{}; + u32 tick_count_low{}; + u32 tick_ctrl{}; + INSERT_PADDING_WORDS(23); + u32 ind_ctrl{}; + u32 ind_off2{}; + u32 ind_off{}; + std::array<u32, 31> ind_data{}; + INSERT_PADDING_WORDS(1); + u32 load_syncpoint_payload32{}; + u32 stall_ctrl{}; + u32 wait_syncpt32{}; + u32 wait_syncpt_base32{}; + u32 load_syncpt_base32{}; + u32 incr_syncpt_base32{}; + u32 stall_count_high{}; + u32 stall_count_low{}; + u32 xref_ctrl{}; + u32 channel_xref_high{}; + u32 channel_xref_low{}; + }; + static_assert(sizeof(Host1xClassRegisters) == 0x164, "Host1xClassRegisters is an invalid size"); + + enum class Method : u32 { + WaitSyncpt = offsetof(Host1xClassRegisters, wait_syncpt) / 4, + LoadSyncptPayload32 = offsetof(Host1xClassRegisters, load_syncpoint_payload32) / 4, + WaitSyncpt32 = offsetof(Host1xClassRegisters, wait_syncpt32) / 4, + }; + + explicit Host1x(GPU& gpu); + ~Host1x(); + + /// Writes the method into the state, Invoke Execute() if encountered + void ProcessMethod(Method method, const std::vector<u32>& arguments); + +private: + /// For Host1x, execute is waiting on a syncpoint previously written into the state + void Execute(u32 data); + + /// Write argument into the provided offset + void StateWrite(u32 offset, u32 arguments); + + u32 syncpoint_value{}; + Host1xClassRegisters state{}; + GPU& gpu; +}; + +} // namespace Tegra diff --git a/src/video_core/command_classes/nvdec.cpp b/src/video_core/command_classes/nvdec.cpp new file mode 100644 index 000000000..8ca7a7b06 --- /dev/null +++ b/src/video_core/command_classes/nvdec.cpp @@ -0,0 +1,52 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "video_core/command_classes/nvdec.h" +#include "video_core/gpu.h" + +namespace Tegra { + +Nvdec::Nvdec(GPU& gpu_) : gpu(gpu_), codec(std::make_unique<Codec>(gpu)) {} + +Nvdec::~Nvdec() = default; + +void Nvdec::ProcessMethod(Method method, const std::vector<u32>& arguments) { + if (method == Method::SetVideoCodec) { + codec->StateWrite(static_cast<u32>(method), arguments[0]); + } else { + codec->StateWrite(static_cast<u32>(method), static_cast<u64>(arguments[0]) << 8); + } + + switch (method) { + case Method::SetVideoCodec: + codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(arguments[0])); + break; + case Method::Execute: + Execute(); + break; + } +} + +AVFrame* Nvdec::GetFrame() { + return codec->GetCurrentFrame(); +} + +const AVFrame* Nvdec::GetFrame() const { + return codec->GetCurrentFrame(); +} + +void Nvdec::Execute() { + switch (codec->GetCurrentCodec()) { + case NvdecCommon::VideoCodec::H264: + case NvdecCommon::VideoCodec::Vp9: + codec->Decode(); + break; + default: + UNIMPLEMENTED_MSG("Unknown codec {}", static_cast<u32>(codec->GetCurrentCodec())); + break; + } +} + +} // namespace Tegra diff --git a/src/video_core/command_classes/nvdec.h b/src/video_core/command_classes/nvdec.h new file mode 100644 index 000000000..eec4443f9 --- /dev/null +++ b/src/video_core/command_classes/nvdec.h @@ -0,0 +1,39 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <vector> +#include "common/common_types.h" +#include "video_core/command_classes/codecs/codec.h" + +namespace Tegra { +class GPU; + +class Nvdec { +public: + enum class Method : u32 { + SetVideoCodec = 0x80, + Execute = 0xc0, + }; + + explicit Nvdec(GPU& gpu); + ~Nvdec(); + + /// Writes the method into the state, Invoke Execute() if encountered + void ProcessMethod(Method method, const std::vector<u32>& arguments); + + /// Return most recently decoded frame + [[nodiscard]] AVFrame* GetFrame(); + [[nodiscard]] const AVFrame* GetFrame() const; + +private: + /// Invoke codec to decode a frame + void Execute(); + + GPU& gpu; + std::unique_ptr<Codec> codec; +}; +} // namespace Tegra diff --git a/src/video_core/command_classes/nvdec_common.h b/src/video_core/command_classes/nvdec_common.h new file mode 100644 index 000000000..01b5e086d --- /dev/null +++ b/src/video_core/command_classes/nvdec_common.h @@ -0,0 +1,48 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/common_funcs.h" +#include "common/common_types.h" + +namespace Tegra::NvdecCommon { + +struct NvdecRegisters { + INSERT_PADDING_WORDS(256); + u64 set_codec_id{}; + INSERT_PADDING_WORDS(254); + u64 set_platform_id{}; + u64 picture_info_offset{}; + u64 frame_bitstream_offset{}; + u64 frame_number{}; + u64 h264_slice_data_offsets{}; + u64 h264_mv_dump_offset{}; + INSERT_PADDING_WORDS(6); + u64 frame_stats_offset{}; + u64 h264_last_surface_luma_offset{}; + u64 h264_last_surface_chroma_offset{}; + std::array<u64, 17> surface_luma_offset{}; + std::array<u64, 17> surface_chroma_offset{}; + INSERT_PADDING_WORDS(132); + u64 vp9_entropy_probs_offset{}; + u64 vp9_backward_updates_offset{}; + u64 vp9_last_frame_segmap_offset{}; + u64 vp9_curr_frame_segmap_offset{}; + INSERT_PADDING_WORDS(2); + u64 vp9_last_frame_mvs_offset{}; + u64 vp9_curr_frame_mvs_offset{}; + INSERT_PADDING_WORDS(2); +}; +static_assert(sizeof(NvdecRegisters) == (0xBC0), "NvdecRegisters is incorrect size"); + +enum class VideoCodec : u32 { + None = 0x0, + H264 = 0x3, + Vp8 = 0x5, + H265 = 0x7, + Vp9 = 0x9, +}; + +} // namespace Tegra::NvdecCommon diff --git a/src/video_core/command_classes/sync_manager.cpp b/src/video_core/command_classes/sync_manager.cpp new file mode 100644 index 000000000..19dc9e0ab --- /dev/null +++ b/src/video_core/command_classes/sync_manager.cpp @@ -0,0 +1,60 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#include <algorithm> +#include "sync_manager.h" +#include "video_core/gpu.h" + +namespace Tegra { +SyncptIncrManager::SyncptIncrManager(GPU& gpu_) : gpu(gpu_) {} +SyncptIncrManager::~SyncptIncrManager() = default; + +void SyncptIncrManager::Increment(u32 id) { + increments.emplace_back(0, 0, id, true); + IncrementAllDone(); +} + +u32 SyncptIncrManager::IncrementWhenDone(u32 class_id, u32 id) { + const u32 handle = current_id++; + increments.emplace_back(handle, class_id, id); + return handle; +} + +void SyncptIncrManager::SignalDone(u32 handle) { + const auto done_incr = + std::find_if(increments.begin(), increments.end(), + [handle](const SyncptIncr& incr) { return incr.id == handle; }); + if (done_incr != increments.cend()) { + done_incr->complete = true; + } + IncrementAllDone(); +} + +void SyncptIncrManager::IncrementAllDone() { + std::size_t done_count = 0; + for (; done_count < increments.size(); ++done_count) { + if (!increments[done_count].complete) { + break; + } + gpu.IncrementSyncPoint(increments[done_count].syncpt_id); + } + increments.erase(increments.begin(), increments.begin() + done_count); +} +} // namespace Tegra diff --git a/src/video_core/command_classes/sync_manager.h b/src/video_core/command_classes/sync_manager.h new file mode 100644 index 000000000..2c321ec58 --- /dev/null +++ b/src/video_core/command_classes/sync_manager.h @@ -0,0 +1,64 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#pragma once + +#include <mutex> +#include <vector> +#include "common/common_types.h" + +namespace Tegra { +class GPU; +struct SyncptIncr { + u32 id; + u32 class_id; + u32 syncpt_id; + bool complete; + + SyncptIncr(u32 id_, u32 class_id_, u32 syncpt_id_, bool done = false) + : id(id_), class_id(class_id_), syncpt_id(syncpt_id_), complete(done) {} +}; + +class SyncptIncrManager { +public: + explicit SyncptIncrManager(GPU& gpu); + ~SyncptIncrManager(); + + /// Add syncpoint id and increment all + void Increment(u32 id); + + /// Returns a handle to increment later + u32 IncrementWhenDone(u32 class_id, u32 id); + + /// IncrememntAllDone, including handle + void SignalDone(u32 handle); + + /// Increment all sequential pending increments that are already done. + void IncrementAllDone(); + +private: + std::vector<SyncptIncr> increments; + std::mutex increment_lock; + u32 current_id{}; + + GPU& gpu; +}; + +} // namespace Tegra diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp new file mode 100644 index 000000000..5b52da277 --- /dev/null +++ b/src/video_core/command_classes/vic.cpp @@ -0,0 +1,180 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <array> +#include "common/assert.h" +#include "video_core/command_classes/nvdec.h" +#include "video_core/command_classes/vic.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" +#include "video_core/texture_cache/surface_params.h" + +extern "C" { +#include <libswscale/swscale.h> +} + +namespace Tegra { + +Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_) + : gpu(gpu_), nvdec_processor(std::move(nvdec_processor_)) {} +Vic::~Vic() = default; + +void Vic::VicStateWrite(u32 offset, u32 arguments) { + u8* const state_offset = reinterpret_cast<u8*>(&vic_state) + offset * sizeof(u32); + std::memcpy(state_offset, &arguments, sizeof(u32)); +} + +void Vic::ProcessMethod(Method method, const std::vector<u32>& arguments) { + LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", static_cast<u32>(method)); + VicStateWrite(static_cast<u32>(method), arguments[0]); + const u64 arg = static_cast<u64>(arguments[0]) << 8; + switch (method) { + case Method::Execute: + Execute(); + break; + case Method::SetConfigStructOffset: + config_struct_address = arg; + break; + case Method::SetOutputSurfaceLumaOffset: + output_surface_luma_address = arg; + break; + case Method::SetOutputSurfaceChromaUOffset: + output_surface_chroma_u_address = arg; + break; + case Method::SetOutputSurfaceChromaVOffset: + output_surface_chroma_v_address = arg; + break; + default: + break; + } +} + +void Vic::Execute() { + if (output_surface_luma_address == 0) { + LOG_ERROR(Service_NVDRV, "VIC Luma address not set. Recieved 0x{:X}", + vic_state.output_surface.luma_offset); + return; + } + const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)}; + const VideoPixelFormat pixel_format = + static_cast<VideoPixelFormat>(config.pixel_format.Value()); + switch (pixel_format) { + case VideoPixelFormat::BGRA8: + case VideoPixelFormat::RGBA8: { + LOG_TRACE(Service_NVDRV, "Writing RGB Frame"); + const auto* frame = nvdec_processor->GetFrame(); + + if (!frame || frame->width == 0 || frame->height == 0) { + return; + } + if (scaler_ctx == nullptr || frame->width != scaler_width || + frame->height != scaler_height) { + const AVPixelFormat target_format = + (pixel_format == VideoPixelFormat::RGBA8) ? AV_PIX_FMT_RGBA : AV_PIX_FMT_BGRA; + + sws_freeContext(scaler_ctx); + scaler_ctx = nullptr; + + // FFmpeg returns all frames in YUV420, convert it into expected format + scaler_ctx = + sws_getContext(frame->width, frame->height, AV_PIX_FMT_YUV420P, frame->width, + frame->height, target_format, 0, nullptr, nullptr, nullptr); + + scaler_width = frame->width; + scaler_height = frame->height; + } + // Get Converted frame + const std::size_t linear_size = frame->width * frame->height * 4; + + using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>; + AVMallocPtr converted_frame_buffer{static_cast<u8*>(av_malloc(linear_size)), av_free}; + + const int converted_stride{frame->width * 4}; + u8* const converted_frame_buf_addr{converted_frame_buffer.get()}; + + sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height, + &converted_frame_buf_addr, &converted_stride); + + const u32 blk_kind = static_cast<u32>(config.block_linear_kind); + if (blk_kind != 0) { + // swizzle pitch linear to block linear + const u32 block_height = static_cast<u32>(config.block_linear_height_log2); + const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1, + block_height, 0); + std::vector<u8> swizzled_data(size); + Tegra::Texture::CopySwizzledData(frame->width, frame->height, 1, 4, 4, + swizzled_data.data(), converted_frame_buffer.get(), + false, block_height, 0, 1); + + gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size); + gpu.Maxwell3D().OnMemoryWrite(); + } else { + // send pitch linear frame + gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr, + linear_size); + gpu.Maxwell3D().OnMemoryWrite(); + } + break; + } + case VideoPixelFormat::Yuv420: { + LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame"); + + const auto* frame = nvdec_processor->GetFrame(); + + if (!frame || frame->width == 0 || frame->height == 0) { + return; + } + + const std::size_t surface_width = config.surface_width_minus1 + 1; + const std::size_t surface_height = config.surface_height_minus1 + 1; + const std::size_t half_width = surface_width / 2; + const std::size_t half_height = config.surface_height_minus1 / 2; + const std::size_t aligned_width = (surface_width + 0xff) & ~0xff; + + const auto* luma_ptr = frame->data[0]; + const auto* chroma_b_ptr = frame->data[1]; + const auto* chroma_r_ptr = frame->data[2]; + const auto stride = frame->linesize[0]; + const auto half_stride = frame->linesize[1]; + + std::vector<u8> luma_buffer(aligned_width * surface_height); + std::vector<u8> chroma_buffer(aligned_width * half_height); + + // Populate luma buffer + for (std::size_t y = 0; y < surface_height - 1; ++y) { + std::size_t src = y * stride; + std::size_t dst = y * aligned_width; + + std::size_t size = surface_width; + + for (std::size_t offset = 0; offset < size; ++offset) { + luma_buffer[dst + offset] = luma_ptr[src + offset]; + } + } + gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), + luma_buffer.size()); + + // Populate chroma buffer from both channels with interleaving. + for (std::size_t y = 0; y < half_height; ++y) { + std::size_t src = y * half_stride; + std::size_t dst = y * aligned_width; + + for (std::size_t x = 0; x < half_width; ++x) { + chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x]; + chroma_buffer[dst + x * 2 + 1] = chroma_r_ptr[src + x]; + } + } + gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(), + chroma_buffer.size()); + gpu.Maxwell3D().OnMemoryWrite(); + break; + } + default: + UNIMPLEMENTED_MSG("Unknown video pixel format {}", config.pixel_format.Value()); + break; + } +} + +} // namespace Tegra diff --git a/src/video_core/command_classes/vic.h b/src/video_core/command_classes/vic.h new file mode 100644 index 000000000..8c4e284a1 --- /dev/null +++ b/src/video_core/command_classes/vic.h @@ -0,0 +1,110 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <vector> +#include "common/bit_field.h" +#include "common/common_types.h" + +struct SwsContext; + +namespace Tegra { +class GPU; +class Nvdec; + +struct PlaneOffsets { + u32 luma_offset{}; + u32 chroma_u_offset{}; + u32 chroma_v_offset{}; +}; + +struct VicRegisters { + INSERT_PADDING_WORDS(64); + u32 nop{}; + INSERT_PADDING_WORDS(15); + u32 pm_trigger{}; + INSERT_PADDING_WORDS(47); + u32 set_application_id{}; + u32 set_watchdog_timer{}; + INSERT_PADDING_WORDS(17); + u32 context_save_area{}; + u32 context_switch{}; + INSERT_PADDING_WORDS(43); + u32 execute{}; + INSERT_PADDING_WORDS(63); + std::array<std::array<PlaneOffsets, 8>, 8> surfacex_slots{}; + u32 picture_index{}; + u32 control_params{}; + u32 config_struct_offset{}; + u32 filter_struct_offset{}; + u32 palette_offset{}; + u32 hist_offset{}; + u32 context_id{}; + u32 fce_ucode_size{}; + PlaneOffsets output_surface{}; + u32 fce_ucode_offset{}; + INSERT_PADDING_WORDS(4); + std::array<u32, 8> slot_context_id{}; + INSERT_PADDING_WORDS(16); +}; +static_assert(sizeof(VicRegisters) == 0x7A0, "VicRegisters is an invalid size"); + +class Vic { +public: + enum class Method : u32 { + Execute = 0xc0, + SetControlParams = 0x1c1, + SetConfigStructOffset = 0x1c2, + SetOutputSurfaceLumaOffset = 0x1c8, + SetOutputSurfaceChromaUOffset = 0x1c9, + SetOutputSurfaceChromaVOffset = 0x1ca + }; + + explicit Vic(GPU& gpu, std::shared_ptr<Nvdec> nvdec_processor); + ~Vic(); + + /// Write to the device state. + void ProcessMethod(Method method, const std::vector<u32>& arguments); + +private: + void Execute(); + + void VicStateWrite(u32 offset, u32 arguments); + VicRegisters vic_state{}; + + enum class VideoPixelFormat : u64_le { + RGBA8 = 0x1f, + BGRA8 = 0x20, + Yuv420 = 0x44, + }; + + union VicConfig { + u64_le raw{}; + BitField<0, 7, u64_le> pixel_format; + BitField<7, 2, u64_le> chroma_loc_horiz; + BitField<9, 2, u64_le> chroma_loc_vert; + BitField<11, 4, u64_le> block_linear_kind; + BitField<15, 4, u64_le> block_linear_height_log2; + BitField<19, 3, u64_le> reserved0; + BitField<22, 10, u64_le> reserved1; + BitField<32, 14, u64_le> surface_width_minus1; + BitField<46, 14, u64_le> surface_height_minus1; + }; + + GPU& gpu; + std::shared_ptr<Tegra::Nvdec> nvdec_processor; + + GPUVAddr config_struct_address{}; + GPUVAddr output_surface_luma_address{}; + GPUVAddr output_surface_chroma_u_address{}; + GPUVAddr output_surface_chroma_v_address{}; + + SwsContext* scaler_ctx{}; + s32 scaler_width{}; + s32 scaler_height{}; +}; + +} // namespace Tegra diff --git a/src/video_core/compatible_formats.cpp b/src/video_core/compatible_formats.cpp new file mode 100644 index 000000000..b06c32c84 --- /dev/null +++ b/src/video_core/compatible_formats.cpp @@ -0,0 +1,155 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <array> +#include <bitset> +#include <cstddef> + +#include "video_core/compatible_formats.h" +#include "video_core/surface.h" + +namespace VideoCore::Surface { + +namespace { + +// Compatibility table taken from Table 3.X.2 in: +// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_view.txt + +constexpr std::array VIEW_CLASS_128_BITS = { + PixelFormat::R32G32B32A32_FLOAT, + PixelFormat::R32G32B32A32_UINT, + PixelFormat::R32G32B32A32_SINT, +}; + +constexpr std::array VIEW_CLASS_96_BITS = { + PixelFormat::R32G32B32_FLOAT, +}; +// Missing formats: +// PixelFormat::RGB32UI, +// PixelFormat::RGB32I, + +constexpr std::array VIEW_CLASS_64_BITS = { + PixelFormat::R32G32_FLOAT, PixelFormat::R32G32_UINT, + PixelFormat::R32G32_SINT, PixelFormat::R16G16B16A16_FLOAT, + PixelFormat::R16G16B16A16_UNORM, PixelFormat::R16G16B16A16_SNORM, + PixelFormat::R16G16B16A16_UINT, PixelFormat::R16G16B16A16_SINT, +}; + +// TODO: How should we handle 48 bits? + +constexpr std::array VIEW_CLASS_32_BITS = { + PixelFormat::R16G16_FLOAT, PixelFormat::B10G11R11_FLOAT, PixelFormat::R32_FLOAT, + PixelFormat::A2B10G10R10_UNORM, PixelFormat::R16G16_UINT, PixelFormat::R32_UINT, + PixelFormat::R16G16_SINT, PixelFormat::R32_SINT, PixelFormat::A8B8G8R8_UNORM, + PixelFormat::R16G16_UNORM, PixelFormat::A8B8G8R8_SNORM, PixelFormat::R16G16_SNORM, + PixelFormat::A8B8G8R8_SRGB, PixelFormat::E5B9G9R9_FLOAT, PixelFormat::B8G8R8A8_UNORM, + PixelFormat::B8G8R8A8_SRGB, PixelFormat::A8B8G8R8_UINT, PixelFormat::A8B8G8R8_SINT, + PixelFormat::A2B10G10R10_UINT, +}; + +// TODO: How should we handle 24 bits? + +constexpr std::array VIEW_CLASS_16_BITS = { + PixelFormat::R16_FLOAT, PixelFormat::R8G8_UINT, PixelFormat::R16_UINT, + PixelFormat::R16_SINT, PixelFormat::R8G8_UNORM, PixelFormat::R16_UNORM, + PixelFormat::R8G8_SNORM, PixelFormat::R16_SNORM, PixelFormat::R8G8_SINT, +}; + +constexpr std::array VIEW_CLASS_8_BITS = { + PixelFormat::R8_UINT, + PixelFormat::R8_UNORM, + PixelFormat::R8_SINT, + PixelFormat::R8_SNORM, +}; + +constexpr std::array VIEW_CLASS_RGTC1_RED = { + PixelFormat::BC4_UNORM, + PixelFormat::BC4_SNORM, +}; + +constexpr std::array VIEW_CLASS_RGTC2_RG = { + PixelFormat::BC5_UNORM, + PixelFormat::BC5_SNORM, +}; + +constexpr std::array VIEW_CLASS_BPTC_UNORM = { + PixelFormat::BC7_UNORM, + PixelFormat::BC7_SRGB, +}; + +constexpr std::array VIEW_CLASS_BPTC_FLOAT = { + PixelFormat::BC6H_SFLOAT, + PixelFormat::BC6H_UFLOAT, +}; + +// Compatibility table taken from Table 4.X.1 in: +// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_copy_image.txt + +constexpr std::array COPY_CLASS_128_BITS = { + PixelFormat::R32G32B32A32_UINT, PixelFormat::R32G32B32A32_FLOAT, PixelFormat::R32G32B32A32_SINT, + PixelFormat::BC2_UNORM, PixelFormat::BC2_SRGB, PixelFormat::BC3_UNORM, + PixelFormat::BC3_SRGB, PixelFormat::BC5_UNORM, PixelFormat::BC5_SNORM, + PixelFormat::BC7_UNORM, PixelFormat::BC7_SRGB, PixelFormat::BC6H_SFLOAT, + PixelFormat::BC6H_UFLOAT, +}; +// Missing formats: +// PixelFormat::RGBA32I +// COMPRESSED_RG_RGTC2 + +constexpr std::array COPY_CLASS_64_BITS = { + PixelFormat::R16G16B16A16_FLOAT, PixelFormat::R16G16B16A16_UINT, + PixelFormat::R16G16B16A16_UNORM, PixelFormat::R16G16B16A16_SNORM, + PixelFormat::R16G16B16A16_SINT, PixelFormat::R32G32_UINT, + PixelFormat::R32G32_FLOAT, PixelFormat::R32G32_SINT, + PixelFormat::BC1_RGBA_UNORM, PixelFormat::BC1_RGBA_SRGB, +}; +// Missing formats: +// COMPRESSED_RGB_S3TC_DXT1_EXT +// COMPRESSED_SRGB_S3TC_DXT1_EXT +// COMPRESSED_RGBA_S3TC_DXT1_EXT +// COMPRESSED_SIGNED_RED_RGTC1 + +void Enable(FormatCompatibility::Table& compatiblity, size_t format_a, size_t format_b) { + compatiblity[format_a][format_b] = true; + compatiblity[format_b][format_a] = true; +} + +void Enable(FormatCompatibility::Table& compatibility, PixelFormat format_a, PixelFormat format_b) { + Enable(compatibility, static_cast<size_t>(format_a), static_cast<size_t>(format_b)); +} + +template <typename Range> +void EnableRange(FormatCompatibility::Table& compatibility, const Range& range) { + for (auto it_a = range.begin(); it_a != range.end(); ++it_a) { + for (auto it_b = it_a; it_b != range.end(); ++it_b) { + Enable(compatibility, *it_a, *it_b); + } + } +} + +} // Anonymous namespace + +FormatCompatibility::FormatCompatibility() { + for (size_t i = 0; i < MaxPixelFormat; ++i) { + // Identity is allowed + Enable(view, i, i); + } + + EnableRange(view, VIEW_CLASS_128_BITS); + EnableRange(view, VIEW_CLASS_96_BITS); + EnableRange(view, VIEW_CLASS_64_BITS); + EnableRange(view, VIEW_CLASS_32_BITS); + EnableRange(view, VIEW_CLASS_16_BITS); + EnableRange(view, VIEW_CLASS_8_BITS); + EnableRange(view, VIEW_CLASS_RGTC1_RED); + EnableRange(view, VIEW_CLASS_RGTC2_RG); + EnableRange(view, VIEW_CLASS_BPTC_UNORM); + EnableRange(view, VIEW_CLASS_BPTC_FLOAT); + + copy = view; + EnableRange(copy, COPY_CLASS_128_BITS); + EnableRange(copy, COPY_CLASS_64_BITS); +} + +} // namespace VideoCore::Surface diff --git a/src/video_core/compatible_formats.h b/src/video_core/compatible_formats.h new file mode 100644 index 000000000..51766349b --- /dev/null +++ b/src/video_core/compatible_formats.h @@ -0,0 +1,34 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <bitset> +#include <cstddef> + +#include "video_core/surface.h" + +namespace VideoCore::Surface { + +class FormatCompatibility { +public: + using Table = std::array<std::bitset<MaxPixelFormat>, MaxPixelFormat>; + + explicit FormatCompatibility(); + + bool TestView(PixelFormat format_a, PixelFormat format_b) const noexcept { + return view[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)]; + } + + bool TestCopy(PixelFormat format_a, PixelFormat format_b) const noexcept { + return copy[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)]; + } + +private: + Table view; + Table copy; +}; + +} // namespace VideoCore::Surface diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index 713c14182..d8801b1f5 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -2,6 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include "common/cityhash.h" #include "common/microprofile.h" #include "core/core.h" #include "core/memory.h" @@ -12,7 +13,7 @@ namespace Tegra { -DmaPusher::DmaPusher(GPU& gpu) : gpu(gpu) {} +DmaPusher::DmaPusher(Core::System& system, GPU& gpu) : gpu{gpu}, system{system} {} DmaPusher::~DmaPusher() = default; @@ -21,17 +22,22 @@ MICROPROFILE_DEFINE(DispatchCalls, "GPU", "Execute command buffer", MP_RGB(128, void DmaPusher::DispatchCalls() { MICROPROFILE_SCOPE(DispatchCalls); + gpu.SyncGuestHost(); // On entering GPU code, assume all memory may be touched by the ARM core. gpu.Maxwell3D().OnMemoryWrite(); dma_pushbuffer_subindex = 0; - while (Core::System::GetInstance().IsPoweredOn()) { + dma_state.is_last_call = true; + + while (system.IsPoweredOn()) { if (!Step()) { break; } } gpu.FlushCommands(); + gpu.SyncGuestHost(); + gpu.OnCommandListEnd(); } bool DmaPusher::Step() { @@ -40,44 +46,59 @@ bool DmaPusher::Step() { return false; } - const CommandList& command_list{dma_pushbuffer.front()}; - ASSERT_OR_EXECUTE(!command_list.empty(), { - // Somehow the command_list is empty, in order to avoid a crash - // We ignore it and assume its size is 0. - dma_pushbuffer.pop(); - dma_pushbuffer_subindex = 0; - return true; - }); - const CommandListHeader command_list_header{command_list[dma_pushbuffer_subindex++]}; - GPUVAddr dma_get = command_list_header.addr; - GPUVAddr dma_put = dma_get + command_list_header.size * sizeof(u32); - bool non_main = command_list_header.is_non_main; - - if (dma_pushbuffer_subindex >= command_list.size()) { - // We've gone through the current list, remove it from the queue - dma_pushbuffer.pop(); - dma_pushbuffer_subindex = 0; - } + CommandList& command_list{dma_pushbuffer.front()}; - if (command_list_header.size == 0) { - return true; - } + ASSERT_OR_EXECUTE( + command_list.command_lists.size() || command_list.prefetch_command_list.size(), { + // Somehow the command_list is empty, in order to avoid a crash + // We ignore it and assume its size is 0. + dma_pushbuffer.pop(); + dma_pushbuffer_subindex = 0; + return true; + }); - // Push buffer non-empty, read a word - command_headers.resize(command_list_header.size); - gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(), - command_list_header.size * sizeof(u32)); + if (command_list.prefetch_command_list.size()) { + // Prefetched command list from nvdrv, used for things like synchronization + command_headers = std::move(command_list.prefetch_command_list); + dma_pushbuffer.pop(); + } else { + const CommandListHeader command_list_header{ + command_list.command_lists[dma_pushbuffer_subindex++]}; + const GPUVAddr dma_get = command_list_header.addr; + + if (dma_pushbuffer_subindex >= command_list.command_lists.size()) { + // We've gone through the current list, remove it from the queue + dma_pushbuffer.pop(); + dma_pushbuffer_subindex = 0; + } - for (const CommandHeader& command_header : command_headers) { + if (command_list_header.size == 0) { + return true; + } - // now, see if we're in the middle of a command - if (dma_state.length_pending) { - // Second word of long non-inc methods command - method count - dma_state.length_pending = 0; - dma_state.method_count = command_header.method_count_; - } else if (dma_state.method_count) { + // Push buffer non-empty, read a word + command_headers.resize(command_list_header.size); + gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(), + command_list_header.size * sizeof(u32)); + } + for (std::size_t index = 0; index < command_headers.size();) { + const CommandHeader& command_header = command_headers[index]; + + if (dma_state.method_count) { // Data word of methods command - CallMethod(command_header.argument); + if (dma_state.non_incrementing) { + const u32 max_write = static_cast<u32>( + std::min<std::size_t>(index + dma_state.method_count, command_headers.size()) - + index); + CallMultiMethod(&command_header.argument, max_write); + dma_state.method_count -= max_write; + dma_state.is_last_call = true; + index += max_write; + continue; + } else { + dma_state.is_last_call = dma_state.method_count <= 1; + CallMethod(command_header.argument); + } if (!dma_state.non_incrementing) { dma_state.method++; @@ -117,11 +138,7 @@ bool DmaPusher::Step() { break; } } - } - - if (!non_main) { - // TODO (degasus): This is dead code, as dma_mget is never read. - dma_mget = dma_put; + index++; } return true; @@ -134,7 +151,22 @@ void DmaPusher::SetState(const CommandHeader& command_header) { } void DmaPusher::CallMethod(u32 argument) const { - gpu.CallMethod({dma_state.method, argument, dma_state.subchannel, dma_state.method_count}); + if (dma_state.method < non_puller_methods) { + gpu.CallMethod({dma_state.method, argument, dma_state.subchannel, dma_state.method_count}); + } else { + subchannels[dma_state.subchannel]->CallMethod(dma_state.method, argument, + dma_state.is_last_call); + } +} + +void DmaPusher::CallMultiMethod(const u32* base_start, u32 num_methods) const { + if (dma_state.method < non_puller_methods) { + gpu.CallMultiMethod(dma_state.method, dma_state.subchannel, base_start, num_methods, + dma_state.method_count); + } else { + subchannels[dma_state.subchannel]->CallMultiMethod(dma_state.method, base_start, + num_methods, dma_state.method_count); + } } } // namespace Tegra diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h index 6ab06518f..96ac267f7 100644 --- a/src/video_core/dma_pusher.h +++ b/src/video_core/dma_pusher.h @@ -4,14 +4,22 @@ #pragma once +#include <array> #include <vector> #include <queue> #include "common/bit_field.h" #include "common/common_types.h" +#include "video_core/engines/engine_interface.h" + +namespace Core { +class System; +} namespace Tegra { +class GPU; + enum class SubmissionMode : u32 { IncreasingOld = 0, Increasing = 1, @@ -21,6 +29,31 @@ enum class SubmissionMode : u32 { IncreaseOnce = 5 }; +// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence +// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4. +// So the values you see in docs might be multiplied by 4. +enum class BufferMethods : u32 { + BindObject = 0x0, + Nop = 0x2, + SemaphoreAddressHigh = 0x4, + SemaphoreAddressLow = 0x5, + SemaphoreSequence = 0x6, + SemaphoreTrigger = 0x7, + NotifyIntr = 0x8, + WrcacheFlush = 0x9, + Unk28 = 0xA, + UnkCacheFlush = 0xB, + RefCnt = 0x14, + SemaphoreAcquire = 0x1A, + SemaphoreRelease = 0x1B, + FenceValue = 0x1C, + FenceAction = 0x1D, + WaitForInterrupt = 0x1E, + Unk7c = 0x1F, + Yield = 0x20, + NonPullerMethods = 0x40, +}; + struct CommandListHeader { union { u64 raw; @@ -43,9 +76,23 @@ union CommandHeader { static_assert(std::is_standard_layout_v<CommandHeader>, "CommandHeader is not standard layout"); static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!"); -class GPU; - -using CommandList = std::vector<Tegra::CommandListHeader>; +inline CommandHeader BuildCommandHeader(BufferMethods method, u32 arg_count, SubmissionMode mode) { + CommandHeader result{}; + result.method.Assign(static_cast<u32>(method)); + result.arg_count.Assign(arg_count); + result.mode.Assign(mode); + return result; +} + +struct CommandList final { + CommandList() = default; + explicit CommandList(std::size_t size) : command_lists(size) {} + explicit CommandList(std::vector<Tegra::CommandHeader>&& prefetch_command_list) + : prefetch_command_list{std::move(prefetch_command_list)} {} + + std::vector<Tegra::CommandListHeader> command_lists; + std::vector<Tegra::CommandHeader> prefetch_command_list; +}; /** * The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the @@ -54,9 +101,9 @@ using CommandList = std::vector<Tegra::CommandListHeader>; * See https://envytools.readthedocs.io/en/latest/hw/fifo/dma-pusher.html#fifo-dma-pusher for * details on this implementation. */ -class DmaPusher { +class DmaPusher final { public: - explicit DmaPusher(GPU& gpu); + explicit DmaPusher(Core::System& system, GPU& gpu); ~DmaPusher(); void Push(CommandList&& entries) { @@ -65,14 +112,19 @@ public: void DispatchCalls(); + void BindSubchannel(Tegra::Engines::EngineInterface* engine, u32 subchannel_id) { + subchannels[subchannel_id] = engine; + } + private: + static constexpr u32 non_puller_methods = 0x40; + static constexpr u32 max_subchannels = 8; bool Step(); void SetState(const CommandHeader& command_header); void CallMethod(u32 argument) const; - - GPU& gpu; + void CallMultiMethod(const u32* base_start, u32 num_methods) const; std::vector<CommandHeader> command_headers; ///< Buffer for list of commands fetched at once @@ -85,13 +137,18 @@ private: u32 method_count; ///< Current method count u32 length_pending; ///< Large NI command length pending bool non_incrementing; ///< Current command's NI flag + bool is_last_call; }; DmaState dma_state{}; bool dma_increment_once{}; - GPUVAddr dma_mget{}; ///< main pushbuffer last read address bool ib_enable{true}; ///< IB mode enabled + + std::array<Tegra::Engines::EngineInterface*, max_subchannels> subchannels{}; + + GPU& gpu; + Core::System& system; }; } // namespace Tegra diff --git a/src/video_core/engines/const_buffer_engine_interface.h b/src/video_core/engines/const_buffer_engine_interface.h index ebe139504..f46e81bb7 100644 --- a/src/video_core/engines/const_buffer_engine_interface.h +++ b/src/video_core/engines/const_buffer_engine_interface.h @@ -93,6 +93,7 @@ public: virtual SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const = 0; virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer, u64 offset) const = 0; + virtual SamplerDescriptor AccessSampler(u32 handle) const = 0; virtual u32 GetBoundBuffer() const = 0; virtual VideoCore::GuestDriverProfile& AccessGuestDriverProfile() = 0; diff --git a/src/video_core/engines/engine_interface.h b/src/video_core/engines/engine_interface.h new file mode 100644 index 000000000..18a9db7e6 --- /dev/null +++ b/src/video_core/engines/engine_interface.h @@ -0,0 +1,22 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <type_traits> +#include "common/common_types.h" + +namespace Tegra::Engines { + +class EngineInterface { +public: + /// Write the value to the register identified by method. + virtual void CallMethod(u32 method, u32 method_argument, bool is_last_call) = 0; + + /// Write multiple values to the register identified by method. + virtual void CallMultiMethod(u32 method, const u32* base_start, u32 amount, + u32 methods_pending) = 0; +}; + +} // namespace Tegra::Engines diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp index 85d308e26..9409c4075 100644 --- a/src/video_core/engines/fermi_2d.cpp +++ b/src/video_core/engines/fermi_2d.cpp @@ -10,15 +10,21 @@ namespace Tegra::Engines { -Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {} +Fermi2D::Fermi2D() = default; -void Fermi2D::CallMethod(const GPU::MethodCall& method_call) { - ASSERT_MSG(method_call.method < Regs::NUM_REGS, +Fermi2D::~Fermi2D() = default; + +void Fermi2D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) { + rasterizer = &rasterizer_; +} + +void Fermi2D::CallMethod(u32 method, u32 method_argument, bool is_last_call) { + ASSERT_MSG(method < Regs::NUM_REGS, "Invalid Fermi2D register, increase the size of the Regs structure"); - regs.reg_array[method_call.method] = method_call.argument; + regs.reg_array[method] = method_argument; - switch (method_call.method) { + switch (method) { // Trigger the surface copy on the last register write. This is blit_src_y, but this is 64-bit, // so trigger on the second 32-bit write. case FERMI2D_REG_INDEX(blit_src_y) + 1: { @@ -28,7 +34,13 @@ void Fermi2D::CallMethod(const GPU::MethodCall& method_call) { } } -std::pair<u32, u32> DelimitLine(u32 src_1, u32 src_2, u32 dst_1, u32 dst_2, u32 src_line) { +void Fermi2D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32 methods_pending) { + for (std::size_t i = 0; i < amount; i++) { + CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); + } +} + +static std::pair<u32, u32> DelimitLine(u32 src_1, u32 src_2, u32 dst_1, u32 dst_2, u32 src_line) { const u32 line_a = src_2 - src_1; const u32 line_b = dst_2 - dst_1; const u32 excess = std::max<s32>(0, line_a - src_line + src_1); @@ -75,13 +87,13 @@ void Fermi2D::HandleSurfaceCopy() { const Common::Rectangle<u32> src_rect{src_blit_x1, src_blit_y1, src_blit_x2, src_blit_y2}; const Common::Rectangle<u32> dst_rect{regs.blit_dst_x, regs.blit_dst_y, dst_blit_x2, dst_blit_y2}; - Config copy_config; - copy_config.operation = regs.operation; - copy_config.filter = regs.blit_control.filter; - copy_config.src_rect = src_rect; - copy_config.dst_rect = dst_rect; - - if (!rasterizer.AccelerateSurfaceCopy(regs.src, regs.dst, copy_config)) { + const Config copy_config{ + .operation = regs.operation, + .filter = regs.blit_control.filter, + .src_rect = src_rect, + .dst_rect = dst_rect, + }; + if (!rasterizer->AccelerateSurfaceCopy(regs.src, regs.dst, copy_config)) { UNIMPLEMENTED(); } } diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h index dba342c70..0909709ec 100644 --- a/src/video_core/engines/fermi_2d.h +++ b/src/video_core/engines/fermi_2d.h @@ -10,6 +10,7 @@ #include "common/common_funcs.h" #include "common/common_types.h" #include "common/math_util.h" +#include "video_core/engines/engine_interface.h" #include "video_core/gpu.h" namespace Tegra { @@ -31,13 +32,20 @@ namespace Tegra::Engines { #define FERMI2D_REG_INDEX(field_name) \ (offsetof(Tegra::Engines::Fermi2D::Regs, field_name) / sizeof(u32)) -class Fermi2D final { +class Fermi2D final : public EngineInterface { public: - explicit Fermi2D(VideoCore::RasterizerInterface& rasterizer); - ~Fermi2D() = default; + explicit Fermi2D(); + ~Fermi2D(); + + /// Binds a rasterizer to this engine. + void BindRasterizer(VideoCore::RasterizerInterface& rasterizer); /// Write the value to the register identified by method. - void CallMethod(const GPU::MethodCall& method_call); + void CallMethod(u32 method, u32 method_argument, bool is_last_call) override; + + /// Write multiple values to the register identified by method. + void CallMultiMethod(u32 method, const u32* base_start, u32 amount, + u32 methods_pending) override; enum class Origin : u32 { Center = 0, @@ -137,14 +145,14 @@ public: } regs{}; struct Config { - Operation operation; - Filter filter; + Operation operation{}; + Filter filter{}; Common::Rectangle<u32> src_rect; Common::Rectangle<u32> dst_rect; }; private: - VideoCore::RasterizerInterface& rasterizer; + VideoCore::RasterizerInterface* rasterizer; /// Performs the copy from the source surface to the destination surface as configured in the /// registers. diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index 368c75a66..898370739 100644 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -16,28 +16,28 @@ namespace Tegra::Engines { -KeplerCompute::KeplerCompute(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - MemoryManager& memory_manager) - : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, upload_state{ - memory_manager, - regs.upload} {} +KeplerCompute::KeplerCompute(Core::System& system_, MemoryManager& memory_manager_) + : system{system_}, memory_manager{memory_manager_}, upload_state{memory_manager, regs.upload} {} KeplerCompute::~KeplerCompute() = default; -void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) { - ASSERT_MSG(method_call.method < Regs::NUM_REGS, +void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) { + rasterizer = &rasterizer_; +} + +void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_call) { + ASSERT_MSG(method < Regs::NUM_REGS, "Invalid KeplerCompute register, increase the size of the Regs structure"); - regs.reg_array[method_call.method] = method_call.argument; + regs.reg_array[method] = method_argument; - switch (method_call.method) { + switch (method) { case KEPLER_COMPUTE_REG_INDEX(exec_upload): { upload_state.ProcessExec(regs.exec_upload.linear != 0); break; } case KEPLER_COMPUTE_REG_INDEX(data_upload): { - const bool is_last_call = method_call.IsLastCall(); - upload_state.ProcessData(method_call.argument, is_last_call); + upload_state.ProcessData(method_argument, is_last_call); if (is_last_call) { system.GPU().Maxwell3D().OnMemoryWrite(); } @@ -51,6 +51,13 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) { } } +void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amount, + u32 methods_pending) { + for (std::size_t i = 0; i < amount; i++) { + CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); + } +} + Texture::FullTextureInfo KeplerCompute::GetTexture(std::size_t offset) const { const std::bitset<8> cbuf_mask = launch_description.const_buffer_enable_mask.Value(); ASSERT(cbuf_mask[regs.tex_cb_index]); @@ -86,8 +93,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con ASSERT(stage == ShaderType::Compute); const auto& tex_info_buffer = launch_description.const_buffer_config[const_buffer]; const GPUVAddr tex_info_address = tex_info_buffer.Address() + offset; + return AccessSampler(memory_manager.Read<u32>(tex_info_address)); +} - const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)}; +SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const { + const Texture::TextureHandle tex_handle{handle}; const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); @@ -95,11 +105,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con } VideoCore::GuestDriverProfile& KeplerCompute::AccessGuestDriverProfile() { - return rasterizer.AccessGuestDriverProfile(); + return rasterizer->AccessGuestDriverProfile(); } const VideoCore::GuestDriverProfile& KeplerCompute::AccessGuestDriverProfile() const { - return rasterizer.AccessGuestDriverProfile(); + return rasterizer->AccessGuestDriverProfile(); } void KeplerCompute::ProcessLaunch() { @@ -110,7 +120,7 @@ void KeplerCompute::ProcessLaunch() { const GPUVAddr code_addr = regs.code_loc.Address() + launch_description.program_start; LOG_TRACE(HW_GPU, "Compute invocation launched at address 0x{:016x}", code_addr); - rasterizer.DispatchCompute(code_addr); + rasterizer->DispatchCompute(code_addr); } Texture::TICEntry KeplerCompute::GetTICEntry(u32 tic_index) const { diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h index eeb79c56f..7f2500aab 100644 --- a/src/video_core/engines/kepler_compute.h +++ b/src/video_core/engines/kepler_compute.h @@ -11,6 +11,7 @@ #include "common/common_funcs.h" #include "common/common_types.h" #include "video_core/engines/const_buffer_engine_interface.h" +#include "video_core/engines/engine_interface.h" #include "video_core/engines/engine_upload.h" #include "video_core/engines/shader_type.h" #include "video_core/gpu.h" @@ -39,12 +40,14 @@ namespace Tegra::Engines { #define KEPLER_COMPUTE_REG_INDEX(field_name) \ (offsetof(Tegra::Engines::KeplerCompute::Regs, field_name) / sizeof(u32)) -class KeplerCompute final : public ConstBufferEngineInterface { +class KeplerCompute final : public ConstBufferEngineInterface, public EngineInterface { public: - explicit KeplerCompute(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - MemoryManager& memory_manager); + explicit KeplerCompute(Core::System& system, MemoryManager& memory_manager); ~KeplerCompute(); + /// Binds a rasterizer to this engine. + void BindRasterizer(VideoCore::RasterizerInterface& rasterizer); + static constexpr std::size_t NumConstBuffers = 8; struct Regs { @@ -200,7 +203,11 @@ public: "KeplerCompute LaunchParams has wrong size"); /// Write the value to the register identified by method. - void CallMethod(const GPU::MethodCall& method_call); + void CallMethod(u32 method, u32 method_argument, bool is_last_call) override; + + /// Write multiple values to the register identified by method. + void CallMultiMethod(u32 method, const u32* base_start, u32 amount, + u32 methods_pending) override; Texture::FullTextureInfo GetTexture(std::size_t offset) const; @@ -214,6 +221,8 @@ public: SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer, u64 offset) const override; + SamplerDescriptor AccessSampler(u32 handle) const override; + u32 GetBoundBuffer() const override { return regs.tex_cb_index; } @@ -223,11 +232,6 @@ public: const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override; private: - Core::System& system; - VideoCore::RasterizerInterface& rasterizer; - MemoryManager& memory_manager; - Upload::State upload_state; - void ProcessLaunch(); /// Retrieves information about a specific TIC entry from the TIC buffer. @@ -235,6 +239,11 @@ private: /// Retrieves information about a specific TSC entry from the TSC buffer. Texture::TSCEntry GetTSCEntry(u32 tsc_index) const; + + Core::System& system; + MemoryManager& memory_manager; + VideoCore::RasterizerInterface* rasterizer = nullptr; + Upload::State upload_state; }; #define ASSERT_REG_POSITION(field_name, position) \ diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index 597872e43..dc71b2eec 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -19,20 +19,19 @@ KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager) KeplerMemory::~KeplerMemory() = default; -void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) { - ASSERT_MSG(method_call.method < Regs::NUM_REGS, +void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call) { + ASSERT_MSG(method < Regs::NUM_REGS, "Invalid KeplerMemory register, increase the size of the Regs structure"); - regs.reg_array[method_call.method] = method_call.argument; + regs.reg_array[method] = method_argument; - switch (method_call.method) { + switch (method) { case KEPLERMEMORY_REG_INDEX(exec): { upload_state.ProcessExec(regs.exec.linear != 0); break; } case KEPLERMEMORY_REG_INDEX(data): { - const bool is_last_call = method_call.IsLastCall(); - upload_state.ProcessData(method_call.argument, is_last_call); + upload_state.ProcessData(method_argument, is_last_call); if (is_last_call) { system.GPU().Maxwell3D().OnMemoryWrite(); } @@ -41,4 +40,11 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) { } } +void KeplerMemory::CallMultiMethod(u32 method, const u32* base_start, u32 amount, + u32 methods_pending) { + for (std::size_t i = 0; i < amount; i++) { + CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); + } +} + } // namespace Tegra::Engines diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h index 396fb6e86..5b7f71a00 100644 --- a/src/video_core/engines/kepler_memory.h +++ b/src/video_core/engines/kepler_memory.h @@ -10,6 +10,7 @@ #include "common/bit_field.h" #include "common/common_funcs.h" #include "common/common_types.h" +#include "video_core/engines/engine_interface.h" #include "video_core/engines/engine_upload.h" #include "video_core/gpu.h" @@ -32,13 +33,17 @@ namespace Tegra::Engines { #define KEPLERMEMORY_REG_INDEX(field_name) \ (offsetof(Tegra::Engines::KeplerMemory::Regs, field_name) / sizeof(u32)) -class KeplerMemory final { +class KeplerMemory final : public EngineInterface { public: KeplerMemory(Core::System& system, MemoryManager& memory_manager); ~KeplerMemory(); /// Write the value to the register identified by method. - void CallMethod(const GPU::MethodCall& method_call); + void CallMethod(u32 method, u32 method_argument, bool is_last_call) override; + + /// Write multiple values to the register identified by method. + void CallMultiMethod(u32 method, const u32* base_start, u32 amount, + u32 methods_pending) override; struct Regs { static constexpr size_t NUM_REGS = 0x7F; diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index ba63b44b4..6287df633 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -22,15 +22,19 @@ using VideoCore::QueryType; /// First register id that is actually a Macro call. constexpr u32 MacroRegistersStart = 0xE00; -Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - MemoryManager& memory_manager) - : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, - macro_interpreter{*this}, upload_state{memory_manager, regs.upload} { +Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_) + : system{system_}, memory_manager{memory_manager_}, macro_engine{GetMacroEngine(*this)}, + upload_state{memory_manager, regs.upload} { dirty.flags.flip(); - InitializeRegisterDefaults(); } +Maxwell3D::~Maxwell3D() = default; + +void Maxwell3D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) { + rasterizer = &rasterizer_; +} + void Maxwell3D::InitializeRegisterDefaults() { // Initializes registers to their default values - what games expect them to be at boot. This is // for certain registers that may not be explicitly set by games. @@ -44,6 +48,12 @@ void Maxwell3D::InitializeRegisterDefaults() { viewport.depth_range_near = 0.0f; viewport.depth_range_far = 1.0f; } + for (auto& viewport : regs.viewport_transform) { + viewport.swizzle.x.Assign(Regs::ViewportSwizzle::PositiveX); + viewport.swizzle.y.Assign(Regs::ViewportSwizzle::PositiveY); + viewport.swizzle.z.Assign(Regs::ViewportSwizzle::PositiveZ); + viewport.swizzle.w.Assign(Regs::ViewportSwizzle::PositiveW); + } // Doom and Bomberman seems to use the uninitialized registers and just enable blend // so initialize blend registers with sane values @@ -92,11 +102,19 @@ void Maxwell3D::InitializeRegisterDefaults() { color_mask.A.Assign(1); } + for (auto& format : regs.vertex_attrib_format) { + format.constant.Assign(1); + } + // NVN games expect these values to be enabled at boot regs.rasterize_enable = 1; regs.rt_separate_frag_data = 1; regs.framebuffer_srgb = 1; + regs.line_width_aliased = 1.0f; + regs.line_width_smooth = 1.0f; regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise; + regs.polygon_mode_back = Maxwell3D::Regs::PolygonMode::Fill; + regs.polygon_mode_front = Maxwell3D::Regs::PolygonMode::Fill; shadow_state = regs; @@ -106,7 +124,113 @@ void Maxwell3D::InitializeRegisterDefaults() { mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true; } -void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters) { +void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call) { + if (executing_macro == 0) { + // A macro call must begin by writing the macro method's register, not its argument. + ASSERT_MSG((method % 2) == 0, + "Can't start macro execution by writing to the ARGS register"); + executing_macro = method; + } + + macro_params.insert(macro_params.end(), base_start, base_start + amount); + + // Call the macro when there are no more parameters in the command buffer + if (is_last_call) { + CallMacroMethod(executing_macro, macro_params); + macro_params.clear(); + } +} + +u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) { + // Keep track of the register value in shadow_state when requested. + const auto control = shadow_state.shadow_ram_control; + if (control == Regs::ShadowRamControl::Track || + control == Regs::ShadowRamControl::TrackWithFilter) { + shadow_state.reg_array[method] = argument; + return argument; + } + if (control == Regs::ShadowRamControl::Replay) { + return shadow_state.reg_array[method]; + } + return argument; +} + +void Maxwell3D::ProcessDirtyRegisters(u32 method, u32 argument) { + if (regs.reg_array[method] == argument) { + return; + } + regs.reg_array[method] = argument; + + for (const auto& table : dirty.tables) { + dirty.flags[table[method]] = true; + } +} + +void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument, + bool is_last_call) { + switch (method) { + case MAXWELL3D_REG_INDEX(wait_for_idle): + return rasterizer->WaitForIdle(); + case MAXWELL3D_REG_INDEX(shadow_ram_control): + shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(nonshadow_argument); + return; + case MAXWELL3D_REG_INDEX(macros.data): + return macro_engine->AddCode(regs.macros.upload_address, argument); + case MAXWELL3D_REG_INDEX(macros.bind): + return ProcessMacroBind(argument); + case MAXWELL3D_REG_INDEX(firmware[4]): + return ProcessFirmwareCall4(); + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): + return StartCBData(method); + case MAXWELL3D_REG_INDEX(cb_bind[0]): + return ProcessCBBind(0); + case MAXWELL3D_REG_INDEX(cb_bind[1]): + return ProcessCBBind(1); + case MAXWELL3D_REG_INDEX(cb_bind[2]): + return ProcessCBBind(2); + case MAXWELL3D_REG_INDEX(cb_bind[3]): + return ProcessCBBind(3); + case MAXWELL3D_REG_INDEX(cb_bind[4]): + return ProcessCBBind(4); + case MAXWELL3D_REG_INDEX(draw.vertex_end_gl): + return DrawArrays(); + case MAXWELL3D_REG_INDEX(clear_buffers): + return ProcessClearBuffers(); + case MAXWELL3D_REG_INDEX(query.query_get): + return ProcessQueryGet(); + case MAXWELL3D_REG_INDEX(condition.mode): + return ProcessQueryCondition(); + case MAXWELL3D_REG_INDEX(counter_reset): + return ProcessCounterReset(); + case MAXWELL3D_REG_INDEX(sync_info): + return ProcessSyncPoint(); + case MAXWELL3D_REG_INDEX(exec_upload): + return upload_state.ProcessExec(regs.exec_upload.linear != 0); + case MAXWELL3D_REG_INDEX(data_upload): + upload_state.ProcessData(argument, is_last_call); + if (is_last_call) { + OnMemoryWrite(); + } + return; + } +} + +void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) { // Reset the current macro. executing_macro = 0; @@ -115,18 +239,16 @@ void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u3 ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size()); // Execute the current macro. - macro_interpreter.Execute(macro_positions[entry], num_parameters, parameters); + macro_engine->Execute(*this, macro_positions[entry], parameters); if (mme_draw.current_mode != MMEDrawMode::Undefined) { FlushMMEInlineDraw(); } } -void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { - const u32 method = method_call.method; - +void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) { if (method == cb_data_state.current) { - regs.reg_array[method] = method_call.argument; - ProcessCBData(method_call.argument); + regs.reg_array[method] = method_argument; + ProcessCBData(method_argument); return; } else if (cb_data_state.current != null_cb_data) { FinishCBData(); @@ -141,61 +263,27 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { // Methods after 0xE00 are special, they're actually triggers for some microcode that was // uploaded to the GPU during initialization. if (method >= MacroRegistersStart) { - // We're trying to execute a macro - if (executing_macro == 0) { - // A macro call must begin by writing the macro method's register, not its argument. - ASSERT_MSG((method % 2) == 0, - "Can't start macro execution by writing to the ARGS register"); - executing_macro = method; - } - - macro_params.push_back(method_call.argument); - - // Call the macro when there are no more parameters in the command buffer - if (method_call.IsLastCall()) { - CallMacroMethod(executing_macro, macro_params.size(), macro_params.data()); - macro_params.clear(); - } + ProcessMacro(method, &method_argument, 1, is_last_call); return; } ASSERT_MSG(method < Regs::NUM_REGS, "Invalid Maxwell3D register, increase the size of the Regs structure"); - u32 arg = method_call.argument; - // Keep track of the register value in shadow_state when requested. - if (shadow_state.shadow_ram_control == Regs::ShadowRamControl::Track || - shadow_state.shadow_ram_control == Regs::ShadowRamControl::TrackWithFilter) { - shadow_state.reg_array[method] = arg; - } else if (shadow_state.shadow_ram_control == Regs::ShadowRamControl::Replay) { - arg = shadow_state.reg_array[method]; - } - - if (regs.reg_array[method] != arg) { - regs.reg_array[method] = arg; + const u32 argument = ProcessShadowRam(method, method_argument); + ProcessDirtyRegisters(method, argument); + ProcessMethodCall(method, argument, method_argument, is_last_call); +} - for (const auto& table : dirty.tables) { - dirty.flags[table[method]] = true; - } +void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, + u32 methods_pending) { + // Methods after 0xE00 are special, they're actually triggers for some microcode that was + // uploaded to the GPU during initialization. + if (method >= MacroRegistersStart) { + ProcessMacro(method, base_start, amount, amount == methods_pending); + return; } - switch (method) { - case MAXWELL3D_REG_INDEX(shadow_ram_control): { - shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(method_call.argument); - break; - } - case MAXWELL3D_REG_INDEX(macros.data): { - ProcessMacroUpload(arg); - break; - } - case MAXWELL3D_REG_INDEX(macros.bind): { - ProcessMacroBind(arg); - break; - } - case MAXWELL3D_REG_INDEX(firmware[4]): { - ProcessFirmwareCall4(); - break; - } case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]): case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]): case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]): @@ -211,67 +299,13 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]): case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]): case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): { - StartCBData(method); - break; - } - case MAXWELL3D_REG_INDEX(cb_bind[0]): { - ProcessCBBind(0); - break; - } - case MAXWELL3D_REG_INDEX(cb_bind[1]): { - ProcessCBBind(1); - break; - } - case MAXWELL3D_REG_INDEX(cb_bind[2]): { - ProcessCBBind(2); - break; - } - case MAXWELL3D_REG_INDEX(cb_bind[3]): { - ProcessCBBind(3); - break; - } - case MAXWELL3D_REG_INDEX(cb_bind[4]): { - ProcessCBBind(4); - break; - } - case MAXWELL3D_REG_INDEX(draw.vertex_end_gl): { - DrawArrays(); - break; - } - case MAXWELL3D_REG_INDEX(clear_buffers): { - ProcessClearBuffers(); - break; - } - case MAXWELL3D_REG_INDEX(query.query_get): { - ProcessQueryGet(); - break; - } - case MAXWELL3D_REG_INDEX(condition.mode): { - ProcessQueryCondition(); - break; - } - case MAXWELL3D_REG_INDEX(counter_reset): { - ProcessCounterReset(); - break; - } - case MAXWELL3D_REG_INDEX(sync_info): { - ProcessSyncPoint(); - break; - } - case MAXWELL3D_REG_INDEX(exec_upload): { - upload_state.ProcessExec(regs.exec_upload.linear != 0); - break; - } - case MAXWELL3D_REG_INDEX(data_upload): { - const bool is_last_call = method_call.IsLastCall(); - upload_state.ProcessData(arg, is_last_call); - if (is_last_call) { - OnMemoryWrite(); - } + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): + ProcessCBMultiData(method, base_start, amount); break; - } default: + for (std::size_t i = 0; i < amount; i++) { + CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); + } break; } } @@ -300,16 +334,15 @@ void Maxwell3D::StepInstance(const MMEDrawMode expected_mode, const u32 count) { StepInstance(expected_mode, count); } -void Maxwell3D::CallMethodFromMME(const GPU::MethodCall& method_call) { - const u32 method = method_call.method; +void Maxwell3D::CallMethodFromMME(u32 method, u32 method_argument) { if (mme_inline[method]) { - regs.reg_array[method] = method_call.argument; + regs.reg_array[method] = method_argument; if (method == MAXWELL3D_REG_INDEX(vertex_buffer.count) || method == MAXWELL3D_REG_INDEX(index_array.count)) { const MMEDrawMode expected_mode = method == MAXWELL3D_REG_INDEX(vertex_buffer.count) ? MMEDrawMode::Array : MMEDrawMode::Indexed; - StepInstance(expected_mode, method_call.argument); + StepInstance(expected_mode, method_argument); } else if (method == MAXWELL3D_REG_INDEX(draw.vertex_begin_gl)) { mme_draw.instance_mode = (regs.draw.instance_next != 0) || (regs.draw.instance_cont != 0); @@ -321,7 +354,7 @@ void Maxwell3D::CallMethodFromMME(const GPU::MethodCall& method_call) { if (mme_draw.current_mode != MMEDrawMode::Undefined) { FlushMMEInlineDraw(); } - CallMethod(method_call); + CallMethod(method, method_argument, true); } } @@ -337,7 +370,7 @@ void Maxwell3D::FlushMMEInlineDraw() { const bool is_indexed = mme_draw.current_mode == MMEDrawMode::Indexed; if (ShouldExecute()) { - rasterizer.Draw(is_indexed, true); + rasterizer->Draw(is_indexed, true); } // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if @@ -358,9 +391,7 @@ void Maxwell3D::FlushMMEInlineDraw() { } void Maxwell3D::ProcessMacroUpload(u32 data) { - ASSERT_MSG(regs.macros.upload_address < macro_memory.size(), - "upload_address exceeded macro_memory size!"); - macro_memory[regs.macros.upload_address++] = data; + macro_engine->AddCode(regs.macros.upload_address++, data); } void Maxwell3D::ProcessMacroBind(u32 data) { @@ -395,12 +426,17 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) { void Maxwell3D::ProcessQueryGet() { // TODO(Subv): Support the other query units. - ASSERT_MSG(regs.query.query_get.unit == Regs::QueryUnit::Crop, - "Units other than CROP are unimplemented"); + if (regs.query.query_get.unit != Regs::QueryUnit::Crop) { + LOG_DEBUG(HW_GPU, "Units other than CROP are unimplemented"); + } switch (regs.query.query_get.operation) { case Regs::QueryOperation::Release: - StampQueryResult(regs.query.query_sequence, regs.query.query_get.short_query == 0); + if (regs.query.query_get.fence == 1) { + rasterizer->SignalSemaphore(regs.query.QueryAddress(), regs.query.query_sequence); + } else { + StampQueryResult(regs.query.query_sequence, regs.query.query_get.short_query == 0); + } break; case Regs::QueryOperation::Acquire: // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that @@ -465,11 +501,11 @@ void Maxwell3D::ProcessQueryCondition() { void Maxwell3D::ProcessCounterReset() { switch (regs.counter_reset) { case Regs::CounterReset::SampleCnt: - rasterizer.ResetCounter(QueryType::SamplesPassed); + rasterizer->ResetCounter(QueryType::SamplesPassed); break; default: - LOG_WARNING(Render_OpenGL, "Unimplemented counter reset={}", - static_cast<int>(regs.counter_reset)); + LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", + static_cast<int>(regs.counter_reset)); break; } } @@ -479,7 +515,7 @@ void Maxwell3D::ProcessSyncPoint() { const u32 increment = regs.sync_info.increment.Value(); [[maybe_unused]] const u32 cache_flush = regs.sync_info.unknown.Value(); if (increment) { - system.GPU().IncrementSyncPoint(sync_point); + rasterizer->SignalSyncPoint(sync_point); } } @@ -502,7 +538,7 @@ void Maxwell3D::DrawArrays() { const bool is_indexed{regs.index_array.count && !regs.vertex_buffer.count}; if (ShouldExecute()) { - rasterizer.Draw(is_indexed, false); + rasterizer->Draw(is_indexed, false); } // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if @@ -522,12 +558,12 @@ std::optional<u64> Maxwell3D::GetQueryResult() { return 0; case Regs::QuerySelect::SamplesPassed: // Deferred. - rasterizer.Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed, - system.GPU().GetTicks()); - return {}; + rasterizer->Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed, + system.GPU().GetTicks()); + return std::nullopt; default: - UNIMPLEMENTED_MSG("Unimplemented query select type {}", - static_cast<u32>(regs.query.query_get.select.Value())); + LOG_DEBUG(HW_GPU, "Unimplemented query select type {}", + static_cast<u32>(regs.query.query_get.select.Value())); return 1; } } @@ -562,6 +598,28 @@ void Maxwell3D::StartCBData(u32 method) { ProcessCBData(regs.const_buffer.cb_data[cb_data_state.id]); } +void Maxwell3D::ProcessCBMultiData(u32 method, const u32* start_base, u32 amount) { + if (cb_data_state.current != method) { + if (cb_data_state.current != null_cb_data) { + FinishCBData(); + } + constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]); + cb_data_state.start_pos = regs.const_buffer.cb_pos; + cb_data_state.id = method - first_cb_data; + cb_data_state.current = method; + cb_data_state.counter = 0; + } + const std::size_t id = cb_data_state.id; + const std::size_t size = amount; + std::size_t i = 0; + for (; i < size; i++) { + cb_data_state.buffer[id][cb_data_state.counter] = start_base[i]; + cb_data_state.counter++; + } + // Increment the current buffer position. + regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4 * amount; +} + void Maxwell3D::FinishCBData() { // Write the input value to the current const buffer at the current position. const GPUVAddr buffer_address = regs.const_buffer.BufferAddress(); @@ -628,7 +686,7 @@ void Maxwell3D::ProcessClearBuffers() { regs.clear_buffers.R == regs.clear_buffers.B && regs.clear_buffers.R == regs.clear_buffers.A); - rasterizer.Clear(); + rasterizer->Clear(); } u32 Maxwell3D::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const { @@ -650,8 +708,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b const auto& shader = state.shader_stages[static_cast<std::size_t>(stage)]; const auto& tex_info_buffer = shader.const_buffers[const_buffer]; const GPUVAddr tex_info_address = tex_info_buffer.address + offset; + return AccessSampler(memory_manager.Read<u32>(tex_info_address)); +} - const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)}; +SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const { + const Texture::TextureHandle tex_handle{handle}; const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); @@ -659,11 +720,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b } VideoCore::GuestDriverProfile& Maxwell3D::AccessGuestDriverProfile() { - return rasterizer.AccessGuestDriverProfile(); + return rasterizer->AccessGuestDriverProfile(); } const VideoCore::GuestDriverProfile& Maxwell3D::AccessGuestDriverProfile() const { - return rasterizer.AccessGuestDriverProfile(); + return rasterizer->AccessGuestDriverProfile(); } } // namespace Tegra::Engines diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 5cf6a4cc3..1cbe8fe67 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -19,10 +19,11 @@ #include "common/math_util.h" #include "video_core/engines/const_buffer_engine_interface.h" #include "video_core/engines/const_buffer_info.h" +#include "video_core/engines/engine_interface.h" #include "video_core/engines/engine_upload.h" #include "video_core/engines/shader_type.h" #include "video_core/gpu.h" -#include "video_core/macro_interpreter.h" +#include "video_core/macro/macro.h" #include "video_core/textures/texture.h" namespace Core { @@ -48,11 +49,13 @@ namespace Tegra::Engines { #define MAXWELL3D_REG_INDEX(field_name) \ (offsetof(Tegra::Engines::Maxwell3D::Regs, field_name) / sizeof(u32)) -class Maxwell3D final : public ConstBufferEngineInterface { +class Maxwell3D final : public ConstBufferEngineInterface, public EngineInterface { public: - explicit Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - MemoryManager& memory_manager); - ~Maxwell3D() = default; + explicit Maxwell3D(Core::System& system, MemoryManager& memory_manager); + ~Maxwell3D(); + + /// Binds a rasterizer to this engine. + void BindRasterizer(VideoCore::RasterizerInterface& rasterizer); /// Register structure of the Maxwell3D engine. /// TODO(Subv): This structure will need to be made bigger as more registers are discovered. @@ -575,6 +578,17 @@ public: Replay = 3, }; + enum class ViewportSwizzle : u32 { + PositiveX = 0, + NegativeX = 1, + PositiveY = 2, + NegativeY = 3, + PositiveZ = 4, + NegativeZ = 5, + PositiveW = 6, + NegativeW = 7, + }; + struct RenderTargetConfig { u32 address_high; u32 address_low; @@ -586,6 +600,7 @@ public: BitField<4, 3, u32> block_height; BitField<8, 3, u32> block_depth; BitField<12, 1, InvMemoryLayout> type; + BitField<16, 1, u32> is_3d; } memory_layout; union { BitField<0, 16, u32> layers; @@ -618,7 +633,14 @@ public: f32 translate_x; f32 translate_y; f32 translate_z; - INSERT_UNION_PADDING_WORDS(2); + union { + u32 raw; + BitField<0, 3, ViewportSwizzle> x; + BitField<4, 3, ViewportSwizzle> y; + BitField<8, 3, ViewportSwizzle> z; + BitField<12, 3, ViewportSwizzle> w; + } swizzle; + INSERT_UNION_PADDING_WORDS(1); Common::Rectangle<f32> GetRect() const { return { @@ -627,7 +649,7 @@ public: GetX() + GetWidth(), // right GetY() // bottom }; - }; + } f32 GetX() const { return std::max(0.0f, translate_x - std::fabs(scale_x)); @@ -709,7 +731,9 @@ public: union { struct { - INSERT_UNION_PADDING_WORDS(0x45); + INSERT_UNION_PADDING_WORDS(0x44); + + u32 wait_for_idle; struct { u32 upload_address; @@ -1149,7 +1173,7 @@ public: /// Returns whether the vertex array specified by index is supposed to be /// accessed per instance or not. - bool IsInstancingEnabled(u32 index) const { + bool IsInstancingEnabled(std::size_t index) const { return is_instanced[index]; } } instanced_arrays; @@ -1179,6 +1203,7 @@ public: BitField<0, 1, u32> depth_range_0_1; BitField<3, 1, u32> depth_clamp_near; BitField<4, 1, u32> depth_clamp_far; + BitField<11, 1, u32> depth_clamp_disabled; } view_volume_clip_control; INSERT_UNION_PADDING_WORDS(0x1F); @@ -1259,7 +1284,8 @@ public: GPUVAddr LimitAddress() const { return static_cast<GPUVAddr>((static_cast<GPUVAddr>(limit_high) << 32) | - limit_low); + limit_low) + + 1; } } vertex_array_limit[NumVertexArrays]; @@ -1356,10 +1382,14 @@ public: u32 GetRegisterValue(u32 method) const; /// Write the value to the register identified by method. - void CallMethod(const GPU::MethodCall& method_call); + void CallMethod(u32 method, u32 method_argument, bool is_last_call) override; + + /// Write multiple values to the register identified by method. + void CallMultiMethod(u32 method, const u32* base_start, u32 amount, + u32 methods_pending) override; /// Write the value to the register identified by method. - void CallMethodFromMME(const GPU::MethodCall& method_call); + void CallMethodFromMME(u32 method, u32 method_argument); void FlushMMEInlineDraw(); @@ -1376,6 +1406,8 @@ public: SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer, u64 offset) const override; + SamplerDescriptor AccessSampler(u32 handle) const override; + u32 GetBoundBuffer() const override { return regs.tex_cb_index; } @@ -1384,17 +1416,16 @@ public: const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override; - /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than - /// we've seen used. - using MacroMemory = std::array<u32, 0x40000>; + bool ShouldExecute() const { + return execute_on; + } - /// Gets a reference to macro memory. - const MacroMemory& GetMacroMemory() const { - return macro_memory; + VideoCore::RasterizerInterface& Rasterizer() { + return *rasterizer; } - bool ShouldExecute() const { - return execute_on; + const VideoCore::RasterizerInterface& Rasterizer() const { + return *rasterizer; } /// Notify a memory write has happened. @@ -1430,27 +1461,31 @@ public: private: void InitializeRegisterDefaults(); - Core::System& system; + void ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call); - VideoCore::RasterizerInterface& rasterizer; + u32 ProcessShadowRam(u32 method, u32 argument); + void ProcessDirtyRegisters(u32 method, u32 argument); + + void ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument, bool is_last_call); + + Core::System& system; MemoryManager& memory_manager; + VideoCore::RasterizerInterface* rasterizer = nullptr; + /// Start offsets of each macro in macro_memory std::array<u32, 0x80> macro_positions = {}; std::array<bool, Regs::NUM_REGS> mme_inline{}; - /// Memory for macro code - MacroMemory macro_memory; - /// Macro method that is currently being executed / being fed parameters. u32 executing_macro = 0; /// Parameters that have been submitted to the macro call so far. std::vector<u32> macro_params; /// Interpreter for the macro codes uploaded to the GPU. - MacroInterpreter macro_interpreter; + std::unique_ptr<MacroEngine> macro_engine; static constexpr u32 null_cb_data = 0xFFFFFFFF; struct { @@ -1479,7 +1514,7 @@ private: * @param num_parameters Number of arguments * @param parameters Arguments to the method call */ - void CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters); + void CallMacroMethod(u32 method, const std::vector<u32>& parameters); /// Handles writes to the macro uploading register. void ProcessMacroUpload(u32 data); @@ -1511,6 +1546,7 @@ private: /// Handles a write to the CB_DATA[i] register. void StartCBData(u32 method); void ProcessCBData(u32 value); + void ProcessCBMultiData(u32 method, const u32* start_base, u32 amount); void FinishCBData(); /// Handles a write to the CB_BIND register. @@ -1530,6 +1566,7 @@ private: static_assert(offsetof(Maxwell3D::Regs, field_name) == position * 4, \ "Field " #field_name " has invalid position") +ASSERT_REG_POSITION(wait_for_idle, 0x44); ASSERT_REG_POSITION(macros, 0x45); ASSERT_REG_POSITION(shadow_ram_control, 0x49); ASSERT_REG_POSITION(upload, 0x60); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index c2610f992..8fa359d0a 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -14,43 +14,45 @@ namespace Tegra::Engines { +using namespace Texture; + MaxwellDMA::MaxwellDMA(Core::System& system, MemoryManager& memory_manager) : system{system}, memory_manager{memory_manager} {} -void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) { - ASSERT_MSG(method_call.method < Regs::NUM_REGS, - "Invalid MaxwellDMA register, increase the size of the Regs structure"); - - regs.reg_array[method_call.method] = method_call.argument; +void MaxwellDMA::CallMethod(u32 method, u32 method_argument, bool is_last_call) { + ASSERT_MSG(method < NUM_REGS, "Invalid MaxwellDMA register"); -#define MAXWELLDMA_REG_INDEX(field_name) \ - (offsetof(Tegra::Engines::MaxwellDMA::Regs, field_name) / sizeof(u32)) + regs.reg_array[method] = method_argument; - switch (method_call.method) { - case MAXWELLDMA_REG_INDEX(exec): { - HandleCopy(); - break; - } + if (method == offsetof(Regs, launch_dma) / sizeof(u32)) { + Launch(); } - -#undef MAXWELLDMA_REG_INDEX } -void MaxwellDMA::HandleCopy() { - LOG_TRACE(HW_GPU, "Requested a DMA copy"); +void MaxwellDMA::CallMultiMethod(u32 method, const u32* base_start, u32 amount, + u32 methods_pending) { + for (size_t i = 0; i < amount; ++i) { + CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); + } +} - const GPUVAddr source = regs.src_address.Address(); - const GPUVAddr dest = regs.dst_address.Address(); +void MaxwellDMA::Launch() { + LOG_TRACE(Render_OpenGL, "DMA copy 0x{:x} -> 0x{:x}", static_cast<GPUVAddr>(regs.offset_in), + static_cast<GPUVAddr>(regs.offset_out)); // TODO(Subv): Perform more research and implement all features of this engine. - ASSERT(regs.exec.enable_swizzle == 0); - ASSERT(regs.exec.query_mode == Regs::QueryMode::None); - ASSERT(regs.exec.query_intr == Regs::QueryIntr::None); - ASSERT(regs.exec.copy_mode == Regs::CopyMode::Unk2); - ASSERT(regs.dst_params.pos_x == 0); - ASSERT(regs.dst_params.pos_y == 0); - - if (!regs.exec.is_dst_linear && !regs.exec.is_src_linear) { + const LaunchDMA& launch = regs.launch_dma; + ASSERT(launch.remap_enable == 0); + ASSERT(launch.semaphore_type == LaunchDMA::SemaphoreType::NONE); + ASSERT(launch.interrupt_type == LaunchDMA::InterruptType::NONE); + ASSERT(launch.data_transfer_type == LaunchDMA::DataTransferType::NON_PIPELINED); + ASSERT(regs.dst_params.origin.x == 0); + ASSERT(regs.dst_params.origin.y == 0); + + const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH; + const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH; + + if (!is_src_pitch && !is_dst_pitch) { // If both the source and the destination are in block layout, assert. UNREACHABLE_MSG("Tiled->Tiled DMA transfers are not yet implemented"); return; @@ -59,99 +61,154 @@ void MaxwellDMA::HandleCopy() { // All copies here update the main memory, so mark all rasterizer states as invalid. system.GPU().Maxwell3D().OnMemoryWrite(); - if (regs.exec.is_dst_linear && regs.exec.is_src_linear) { - // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D - // buffer of length `x_count`, otherwise we copy a 2D image of dimensions (x_count, - // y_count). - if (!regs.exec.enable_2d) { - memory_manager.CopyBlock(dest, source, regs.x_count); - return; - } + if (is_src_pitch && is_dst_pitch) { + CopyPitchToPitch(); + } else { + ASSERT(launch.multi_line_enable == 1); - // If both the source and the destination are in linear layout, perform a line-by-line - // copy. We're going to take a subrect of size (x_count, y_count) from the source - // rectangle. There is no need to manually flush/invalidate the regions because - // CopyBlock does that for us. - for (u32 line = 0; line < regs.y_count; ++line) { - const GPUVAddr source_line = source + line * regs.src_pitch; - const GPUVAddr dest_line = dest + line * regs.dst_pitch; - memory_manager.CopyBlock(dest_line, source_line, regs.x_count); + if (!is_src_pitch && is_dst_pitch) { + CopyBlockLinearToPitch(); + } else { + CopyPitchToBlockLinear(); } + } +} + +void MaxwellDMA::CopyPitchToPitch() { + // When `multi_line_enable` bit is disabled the copy is performed as if we were copying a 1D + // buffer of length `line_length_in`. + // Otherwise we copy a 2D image of dimensions (line_length_in, line_count). + if (!regs.launch_dma.multi_line_enable) { + memory_manager.CopyBlock(regs.offset_out, regs.offset_in, regs.line_length_in); return; } - ASSERT(regs.exec.enable_2d == 1); + // Perform a line-by-line copy. + // We're going to take a subrect of size (line_length_in, line_count) from the source rectangle. + // There is no need to manually flush/invalidate the regions because CopyBlock does that for us. + for (u32 line = 0; line < regs.line_count; ++line) { + const GPUVAddr source_line = regs.offset_in + static_cast<size_t>(line) * regs.pitch_in; + const GPUVAddr dest_line = regs.offset_out + static_cast<size_t>(line) * regs.pitch_out; + memory_manager.CopyBlock(dest_line, source_line, regs.line_length_in); + } +} - if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) { - ASSERT(regs.src_params.BlockDepth() == 0); - // If the input is tiled and the output is linear, deswizzle the input and copy it over. - const u32 bytes_per_pixel = regs.dst_pitch / regs.x_count; - const std::size_t src_size = Texture::CalculateSize( - true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y, - regs.src_params.size_z, regs.src_params.BlockHeight(), regs.src_params.BlockDepth()); +void MaxwellDMA::CopyBlockLinearToPitch() { + UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0); + UNIMPLEMENTED_IF(regs.src_params.layer != 0); - const std::size_t src_layer_size = Texture::CalculateSize( - true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y, 1, - regs.src_params.BlockHeight(), regs.src_params.BlockDepth()); + // Optimized path for micro copies. + const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count; + if (dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X) { + FastCopyBlockLinearToPitch(); + return; + } - const std::size_t dst_size = regs.dst_pitch * regs.y_count; + // Deswizzle the input and copy it over. + const u32 bytes_per_pixel = regs.pitch_out / regs.line_length_in; + const Parameters& src_params = regs.src_params; + const u32 width = src_params.width; + const u32 height = src_params.height; + const u32 depth = src_params.depth; + const u32 block_height = src_params.block_size.height; + const u32 block_depth = src_params.block_size.depth; + const size_t src_size = + CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth); + + if (read_buffer.size() < src_size) { + read_buffer.resize(src_size); + } + if (write_buffer.size() < dst_size) { + write_buffer.resize(dst_size); + } - if (read_buffer.size() < src_size) { - read_buffer.resize(src_size); - } + memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size); + memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size); - if (write_buffer.size() < dst_size) { - write_buffer.resize(dst_size); - } + UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, width, bytes_per_pixel, + block_height, src_params.origin.x, src_params.origin.y, write_buffer.data(), + read_buffer.data()); - memory_manager.ReadBlock(source, read_buffer.data(), src_size); - memory_manager.ReadBlock(dest, write_buffer.data(), dst_size); + memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); +} - Texture::UnswizzleSubrect( - regs.x_count, regs.y_count, regs.dst_pitch, regs.src_params.size_x, bytes_per_pixel, - read_buffer.data() + src_layer_size * regs.src_params.pos_z, write_buffer.data(), - regs.src_params.BlockHeight(), regs.src_params.pos_x, regs.src_params.pos_y); +void MaxwellDMA::CopyPitchToBlockLinear() { + const auto& dst_params = regs.dst_params; + const u32 bytes_per_pixel = regs.pitch_in / regs.line_length_in; + const u32 width = dst_params.width; + const u32 height = dst_params.height; + const u32 depth = dst_params.depth; + const u32 block_height = dst_params.block_size.height; + const u32 block_depth = dst_params.block_size.depth; + const size_t dst_size = + CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth); + const size_t dst_layer_size = + CalculateSize(true, bytes_per_pixel, width, height, 1, block_height, block_depth); + + const size_t src_size = static_cast<size_t>(regs.pitch_in) * regs.line_count; + + if (read_buffer.size() < src_size) { + read_buffer.resize(src_size); + } + if (write_buffer.size() < dst_size) { + write_buffer.resize(dst_size); + } - memory_manager.WriteBlock(dest, write_buffer.data(), dst_size); + if (Settings::IsGPULevelExtreme()) { + memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size); + memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size); } else { - ASSERT(regs.dst_params.BlockDepth() == 0); - - const u32 bytes_per_pixel = regs.src_pitch / regs.x_count; - - const std::size_t dst_size = Texture::CalculateSize( - true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, - regs.dst_params.size_z, regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth()); - - const std::size_t dst_layer_size = Texture::CalculateSize( - true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1, - regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth()); + memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(), src_size); + memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size); + } - const std::size_t src_size = regs.src_pitch * regs.y_count; + // If the input is linear and the output is tiled, swizzle the input and copy it over. + if (regs.dst_params.block_size.depth > 0) { + ASSERT(dst_params.layer == 0); + SwizzleSliceToVoxel(regs.line_length_in, regs.line_count, regs.pitch_in, width, height, + bytes_per_pixel, block_height, block_depth, dst_params.origin.x, + dst_params.origin.y, write_buffer.data(), read_buffer.data()); + } else { + SwizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_in, width, bytes_per_pixel, + write_buffer.data() + dst_layer_size * dst_params.layer, read_buffer.data(), + block_height, dst_params.origin.x, dst_params.origin.y); + } - if (read_buffer.size() < src_size) { - read_buffer.resize(src_size); - } + memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); +} - if (write_buffer.size() < dst_size) { - write_buffer.resize(dst_size); - } +void MaxwellDMA::FastCopyBlockLinearToPitch() { + const u32 bytes_per_pixel = regs.pitch_out / regs.line_length_in; + const size_t src_size = GOB_SIZE; + const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count; + u32 pos_x = regs.src_params.origin.x; + u32 pos_y = regs.src_params.origin.y; + const u64 offset = GetGOBOffset(regs.src_params.width, regs.src_params.height, pos_x, pos_y, + regs.src_params.block_size.height, bytes_per_pixel); + const u32 x_in_gob = 64 / bytes_per_pixel; + pos_x = pos_x % x_in_gob; + pos_y = pos_y % 8; + + if (read_buffer.size() < src_size) { + read_buffer.resize(src_size); + } + if (write_buffer.size() < dst_size) { + write_buffer.resize(dst_size); + } - if (Settings::values.use_accurate_gpu_emulation) { - memory_manager.ReadBlock(source, read_buffer.data(), src_size); - memory_manager.ReadBlock(dest, write_buffer.data(), dst_size); - } else { - memory_manager.ReadBlockUnsafe(source, read_buffer.data(), src_size); - memory_manager.ReadBlockUnsafe(dest, write_buffer.data(), dst_size); - } + if (Settings::IsGPULevelExtreme()) { + memory_manager.ReadBlock(regs.offset_in + offset, read_buffer.data(), src_size); + memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size); + } else { + memory_manager.ReadBlockUnsafe(regs.offset_in + offset, read_buffer.data(), src_size); + memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size); + } - // If the input is linear and the output is tiled, swizzle the input and copy it over. - Texture::SwizzleSubrect( - regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x, bytes_per_pixel, - write_buffer.data() + dst_layer_size * regs.dst_params.pos_z, read_buffer.data(), - regs.dst_params.BlockHeight(), regs.dst_params.pos_x, regs.dst_params.pos_y); + UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, regs.src_params.width, + bytes_per_pixel, regs.src_params.block_size.height, pos_x, pos_y, + write_buffer.data(), read_buffer.data()); - memory_manager.WriteBlock(dest, write_buffer.data(), dst_size); - } + memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); } } // namespace Tegra::Engines diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h index 4f40d1d1f..50f445efc 100644 --- a/src/video_core/engines/maxwell_dma.h +++ b/src/video_core/engines/maxwell_dma.h @@ -10,6 +10,7 @@ #include "common/bit_field.h" #include "common/common_funcs.h" #include "common/common_types.h" +#include "video_core/engines/engine_interface.h" #include "video_core/gpu.h" namespace Core { @@ -23,156 +24,190 @@ class MemoryManager; namespace Tegra::Engines { /** - * This Engine is known as GK104_Copy. Documentation can be found in: + * This engine is known as gk104_copy. Documentation can be found in: + * https://github.com/NVIDIA/open-gpu-doc/blob/master/classes/dma-copy/clb0b5.h * https://github.com/envytools/envytools/blob/master/rnndb/fifo/gk104_copy.xml */ -class MaxwellDMA final { +class MaxwellDMA final : public EngineInterface { public: - explicit MaxwellDMA(Core::System& system, MemoryManager& memory_manager); - ~MaxwellDMA() = default; - - /// Write the value to the register identified by method. - void CallMethod(const GPU::MethodCall& method_call); + struct PackedGPUVAddr { + u32 upper; + u32 lower; + + constexpr operator GPUVAddr() const noexcept { + return (static_cast<GPUVAddr>(upper & 0xff) << 32) | lower; + } + }; + + union BlockSize { + BitField<0, 4, u32> width; + BitField<4, 4, u32> height; + BitField<8, 4, u32> depth; + BitField<12, 4, u32> gob_height; + }; + static_assert(sizeof(BlockSize) == 4); + + union Origin { + BitField<0, 16, u32> x; + BitField<16, 16, u32> y; + }; + static_assert(sizeof(Origin) == 4); + + struct Parameters { + BlockSize block_size; + u32 width; + u32 height; + u32 depth; + u32 layer; + Origin origin; + }; + static_assert(sizeof(Parameters) == 24); + + struct Semaphore { + PackedGPUVAddr address; + u32 payload; + }; + static_assert(sizeof(Semaphore) == 12); + + struct RenderEnable { + enum class Mode : u32 { + FALSE = 0, + TRUE = 1, + CONDITIONAL = 2, + RENDER_IF_EQUAL = 3, + RENDER_IF_NOT_EQUAL = 4, + }; - struct Regs { - static constexpr std::size_t NUM_REGS = 0x1D6; + PackedGPUVAddr address; + BitField<0, 3, Mode> mode; + }; + static_assert(sizeof(RenderEnable) == 12); + + enum class PhysModeTarget : u32 { + LOCAL_FB = 0, + COHERENT_SYSMEM = 1, + NONCOHERENT_SYSMEM = 2, + }; + using PhysMode = BitField<0, 2, PhysModeTarget>; + + union LaunchDMA { + enum class DataTransferType : u32 { + NONE = 0, + PIPELINED = 1, + NON_PIPELINED = 2, + }; - struct Parameters { - union { - BitField<0, 4, u32> block_depth; - BitField<4, 4, u32> block_height; - BitField<8, 4, u32> block_width; - }; - u32 size_x; - u32 size_y; - u32 size_z; - u32 pos_z; - union { - BitField<0, 16, u32> pos_x; - BitField<16, 16, u32> pos_y; - }; + enum class SemaphoreType : u32 { + NONE = 0, + RELEASE_ONE_WORD_SEMAPHORE = 1, + RELEASE_FOUR_WORD_SEMAPHORE = 2, + }; - u32 BlockHeight() const { - return block_height.Value(); - } + enum class InterruptType : u32 { + NONE = 0, + BLOCKING = 1, + NON_BLOCKING = 2, + }; - u32 BlockDepth() const { - return block_depth.Value(); - } + enum class MemoryLayout : u32 { + BLOCKLINEAR = 0, + PITCH = 1, }; - static_assert(sizeof(Parameters) == 24, "Parameters has wrong size"); + enum class Type : u32 { + VIRTUAL = 0, + PHYSICAL = 1, + }; - enum class ComponentMode : u32 { - Src0 = 0, - Src1 = 1, - Src2 = 2, - Src3 = 3, - Const0 = 4, - Const1 = 5, - Zero = 6, + enum class SemaphoreReduction : u32 { + IMIN = 0, + IMAX = 1, + IXOR = 2, + IAND = 3, + IOR = 4, + IADD = 5, + INC = 6, + DEC = 7, + FADD = 0xA, }; - enum class CopyMode : u32 { - None = 0, - Unk1 = 1, - Unk2 = 2, + enum class SemaphoreReductionSign : u32 { + SIGNED = 0, + UNSIGNED = 1, }; - enum class QueryMode : u32 { - None = 0, - Short = 1, - Long = 2, + enum class BypassL2 : u32 { + USE_PTE_SETTING = 0, + FORCE_VOLATILE = 1, }; - enum class QueryIntr : u32 { - None = 0, - Block = 1, - NonBlock = 2, + BitField<0, 2, DataTransferType> data_transfer_type; + BitField<2, 1, u32> flush_enable; + BitField<3, 2, SemaphoreType> semaphore_type; + BitField<5, 2, InterruptType> interrupt_type; + BitField<7, 1, MemoryLayout> src_memory_layout; + BitField<8, 1, MemoryLayout> dst_memory_layout; + BitField<9, 1, u32> multi_line_enable; + BitField<10, 1, u32> remap_enable; + BitField<11, 1, u32> rmwdisable; + BitField<12, 1, Type> src_type; + BitField<13, 1, Type> dst_type; + BitField<14, 4, SemaphoreReduction> semaphore_reduction; + BitField<18, 1, SemaphoreReductionSign> semaphore_reduction_sign; + BitField<19, 1, u32> reduction_enable; + BitField<20, 1, BypassL2> bypass_l2; + }; + static_assert(sizeof(LaunchDMA) == 4); + + struct RemapConst { + enum Swizzle : u32 { + SRC_X = 0, + SRC_Y = 1, + SRC_Z = 2, + SRC_W = 3, + CONST_A = 4, + CONST_B = 5, + NO_WRITE = 6, }; + PackedGPUVAddr address; + union { - struct { - INSERT_UNION_PADDING_WORDS(0xC0); - - struct { - union { - BitField<0, 2, CopyMode> copy_mode; - BitField<2, 1, u32> flush; - - BitField<3, 2, QueryMode> query_mode; - BitField<5, 2, QueryIntr> query_intr; - - BitField<7, 1, u32> is_src_linear; - BitField<8, 1, u32> is_dst_linear; - - BitField<9, 1, u32> enable_2d; - BitField<10, 1, u32> enable_swizzle; - }; - } exec; - - INSERT_UNION_PADDING_WORDS(0x3F); - - struct { - u32 address_high; - u32 address_low; - - GPUVAddr Address() const { - return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | - address_low); - } - } src_address; - - struct { - u32 address_high; - u32 address_low; - - GPUVAddr Address() const { - return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | - address_low); - } - } dst_address; - - u32 src_pitch; - u32 dst_pitch; - u32 x_count; - u32 y_count; - - INSERT_UNION_PADDING_WORDS(0xB8); - - u32 const0; - u32 const1; - union { - BitField<0, 4, ComponentMode> component0; - BitField<4, 4, ComponentMode> component1; - BitField<8, 4, ComponentMode> component2; - BitField<12, 4, ComponentMode> component3; - BitField<16, 2, u32> component_size; - BitField<20, 3, u32> src_num_components; - BitField<24, 3, u32> dst_num_components; - - u32 SrcBytePerPixel() const { - return src_num_components.Value() * component_size.Value(); - } - u32 DstBytePerPixel() const { - return dst_num_components.Value() * component_size.Value(); - } - } swizzle_config; + BitField<0, 3, Swizzle> dst_x; + BitField<4, 3, Swizzle> dst_y; + BitField<8, 3, Swizzle> dst_z; + BitField<12, 3, Swizzle> dst_w; + BitField<16, 2, u32> component_size_minus_one; + BitField<20, 2, u32> num_src_components_minus_one; + BitField<24, 2, u32> num_dst_components_minus_one; + }; + }; + static_assert(sizeof(RemapConst) == 12); - Parameters dst_params; + explicit MaxwellDMA(Core::System& system, MemoryManager& memory_manager); + ~MaxwellDMA() = default; - INSERT_UNION_PADDING_WORDS(1); + /// Write the value to the register identified by method. + void CallMethod(u32 method, u32 method_argument, bool is_last_call) override; - Parameters src_params; - - INSERT_UNION_PADDING_WORDS(0x13); - }; - std::array<u32, NUM_REGS> reg_array; - }; - } regs{}; + /// Write multiple values to the register identified by method. + void CallMultiMethod(u32 method, const u32* base_start, u32 amount, + u32 methods_pending) override; private: + /// Performs the copy from the source buffer to the destination buffer as configured in the + /// registers. + void Launch(); + + void CopyPitchToPitch(); + + void CopyBlockLinearToPitch(); + + void CopyPitchToBlockLinear(); + + void FastCopyBlockLinearToPitch(); + Core::System& system; MemoryManager& memory_manager; @@ -180,28 +215,58 @@ private: std::vector<u8> read_buffer; std::vector<u8> write_buffer; - /// Performs the copy from the source buffer to the destination buffer as configured in the - /// registers. - void HandleCopy(); -}; + static constexpr std::size_t NUM_REGS = 0x800; + struct Regs { + union { + struct { + u32 reserved[0x40]; + u32 nop; + u32 reserved01[0xf]; + u32 pm_trigger; + u32 reserved02[0x3f]; + Semaphore semaphore; + u32 reserved03[0x2]; + RenderEnable render_enable; + PhysMode src_phys_mode; + PhysMode dst_phys_mode; + u32 reserved04[0x26]; + LaunchDMA launch_dma; + u32 reserved05[0x3f]; + PackedGPUVAddr offset_in; + PackedGPUVAddr offset_out; + u32 pitch_in; + u32 pitch_out; + u32 line_length_in; + u32 line_count; + u32 reserved06[0xb8]; + RemapConst remap_const; + Parameters dst_params; + u32 reserved07[0x1]; + Parameters src_params; + u32 reserved08[0x275]; + u32 pm_trigger_end; + u32 reserved09[0x3ba]; + }; + std::array<u32, NUM_REGS> reg_array; + }; + } regs{}; #define ASSERT_REG_POSITION(field_name, position) \ static_assert(offsetof(MaxwellDMA::Regs, field_name) == position * 4, \ "Field " #field_name " has invalid position") -ASSERT_REG_POSITION(exec, 0xC0); -ASSERT_REG_POSITION(src_address, 0x100); -ASSERT_REG_POSITION(dst_address, 0x102); -ASSERT_REG_POSITION(src_pitch, 0x104); -ASSERT_REG_POSITION(dst_pitch, 0x105); -ASSERT_REG_POSITION(x_count, 0x106); -ASSERT_REG_POSITION(y_count, 0x107); -ASSERT_REG_POSITION(const0, 0x1C0); -ASSERT_REG_POSITION(const1, 0x1C1); -ASSERT_REG_POSITION(swizzle_config, 0x1C2); -ASSERT_REG_POSITION(dst_params, 0x1C3); -ASSERT_REG_POSITION(src_params, 0x1CA); + ASSERT_REG_POSITION(launch_dma, 0xC0); + ASSERT_REG_POSITION(offset_in, 0x100); + ASSERT_REG_POSITION(offset_out, 0x102); + ASSERT_REG_POSITION(pitch_in, 0x104); + ASSERT_REG_POSITION(pitch_out, 0x105); + ASSERT_REG_POSITION(line_length_in, 0x106); + ASSERT_REG_POSITION(line_count, 0x107); + ASSERT_REG_POSITION(remap_const, 0x1C0); + ASSERT_REG_POSITION(dst_params, 0x1C3); + ASSERT_REG_POSITION(src_params, 0x1CA); #undef ASSERT_REG_POSITION +}; } // namespace Tegra::Engines diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index 5e9cfba22..37d17efdc 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -32,31 +32,31 @@ struct Register { constexpr Register() = default; - constexpr Register(u64 value) : value(value) {} + constexpr Register(u64 value_) : value(value_) {} - constexpr operator u64() const { + [[nodiscard]] constexpr operator u64() const { return value; } template <typename T> - constexpr u64 operator-(const T& oth) const { + [[nodiscard]] constexpr u64 operator-(const T& oth) const { return value - oth; } template <typename T> - constexpr u64 operator&(const T& oth) const { + [[nodiscard]] constexpr u64 operator&(const T& oth) const { return value & oth; } - constexpr u64 operator&(const Register& oth) const { + [[nodiscard]] constexpr u64 operator&(const Register& oth) const { return value & oth.value; } - constexpr u64 operator~() const { + [[nodiscard]] constexpr u64 operator~() const { return ~value; } - u64 GetSwizzledIndex(u64 elem) const { + [[nodiscard]] u64 GetSwizzledIndex(u64 elem) const { elem = (value + elem) & 3; return (value & ~3) + elem; } @@ -75,7 +75,7 @@ enum class AttributeSize : u64 { union Attribute { Attribute() = default; - constexpr explicit Attribute(u64 value) : value(value) {} + constexpr explicit Attribute(u64 value_) : value(value_) {} enum class Index : u64 { LayerViewportPointSize = 6, @@ -107,7 +107,7 @@ union Attribute { BitField<31, 1, u64> patch; BitField<47, 3, AttributeSize> size; - bool IsPhysical() const { + [[nodiscard]] bool IsPhysical() const { return patch == 0 && element == 0 && static_cast<u64>(index.Value()) == 0; } } fmt20; @@ -124,7 +124,7 @@ union Attribute { union Sampler { Sampler() = default; - constexpr explicit Sampler(u64 value) : value(value) {} + constexpr explicit Sampler(u64 value_) : value(value_) {} enum class Index : u64 { Sampler_0 = 8, @@ -137,7 +137,7 @@ union Sampler { union Image { Image() = default; - constexpr explicit Image(u64 value) : value{value} {} + constexpr explicit Image(u64 value_) : value{value_} {} BitField<36, 13, u64> index; u64 value; @@ -168,18 +168,22 @@ enum class Pred : u64 { }; enum class PredCondition : u64 { - LessThan = 1, - Equal = 2, - LessEqual = 3, - GreaterThan = 4, - NotEqual = 5, - GreaterEqual = 6, - LessThanWithNan = 9, - LessEqualWithNan = 11, - GreaterThanWithNan = 12, - NotEqualWithNan = 13, - GreaterEqualWithNan = 14, - // TODO(Subv): Other condition types + F = 0, // Always false + LT = 1, // Ordered less than + EQ = 2, // Ordered equal + LE = 3, // Ordered less than or equal + GT = 4, // Ordered greater than + NE = 5, // Ordered not equal + GE = 6, // Ordered greater than or equal + NUM = 7, // Ordered + NAN_ = 8, // Unordered + LTU = 9, // Unordered less than + EQU = 10, // Unordered equal + LEU = 11, // Unordered less than or equal + GTU = 12, // Unordered greater than + NEU = 13, // Unordered not equal + GEU = 14, // Unordered greater than or equal + T = 15, // Always true }; enum class PredOperation : u64 { @@ -501,14 +505,14 @@ struct IpaMode { IpaInterpMode interpolation_mode; IpaSampleMode sampling_mode; - bool operator==(const IpaMode& a) const { + [[nodiscard]] bool operator==(const IpaMode& a) const { return std::tie(interpolation_mode, sampling_mode) == std::tie(a.interpolation_mode, a.sampling_mode); } - bool operator!=(const IpaMode& a) const { + [[nodiscard]] bool operator!=(const IpaMode& a) const { return !operator==(a); } - bool operator<(const IpaMode& a) const { + [[nodiscard]] bool operator<(const IpaMode& a) const { return std::tie(interpolation_mode, sampling_mode) < std::tie(a.interpolation_mode, a.sampling_mode); } @@ -654,7 +658,12 @@ union Instruction { return *this; } - constexpr Instruction(u64 value) : value{value} {} + constexpr Instruction(u64 value_) : value{value_} {} + constexpr Instruction(const Instruction& instr) : value(instr.value) {} + + [[nodiscard]] constexpr bool Bit(u64 offset) const { + return ((value >> offset) & 1) != 0; + } BitField<0, 8, Register> gpr0; BitField<8, 8, Register> gpr8; @@ -737,34 +746,34 @@ union Instruction { BitField<28, 8, u64> imm_lut28; BitField<48, 8, u64> imm_lut48; - u32 GetImmLut28() const { + [[nodiscard]] u32 GetImmLut28() const { return static_cast<u32>(imm_lut28); } - u32 GetImmLut48() const { + [[nodiscard]] u32 GetImmLut48() const { return static_cast<u32>(imm_lut48); } } lop3; - u16 GetImm20_16() const { + [[nodiscard]] u16 GetImm20_16() const { return static_cast<u16>(imm20_16); } - u32 GetImm20_19() const { + [[nodiscard]] u32 GetImm20_19() const { u32 imm{static_cast<u32>(imm20_19)}; imm <<= 12; imm |= negate_imm ? 0x80000000 : 0; return imm; } - u32 GetImm20_32() const { + [[nodiscard]] u32 GetImm20_32() const { return static_cast<u32>(imm20_32); } - s32 GetSignedImm20_20() const { - u32 immediate = static_cast<u32>(imm20_19 | (negate_imm << 19)); + [[nodiscard]] s32 GetSignedImm20_20() const { + const auto immediate = static_cast<u32>(imm20_19 | (negate_imm << 19)); // Sign extend the 20-bit value. - u32 mask = 1U << (20 - 1); + const auto mask = 1U << (20 - 1); return static_cast<s32>((immediate ^ mask) - mask); } } alu; @@ -813,15 +822,17 @@ union Instruction { } alu_integer; union { + BitField<43, 1, u64> x; + } iadd; + + union { BitField<39, 1, u64> ftz; BitField<32, 1, u64> saturate; BitField<49, 2, HalfMerge> merge; - BitField<43, 1, u64> negate_a; BitField<44, 1, u64> abs_a; BitField<47, 2, HalfType> type_a; - BitField<31, 1, u64> negate_b; BitField<30, 1, u64> abs_b; BitField<28, 2, HalfType> type_b; @@ -846,7 +857,7 @@ union Instruction { BitField<56, 1, u64> second_negate; BitField<30, 9, u64> second; - u32 PackImmediates() const { + [[nodiscard]] u32 PackImmediates() const { // Immediates are half floats shifted. constexpr u32 imm_shift = 6; return static_cast<u32>((first << imm_shift) | (second << (16 + imm_shift))); @@ -1022,7 +1033,7 @@ union Instruction { BitField<28, 2, AtomicType> type; BitField<30, 22, s64> offset; - s32 GetImmediateOffset() const { + [[nodiscard]] s32 GetImmediateOffset() const { return static_cast<s32>(offset << 2); } } atoms; @@ -1204,7 +1215,7 @@ union Instruction { BitField<39, 4, u64> rounding; // H0, H1 extract for F16 missing BitField<41, 1, u64> selector; // Guessed as some games set it, TODO: reverse this value - F2fRoundingOp GetRoundingMode() const { + [[nodiscard]] F2fRoundingOp GetRoundingMode() const { constexpr u64 rounding_mask = 0x0B; return static_cast<F2fRoundingOp>(rounding.Value() & rounding_mask); } @@ -1228,15 +1239,15 @@ union Instruction { BitField<54, 1, u64> aoffi_flag; BitField<55, 3, TextureProcessMode> process_mode; - bool IsComponentEnabled(std::size_t component) const { - return ((1ull << component) & component_mask) != 0; + [[nodiscard]] bool IsComponentEnabled(std::size_t component) const { + return ((1ULL << component) & component_mask) != 0; } - TextureProcessMode GetTextureProcessMode() const { + [[nodiscard]] TextureProcessMode GetTextureProcessMode() const { return process_mode; } - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::DC: return dc_flag != 0; @@ -1260,15 +1271,15 @@ union Instruction { BitField<36, 1, u64> aoffi_flag; BitField<37, 3, TextureProcessMode> process_mode; - bool IsComponentEnabled(std::size_t component) const { + [[nodiscard]] bool IsComponentEnabled(std::size_t component) const { return ((1ULL << component) & component_mask) != 0; } - TextureProcessMode GetTextureProcessMode() const { + [[nodiscard]] TextureProcessMode GetTextureProcessMode() const { return process_mode; } - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::DC: return dc_flag != 0; @@ -1288,7 +1299,7 @@ union Instruction { BitField<31, 4, u64> component_mask; BitField<49, 1, u64> nodep_flag; - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::NODEP: return nodep_flag != 0; @@ -1298,7 +1309,7 @@ union Instruction { return false; } - bool IsComponentEnabled(std::size_t component) const { + [[nodiscard]] bool IsComponentEnabled(std::size_t component) const { return ((1ULL << component) & component_mask) != 0; } } txq; @@ -1310,11 +1321,11 @@ union Instruction { BitField<35, 1, u64> ndv_flag; BitField<49, 1, u64> nodep_flag; - bool IsComponentEnabled(std::size_t component) const { - return ((1ull << component) & component_mask) != 0; + [[nodiscard]] bool IsComponentEnabled(std::size_t component) const { + return ((1ULL << component) & component_mask) != 0; } - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::NDV: return (ndv_flag != 0); @@ -1336,7 +1347,7 @@ union Instruction { BitField<54, 2, u64> offset_mode; BitField<56, 2, u64> component; - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::NDV: return ndv_flag != 0; @@ -1362,7 +1373,7 @@ union Instruction { BitField<33, 2, u64> offset_mode; BitField<37, 2, u64> component; - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::NDV: return ndv_flag != 0; @@ -1388,7 +1399,7 @@ union Instruction { BitField<52, 2, u64> component; BitField<55, 1, u64> fp16_flag; - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::DC: return dc_flag != 0; @@ -1411,16 +1422,20 @@ union Instruction { BitField<53, 4, u64> texture_info; BitField<59, 1, u64> fp32_flag; - TextureType GetTextureType() const { + [[nodiscard]] TextureType GetTextureType() const { // The TEXS instruction has a weird encoding for the texture type. - if (texture_info == 0) + if (texture_info == 0) { return TextureType::Texture1D; - if (texture_info >= 1 && texture_info <= 9) + } + if (texture_info >= 1 && texture_info <= 9) { return TextureType::Texture2D; - if (texture_info >= 10 && texture_info <= 11) + } + if (texture_info >= 10 && texture_info <= 11) { return TextureType::Texture3D; - if (texture_info >= 12 && texture_info <= 13) + } + if (texture_info >= 12 && texture_info <= 13) { return TextureType::TextureCube; + } LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}", static_cast<u32>(texture_info.Value())); @@ -1428,7 +1443,7 @@ union Instruction { return TextureType::Texture1D; } - TextureProcessMode GetTextureProcessMode() const { + [[nodiscard]] TextureProcessMode GetTextureProcessMode() const { switch (texture_info) { case 0: case 2: @@ -1447,7 +1462,7 @@ union Instruction { return TextureProcessMode::None; } - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::DC: return (texture_info >= 4 && texture_info <= 6) || texture_info == 9; @@ -1459,16 +1474,16 @@ union Instruction { return false; } - bool IsArrayTexture() const { + [[nodiscard]] bool IsArrayTexture() const { // TEXS only supports Texture2D arrays. return texture_info >= 7 && texture_info <= 9; } - bool HasTwoDestinations() const { + [[nodiscard]] bool HasTwoDestinations() const { return gpr28.Value() != Register::ZeroIndex; } - bool IsComponentEnabled(std::size_t component) const { + [[nodiscard]] bool IsComponentEnabled(std::size_t component) const { static constexpr std::array<std::array<u32, 8>, 4> mask_lut{{ {}, {0x1, 0x2, 0x4, 0x8, 0x3, 0x9, 0xa, 0xc}, @@ -1495,7 +1510,7 @@ union Instruction { BitField<54, 1, u64> cl; BitField<55, 1, u64> process_mode; - TextureProcessMode GetTextureProcessMode() const { + [[nodiscard]] TextureProcessMode GetTextureProcessMode() const { return process_mode == 0 ? TextureProcessMode::LZ : TextureProcessMode::LL; } } tld; @@ -1505,9 +1520,9 @@ union Instruction { BitField<53, 4, u64> texture_info; BitField<59, 1, u64> fp32_flag; - TextureType GetTextureType() const { + [[nodiscard]] TextureType GetTextureType() const { // The TLDS instruction has a weird encoding for the texture type. - if (texture_info >= 0 && texture_info <= 1) { + if (texture_info <= 1) { return TextureType::Texture1D; } if (texture_info == 2 || texture_info == 8 || texture_info == 12 || @@ -1524,13 +1539,14 @@ union Instruction { return TextureType::Texture1D; } - TextureProcessMode GetTextureProcessMode() const { - if (texture_info == 1 || texture_info == 5 || texture_info == 12) + [[nodiscard]] TextureProcessMode GetTextureProcessMode() const { + if (texture_info == 1 || texture_info == 5 || texture_info == 12) { return TextureProcessMode::LL; + } return TextureProcessMode::LZ; } - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::AOFFI: return texture_info == 12 || texture_info == 4; @@ -1544,7 +1560,7 @@ union Instruction { return false; } - bool IsArrayTexture() const { + [[nodiscard]] bool IsArrayTexture() const { // TEXS only supports Texture2D arrays. return texture_info == 8; } @@ -1556,7 +1572,7 @@ union Instruction { BitField<35, 1, u64> aoffi_flag; BitField<49, 1, u64> nodep_flag; - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::AOFFI: return aoffi_flag != 0; @@ -1580,7 +1596,7 @@ union Instruction { BitField<20, 3, StoreType> store_data_layout; BitField<20, 4, u64> component_mask_selector; - bool IsComponentEnabled(std::size_t component) const { + [[nodiscard]] bool IsComponentEnabled(std::size_t component) const { ASSERT(mode == SurfaceDataMode::P); constexpr u8 R = 0b0001; constexpr u8 G = 0b0010; @@ -1593,7 +1609,7 @@ union Instruction { return std::bitset<4>{mask.at(component_mask_selector)}.test(component); } - StoreType GetStoreDataLayout() const { + [[nodiscard]] StoreType GetStoreDataLayout() const { ASSERT(mode == SurfaceDataMode::D_BA); return store_data_layout; } @@ -1611,14 +1627,15 @@ union Instruction { BitField<20, 24, u64> target; BitField<5, 1, u64> constant_buffer; - s32 GetBranchTarget() const { + [[nodiscard]] s32 GetBranchTarget() const { // Sign extend the branch target offset - u32 mask = 1U << (24 - 1); - u32 value = static_cast<u32>(target); + const auto mask = 1U << (24 - 1); + const auto target_value = static_cast<u32>(target); + constexpr auto instruction_size = static_cast<s32>(sizeof(Instruction)); + // The branch offset is relative to the next instruction and is stored in bytes, so // divide it by the size of an instruction and add 1 to it. - return static_cast<s32>((value ^ mask) - mask) / static_cast<s32>(sizeof(Instruction)) + - 1; + return static_cast<s32>((target_value ^ mask) - mask) / instruction_size + 1; } } bra; @@ -1626,14 +1643,15 @@ union Instruction { BitField<20, 24, u64> target; BitField<5, 1, u64> constant_buffer; - s32 GetBranchExtend() const { + [[nodiscard]] s32 GetBranchExtend() const { // Sign extend the branch target offset - u32 mask = 1U << (24 - 1); - u32 value = static_cast<u32>(target); + const auto mask = 1U << (24 - 1); + const auto target_value = static_cast<u32>(target); + constexpr auto instruction_size = static_cast<s32>(sizeof(Instruction)); + // The branch offset is relative to the next instruction and is stored in bytes, so // divide it by the size of an instruction and add 1 to it. - return static_cast<s32>((value ^ mask) - mask) / static_cast<s32>(sizeof(Instruction)) + - 1; + return static_cast<s32>((target_value ^ mask) - mask) / instruction_size + 1; } } brx; @@ -1686,7 +1704,7 @@ union Instruction { BitField<50, 1, u64> is_op_b_register; BitField<51, 3, VmnmxOperation> operation; - VmnmxType SourceFormatA() const { + [[nodiscard]] VmnmxType SourceFormatA() const { switch (src_format_a) { case 0b11: return VmnmxType::Bits32; @@ -1697,7 +1715,7 @@ union Instruction { } } - VmnmxType SourceFormatB() const { + [[nodiscard]] VmnmxType SourceFormatB() const { switch (src_format_b) { case 0b11: return VmnmxType::Bits32; @@ -1728,7 +1746,7 @@ union Instruction { BitField<20, 14, u64> shifted_offset; BitField<34, 5, u64> index; - u64 GetOffset() const { + [[nodiscard]] u64 GetOffset() const { return shifted_offset * 4; } } cbuf34; @@ -1737,7 +1755,7 @@ union Instruction { BitField<20, 16, s64> offset; BitField<36, 5, u64> index; - s64 GetOffset() const { + [[nodiscard]] s64 GetOffset() const { return offset; } } cbuf36; @@ -1867,7 +1885,9 @@ public: HSETP2_C, HSETP2_R, HSETP2_IMM, + HSET2_C, HSET2_R, + HSET2_IMM, POPC_C, POPC_R, POPC_IMM, @@ -1880,6 +1900,7 @@ public: ICMP_IMM, FCMP_RR, FCMP_RC, + FCMP_IMMR, MUFU, // Multi-Function Operator RRO_C, // Range Reduction Operator RRO_R, @@ -1983,29 +2004,29 @@ public: /// Returns whether an opcode has an execution predicate field or not (ie, whether it can be /// conditionally executed). - static bool IsPredicatedInstruction(Id opcode) { + [[nodiscard]] static bool IsPredicatedInstruction(Id opcode) { // TODO(Subv): Add the rest of unpredicated instructions. return opcode != Id::SSY && opcode != Id::PBK; } class Matcher { public: - constexpr Matcher(const char* const name, u16 mask, u16 expected, Id id, Type type) - : name{name}, mask{mask}, expected{expected}, id{id}, type{type} {} + constexpr Matcher(const char* const name_, u16 mask_, u16 expected_, Id id_, Type type_) + : name{name_}, mask{mask_}, expected{expected_}, id{id_}, type{type_} {} - constexpr const char* GetName() const { + [[nodiscard]] constexpr const char* GetName() const { return name; } - constexpr u16 GetMask() const { + [[nodiscard]] constexpr u16 GetMask() const { return mask; } - constexpr Id GetId() const { + [[nodiscard]] constexpr Id GetId() const { return id; } - constexpr Type GetType() const { + [[nodiscard]] constexpr Type GetType() const { return type; } @@ -2014,7 +2035,7 @@ public: * @param instruction The instruction to test * @returns true if the given instruction matches. */ - constexpr bool Matches(u16 instruction) const { + [[nodiscard]] constexpr bool Matches(u16 instruction) const { return (instruction & mask) == expected; } @@ -2026,7 +2047,8 @@ public: Type type; }; - static std::optional<std::reference_wrapper<const Matcher>> Decode(Instruction instr) { + using DecodeResult = std::optional<std::reference_wrapper<const Matcher>>; + [[nodiscard]] static DecodeResult Decode(Instruction instr) { static const auto table{GetDecodeTable()}; const auto matches_instruction = [instr](const auto& matcher) { @@ -2048,7 +2070,7 @@ private: * A '0' in a bitstring indicates that a zero must be present at that bit position. * A '1' in a bitstring indicates that a one must be present at that bit position. */ - static constexpr auto GetMaskAndExpect(const char* const bitstring) { + [[nodiscard]] static constexpr auto GetMaskAndExpect(const char* const bitstring) { u16 mask = 0, expect = 0; for (std::size_t i = 0; i < opcode_bitsize; i++) { const std::size_t bit_position = opcode_bitsize - i - 1; @@ -2070,14 +2092,14 @@ private: public: /// Creates a matcher that can match and parse instructions based on bitstring. - static constexpr auto GetMatcher(const char* const bitstring, Id op, Type type, - const char* const name) { + [[nodiscard]] static constexpr auto GetMatcher(const char* const bitstring, Id op, + Type type, const char* const name) { const auto [mask, expected] = GetMaskAndExpect(bitstring); return Matcher(name, mask, expected, op, type); } }; - static std::vector<Matcher> GetDecodeTable() { + [[nodiscard]] static std::vector<Matcher> GetDecodeTable() { std::vector<Matcher> table = { #define INST(bitstring, op, type, name) Detail::GetMatcher(bitstring, op, type, name) INST("111000110011----", Id::KIL, Type::Flow, "KIL"), @@ -2187,9 +2209,12 @@ private: INST("0111111-1-------", Id::HSETP2_C, Type::HalfSetPredicate, "HSETP2_C"), INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"), INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"), + INST("0111110-1-------", Id::HSET2_C, Type::HalfSet, "HSET2_C"), INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"), + INST("0111110-0-------", Id::HSET2_IMM, Type::HalfSet, "HSET2_IMM"), INST("010110111010----", Id::FCMP_RR, Type::Arithmetic, "FCMP_RR"), INST("010010111010----", Id::FCMP_RC, Type::Arithmetic, "FCMP_RC"), + INST("0011011-1010----", Id::FCMP_IMMR, Type::Arithmetic, "FCMP_IMMR"), INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"), INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"), INST("0101110010010---", Id::RRO_R, Type::Arithmetic, "RRO_R"), diff --git a/src/video_core/engines/shader_header.h b/src/video_core/engines/shader_header.h index 72e2a33d5..ceec05459 100644 --- a/src/video_core/engines/shader_header.h +++ b/src/video_core/engines/shader_header.h @@ -41,30 +41,30 @@ struct Header { BitField<26, 1, u32> does_load_or_store; BitField<27, 1, u32> does_fp64; BitField<28, 4, u32> stream_out_mask; - } common0{}; + } common0; union { BitField<0, 24, u32> shader_local_memory_low_size; BitField<24, 8, u32> per_patch_attribute_count; - } common1{}; + } common1; union { BitField<0, 24, u32> shader_local_memory_high_size; BitField<24, 8, u32> threads_per_input_primitive; - } common2{}; + } common2; union { BitField<0, 24, u32> shader_local_memory_crs_size; BitField<24, 4, OutputTopology> output_topology; BitField<28, 4, u32> reserved; - } common3{}; + } common3; union { BitField<0, 12, u32> max_output_vertices; BitField<12, 8, u32> store_req_start; // NOTE: not used by geometry shaders. BitField<20, 4, u32> reserved; BitField<24, 8, u32> store_req_end; // NOTE: not used by geometry shaders. - } common4{}; + } common4; union { struct { @@ -145,7 +145,7 @@ struct Header { } } ps; - std::array<u32, 0xF> raw{}; + std::array<u32, 0xF> raw; }; u64 GetLocalMemorySize() const { @@ -153,7 +153,6 @@ struct Header { (common2.shader_local_memory_high_size << 24)); } }; - static_assert(sizeof(Header) == 0x50, "Incorrect structure size"); } // namespace Tegra::Shader diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h new file mode 100644 index 000000000..de6991ef6 --- /dev/null +++ b/src/video_core/fence_manager.h @@ -0,0 +1,164 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <algorithm> +#include <queue> + +#include "common/common_types.h" +#include "core/core.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" +#include "video_core/rasterizer_interface.h" + +namespace VideoCommon { + +class FenceBase { +public: + FenceBase(u32 payload, bool is_stubbed) + : address{}, payload{payload}, is_semaphore{false}, is_stubbed{is_stubbed} {} + + FenceBase(GPUVAddr address, u32 payload, bool is_stubbed) + : address{address}, payload{payload}, is_semaphore{true}, is_stubbed{is_stubbed} {} + + GPUVAddr GetAddress() const { + return address; + } + + u32 GetPayload() const { + return payload; + } + + bool IsSemaphore() const { + return is_semaphore; + } + +private: + GPUVAddr address; + u32 payload; + bool is_semaphore; + +protected: + bool is_stubbed; +}; + +template <typename TFence, typename TTextureCache, typename TTBufferCache, typename TQueryCache> +class FenceManager { +public: + void SignalSemaphore(GPUVAddr addr, u32 value) { + TryReleasePendingFences(); + const bool should_flush = ShouldFlush(); + CommitAsyncFlushes(); + TFence new_fence = CreateFence(addr, value, !should_flush); + fences.push(new_fence); + QueueFence(new_fence); + if (should_flush) { + rasterizer.FlushCommands(); + } + rasterizer.SyncGuestHost(); + } + + void SignalSyncPoint(u32 value) { + TryReleasePendingFences(); + const bool should_flush = ShouldFlush(); + CommitAsyncFlushes(); + TFence new_fence = CreateFence(value, !should_flush); + fences.push(new_fence); + QueueFence(new_fence); + if (should_flush) { + rasterizer.FlushCommands(); + } + rasterizer.SyncGuestHost(); + } + + void WaitPendingFences() { + while (!fences.empty()) { + TFence& current_fence = fences.front(); + if (ShouldWait()) { + WaitFence(current_fence); + } + PopAsyncFlushes(); + if (current_fence->IsSemaphore()) { + gpu_memory.template Write<u32>(current_fence->GetAddress(), + current_fence->GetPayload()); + } else { + gpu.IncrementSyncPoint(current_fence->GetPayload()); + } + fences.pop(); + } + } + +protected: + explicit FenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, + TTextureCache& texture_cache_, TTBufferCache& buffer_cache_, + TQueryCache& query_cache_) + : rasterizer{rasterizer_}, gpu{gpu_}, gpu_memory{gpu.MemoryManager()}, + texture_cache{texture_cache_}, buffer_cache{buffer_cache_}, query_cache{query_cache_} {} + + virtual ~FenceManager() = default; + + /// Creates a Sync Point Fence Interface, does not create a backend fence if 'is_stubbed' is + /// true + virtual TFence CreateFence(u32 value, bool is_stubbed) = 0; + /// Creates a Semaphore Fence Interface, does not create a backend fence if 'is_stubbed' is true + virtual TFence CreateFence(GPUVAddr addr, u32 value, bool is_stubbed) = 0; + /// Queues a fence into the backend if the fence isn't stubbed. + virtual void QueueFence(TFence& fence) = 0; + /// Notifies that the backend fence has been signaled/reached in host GPU. + virtual bool IsFenceSignaled(TFence& fence) const = 0; + /// Waits until a fence has been signalled by the host GPU. + virtual void WaitFence(TFence& fence) = 0; + + VideoCore::RasterizerInterface& rasterizer; + Tegra::GPU& gpu; + Tegra::MemoryManager& gpu_memory; + TTextureCache& texture_cache; + TTBufferCache& buffer_cache; + TQueryCache& query_cache; + +private: + void TryReleasePendingFences() { + while (!fences.empty()) { + TFence& current_fence = fences.front(); + if (ShouldWait() && !IsFenceSignaled(current_fence)) { + return; + } + PopAsyncFlushes(); + if (current_fence->IsSemaphore()) { + gpu_memory.template Write<u32>(current_fence->GetAddress(), + current_fence->GetPayload()); + } else { + gpu.IncrementSyncPoint(current_fence->GetPayload()); + } + fences.pop(); + } + } + + bool ShouldWait() const { + return texture_cache.ShouldWaitAsyncFlushes() || buffer_cache.ShouldWaitAsyncFlushes() || + query_cache.ShouldWaitAsyncFlushes(); + } + + bool ShouldFlush() const { + return texture_cache.HasUncommittedFlushes() || buffer_cache.HasUncommittedFlushes() || + query_cache.HasUncommittedFlushes(); + } + + void PopAsyncFlushes() { + texture_cache.PopAsyncFlushes(); + buffer_cache.PopAsyncFlushes(); + query_cache.PopAsyncFlushes(); + } + + void CommitAsyncFlushes() { + texture_cache.CommitAsyncFlushes(); + buffer_cache.CommitAsyncFlushes(); + query_cache.CommitAsyncFlushes(); + } + + std::queue<TFence> fences; +}; + +} // namespace VideoCommon diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 8acf2eda2..ebd149c3a 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -2,6 +2,8 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <chrono> + #include "common/assert.h" #include "common/microprofile.h" #include "core/core.h" @@ -9,6 +11,7 @@ #include "core/core_timing_util.h" #include "core/frontend/emu_window.h" #include "core/memory.h" +#include "core/settings.h" #include "video_core/engines/fermi_2d.h" #include "video_core/engines/kepler_compute.h" #include "video_core/engines/kepler_memory.h" @@ -17,26 +20,36 @@ #include "video_core/gpu.h" #include "video_core/memory_manager.h" #include "video_core/renderer_base.h" +#include "video_core/shader_notify.h" #include "video_core/video_core.h" namespace Tegra { MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192)); -GPU::GPU(Core::System& system, std::unique_ptr<VideoCore::RendererBase>&& renderer_, bool is_async) - : system{system}, renderer{std::move(renderer_)}, is_async{is_async} { - auto& rasterizer{renderer->Rasterizer()}; - memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer); - dma_pusher = std::make_unique<Tegra::DmaPusher>(*this); - maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager); - fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer); - kepler_compute = std::make_unique<Engines::KeplerCompute>(system, rasterizer, *memory_manager); - maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, *memory_manager); - kepler_memory = std::make_unique<Engines::KeplerMemory>(system, *memory_manager); -} +GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_) + : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)}, + dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)}, + cdma_pusher{std::make_unique<Tegra::CDmaPusher>(*this)}, use_nvdec{use_nvdec_}, + maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)}, + fermi_2d{std::make_unique<Engines::Fermi2D>()}, + kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)}, + maxwell_dma{std::make_unique<Engines::MaxwellDMA>(system, *memory_manager)}, + kepler_memory{std::make_unique<Engines::KeplerMemory>(system, *memory_manager)}, + shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_} {} GPU::~GPU() = default; +void GPU::BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer_) { + renderer = std::move(renderer_); + + VideoCore::RasterizerInterface& rasterizer = renderer->Rasterizer(); + memory_manager->BindRasterizer(rasterizer); + maxwell_3d->BindRasterizer(rasterizer); + fermi_2d->BindRasterizer(rasterizer); + kepler_compute->BindRasterizer(rasterizer); +} + Engines::Maxwell3D& GPU::Maxwell3D() { return *maxwell_3d; } @@ -65,10 +78,18 @@ DmaPusher& GPU::DmaPusher() { return *dma_pusher; } +Tegra::CDmaPusher& GPU::CDmaPusher() { + return *cdma_pusher; +} + const DmaPusher& GPU::DmaPusher() const { return *dma_pusher; } +const Tegra::CDmaPusher& GPU::CDmaPusher() const { + return *cdma_pusher; +} + void GPU::WaitFence(u32 syncpoint_id, u32 value) { // Synced GPU, is always in sync if (!is_async) { @@ -76,7 +97,7 @@ void GPU::WaitFence(u32 syncpoint_id, u32 value) { } MICROPROFILE_SCOPE(GPU_wait); std::unique_lock lock{sync_mutex}; - sync_cv.wait(lock, [=]() { return syncpoints[syncpoint_id].load() >= value; }); + sync_cv.wait(lock, [=, this] { return syncpoints[syncpoint_id].load() >= value; }); } void GPU::IncrementSyncPoint(const u32 syncpoint_id) { @@ -125,14 +146,38 @@ bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) { return true; } +u64 GPU::RequestFlush(VAddr addr, std::size_t size) { + std::unique_lock lck{flush_request_mutex}; + const u64 fence = ++last_flush_fence; + flush_requests.emplace_back(fence, addr, size); + return fence; +} + +void GPU::TickWork() { + std::unique_lock lck{flush_request_mutex}; + while (!flush_requests.empty()) { + auto& request = flush_requests.front(); + const u64 fence = request.fence; + const VAddr addr = request.addr; + const std::size_t size = request.size; + flush_requests.pop_front(); + flush_request_mutex.unlock(); + renderer->Rasterizer().FlushRegion(addr, size); + current_flush_fence.store(fence); + flush_request_mutex.lock(); + } +} + u64 GPU::GetTicks() const { // This values were reversed engineered by fincs from NVN // The gpu clock is reported in units of 385/625 nanoseconds constexpr u64 gpu_ticks_num = 384; constexpr u64 gpu_ticks_den = 625; - const u64 cpu_ticks = system.CoreTiming().GetTicks(); - const u64 nanoseconds = Core::Timing::CyclesToNs(cpu_ticks).count(); + u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count(); + if (Settings::values.use_fast_gpu_time.GetValue()) { + nanoseconds /= 256; + } const u64 nanoseconds_num = nanoseconds / gpu_ticks_den; const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den; return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den; @@ -142,30 +187,13 @@ void GPU::FlushCommands() { renderer->Rasterizer().FlushCommands(); } -// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence -// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4. -// So the values you see in docs might be multiplied by 4. -enum class BufferMethods { - BindObject = 0x0, - Nop = 0x2, - SemaphoreAddressHigh = 0x4, - SemaphoreAddressLow = 0x5, - SemaphoreSequence = 0x6, - SemaphoreTrigger = 0x7, - NotifyIntr = 0x8, - WrcacheFlush = 0x9, - Unk28 = 0xA, - UnkCacheFlush = 0xB, - RefCnt = 0x14, - SemaphoreAcquire = 0x1A, - SemaphoreRelease = 0x1B, - FenceValue = 0x1C, - FenceAction = 0x1D, - Unk78 = 0x1E, - Unk7c = 0x1F, - Yield = 0x20, - NonPullerMethods = 0x40, -}; +void GPU::SyncGuestHost() { + renderer->Rasterizer().SyncGuestHost(); +} + +void GPU::OnCommandListEnd() { + renderer->Rasterizer().ReleaseFences(); +} enum class GpuSemaphoreOperation { AcquireEqual = 0x1, @@ -180,16 +208,32 @@ void GPU::CallMethod(const MethodCall& method_call) { ASSERT(method_call.subchannel < bound_engines.size()); - if (ExecuteMethodOnEngine(method_call)) { + if (ExecuteMethodOnEngine(method_call.method)) { CallEngineMethod(method_call); } else { CallPullerMethod(method_call); } } -bool GPU::ExecuteMethodOnEngine(const MethodCall& method_call) { - const auto method = static_cast<BufferMethods>(method_call.method); - return method >= BufferMethods::NonPullerMethods; +void GPU::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, + u32 methods_pending) { + LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method, subchannel); + + ASSERT(subchannel < bound_engines.size()); + + if (ExecuteMethodOnEngine(method)) { + CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending); + } else { + for (std::size_t i = 0; i < amount; i++) { + CallPullerMethod( + {method, base_start[i], subchannel, methods_pending - static_cast<u32>(i)}); + } + } +} + +bool GPU::ExecuteMethodOnEngine(u32 method) { + const auto buffer_method = static_cast<BufferMethods>(method); + return buffer_method >= BufferMethods::NonPullerMethods; } void GPU::CallPullerMethod(const MethodCall& method_call) { @@ -209,7 +253,12 @@ void GPU::CallPullerMethod(const MethodCall& method_call) { case BufferMethods::UnkCacheFlush: case BufferMethods::WrcacheFlush: case BufferMethods::FenceValue: + break; case BufferMethods::FenceAction: + ProcessFenceActionMethod(); + break; + case BufferMethods::WaitForInterrupt: + ProcessWaitForInterruptMethod(); break; case BufferMethods::SemaphoreTrigger: { ProcessSemaphoreTriggerMethod(); @@ -250,19 +299,46 @@ void GPU::CallEngineMethod(const MethodCall& method_call) { switch (engine) { case EngineID::FERMI_TWOD_A: - fermi_2d->CallMethod(method_call); + fermi_2d->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall()); + break; + case EngineID::MAXWELL_B: + maxwell_3d->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall()); + break; + case EngineID::KEPLER_COMPUTE_B: + kepler_compute->CallMethod(method_call.method, method_call.argument, + method_call.IsLastCall()); + break; + case EngineID::MAXWELL_DMA_COPY_A: + maxwell_dma->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall()); + break; + case EngineID::KEPLER_INLINE_TO_MEMORY_B: + kepler_memory->CallMethod(method_call.method, method_call.argument, + method_call.IsLastCall()); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented engine"); + } +} + +void GPU::CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, + u32 methods_pending) { + const EngineID engine = bound_engines[subchannel]; + + switch (engine) { + case EngineID::FERMI_TWOD_A: + fermi_2d->CallMultiMethod(method, base_start, amount, methods_pending); break; case EngineID::MAXWELL_B: - maxwell_3d->CallMethod(method_call); + maxwell_3d->CallMultiMethod(method, base_start, amount, methods_pending); break; case EngineID::KEPLER_COMPUTE_B: - kepler_compute->CallMethod(method_call); + kepler_compute->CallMultiMethod(method, base_start, amount, methods_pending); break; case EngineID::MAXWELL_DMA_COPY_A: - maxwell_dma->CallMethod(method_call); + maxwell_dma->CallMultiMethod(method, base_start, amount, methods_pending); break; case EngineID::KEPLER_INLINE_TO_MEMORY_B: - kepler_memory->CallMethod(method_call); + kepler_memory->CallMultiMethod(method, base_start, amount, methods_pending); break; default: UNIMPLEMENTED_MSG("Unimplemented engine"); @@ -273,7 +349,46 @@ void GPU::ProcessBindMethod(const MethodCall& method_call) { // Bind the current subchannel to the desired engine id. LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel, method_call.argument); - bound_engines[method_call.subchannel] = static_cast<EngineID>(method_call.argument); + const auto engine_id = static_cast<EngineID>(method_call.argument); + bound_engines[method_call.subchannel] = static_cast<EngineID>(engine_id); + switch (engine_id) { + case EngineID::FERMI_TWOD_A: + dma_pusher->BindSubchannel(fermi_2d.get(), method_call.subchannel); + break; + case EngineID::MAXWELL_B: + dma_pusher->BindSubchannel(maxwell_3d.get(), method_call.subchannel); + break; + case EngineID::KEPLER_COMPUTE_B: + dma_pusher->BindSubchannel(kepler_compute.get(), method_call.subchannel); + break; + case EngineID::MAXWELL_DMA_COPY_A: + dma_pusher->BindSubchannel(maxwell_dma.get(), method_call.subchannel); + break; + case EngineID::KEPLER_INLINE_TO_MEMORY_B: + dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", static_cast<u32>(engine_id)); + } +} + +void GPU::ProcessFenceActionMethod() { + switch (regs.fence_action.op) { + case FenceOperation::Acquire: + WaitFence(regs.fence_action.syncpoint_id, regs.fence_value); + break; + case FenceOperation::Increment: + IncrementSyncPoint(regs.fence_action.syncpoint_id); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented operation {}", + static_cast<u32>(regs.fence_action.op.Value())); + } +} + +void GPU::ProcessWaitForInterruptMethod() { + // TODO(bunnei) ImplementMe + LOG_WARNING(HW_GPU, "(STUBBED) called"); } void GPU::ProcessSemaphoreTriggerMethod() { diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 1a2d747be..21410e125 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -13,14 +13,15 @@ #include "common/common_types.h" #include "core/hle/service/nvdrv/nvdata.h" #include "core/hle/service/nvflinger/buffer_queue.h" +#include "video_core/cdma_pusher.h" #include "video_core/dma_pusher.h" using CacheAddr = std::uintptr_t; -inline CacheAddr ToCacheAddr(const void* host_ptr) { +[[nodiscard]] inline CacheAddr ToCacheAddr(const void* host_ptr) { return reinterpret_cast<CacheAddr>(host_ptr); } -inline u8* FromCacheAddr(CacheAddr cache_addr) { +[[nodiscard]] inline u8* FromCacheAddr(CacheAddr cache_addr) { return reinterpret_cast<u8*>(cache_addr); } @@ -33,58 +34,68 @@ class System; namespace VideoCore { class RendererBase; +class ShaderNotify; } // namespace VideoCore namespace Tegra { enum class RenderTargetFormat : u32 { NONE = 0x0, - RGBA32_FLOAT = 0xC0, - RGBA32_UINT = 0xC2, - RGBA16_UNORM = 0xC6, - RGBA16_SNORM = 0xC7, - RGBA16_UINT = 0xC9, - RGBA16_FLOAT = 0xCA, - RG32_FLOAT = 0xCB, - RG32_UINT = 0xCD, - RGBX16_FLOAT = 0xCE, - BGRA8_UNORM = 0xCF, - BGRA8_SRGB = 0xD0, - RGB10_A2_UNORM = 0xD1, - RGBA8_UNORM = 0xD5, - RGBA8_SRGB = 0xD6, - RGBA8_SNORM = 0xD7, - RGBA8_UINT = 0xD9, - RG16_UNORM = 0xDA, - RG16_SNORM = 0xDB, - RG16_SINT = 0xDC, - RG16_UINT = 0xDD, - RG16_FLOAT = 0xDE, - R11G11B10_FLOAT = 0xE0, + R32B32G32A32_FLOAT = 0xC0, + R32G32B32A32_SINT = 0xC1, + R32G32B32A32_UINT = 0xC2, + R16G16B16A16_UNORM = 0xC6, + R16G16B16A16_SNORM = 0xC7, + R16G16B16A16_SINT = 0xC8, + R16G16B16A16_UINT = 0xC9, + R16G16B16A16_FLOAT = 0xCA, + R32G32_FLOAT = 0xCB, + R32G32_SINT = 0xCC, + R32G32_UINT = 0xCD, + R16G16B16X16_FLOAT = 0xCE, + B8G8R8A8_UNORM = 0xCF, + B8G8R8A8_SRGB = 0xD0, + A2B10G10R10_UNORM = 0xD1, + A2B10G10R10_UINT = 0xD2, + A8B8G8R8_UNORM = 0xD5, + A8B8G8R8_SRGB = 0xD6, + A8B8G8R8_SNORM = 0xD7, + A8B8G8R8_SINT = 0xD8, + A8B8G8R8_UINT = 0xD9, + R16G16_UNORM = 0xDA, + R16G16_SNORM = 0xDB, + R16G16_SINT = 0xDC, + R16G16_UINT = 0xDD, + R16G16_FLOAT = 0xDE, + B10G11R11_FLOAT = 0xE0, R32_SINT = 0xE3, R32_UINT = 0xE4, R32_FLOAT = 0xE5, - B5G6R5_UNORM = 0xE8, - BGR5A1_UNORM = 0xE9, - RG8_UNORM = 0xEA, - RG8_SNORM = 0xEB, + R5G6B5_UNORM = 0xE8, + A1R5G5B5_UNORM = 0xE9, + R8G8_UNORM = 0xEA, + R8G8_SNORM = 0xEB, + R8G8_SINT = 0xEC, + R8G8_UINT = 0xED, R16_UNORM = 0xEE, R16_SNORM = 0xEF, R16_SINT = 0xF0, R16_UINT = 0xF1, R16_FLOAT = 0xF2, R8_UNORM = 0xF3, + R8_SNORM = 0xF4, + R8_SINT = 0xF5, R8_UINT = 0xF6, }; enum class DepthFormat : u32 { - Z32_FLOAT = 0xA, - Z16_UNORM = 0x13, - S8_Z24_UNORM = 0x14, - Z24_X8_UNORM = 0x15, - Z24_S8_UNORM = 0x16, - Z24_C8_UNORM = 0x18, - Z32_S8_X24_FLOAT = 0x19, + D32_FLOAT = 0xA, + D16_UNORM = 0x13, + S8_UINT_Z24_UNORM = 0x14, + D24X8_UNORM = 0x15, + D24S8_UNORM = 0x16, + D24C8_UNORM = 0x18, + D32_FLOAT_S8X24_UINT = 0x19, }; struct CommandListHeader; @@ -95,9 +106,9 @@ class DebugContext; */ struct FramebufferConfig { enum class PixelFormat : u32 { - ABGR8 = 1, - RGB565 = 4, - BGRA8 = 5, + A8B8G8R8_UNORM = 1, + RGB565_UNORM = 4, + B8G8R8A8_UNORM = 5, }; VAddr address; @@ -132,60 +143,102 @@ class MemoryManager; class GPU { public: - explicit GPU(Core::System& system, std::unique_ptr<VideoCore::RendererBase>&& renderer, - bool is_async); - - virtual ~GPU(); - struct MethodCall { u32 method{}; u32 argument{}; u32 subchannel{}; u32 method_count{}; - bool IsLastCall() const { - return method_count <= 1; - } - MethodCall(u32 method, u32 argument, u32 subchannel = 0, u32 method_count = 0) : method(method), argument(argument), subchannel(subchannel), method_count(method_count) {} + + [[nodiscard]] bool IsLastCall() const { + return method_count <= 1; + } }; + explicit GPU(Core::System& system, bool is_async, bool use_nvdec); + virtual ~GPU(); + + /// Binds a renderer to the GPU. + void BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer); + /// Calls a GPU method. void CallMethod(const MethodCall& method_call); + /// Calls a GPU multivalue method. + void CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, + u32 methods_pending); + + /// Flush all current written commands into the host GPU for execution. void FlushCommands(); + /// Synchronizes CPU writes with Host GPU memory. + void SyncGuestHost(); + /// Signal the ending of command list. + virtual void OnCommandListEnd(); + + /// Request a host GPU memory flush from the CPU. + [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size); + + /// Obtains current flush request fence id. + [[nodiscard]] u64 CurrentFlushRequestFence() const { + return current_flush_fence.load(std::memory_order_relaxed); + } + + /// Tick pending requests within the GPU. + void TickWork(); /// Returns a reference to the Maxwell3D GPU engine. - Engines::Maxwell3D& Maxwell3D(); + [[nodiscard]] Engines::Maxwell3D& Maxwell3D(); /// Returns a const reference to the Maxwell3D GPU engine. - const Engines::Maxwell3D& Maxwell3D() const; + [[nodiscard]] const Engines::Maxwell3D& Maxwell3D() const; /// Returns a reference to the KeplerCompute GPU engine. - Engines::KeplerCompute& KeplerCompute(); + [[nodiscard]] Engines::KeplerCompute& KeplerCompute(); /// Returns a reference to the KeplerCompute GPU engine. - const Engines::KeplerCompute& KeplerCompute() const; + [[nodiscard]] const Engines::KeplerCompute& KeplerCompute() const; /// Returns a reference to the GPU memory manager. - Tegra::MemoryManager& MemoryManager(); + [[nodiscard]] Tegra::MemoryManager& MemoryManager(); /// Returns a const reference to the GPU memory manager. - const Tegra::MemoryManager& MemoryManager() const; + [[nodiscard]] const Tegra::MemoryManager& MemoryManager() const; /// Returns a reference to the GPU DMA pusher. - Tegra::DmaPusher& DmaPusher(); + [[nodiscard]] Tegra::DmaPusher& DmaPusher(); - VideoCore::RendererBase& Renderer() { + /// Returns a const reference to the GPU DMA pusher. + [[nodiscard]] const Tegra::DmaPusher& DmaPusher() const; + + /// Returns a reference to the GPU CDMA pusher. + [[nodiscard]] Tegra::CDmaPusher& CDmaPusher(); + + /// Returns a const reference to the GPU CDMA pusher. + [[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const; + + /// Returns a reference to the underlying renderer. + [[nodiscard]] VideoCore::RendererBase& Renderer() { return *renderer; } - const VideoCore::RendererBase& Renderer() const { + /// Returns a const reference to the underlying renderer. + [[nodiscard]] const VideoCore::RendererBase& Renderer() const { return *renderer; } + /// Returns a reference to the shader notifier. + [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() { + return *shader_notify; + } + + /// Returns a const reference to the shader notifier. + [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const { + return *shader_notify; + } + // Waits for the GPU to finish working virtual void WaitIdle() const = 0; @@ -194,27 +247,46 @@ public: void IncrementSyncPoint(u32 syncpoint_id); - u32 GetSyncpointValue(u32 syncpoint_id) const; + [[nodiscard]] u32 GetSyncpointValue(u32 syncpoint_id) const; void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value); - bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value); + [[nodiscard]] bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value); - u64 GetTicks() const; + [[nodiscard]] u64 GetTicks() const; - std::unique_lock<std::mutex> LockSync() { + [[nodiscard]] std::unique_lock<std::mutex> LockSync() { return std::unique_lock{sync_mutex}; } - bool IsAsync() const { + [[nodiscard]] bool IsAsync() const { return is_async; } - /// Returns a const reference to the GPU DMA pusher. - const Tegra::DmaPusher& DmaPusher() const; + [[nodiscard]] bool UseNvdec() const { + return use_nvdec; + } + + enum class FenceOperation : u32 { + Acquire = 0, + Increment = 1, + }; + + union FenceAction { + u32 raw; + BitField<0, 1, FenceOperation> op; + BitField<8, 24, u32> syncpoint_id; + + [[nodiscard]] static CommandHeader Build(FenceOperation op, u32 syncpoint_id) { + FenceAction result{}; + result.op.Assign(op); + result.syncpoint_id.Assign(syncpoint_id); + return {result.raw}; + } + }; struct Regs { - static constexpr size_t NUM_REGS = 0x100; + static constexpr size_t NUM_REGS = 0x40; union { struct { @@ -223,7 +295,7 @@ public: u32 address_high; u32 address_low; - GPUVAddr SemaphoreAddress() const { + [[nodiscard]] GPUVAddr SemaphoreAddress() const { return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low); } @@ -233,7 +305,7 @@ public: u32 semaphore_trigger; INSERT_UNION_PADDING_WORDS(0xC); - // The puser and the puller share the reference counter, the pusher only has read + // The pusher and the puller share the reference counter, the pusher only has read // access u32 reference_count; INSERT_UNION_PADDING_WORDS(0x5); @@ -241,10 +313,7 @@ public: u32 semaphore_acquire; u32 semaphore_release; u32 fence_value; - union { - BitField<4, 4, u32> operation; - BitField<8, 8, u32> id; - } fence_action; + FenceAction fence_action; INSERT_UNION_PADDING_WORDS(0xE2); // Puller state @@ -263,9 +332,18 @@ public: /// core timing events. virtual void Start() = 0; + /// Obtain the CPU Context + virtual void ObtainContext() = 0; + + /// Release the CPU Context + virtual void ReleaseContext() = 0; + /// Push GPU command entries to be processed virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0; + /// Push GPU command buffer entries to be processed + virtual void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) = 0; + /// Swap buffers (render frame) virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0; @@ -283,6 +361,8 @@ protected: private: void ProcessBindMethod(const MethodCall& method_call); + void ProcessFenceActionMethod(); + void ProcessWaitForInterruptMethod(); void ProcessSemaphoreTriggerMethod(); void ProcessSemaphoreRelease(); void ProcessSemaphoreAcquire(); @@ -293,17 +373,22 @@ private: /// Calls a GPU engine method. void CallEngineMethod(const MethodCall& method_call); + /// Calls a GPU engine multivalue method. + void CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, + u32 methods_pending); + /// Determines where the method should be executed. - bool ExecuteMethodOnEngine(const MethodCall& method_call); + [[nodiscard]] bool ExecuteMethodOnEngine(u32 method); protected: - std::unique_ptr<Tegra::DmaPusher> dma_pusher; Core::System& system; + std::unique_ptr<Tegra::MemoryManager> memory_manager; + std::unique_ptr<Tegra::DmaPusher> dma_pusher; + std::unique_ptr<Tegra::CDmaPusher> cdma_pusher; std::unique_ptr<VideoCore::RendererBase> renderer; + const bool use_nvdec; private: - std::unique_ptr<Tegra::MemoryManager> memory_manager; - /// Mapping of command subchannels to their bound engine ids std::array<EngineID, 8> bound_engines = {}; /// 3D engine @@ -316,15 +401,31 @@ private: std::unique_ptr<Engines::MaxwellDMA> maxwell_dma; /// Inline memory engine std::unique_ptr<Engines::KeplerMemory> kepler_memory; + /// Shader build notifier + std::unique_ptr<VideoCore::ShaderNotify> shader_notify; std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{}; std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts; std::mutex sync_mutex; + std::mutex device_mutex; std::condition_variable sync_cv; + struct FlushRequest { + FlushRequest(u64 fence, VAddr addr, std::size_t size) + : fence{fence}, addr{addr}, size{size} {} + u64 fence; + VAddr addr; + std::size_t size; + }; + + std::list<FlushRequest> flush_requests; + std::atomic<u64> current_flush_fence{}; + u64 last_flush_fence{}; + std::mutex flush_request_mutex; + const bool is_async; }; diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp index 20e73a37e..a9baaf7ef 100644 --- a/src/video_core/gpu_asynch.cpp +++ b/src/video_core/gpu_asynch.cpp @@ -10,23 +10,50 @@ namespace VideoCommon { -GPUAsynch::GPUAsynch(Core::System& system, std::unique_ptr<VideoCore::RendererBase>&& renderer_, - std::unique_ptr<Core::Frontend::GraphicsContext>&& context) - : GPU(system, std::move(renderer_), true), gpu_thread{system}, - cpu_context(renderer->GetRenderWindow().CreateSharedContext()), - gpu_context(std::move(context)) {} +GPUAsynch::GPUAsynch(Core::System& system, bool use_nvdec) + : GPU{system, true, use_nvdec}, gpu_thread{system} {} GPUAsynch::~GPUAsynch() = default; void GPUAsynch::Start() { + gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher, *cdma_pusher); + cpu_context = renderer->GetRenderWindow().CreateSharedContext(); cpu_context->MakeCurrent(); - gpu_thread.StartThread(*renderer, *gpu_context, *dma_pusher); +} + +void GPUAsynch::ObtainContext() { + cpu_context->MakeCurrent(); +} + +void GPUAsynch::ReleaseContext() { + cpu_context->DoneCurrent(); } void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) { gpu_thread.SubmitList(std::move(entries)); } +void GPUAsynch::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) { + if (!use_nvdec) { + return; + } + // This condition fires when a video stream ends, clear all intermediary data + if (entries[0].raw == 0xDEADB33F) { + cdma_pusher.reset(); + return; + } + if (!cdma_pusher) { + cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this); + } + + // SubmitCommandBuffer would make the nvdec operations async, this is not currently working + // TODO(ameerj): RE proper async nvdec operation + // gpu_thread.SubmitCommandBuffer(std::move(entries)); + + cdma_pusher->Push(std::move(entries)); + cdma_pusher->DispatchCalls(); +} + void GPUAsynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { gpu_thread.SwapBuffers(framebuffer); } @@ -52,4 +79,8 @@ void GPUAsynch::WaitIdle() const { gpu_thread.WaitIdle(); } +void GPUAsynch::OnCommandListEnd() { + gpu_thread.OnCommandListEnd(); +} + } // namespace VideoCommon diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h index 03fd0eef0..0c0872e73 100644 --- a/src/video_core/gpu_asynch.h +++ b/src/video_core/gpu_asynch.h @@ -20,25 +20,28 @@ namespace VideoCommon { /// Implementation of GPU interface that runs the GPU asynchronously class GPUAsynch final : public Tegra::GPU { public: - explicit GPUAsynch(Core::System& system, std::unique_ptr<VideoCore::RendererBase>&& renderer, - std::unique_ptr<Core::Frontend::GraphicsContext>&& context); + explicit GPUAsynch(Core::System& system, bool use_nvdec); ~GPUAsynch() override; void Start() override; + void ObtainContext() override; + void ReleaseContext() override; void PushGPUEntries(Tegra::CommandList&& entries) override; + void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) override; void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; void FlushRegion(VAddr addr, u64 size) override; void InvalidateRegion(VAddr addr, u64 size) override; void FlushAndInvalidateRegion(VAddr addr, u64 size) override; void WaitIdle() const override; + void OnCommandListEnd() override; + protected: void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const override; private: GPUThread::ThreadManager gpu_thread; std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context; - std::unique_ptr<Core::Frontend::GraphicsContext> gpu_context; }; } // namespace VideoCommon diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp index 6f38a672a..ecf7bbdf3 100644 --- a/src/video_core/gpu_synch.cpp +++ b/src/video_core/gpu_synch.cpp @@ -7,14 +7,18 @@ namespace VideoCommon { -GPUSynch::GPUSynch(Core::System& system, std::unique_ptr<VideoCore::RendererBase>&& renderer, - std::unique_ptr<Core::Frontend::GraphicsContext>&& context) - : GPU(system, std::move(renderer), false), context{std::move(context)} {} +GPUSynch::GPUSynch(Core::System& system, bool use_nvdec) : GPU{system, false, use_nvdec} {} GPUSynch::~GPUSynch() = default; -void GPUSynch::Start() { - context->MakeCurrent(); +void GPUSynch::Start() {} + +void GPUSynch::ObtainContext() { + renderer->Context().MakeCurrent(); +} + +void GPUSynch::ReleaseContext() { + renderer->Context().DoneCurrent(); } void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) { @@ -22,6 +26,22 @@ void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) { dma_pusher->DispatchCalls(); } +void GPUSynch::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) { + if (!use_nvdec) { + return; + } + // This condition fires when a video stream ends, clears all intermediary data + if (entries[0].raw == 0xDEADB33F) { + cdma_pusher.reset(); + return; + } + if (!cdma_pusher) { + cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this); + } + cdma_pusher->Push(std::move(entries)); + cdma_pusher->DispatchCalls(); +} + void GPUSynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { renderer->SwapBuffers(framebuffer); } diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h index 4a6e9a01d..9d778c71a 100644 --- a/src/video_core/gpu_synch.h +++ b/src/video_core/gpu_synch.h @@ -19,12 +19,14 @@ namespace VideoCommon { /// Implementation of GPU interface that runs the GPU synchronously class GPUSynch final : public Tegra::GPU { public: - explicit GPUSynch(Core::System& system, std::unique_ptr<VideoCore::RendererBase>&& renderer, - std::unique_ptr<Core::Frontend::GraphicsContext>&& context); + explicit GPUSynch(Core::System& system, bool use_nvdec); ~GPUSynch() override; void Start() override; + void ObtainContext() override; + void ReleaseContext() override; void PushGPUEntries(Tegra::CommandList&& entries) override; + void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) override; void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; void FlushRegion(VAddr addr, u64 size) override; void InvalidateRegion(VAddr addr, u64 size) override; @@ -34,9 +36,6 @@ public: protected: void TriggerCpuInterrupt([[maybe_unused]] u32 syncpoint_id, [[maybe_unused]] u32 value) const override {} - -private: - std::unique_ptr<Core::Frontend::GraphicsContext> context; }; } // namespace VideoCommon diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 10cda686b..4b8f58283 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -4,8 +4,10 @@ #include "common/assert.h" #include "common/microprofile.h" +#include "common/thread.h" #include "core/core.h" #include "core/frontend/emu_window.h" +#include "core/settings.h" #include "video_core/dma_pusher.h" #include "video_core/gpu.h" #include "video_core/gpu_thread.h" @@ -14,9 +16,14 @@ namespace VideoCommon::GPUThread { /// Runs the GPU thread -static void RunThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context, - Tegra::DmaPusher& dma_pusher, SynchState& state) { - MicroProfileOnThreadCreate("GpuThread"); +static void RunThread(Core::System& system, VideoCore::RendererBase& renderer, + Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher, + SynchState& state, Tegra::CDmaPusher& cdma_pusher) { + std::string name = "yuzu:GPU"; + MicroProfileOnThreadCreate(name.c_str()); + Common::SetCurrentThreadName(name.c_str()); + Common::SetCurrentThreadPriority(Common::ThreadPriority::High); + system.RegisterHostThread(); // Wait for first GPU command before acquiring the window context while (state.queue.Empty()) @@ -35,12 +42,20 @@ static void RunThread(VideoCore::RendererBase& renderer, Core::Frontend::Graphic if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) { dma_pusher.Push(std::move(submit_list->entries)); dma_pusher.DispatchCalls(); + } else if (const auto command_list = std::get_if<SubmitChCommandEntries>(&next.data)) { + // NVDEC + cdma_pusher.Push(std::move(command_list->entries)); + cdma_pusher.DispatchCalls(); } else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) { renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr); + } else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) { + renderer.Rasterizer().ReleaseFences(); + } else if (std::holds_alternative<GPUTickCommand>(next.data)) { + system.GPU().TickWork(); } else if (const auto data = std::get_if<FlushRegionCommand>(&next.data)) { renderer.Rasterizer().FlushRegion(data->addr, data->size); } else if (const auto data = std::get_if<InvalidateRegionCommand>(&next.data)) { - renderer.Rasterizer().InvalidateRegion(data->addr, data->size); + renderer.Rasterizer().OnCPUWrite(data->addr, data->size); } else if (std::holds_alternative<EndProcessingCommand>(next.data)) { return; } else { @@ -64,30 +79,47 @@ ThreadManager::~ThreadManager() { void ThreadManager::StartThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context, - Tegra::DmaPusher& dma_pusher) { - thread = std::thread{RunThread, std::ref(renderer), std::ref(context), std::ref(dma_pusher), - std::ref(state)}; + Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher) { + thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context), + std::ref(dma_pusher), std::ref(state), std::ref(cdma_pusher)); } void ThreadManager::SubmitList(Tegra::CommandList&& entries) { PushCommand(SubmitListCommand(std::move(entries))); } +void ThreadManager::SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries) { + PushCommand(SubmitChCommandEntries(std::move(entries))); +} + void ThreadManager::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { PushCommand(SwapBuffersCommand(framebuffer ? std::make_optional(*framebuffer) : std::nullopt)); } void ThreadManager::FlushRegion(VAddr addr, u64 size) { - PushCommand(FlushRegionCommand(addr, size)); + if (!Settings::IsGPULevelHigh()) { + PushCommand(FlushRegionCommand(addr, size)); + return; + } + if (!Settings::IsGPULevelExtreme()) { + return; + } + if (system.Renderer().Rasterizer().MustFlushRegion(addr, size)) { + auto& gpu = system.GPU(); + u64 fence = gpu.RequestFlush(addr, size); + PushCommand(GPUTickCommand()); + while (fence > gpu.CurrentFlushRequestFence()) { + } + } } void ThreadManager::InvalidateRegion(VAddr addr, u64 size) { - system.Renderer().Rasterizer().InvalidateRegion(addr, size); + system.Renderer().Rasterizer().OnCPUWrite(addr, size); } void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) { // Skip flush on asynch mode, as FlushAndInvalidateRegion is not used for anything too important - InvalidateRegion(addr, size); + system.Renderer().Rasterizer().OnCPUWrite(addr, size); } void ThreadManager::WaitIdle() const { @@ -95,6 +127,10 @@ void ThreadManager::WaitIdle() const { } } +void ThreadManager::OnCommandListEnd() { + PushCommand(OnCommandListEndCommand()); +} + u64 ThreadManager::PushCommand(CommandData&& command_data) { const u64 fence{++state.last_fence}; state.queue.Push(CommandDataContainer(std::move(command_data), fence)); diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index cd74ad330..32a34e3a7 100644 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -37,6 +37,14 @@ struct SubmitListCommand final { Tegra::CommandList entries; }; +/// Command to signal to the GPU thread that a cdma command list is ready for processing +struct SubmitChCommandEntries final { + explicit SubmitChCommandEntries(Tegra::ChCommandHeaderList&& entries) + : entries{std::move(entries)} {} + + Tegra::ChCommandHeaderList entries; +}; + /// Command to signal to the GPU thread that a swap buffers is pending struct SwapBuffersCommand final { explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer) @@ -70,9 +78,16 @@ struct FlushAndInvalidateRegionCommand final { u64 size; }; +/// Command called within the gpu, to schedule actions after a command list end +struct OnCommandListEndCommand final {}; + +/// Command to make the gpu look into pending requests +struct GPUTickCommand final {}; + using CommandData = - std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand, - InvalidateRegionCommand, FlushAndInvalidateRegionCommand>; + std::variant<EndProcessingCommand, SubmitListCommand, SubmitChCommandEntries, + SwapBuffersCommand, FlushRegionCommand, InvalidateRegionCommand, + FlushAndInvalidateRegionCommand, OnCommandListEndCommand, GPUTickCommand>; struct CommandDataContainer { CommandDataContainer() = default; @@ -102,11 +117,14 @@ public: /// Creates and starts the GPU thread. void StartThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context, - Tegra::DmaPusher& dma_pusher); + Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher); /// Push GPU command entries to be processed void SubmitList(Tegra::CommandList&& entries); + /// Push GPU CDMA command buffer entries to be processed + void SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries); + /// Swap buffers (render frame) void SwapBuffers(const Tegra::FramebufferConfig* framebuffer); @@ -122,6 +140,8 @@ public: // Wait until the gpu thread is idle. void WaitIdle() const; + void OnCommandListEnd(); + private: /// Pushes a command to be executed by the GPU thread u64 PushCommand(CommandData&& command_data); diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt new file mode 100644 index 000000000..c157724a9 --- /dev/null +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -0,0 +1,36 @@ +set(SHADER_SOURCES + opengl_present.frag + opengl_present.vert +) + +set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) +set(SHADER_DIR ${SHADER_INCLUDE}/video_core/host_shaders) +set(HOST_SHADERS_INCLUDE ${SHADER_INCLUDE} PARENT_SCOPE) + +set(INPUT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/source_shader.h.in) +set(HEADER_GENERATOR ${CMAKE_CURRENT_SOURCE_DIR}/StringShaderHeader.cmake) + +foreach(FILENAME IN ITEMS ${SHADER_SOURCES}) + string(REPLACE "." "_" SHADER_NAME ${FILENAME}) + set(SOURCE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}) + set(HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}.h) + add_custom_command( + OUTPUT + ${HEADER_FILE} + COMMAND + ${CMAKE_COMMAND} -P ${HEADER_GENERATOR} ${SOURCE_FILE} ${HEADER_FILE} ${INPUT_FILE} + MAIN_DEPENDENCY + ${SOURCE_FILE} + DEPENDS + ${INPUT_FILE} + # HEADER_GENERATOR should be included here but msbuild seems to assume it's always modified + ) + set(SHADER_HEADERS ${SHADER_HEADERS} ${HEADER_FILE}) +endforeach() + +add_custom_target(host_shaders + DEPENDS + ${SHADER_HEADERS} + SOURCES + ${SHADER_SOURCES} +) diff --git a/src/video_core/host_shaders/StringShaderHeader.cmake b/src/video_core/host_shaders/StringShaderHeader.cmake new file mode 100644 index 000000000..c0fc49768 --- /dev/null +++ b/src/video_core/host_shaders/StringShaderHeader.cmake @@ -0,0 +1,13 @@ +set(SOURCE_FILE ${CMAKE_ARGV3}) +set(HEADER_FILE ${CMAKE_ARGV4}) +set(INPUT_FILE ${CMAKE_ARGV5}) + +get_filename_component(CONTENTS_NAME ${SOURCE_FILE} NAME) +string(REPLACE "." "_" CONTENTS_NAME ${CONTENTS_NAME}) +string(TOUPPER ${CONTENTS_NAME} CONTENTS_NAME) + +file(READ ${SOURCE_FILE} CONTENTS) + +get_filename_component(OUTPUT_DIR ${HEADER_FILE} DIRECTORY) +make_directory(${OUTPUT_DIR}) +configure_file(${INPUT_FILE} ${HEADER_FILE} @ONLY) diff --git a/src/video_core/host_shaders/opengl_present.frag b/src/video_core/host_shaders/opengl_present.frag new file mode 100644 index 000000000..8a4cb024b --- /dev/null +++ b/src/video_core/host_shaders/opengl_present.frag @@ -0,0 +1,10 @@ +#version 430 core + +layout (location = 0) in vec2 frag_tex_coord; +layout (location = 0) out vec4 color; + +layout (binding = 0) uniform sampler2D color_texture; + +void main() { + color = vec4(texture(color_texture, frag_tex_coord).rgb, 1.0f); +} diff --git a/src/video_core/host_shaders/opengl_present.vert b/src/video_core/host_shaders/opengl_present.vert new file mode 100644 index 000000000..2235d31a4 --- /dev/null +++ b/src/video_core/host_shaders/opengl_present.vert @@ -0,0 +1,24 @@ +#version 430 core + +out gl_PerVertex { + vec4 gl_Position; +}; + +layout (location = 0) in vec2 vert_position; +layout (location = 1) in vec2 vert_tex_coord; +layout (location = 0) out vec2 frag_tex_coord; + +// This is a truncated 3x3 matrix for 2D transformations: +// The upper-left 2x2 submatrix performs scaling/rotation/mirroring. +// The third column performs translation. +// The third row could be used for projection, which we don't need in 2D. It hence is assumed to +// implicitly be [0, 0, 1] +layout (location = 0) uniform mat3x2 modelview_matrix; + +void main() { + // Multiply input position by the rotscale part of the matrix and then manually translate by + // the last column. This is equivalent to using a full 3x3 matrix and expanding the vector + // to `vec3(vert_position.xy, 1.0)` + gl_Position = vec4(mat2(modelview_matrix) * vert_position + modelview_matrix[2], 0.0, 1.0); + frag_tex_coord = vert_tex_coord; +} diff --git a/src/video_core/host_shaders/source_shader.h.in b/src/video_core/host_shaders/source_shader.h.in new file mode 100644 index 000000000..ccdb0d2a9 --- /dev/null +++ b/src/video_core/host_shaders/source_shader.h.in @@ -0,0 +1,9 @@ +#pragma once + +#include <string_view> + +namespace HostShaders { + +constexpr std::string_view @CONTENTS_NAME@ = R"(@CONTENTS@)"; + +} // namespace HostShaders diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp new file mode 100644 index 000000000..cd21a2112 --- /dev/null +++ b/src/video_core/macro/macro.cpp @@ -0,0 +1,91 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <optional> +#include <boost/container_hash/hash.hpp> +#include "common/assert.h" +#include "common/logging/log.h" +#include "core/settings.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/macro/macro.h" +#include "video_core/macro/macro_hle.h" +#include "video_core/macro/macro_interpreter.h" +#include "video_core/macro/macro_jit_x64.h" + +namespace Tegra { + +MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d) + : hle_macros{std::make_unique<Tegra::HLEMacro>(maxwell3d)} {} + +MacroEngine::~MacroEngine() = default; + +void MacroEngine::AddCode(u32 method, u32 data) { + uploaded_macro_code[method].push_back(data); +} + +void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method, + const std::vector<u32>& parameters) { + auto compiled_macro = macro_cache.find(method); + if (compiled_macro != macro_cache.end()) { + const auto& cache_info = compiled_macro->second; + if (cache_info.has_hle_program) { + cache_info.hle_program->Execute(parameters, method); + } else { + cache_info.lle_program->Execute(parameters, method); + } + } else { + // Macro not compiled, check if it's uploaded and if so, compile it + std::optional<u32> mid_method; + const auto macro_code = uploaded_macro_code.find(method); + if (macro_code == uploaded_macro_code.end()) { + for (const auto& [method_base, code] : uploaded_macro_code) { + if (method >= method_base && (method - method_base) < code.size()) { + mid_method = method_base; + break; + } + } + if (!mid_method.has_value()) { + UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method); + return; + } + } + auto& cache_info = macro_cache[method]; + + if (!mid_method.has_value()) { + cache_info.lle_program = Compile(macro_code->second); + cache_info.hash = boost::hash_value(macro_code->second); + } else { + const auto& macro_cached = uploaded_macro_code[mid_method.value()]; + const auto rebased_method = method - mid_method.value(); + auto& code = uploaded_macro_code[method]; + code.resize(macro_cached.size() - rebased_method); + std::memcpy(code.data(), macro_cached.data() + rebased_method, + code.size() * sizeof(u32)); + cache_info.hash = boost::hash_value(code); + cache_info.lle_program = Compile(code); + } + + auto hle_program = hle_macros->GetHLEProgram(cache_info.hash); + if (hle_program.has_value()) { + cache_info.has_hle_program = true; + cache_info.hle_program = std::move(hle_program.value()); + cache_info.hle_program->Execute(parameters, method); + } else { + cache_info.lle_program->Execute(parameters, method); + } + } +} + +std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d) { + if (Settings::values.disable_macro_jit) { + return std::make_unique<MacroInterpreter>(maxwell3d); + } +#ifdef ARCHITECTURE_x86_64 + return std::make_unique<MacroJITx64>(maxwell3d); +#else + return std::make_unique<MacroInterpreter>(maxwell3d); +#endif +} + +} // namespace Tegra diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h new file mode 100644 index 000000000..31ee3440a --- /dev/null +++ b/src/video_core/macro/macro.h @@ -0,0 +1,142 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <unordered_map> +#include <vector> +#include "common/bit_field.h" +#include "common/common_types.h" + +namespace Tegra { + +namespace Engines { +class Maxwell3D; +} + +namespace Macro { +constexpr std::size_t NUM_MACRO_REGISTERS = 8; +enum class Operation : u32 { + ALU = 0, + AddImmediate = 1, + ExtractInsert = 2, + ExtractShiftLeftImmediate = 3, + ExtractShiftLeftRegister = 4, + Read = 5, + Unused = 6, // This operation doesn't seem to be a valid encoding. + Branch = 7, +}; + +enum class ALUOperation : u32 { + Add = 0, + AddWithCarry = 1, + Subtract = 2, + SubtractWithBorrow = 3, + // Operations 4-7 don't seem to be valid encodings. + Xor = 8, + Or = 9, + And = 10, + AndNot = 11, + Nand = 12 +}; + +enum class ResultOperation : u32 { + IgnoreAndFetch = 0, + Move = 1, + MoveAndSetMethod = 2, + FetchAndSend = 3, + MoveAndSend = 4, + FetchAndSetMethod = 5, + MoveAndSetMethodFetchAndSend = 6, + MoveAndSetMethodSend = 7 +}; + +enum class BranchCondition : u32 { + Zero = 0, + NotZero = 1, +}; + +union Opcode { + u32 raw; + BitField<0, 3, Operation> operation; + BitField<4, 3, ResultOperation> result_operation; + BitField<4, 1, BranchCondition> branch_condition; + // If set on a branch, then the branch doesn't have a delay slot. + BitField<5, 1, u32> branch_annul; + BitField<7, 1, u32> is_exit; + BitField<8, 3, u32> dst; + BitField<11, 3, u32> src_a; + BitField<14, 3, u32> src_b; + // The signed immediate overlaps the second source operand and the alu operation. + BitField<14, 18, s32> immediate; + + BitField<17, 5, ALUOperation> alu_operation; + + // Bitfield instructions data + BitField<17, 5, u32> bf_src_bit; + BitField<22, 5, u32> bf_size; + BitField<27, 5, u32> bf_dst_bit; + + u32 GetBitfieldMask() const { + return (1 << bf_size) - 1; + } + + s32 GetBranchTarget() const { + return static_cast<s32>(immediate * sizeof(u32)); + } +}; + +union MethodAddress { + u32 raw; + BitField<0, 12, u32> address; + BitField<12, 6, u32> increment; +}; + +} // namespace Macro + +class HLEMacro; + +class CachedMacro { +public: + virtual ~CachedMacro() = default; + /** + * Executes the macro code with the specified input parameters. + * + * @param parameters The parameters of the macro + * @param method The method to execute + */ + virtual void Execute(const std::vector<u32>& parameters, u32 method) = 0; +}; + +class MacroEngine { +public: + explicit MacroEngine(Engines::Maxwell3D& maxwell3d); + virtual ~MacroEngine(); + + // Store the uploaded macro code to compile them when they're called. + void AddCode(u32 method, u32 data); + + // Compiles the macro if its not in the cache, and executes the compiled macro + void Execute(Engines::Maxwell3D& maxwell3d, u32 method, const std::vector<u32>& parameters); + +protected: + virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0; + +private: + struct CacheInfo { + std::unique_ptr<CachedMacro> lle_program{}; + std::unique_ptr<CachedMacro> hle_program{}; + u64 hash{}; + bool has_hle_program{}; + }; + + std::unordered_map<u32, CacheInfo> macro_cache; + std::unordered_map<u32, std::vector<u32>> uploaded_macro_code; + std::unique_ptr<HLEMacro> hle_macros; +}; + +std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d); + +} // namespace Tegra diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp new file mode 100644 index 000000000..df00b57df --- /dev/null +++ b/src/video_core/macro/macro_hle.cpp @@ -0,0 +1,109 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <array> +#include <vector> +#include "video_core/engines/maxwell_3d.h" +#include "video_core/macro/macro_hle.h" +#include "video_core/rasterizer_interface.h" + +namespace Tegra { + +namespace { +// HLE'd functions +void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) { + const u32 instance_count = parameters[2] & maxwell3d.GetRegisterValue(0xD1B); + + maxwell3d.regs.draw.topology.Assign( + static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0] & 0x3ffffff)); + maxwell3d.regs.vb_base_instance = parameters[5]; + maxwell3d.mme_draw.instance_count = instance_count; + maxwell3d.regs.vb_element_base = parameters[3]; + maxwell3d.regs.index_array.count = parameters[1]; + maxwell3d.regs.index_array.first = parameters[4]; + + if (maxwell3d.ShouldExecute()) { + maxwell3d.Rasterizer().Draw(true, true); + } + maxwell3d.regs.index_array.count = 0; + maxwell3d.mme_draw.instance_count = 0; + maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined; +} + +void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) { + const u32 count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]); + + maxwell3d.regs.vertex_buffer.first = parameters[3]; + maxwell3d.regs.vertex_buffer.count = parameters[1]; + maxwell3d.regs.vb_base_instance = parameters[4]; + maxwell3d.regs.draw.topology.Assign( + static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0])); + maxwell3d.mme_draw.instance_count = count; + + if (maxwell3d.ShouldExecute()) { + maxwell3d.Rasterizer().Draw(false, true); + } + maxwell3d.regs.vertex_buffer.count = 0; + maxwell3d.mme_draw.instance_count = 0; + maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined; +} + +void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) { + const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]); + const u32 element_base = parameters[4]; + const u32 base_instance = parameters[5]; + maxwell3d.regs.index_array.first = parameters[3]; + maxwell3d.regs.reg_array[0x446] = element_base; // vertex id base? + maxwell3d.regs.index_array.count = parameters[1]; + maxwell3d.regs.vb_element_base = element_base; + maxwell3d.regs.vb_base_instance = base_instance; + maxwell3d.mme_draw.instance_count = instance_count; + maxwell3d.CallMethodFromMME(0x8e3, 0x640); + maxwell3d.CallMethodFromMME(0x8e4, element_base); + maxwell3d.CallMethodFromMME(0x8e5, base_instance); + maxwell3d.regs.draw.topology.Assign( + static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0])); + if (maxwell3d.ShouldExecute()) { + maxwell3d.Rasterizer().Draw(true, true); + } + maxwell3d.regs.reg_array[0x446] = 0x0; // vertex id base? + maxwell3d.regs.index_array.count = 0; + maxwell3d.regs.vb_element_base = 0x0; + maxwell3d.regs.vb_base_instance = 0x0; + maxwell3d.mme_draw.instance_count = 0; + maxwell3d.CallMethodFromMME(0x8e3, 0x640); + maxwell3d.CallMethodFromMME(0x8e4, 0x0); + maxwell3d.CallMethodFromMME(0x8e5, 0x0); + maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined; +} +} // Anonymous namespace + +constexpr std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{{ + {0x771BB18C62444DA0, &HLE_771BB18C62444DA0}, + {0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD}, + {0x0217920100488FF7, &HLE_0217920100488FF7}, +}}; + +HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} +HLEMacro::~HLEMacro() = default; + +std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const { + const auto it = std::find_if(hle_funcs.cbegin(), hle_funcs.cend(), + [hash](const auto& pair) { return pair.first == hash; }); + if (it == hle_funcs.end()) { + return std::nullopt; + } + return std::make_unique<HLEMacroImpl>(maxwell3d, it->second); +} + +HLEMacroImpl::~HLEMacroImpl() = default; + +HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func) + : maxwell3d(maxwell3d), func(func) {} + +void HLEMacroImpl::Execute(const std::vector<u32>& parameters, u32 method) { + func(maxwell3d, parameters); +} + +} // namespace Tegra diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h new file mode 100644 index 000000000..37af875a0 --- /dev/null +++ b/src/video_core/macro/macro_hle.h @@ -0,0 +1,44 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <optional> +#include <vector> +#include "common/common_types.h" +#include "video_core/macro/macro.h" + +namespace Tegra { + +namespace Engines { +class Maxwell3D; +} + +using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters); + +class HLEMacro { +public: + explicit HLEMacro(Engines::Maxwell3D& maxwell3d); + ~HLEMacro(); + + std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const; + +private: + Engines::Maxwell3D& maxwell3d; +}; + +class HLEMacroImpl : public CachedMacro { +public: + explicit HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func); + ~HLEMacroImpl(); + + void Execute(const std::vector<u32>& parameters, u32 method) override; + +private: + Engines::Maxwell3D& maxwell3d; + HLEFunction func; +}; + +} // namespace Tegra diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp index 42031d80a..bd01fd1f2 100644 --- a/src/video_core/macro_interpreter.cpp +++ b/src/video_core/macro/macro_interpreter.cpp @@ -1,4 +1,4 @@ -// Copyright 2018 yuzu Emulator Project +// Copyright 2020 yuzu Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. @@ -6,109 +6,46 @@ #include "common/logging/log.h" #include "common/microprofile.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/macro_interpreter.h" +#include "video_core/macro/macro_interpreter.h" MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192)); namespace Tegra { -namespace { -enum class Operation : u32 { - ALU = 0, - AddImmediate = 1, - ExtractInsert = 2, - ExtractShiftLeftImmediate = 3, - ExtractShiftLeftRegister = 4, - Read = 5, - Unused = 6, // This operation doesn't seem to be a valid encoding. - Branch = 7, -}; -} // Anonymous namespace - -enum class MacroInterpreter::ALUOperation : u32 { - Add = 0, - AddWithCarry = 1, - Subtract = 2, - SubtractWithBorrow = 3, - // Operations 4-7 don't seem to be valid encodings. - Xor = 8, - Or = 9, - And = 10, - AndNot = 11, - Nand = 12 -}; - -enum class MacroInterpreter::ResultOperation : u32 { - IgnoreAndFetch = 0, - Move = 1, - MoveAndSetMethod = 2, - FetchAndSend = 3, - MoveAndSend = 4, - FetchAndSetMethod = 5, - MoveAndSetMethodFetchAndSend = 6, - MoveAndSetMethodSend = 7 -}; - -enum class MacroInterpreter::BranchCondition : u32 { - Zero = 0, - NotZero = 1, -}; - -union MacroInterpreter::Opcode { - u32 raw; - BitField<0, 3, Operation> operation; - BitField<4, 3, ResultOperation> result_operation; - BitField<4, 1, BranchCondition> branch_condition; - // If set on a branch, then the branch doesn't have a delay slot. - BitField<5, 1, u32> branch_annul; - BitField<7, 1, u32> is_exit; - BitField<8, 3, u32> dst; - BitField<11, 3, u32> src_a; - BitField<14, 3, u32> src_b; - // The signed immediate overlaps the second source operand and the alu operation. - BitField<14, 18, s32> immediate; - - BitField<17, 5, ALUOperation> alu_operation; - - // Bitfield instructions data - BitField<17, 5, u32> bf_src_bit; - BitField<22, 5, u32> bf_size; - BitField<27, 5, u32> bf_dst_bit; - - u32 GetBitfieldMask() const { - return (1 << bf_size) - 1; - } +MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) + : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {} - s32 GetBranchTarget() const { - return static_cast<s32>(immediate * sizeof(u32)); - } -}; +std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) { + return std::make_unique<MacroInterpreterImpl>(maxwell3d, code); +} -MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} +MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, + const std::vector<u32>& code) + : maxwell3d(maxwell3d), code(code) {} -void MacroInterpreter::Execute(u32 offset, std::size_t num_parameters, const u32* parameters) { +void MacroInterpreterImpl::Execute(const std::vector<u32>& parameters, u32 method) { MICROPROFILE_SCOPE(MacroInterp); Reset(); registers[1] = parameters[0]; + num_parameters = parameters.size(); if (num_parameters > parameters_capacity) { parameters_capacity = num_parameters; this->parameters = std::make_unique<u32[]>(num_parameters); } - std::memcpy(this->parameters.get(), parameters, num_parameters * sizeof(u32)); - this->num_parameters = num_parameters; + std::memcpy(this->parameters.get(), parameters.data(), num_parameters * sizeof(u32)); // Execute the code until we hit an exit condition. bool keep_executing = true; while (keep_executing) { - keep_executing = Step(offset, false); + keep_executing = Step(false); } // Assert the the macro used all the input parameters ASSERT(next_parameter_index == num_parameters); } -void MacroInterpreter::Reset() { +void MacroInterpreterImpl::Reset() { registers = {}; pc = 0; delayed_pc = {}; @@ -120,10 +57,10 @@ void MacroInterpreter::Reset() { carry_flag = false; } -bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { +bool MacroInterpreterImpl::Step(bool is_delay_slot) { u32 base_address = pc; - Opcode opcode = GetOpcode(offset); + Macro::Opcode opcode = GetOpcode(); pc += 4; // Update the program counter if we were delayed @@ -134,18 +71,18 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { } switch (opcode.operation) { - case Operation::ALU: { + case Macro::Operation::ALU: { u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a), GetRegister(opcode.src_b)); ProcessResult(opcode.result_operation, opcode.dst, result); break; } - case Operation::AddImmediate: { + case Macro::Operation::AddImmediate: { ProcessResult(opcode.result_operation, opcode.dst, GetRegister(opcode.src_a) + opcode.immediate); break; } - case Operation::ExtractInsert: { + case Macro::Operation::ExtractInsert: { u32 dst = GetRegister(opcode.src_a); u32 src = GetRegister(opcode.src_b); @@ -155,7 +92,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { ProcessResult(opcode.result_operation, opcode.dst, dst); break; } - case Operation::ExtractShiftLeftImmediate: { + case Macro::Operation::ExtractShiftLeftImmediate: { u32 dst = GetRegister(opcode.src_a); u32 src = GetRegister(opcode.src_b); @@ -164,7 +101,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { ProcessResult(opcode.result_operation, opcode.dst, result); break; } - case Operation::ExtractShiftLeftRegister: { + case Macro::Operation::ExtractShiftLeftRegister: { u32 dst = GetRegister(opcode.src_a); u32 src = GetRegister(opcode.src_b); @@ -173,12 +110,12 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { ProcessResult(opcode.result_operation, opcode.dst, result); break; } - case Operation::Read: { + case Macro::Operation::Read: { u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate); ProcessResult(opcode.result_operation, opcode.dst, result); break; } - case Operation::Branch: { + case Macro::Operation::Branch: { ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid"); u32 value = GetRegister(opcode.src_a); bool taken = EvaluateBranchCondition(opcode.branch_condition, value); @@ -191,7 +128,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { delayed_pc = base_address + opcode.GetBranchTarget(); // Execute one more instruction due to the delay slot. - return Step(offset, true); + return Step(true); } break; } @@ -204,51 +141,44 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { // cause an exit if it's executed inside a delay slot. if (opcode.is_exit && !is_delay_slot) { // Exit has a delay slot, execute the next instruction - Step(offset, true); + Step(true); return false; } return true; } -MacroInterpreter::Opcode MacroInterpreter::GetOpcode(u32 offset) const { - const auto& macro_memory{maxwell3d.GetMacroMemory()}; - ASSERT((pc % sizeof(u32)) == 0); - ASSERT((pc + offset) < macro_memory.size() * sizeof(u32)); - return {macro_memory[offset + pc / sizeof(u32)]}; -} - -u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) { +u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) { switch (operation) { - case ALUOperation::Add: { + case Macro::ALUOperation::Add: { const u64 result{static_cast<u64>(src_a) + src_b}; carry_flag = result > 0xffffffff; return static_cast<u32>(result); } - case ALUOperation::AddWithCarry: { + case Macro::ALUOperation::AddWithCarry: { const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)}; carry_flag = result > 0xffffffff; return static_cast<u32>(result); } - case ALUOperation::Subtract: { + case Macro::ALUOperation::Subtract: { const u64 result{static_cast<u64>(src_a) - src_b}; carry_flag = result < 0x100000000; return static_cast<u32>(result); } - case ALUOperation::SubtractWithBorrow: { + case Macro::ALUOperation::SubtractWithBorrow: { const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)}; carry_flag = result < 0x100000000; return static_cast<u32>(result); } - case ALUOperation::Xor: + case Macro::ALUOperation::Xor: return src_a ^ src_b; - case ALUOperation::Or: + case Macro::ALUOperation::Or: return src_a | src_b; - case ALUOperation::And: + case Macro::ALUOperation::And: return src_a & src_b; - case ALUOperation::AndNot: + case Macro::ALUOperation::AndNot: return src_a & ~src_b; - case ALUOperation::Nand: + case Macro::ALUOperation::Nand: return ~(src_a & src_b); default: @@ -257,43 +187,43 @@ u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) } } -void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 result) { +void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) { switch (operation) { - case ResultOperation::IgnoreAndFetch: + case Macro::ResultOperation::IgnoreAndFetch: // Fetch parameter and ignore result. SetRegister(reg, FetchParameter()); break; - case ResultOperation::Move: + case Macro::ResultOperation::Move: // Move result. SetRegister(reg, result); break; - case ResultOperation::MoveAndSetMethod: + case Macro::ResultOperation::MoveAndSetMethod: // Move result and use as Method Address. SetRegister(reg, result); SetMethodAddress(result); break; - case ResultOperation::FetchAndSend: + case Macro::ResultOperation::FetchAndSend: // Fetch parameter and send result. SetRegister(reg, FetchParameter()); Send(result); break; - case ResultOperation::MoveAndSend: + case Macro::ResultOperation::MoveAndSend: // Move and send result. SetRegister(reg, result); Send(result); break; - case ResultOperation::FetchAndSetMethod: + case Macro::ResultOperation::FetchAndSetMethod: // Fetch parameter and use result as Method Address. SetRegister(reg, FetchParameter()); SetMethodAddress(result); break; - case ResultOperation::MoveAndSetMethodFetchAndSend: + case Macro::ResultOperation::MoveAndSetMethodFetchAndSend: // Move result and use as Method Address, then fetch and send parameter. SetRegister(reg, result); SetMethodAddress(result); Send(FetchParameter()); break; - case ResultOperation::MoveAndSetMethodSend: + case Macro::ResultOperation::MoveAndSetMethodSend: // Move result and use as Method Address, then send bits 12:17 of result. SetRegister(reg, result); SetMethodAddress(result); @@ -304,16 +234,28 @@ void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 res } } -u32 MacroInterpreter::FetchParameter() { - ASSERT(next_parameter_index < num_parameters); - return parameters[next_parameter_index++]; +bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const { + switch (cond) { + case Macro::BranchCondition::Zero: + return value == 0; + case Macro::BranchCondition::NotZero: + return value != 0; + } + UNREACHABLE(); + return true; +} + +Macro::Opcode MacroInterpreterImpl::GetOpcode() const { + ASSERT((pc % sizeof(u32)) == 0); + ASSERT(pc < code.size() * sizeof(u32)); + return {code[pc / sizeof(u32)]}; } -u32 MacroInterpreter::GetRegister(u32 register_id) const { +u32 MacroInterpreterImpl::GetRegister(u32 register_id) const { return registers.at(register_id); } -void MacroInterpreter::SetRegister(u32 register_id, u32 value) { +void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) { // Register 0 is hardwired as the zero register. // Ensure no writes to it actually occur. if (register_id == 0) { @@ -323,30 +265,24 @@ void MacroInterpreter::SetRegister(u32 register_id, u32 value) { registers.at(register_id) = value; } -void MacroInterpreter::SetMethodAddress(u32 address) { +void MacroInterpreterImpl::SetMethodAddress(u32 address) { method_address.raw = address; } -void MacroInterpreter::Send(u32 value) { - maxwell3d.CallMethodFromMME({method_address.address, value}); +void MacroInterpreterImpl::Send(u32 value) { + maxwell3d.CallMethodFromMME(method_address.address, value); // Increment the method address by the method increment. method_address.address.Assign(method_address.address.Value() + method_address.increment.Value()); } -u32 MacroInterpreter::Read(u32 method) const { +u32 MacroInterpreterImpl::Read(u32 method) const { return maxwell3d.GetRegisterValue(method); } -bool MacroInterpreter::EvaluateBranchCondition(BranchCondition cond, u32 value) const { - switch (cond) { - case BranchCondition::Zero: - return value == 0; - case BranchCondition::NotZero: - return value != 0; - } - UNREACHABLE(); - return true; +u32 MacroInterpreterImpl::FetchParameter() { + ASSERT(next_parameter_index < num_parameters); + return parameters[next_parameter_index++]; } } // namespace Tegra diff --git a/src/video_core/macro_interpreter.h b/src/video_core/macro/macro_interpreter.h index 631146d89..90217fc89 100644 --- a/src/video_core/macro_interpreter.h +++ b/src/video_core/macro/macro_interpreter.h @@ -1,44 +1,37 @@ -// Copyright 2018 yuzu Emulator Project +// Copyright 2020 yuzu Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. #pragma once - #include <array> #include <optional> - +#include <vector> #include "common/bit_field.h" #include "common/common_types.h" +#include "video_core/macro/macro.h" namespace Tegra { namespace Engines { class Maxwell3D; } -class MacroInterpreter final { +class MacroInterpreter final : public MacroEngine { public: explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d); - /** - * Executes the macro code with the specified input parameters. - * @param offset Offset to start execution at. - * @param parameters The parameters of the macro. - */ - void Execute(u32 offset, std::size_t num_parameters, const u32* parameters); +protected: + std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override; private: - enum class ALUOperation : u32; - enum class BranchCondition : u32; - enum class ResultOperation : u32; - - union Opcode; + Engines::Maxwell3D& maxwell3d; +}; - union MethodAddress { - u32 raw; - BitField<0, 12, u32> address; - BitField<12, 6, u32> increment; - }; +class MacroInterpreterImpl : public CachedMacro { +public: + MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code); + void Execute(const std::vector<u32>& parameters, u32 method) override; +private: /// Resets the execution engine state, zeroing registers, etc. void Reset(); @@ -49,20 +42,20 @@ private: * @param is_delay_slot Whether the current step is being executed due to a delay slot in a * previous instruction. */ - bool Step(u32 offset, bool is_delay_slot); + bool Step(bool is_delay_slot); /// Calculates the result of an ALU operation. src_a OP src_b; - u32 GetALUResult(ALUOperation operation, u32 src_a, u32 src_b); + u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b); /// Performs the result operation on the input result and stores it in the specified register /// (if necessary). - void ProcessResult(ResultOperation operation, u32 reg, u32 result); + void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result); /// Evaluates the branch condition and returns whether the branch should be taken or not. - bool EvaluateBranchCondition(BranchCondition cond, u32 value) const; + bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const; /// Reads an opcode at the current program counter location. - Opcode GetOpcode(u32 offset) const; + Macro::Opcode GetOpcode() const; /// Returns the specified register's value. Register 0 is hardcoded to always return 0. u32 GetRegister(u32 register_id) const; @@ -89,13 +82,11 @@ private: /// Program counter to execute at after the delay slot is executed. std::optional<u32> delayed_pc; - static constexpr std::size_t NumMacroRegisters = 8; - /// General purpose macro registers. - std::array<u32, NumMacroRegisters> registers = {}; + std::array<u32, Macro::NUM_MACRO_REGISTERS> registers = {}; /// Method address to use for the next Send instruction. - MethodAddress method_address = {}; + Macro::MethodAddress method_address = {}; /// Input parameters of the current macro. std::unique_ptr<u32[]> parameters; @@ -105,5 +96,7 @@ private: u32 next_parameter_index = 0; bool carry_flag = false; + const std::vector<u32>& code; }; + } // namespace Tegra diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp new file mode 100644 index 000000000..954b87515 --- /dev/null +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -0,0 +1,620 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "common/logging/log.h" +#include "common/microprofile.h" +#include "common/x64/xbyak_util.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/macro/macro_interpreter.h" +#include "video_core/macro/macro_jit_x64.h" + +MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255, 47)); +MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0)); + +namespace Tegra { +constexpr Xbyak::Reg64 STATE = Xbyak::util::rbx; +constexpr Xbyak::Reg32 RESULT = Xbyak::util::ebp; +constexpr Xbyak::Reg64 PARAMETERS = Xbyak::util::r12; +constexpr Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d; +constexpr Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15; + +static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({ + STATE, + RESULT, + PARAMETERS, + METHOD_ADDRESS, + BRANCH_HOLDER, +}); + +MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) + : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {} + +std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) { + return std::make_unique<MacroJITx64Impl>(maxwell3d, code); +} + +MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code) + : Xbyak::CodeGenerator(MAX_CODE_SIZE), code(code), maxwell3d(maxwell3d) { + Compile(); +} + +MacroJITx64Impl::~MacroJITx64Impl() = default; + +void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) { + MICROPROFILE_SCOPE(MacroJitExecute); + ASSERT_OR_EXECUTE(program != nullptr, { return; }); + JITState state{}; + state.maxwell3d = &maxwell3d; + state.registers = {}; + program(&state, parameters.data()); +} + +void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) { + const bool is_a_zero = opcode.src_a == 0; + const bool is_b_zero = opcode.src_b == 0; + const bool valid_operation = !is_a_zero && !is_b_zero; + [[maybe_unused]] const bool is_move_operation = !is_a_zero && is_b_zero; + const bool has_zero_register = is_a_zero || is_b_zero; + const bool no_zero_reg_skip = opcode.alu_operation == Macro::ALUOperation::AddWithCarry || + opcode.alu_operation == Macro::ALUOperation::SubtractWithBorrow; + + Xbyak::Reg32 src_a; + Xbyak::Reg32 src_b; + + if (!optimizer.zero_reg_skip || no_zero_reg_skip) { + src_a = Compile_GetRegister(opcode.src_a, RESULT); + src_b = Compile_GetRegister(opcode.src_b, eax); + } else { + if (!is_a_zero) { + src_a = Compile_GetRegister(opcode.src_a, RESULT); + } + if (!is_b_zero) { + src_b = Compile_GetRegister(opcode.src_b, eax); + } + } + + bool has_emitted = false; + + switch (opcode.alu_operation) { + case Macro::ALUOperation::Add: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + add(src_a, src_b); + } + } else { + add(src_a, src_b); + } + + if (!optimizer.can_skip_carry) { + setc(byte[STATE + offsetof(JITState, carry_flag)]); + } + break; + case Macro::ALUOperation::AddWithCarry: + bt(dword[STATE + offsetof(JITState, carry_flag)], 0); + adc(src_a, src_b); + setc(byte[STATE + offsetof(JITState, carry_flag)]); + break; + case Macro::ALUOperation::Subtract: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + sub(src_a, src_b); + has_emitted = true; + } + } else { + sub(src_a, src_b); + has_emitted = true; + } + if (!optimizer.can_skip_carry && has_emitted) { + setc(byte[STATE + offsetof(JITState, carry_flag)]); + } + break; + case Macro::ALUOperation::SubtractWithBorrow: + bt(dword[STATE + offsetof(JITState, carry_flag)], 0); + sbb(src_a, src_b); + setc(byte[STATE + offsetof(JITState, carry_flag)]); + break; + case Macro::ALUOperation::Xor: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + xor_(src_a, src_b); + } + } else { + xor_(src_a, src_b); + } + break; + case Macro::ALUOperation::Or: + if (optimizer.zero_reg_skip) { + if (valid_operation) { + or_(src_a, src_b); + } + } else { + or_(src_a, src_b); + } + break; + case Macro::ALUOperation::And: + if (optimizer.zero_reg_skip) { + if (!has_zero_register) { + and_(src_a, src_b); + } + } else { + and_(src_a, src_b); + } + break; + case Macro::ALUOperation::AndNot: + if (optimizer.zero_reg_skip) { + if (!is_a_zero) { + not_(src_b); + and_(src_a, src_b); + } + } else { + not_(src_b); + and_(src_a, src_b); + } + break; + case Macro::ALUOperation::Nand: + if (optimizer.zero_reg_skip) { + if (!is_a_zero) { + and_(src_a, src_b); + not_(src_a); + } + } else { + and_(src_a, src_b); + not_(src_a); + } + break; + default: + UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", + static_cast<std::size_t>(opcode.alu_operation.Value())); + break; + } + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) { + if (optimizer.skip_dummy_addimmediate) { + // Games tend to use this as an exit instruction placeholder. It's to encode an instruction + // without doing anything. In our case we can just not emit anything. + if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) { + return; + } + } + // Check for redundant moves + if (optimizer.optimize_for_method_move && + opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) { + if (next_opcode.has_value()) { + const auto next = *next_opcode; + if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod && + opcode.dst == next.dst) { + return; + } + } + } + if (optimizer.zero_reg_skip && opcode.src_a == 0) { + if (opcode.immediate == 0) { + xor_(RESULT, RESULT); + } else { + mov(RESULT, opcode.immediate); + } + } else { + auto result = Compile_GetRegister(opcode.src_a, RESULT); + if (opcode.immediate > 2) { + add(result, opcode.immediate); + } else if (opcode.immediate == 1) { + inc(result); + } else if (opcode.immediate < 0) { + sub(result, opcode.immediate * -1); + } + } + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) { + auto dst = Compile_GetRegister(opcode.src_a, RESULT); + auto src = Compile_GetRegister(opcode.src_b, eax); + + if (opcode.bf_src_bit != 0 && opcode.bf_src_bit != 31) { + shr(src, opcode.bf_src_bit); + } else if (opcode.bf_src_bit == 31) { + xor_(src, src); + } + // Don't bother masking the whole register since we're using a 32 bit register + if (opcode.bf_size != 31 && opcode.bf_size != 0) { + and_(src, opcode.GetBitfieldMask()); + } else if (opcode.bf_size == 0) { + xor_(src, src); + } + if (opcode.bf_dst_bit != 31 && opcode.bf_dst_bit != 0) { + shl(src, opcode.bf_dst_bit); + } else if (opcode.bf_dst_bit == 31) { + xor_(src, src); + } + + const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit); + if (mask != 0xffffffff) { + and_(dst, mask); + } + or_(dst, src); + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) { + const auto dst = Compile_GetRegister(opcode.src_a, ecx); + const auto src = Compile_GetRegister(opcode.src_b, RESULT); + + shr(src, dst.cvt8()); + if (opcode.bf_size != 0 && opcode.bf_size != 31) { + and_(src, opcode.GetBitfieldMask()); + } else if (opcode.bf_size == 0) { + xor_(src, src); + } + + if (opcode.bf_dst_bit != 0 && opcode.bf_dst_bit != 31) { + shl(src, opcode.bf_dst_bit); + } else if (opcode.bf_dst_bit == 31) { + xor_(src, src); + } + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) { + const auto dst = Compile_GetRegister(opcode.src_a, ecx); + const auto src = Compile_GetRegister(opcode.src_b, RESULT); + + if (opcode.bf_src_bit != 0) { + shr(src, opcode.bf_src_bit); + } + + if (opcode.bf_size != 31) { + and_(src, opcode.GetBitfieldMask()); + } + shl(src, dst.cvt8()); + + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) { + if (optimizer.zero_reg_skip && opcode.src_a == 0) { + if (opcode.immediate == 0) { + xor_(RESULT, RESULT); + } else { + mov(RESULT, opcode.immediate); + } + } else { + auto result = Compile_GetRegister(opcode.src_a, RESULT); + if (opcode.immediate > 2) { + add(result, opcode.immediate); + } else if (opcode.immediate == 1) { + inc(result); + } else if (opcode.immediate < 0) { + sub(result, opcode.immediate * -1); + } + } + + // Equivalent to Engines::Maxwell3D::GetRegisterValue: + if (optimizer.enable_asserts) { + Xbyak::Label pass_range_check; + cmp(RESULT, static_cast<u32>(Engines::Maxwell3D::Regs::NUM_REGS)); + jb(pass_range_check); + int3(); + L(pass_range_check); + } + mov(rax, qword[STATE]); + mov(RESULT, + dword[rax + offsetof(Engines::Maxwell3D, regs) + + offsetof(Engines::Maxwell3D::Regs, reg_array) + RESULT.cvt64() * sizeof(u32)]); + + Compile_ProcessResult(opcode.result_operation, opcode.dst); +} + +static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) { + maxwell3d->CallMethodFromMME(method_address.address, value); +} + +void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) { + Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); + mov(Common::X64::ABI_PARAM1, qword[STATE]); + mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS); + mov(Common::X64::ABI_PARAM3, value); + Common::X64::CallFarFunction(*this, &Send); + Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); + + Xbyak::Label dont_process{}; + // Get increment + test(METHOD_ADDRESS, 0x3f000); + // If zero, method address doesn't update + je(dont_process); + + mov(ecx, METHOD_ADDRESS); + and_(METHOD_ADDRESS, 0xfff); + shr(ecx, 12); + and_(ecx, 0x3f); + lea(eax, ptr[rcx + METHOD_ADDRESS.cvt64()]); + sal(ecx, 12); + or_(eax, ecx); + + mov(METHOD_ADDRESS, eax); + + L(dont_process); +} + +void Tegra::MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) { + ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid"); + const s32 jump_address = + static_cast<s32>(pc) + static_cast<s32>(opcode.GetBranchTarget() / sizeof(s32)); + + Xbyak::Label end; + auto value = Compile_GetRegister(opcode.src_a, eax); + test(value, value); + if (optimizer.has_delayed_pc) { + switch (opcode.branch_condition) { + case Macro::BranchCondition::Zero: + jne(end, T_NEAR); + break; + case Macro::BranchCondition::NotZero: + je(end, T_NEAR); + break; + } + + if (opcode.branch_annul) { + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + jmp(labels[jump_address], T_NEAR); + } else { + Xbyak::Label handle_post_exit{}; + Xbyak::Label skip{}; + jmp(skip, T_NEAR); + if (opcode.is_exit) { + L(handle_post_exit); + // Execute 1 instruction + mov(BRANCH_HOLDER, end_of_code); + // Jump to next instruction to skip delay slot check + jmp(labels[jump_address], T_NEAR); + } else { + L(handle_post_exit); + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + jmp(labels[jump_address], T_NEAR); + } + L(skip); + mov(BRANCH_HOLDER, handle_post_exit); + jmp(delay_skip[pc], T_NEAR); + } + } else { + switch (opcode.branch_condition) { + case Macro::BranchCondition::Zero: + je(labels[jump_address], T_NEAR); + break; + case Macro::BranchCondition::NotZero: + jne(labels[jump_address], T_NEAR); + break; + } + } + + L(end); +} + +void Tegra::MacroJITx64Impl::Optimizer_ScanFlags() { + optimizer.can_skip_carry = true; + optimizer.has_delayed_pc = false; + for (auto raw_op : code) { + Macro::Opcode op{}; + op.raw = raw_op; + + if (op.operation == Macro::Operation::ALU) { + // Scan for any ALU operations which actually use the carry flag, if they don't exist in + // our current code we can skip emitting the carry flag handling operations + if (op.alu_operation == Macro::ALUOperation::AddWithCarry || + op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) { + optimizer.can_skip_carry = false; + } + } + + if (op.operation == Macro::Operation::Branch) { + if (!op.branch_annul) { + optimizer.has_delayed_pc = true; + } + } + } +} + +void MacroJITx64Impl::Compile() { + MICROPROFILE_SCOPE(MacroJitCompile); + labels.fill(Xbyak::Label()); + + Common::X64::ABI_PushRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); + // JIT state + mov(STATE, Common::X64::ABI_PARAM1); + mov(PARAMETERS, Common::X64::ABI_PARAM2); + xor_(RESULT, RESULT); + xor_(METHOD_ADDRESS, METHOD_ADDRESS); + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + + mov(dword[STATE + offsetof(JITState, registers) + 4], Compile_FetchParameter()); + + // Track get register for zero registers and mark it as no-op + optimizer.zero_reg_skip = true; + + // AddImmediate tends to be used as a NOP instruction, if we detect this we can + // completely skip the entire code path and no emit anything + optimizer.skip_dummy_addimmediate = true; + + // SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting + // one if our register isn't "dirty" + optimizer.optimize_for_method_move = true; + + // Enable run-time assertions in JITted code + optimizer.enable_asserts = false; + + // Check to see if we can skip emitting certain instructions + Optimizer_ScanFlags(); + + const u32 op_count = static_cast<u32>(code.size()); + for (u32 i = 0; i < op_count; i++) { + if (i < op_count - 1) { + pc = i + 1; + next_opcode = GetOpCode(); + } else { + next_opcode = {}; + } + pc = i; + Compile_NextInstruction(); + } + + L(end_of_code); + + Common::X64::ABI_PopRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); + ret(); + ready(); + program = getCode<ProgramType>(); +} + +bool MacroJITx64Impl::Compile_NextInstruction() { + const auto opcode = GetOpCode(); + if (labels[pc].getAddress()) { + return false; + } + + L(labels[pc]); + + switch (opcode.operation) { + case Macro::Operation::ALU: + Compile_ALU(opcode); + break; + case Macro::Operation::AddImmediate: + Compile_AddImmediate(opcode); + break; + case Macro::Operation::ExtractInsert: + Compile_ExtractInsert(opcode); + break; + case Macro::Operation::ExtractShiftLeftImmediate: + Compile_ExtractShiftLeftImmediate(opcode); + break; + case Macro::Operation::ExtractShiftLeftRegister: + Compile_ExtractShiftLeftRegister(opcode); + break; + case Macro::Operation::Read: + Compile_Read(opcode); + break; + case Macro::Operation::Branch: + Compile_Branch(opcode); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value()); + break; + } + + if (optimizer.has_delayed_pc) { + if (opcode.is_exit) { + mov(rax, end_of_code); + test(BRANCH_HOLDER, BRANCH_HOLDER); + cmove(BRANCH_HOLDER, rax); + // Jump to next instruction to skip delay slot check + je(labels[pc + 1], T_NEAR); + } else { + // TODO(ogniK): Optimize delay slot branching + Xbyak::Label no_delay_slot{}; + test(BRANCH_HOLDER, BRANCH_HOLDER); + je(no_delay_slot, T_NEAR); + mov(rax, BRANCH_HOLDER); + xor_(BRANCH_HOLDER, BRANCH_HOLDER); + jmp(rax); + L(no_delay_slot); + } + L(delay_skip[pc]); + if (opcode.is_exit) { + return false; + } + } else { + test(BRANCH_HOLDER, BRANCH_HOLDER); + jne(end_of_code, T_NEAR); + if (opcode.is_exit) { + inc(BRANCH_HOLDER); + return false; + } + } + return true; +} + +Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() { + mov(eax, dword[PARAMETERS]); + add(PARAMETERS, sizeof(u32)); + return eax; +} + +Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) { + if (index == 0) { + // Register 0 is always zero + xor_(dst, dst); + } else { + mov(dst, dword[STATE + offsetof(JITState, registers) + index * sizeof(u32)]); + } + + return dst; +} + +void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) { + const auto SetRegister = [this](u32 reg, const Xbyak::Reg32& result) { + // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero + // register. + if (reg == 0) { + return; + } + mov(dword[STATE + offsetof(JITState, registers) + reg * sizeof(u32)], result); + }; + const auto SetMethodAddress = [this](const Xbyak::Reg32& reg) { mov(METHOD_ADDRESS, reg); }; + + switch (operation) { + case Macro::ResultOperation::IgnoreAndFetch: + SetRegister(reg, Compile_FetchParameter()); + break; + case Macro::ResultOperation::Move: + SetRegister(reg, RESULT); + break; + case Macro::ResultOperation::MoveAndSetMethod: + SetRegister(reg, RESULT); + SetMethodAddress(RESULT); + break; + case Macro::ResultOperation::FetchAndSend: + // Fetch parameter and send result. + SetRegister(reg, Compile_FetchParameter()); + Compile_Send(RESULT); + break; + case Macro::ResultOperation::MoveAndSend: + // Move and send result. + SetRegister(reg, RESULT); + Compile_Send(RESULT); + break; + case Macro::ResultOperation::FetchAndSetMethod: + // Fetch parameter and use result as Method Address. + SetRegister(reg, Compile_FetchParameter()); + SetMethodAddress(RESULT); + break; + case Macro::ResultOperation::MoveAndSetMethodFetchAndSend: + // Move result and use as Method Address, then fetch and send parameter. + SetRegister(reg, RESULT); + SetMethodAddress(RESULT); + Compile_Send(Compile_FetchParameter()); + break; + case Macro::ResultOperation::MoveAndSetMethodSend: + // Move result and use as Method Address, then send bits 12:17 of result. + SetRegister(reg, RESULT); + SetMethodAddress(RESULT); + shr(RESULT, 12); + and_(RESULT, 0b111111); + Compile_Send(RESULT); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented macro operation {}", static_cast<std::size_t>(operation)); + } +} + +Macro::Opcode MacroJITx64Impl::GetOpCode() const { + ASSERT(pc < code.size()); + return {code[pc]}; +} + +std::bitset<32> MacroJITx64Impl::PersistentCallerSavedRegs() const { + return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED; +} + +} // namespace Tegra diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h new file mode 100644 index 000000000..a180e7428 --- /dev/null +++ b/src/video_core/macro/macro_jit_x64.h @@ -0,0 +1,98 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <bitset> +#include <xbyak.h> +#include "common/bit_field.h" +#include "common/common_types.h" +#include "common/x64/xbyak_abi.h" +#include "video_core/macro/macro.h" + +namespace Tegra { + +namespace Engines { +class Maxwell3D; +} + +/// MAX_CODE_SIZE is arbitrarily chosen based on current booting games +constexpr size_t MAX_CODE_SIZE = 0x10000; + +class MacroJITx64 final : public MacroEngine { +public: + explicit MacroJITx64(Engines::Maxwell3D& maxwell3d); + +protected: + std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override; + +private: + Engines::Maxwell3D& maxwell3d; +}; + +class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro { +public: + MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code); + ~MacroJITx64Impl(); + + void Execute(const std::vector<u32>& parameters, u32 method) override; + + void Compile_ALU(Macro::Opcode opcode); + void Compile_AddImmediate(Macro::Opcode opcode); + void Compile_ExtractInsert(Macro::Opcode opcode); + void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode); + void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode); + void Compile_Read(Macro::Opcode opcode); + void Compile_Branch(Macro::Opcode opcode); + +private: + void Optimizer_ScanFlags(); + + void Compile(); + bool Compile_NextInstruction(); + + Xbyak::Reg32 Compile_FetchParameter(); + Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst); + + void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg); + void Compile_Send(Xbyak::Reg32 value); + + Macro::Opcode GetOpCode() const; + std::bitset<32> PersistentCallerSavedRegs() const; + + struct JITState { + Engines::Maxwell3D* maxwell3d{}; + std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{}; + u32 carry_flag{}; + }; + static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0"); + using ProgramType = void (*)(JITState*, const u32*); + + struct OptimizerState { + bool can_skip_carry{}; + bool has_delayed_pc{}; + bool zero_reg_skip{}; + bool skip_dummy_addimmediate{}; + bool optimize_for_method_move{}; + bool enable_asserts{}; + }; + OptimizerState optimizer{}; + + std::optional<Macro::Opcode> next_opcode{}; + ProgramType program{nullptr}; + + std::array<Xbyak::Label, MAX_CODE_SIZE> labels; + std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip; + Xbyak::Label end_of_code{}; + + bool is_delay_slot{}; + u32 pc{}; + std::optional<u32> delayed_pc; + + const std::vector<u32>& code; + Engines::Maxwell3D& maxwell3d; +}; + +} // namespace Tegra diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index a3389d0d2..6e70bd362 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -4,186 +4,180 @@ #include "common/alignment.h" #include "common/assert.h" -#include "common/logging/log.h" #include "core/core.h" +#include "core/hle/kernel/memory/page_table.h" #include "core/hle/kernel/process.h" -#include "core/hle/kernel/vm_manager.h" #include "core/memory.h" #include "video_core/gpu.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" +#include "video_core/renderer_base.h" namespace Tegra { -MemoryManager::MemoryManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer) - : rasterizer{rasterizer}, system{system} { - std::fill(page_table.pointers.begin(), page_table.pointers.end(), nullptr); - std::fill(page_table.attributes.begin(), page_table.attributes.end(), - Common::PageType::Unmapped); - page_table.Resize(address_space_width); +MemoryManager::MemoryManager(Core::System& system_) + : system{system_}, page_table(page_table_size) {} - // Initialize the map with a single free region covering the entire managed space. - VirtualMemoryArea initial_vma; - initial_vma.size = address_space_end; - vma_map.emplace(initial_vma.base, initial_vma); +MemoryManager::~MemoryManager() = default; - UpdatePageTableForVMA(initial_vma); +void MemoryManager::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) { + rasterizer = &rasterizer_; } -MemoryManager::~MemoryManager() = default; +GPUVAddr MemoryManager::UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size) { + u64 remaining_size{size}; + for (u64 offset{}; offset < size; offset += page_size) { + if (remaining_size < page_size) { + SetPageEntry(gpu_addr + offset, page_entry + offset, remaining_size); + } else { + SetPageEntry(gpu_addr + offset, page_entry + offset); + } + remaining_size -= page_size; + } + return gpu_addr; +} -GPUVAddr MemoryManager::AllocateSpace(u64 size, u64 align) { - const u64 aligned_size{Common::AlignUp(size, page_size)}; - const GPUVAddr gpu_addr{FindFreeRegion(address_space_base, aligned_size)}; +GPUVAddr MemoryManager::Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size) { + return UpdateRange(gpu_addr, cpu_addr, size); +} - AllocateMemory(gpu_addr, 0, aligned_size); +GPUVAddr MemoryManager::MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align) { + return Map(cpu_addr, *FindFreeRange(size, align), size); +} - return gpu_addr; +GPUVAddr MemoryManager::MapAllocate32(VAddr cpu_addr, std::size_t size) { + const std::optional<GPUVAddr> gpu_addr = FindFreeRange(size, 1, true); + ASSERT(gpu_addr); + return Map(cpu_addr, *gpu_addr, size); } -GPUVAddr MemoryManager::AllocateSpace(GPUVAddr gpu_addr, u64 size, u64 align) { - const u64 aligned_size{Common::AlignUp(size, page_size)}; +void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) { + if (!size) { + return; + } - AllocateMemory(gpu_addr, 0, aligned_size); + // Flush and invalidate through the GPU interface, to be asynchronous if possible. + system.GPU().FlushAndInvalidateRegion(*GpuToCpuAddress(gpu_addr), size); - return gpu_addr; + UpdateRange(gpu_addr, PageEntry::State::Unmapped, size); } -GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, u64 size) { - const u64 aligned_size{Common::AlignUp(size, page_size)}; - const GPUVAddr gpu_addr{FindFreeRegion(address_space_base, aligned_size)}; - - MapBackingMemory(gpu_addr, system.Memory().GetPointer(cpu_addr), aligned_size, cpu_addr); - ASSERT(system.CurrentProcess() - ->VMManager() - .SetMemoryAttribute(cpu_addr, size, Kernel::MemoryAttribute::DeviceMapped, - Kernel::MemoryAttribute::DeviceMapped) - .IsSuccess()); +std::optional<GPUVAddr> MemoryManager::AllocateFixed(GPUVAddr gpu_addr, std::size_t size) { + for (u64 offset{}; offset < size; offset += page_size) { + if (!GetPageEntry(gpu_addr + offset).IsUnmapped()) { + return std::nullopt; + } + } - return gpu_addr; + return UpdateRange(gpu_addr, PageEntry::State::Allocated, size); } -GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, GPUVAddr gpu_addr, u64 size) { - ASSERT((gpu_addr & page_mask) == 0); +GPUVAddr MemoryManager::Allocate(std::size_t size, std::size_t align) { + return *AllocateFixed(*FindFreeRange(size, align), size); +} - const u64 aligned_size{Common::AlignUp(size, page_size)}; +void MemoryManager::TryLockPage(PageEntry page_entry, std::size_t size) { + if (!page_entry.IsValid()) { + return; + } - MapBackingMemory(gpu_addr, system.Memory().GetPointer(cpu_addr), aligned_size, cpu_addr); ASSERT(system.CurrentProcess() - ->VMManager() - .SetMemoryAttribute(cpu_addr, size, Kernel::MemoryAttribute::DeviceMapped, - Kernel::MemoryAttribute::DeviceMapped) + ->PageTable() + .LockForDeviceAddressSpace(page_entry.ToAddress(), size) .IsSuccess()); - return gpu_addr; } -GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) { - ASSERT((gpu_addr & page_mask) == 0); - - const u64 aligned_size{Common::AlignUp(size, page_size)}; - const auto cpu_addr = GpuToCpuAddress(gpu_addr); - ASSERT(cpu_addr); - - // Flush and invalidate through the GPU interface, to be asynchronous if possible. - system.GPU().FlushAndInvalidateRegion(*cpu_addr, aligned_size); +void MemoryManager::TryUnlockPage(PageEntry page_entry, std::size_t size) { + if (!page_entry.IsValid()) { + return; + } - UnmapRange(gpu_addr, aligned_size); ASSERT(system.CurrentProcess() - ->VMManager() - .SetMemoryAttribute(cpu_addr.value(), size, Kernel::MemoryAttribute::DeviceMapped, - Kernel::MemoryAttribute::None) + ->PageTable() + .UnlockForDeviceAddressSpace(page_entry.ToAddress(), size) .IsSuccess()); +} - return gpu_addr; +PageEntry MemoryManager::GetPageEntry(GPUVAddr gpu_addr) const { + return page_table[PageEntryIndex(gpu_addr)]; } -GPUVAddr MemoryManager::FindFreeRegion(GPUVAddr region_start, u64 size) const { - // Find the first Free VMA. - const VMAHandle vma_handle{ - std::find_if(vma_map.begin(), vma_map.end(), [region_start, size](const auto& vma) { - if (vma.second.type != VirtualMemoryArea::Type::Unmapped) { - return false; - } +void MemoryManager::SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size) { + // TODO(bunnei): We should lock/unlock device regions. This currently causes issues due to + // improper tracking, but should be fixed in the future. - const VAddr vma_end{vma.second.base + vma.second.size}; - return vma_end > region_start && vma_end >= region_start + size; - })}; + //// Unlock the old page + // TryUnlockPage(page_table[PageEntryIndex(gpu_addr)], size); - if (vma_handle == vma_map.end()) { - return {}; - } + //// Lock the new page + // TryLockPage(page_entry, size); - return std::max(region_start, vma_handle->second.base); + page_table[PageEntryIndex(gpu_addr)] = page_entry; } -bool MemoryManager::IsAddressValid(GPUVAddr addr) const { - return (addr >> page_bits) < page_table.pointers.size(); -} +std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align, + bool start_32bit_address) const { + if (!align) { + align = page_size; + } else { + align = Common::AlignUp(align, page_size); + } -std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr addr) const { - if (!IsAddressValid(addr)) { - return {}; + u64 available_size{}; + GPUVAddr gpu_addr{start_32bit_address ? address_space_start_low : address_space_start}; + while (gpu_addr + available_size < address_space_size) { + if (GetPageEntry(gpu_addr + available_size).IsUnmapped()) { + available_size += page_size; + + if (available_size >= size) { + return gpu_addr; + } + } else { + gpu_addr += available_size + page_size; + available_size = 0; + + const auto remainder{gpu_addr % align}; + if (remainder) { + gpu_addr = (gpu_addr - remainder) + align; + } + } } - const VAddr cpu_addr{page_table.backing_addr[addr >> page_bits]}; - if (cpu_addr) { - return cpu_addr + (addr & page_mask); + return std::nullopt; +} + +std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr gpu_addr) const { + const auto page_entry{GetPageEntry(gpu_addr)}; + if (!page_entry.IsValid()) { + return std::nullopt; } - return {}; + return page_entry.ToAddress() + (gpu_addr & page_mask); } template <typename T> T MemoryManager::Read(GPUVAddr addr) const { - if (!IsAddressValid(addr)) { - return {}; - } - - const u8* page_pointer{GetPointer(addr)}; - if (page_pointer) { + if (auto page_pointer{GetPointer(addr)}; page_pointer) { // NOTE: Avoid adding any extra logic to this fast-path block T value; std::memcpy(&value, page_pointer, sizeof(T)); return value; } - switch (page_table.attributes[addr >> page_bits]) { - case Common::PageType::Unmapped: - LOG_ERROR(HW_GPU, "Unmapped Read{} @ 0x{:08X}", sizeof(T) * 8, addr); - return 0; - case Common::PageType::Memory: - ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", addr); - break; - default: - UNREACHABLE(); - } + UNREACHABLE(); + return {}; } template <typename T> void MemoryManager::Write(GPUVAddr addr, T data) { - if (!IsAddressValid(addr)) { - return; - } - - u8* page_pointer{GetPointer(addr)}; - if (page_pointer) { + if (auto page_pointer{GetPointer(addr)}; page_pointer) { // NOTE: Avoid adding any extra logic to this fast-path block std::memcpy(page_pointer, &data, sizeof(T)); return; } - switch (page_table.attributes[addr >> page_bits]) { - case Common::PageType::Unmapped: - LOG_ERROR(HW_GPU, "Unmapped Write{} 0x{:08X} @ 0x{:016X}", sizeof(data) * 8, - static_cast<u32>(data), addr); - return; - case Common::PageType::Memory: - ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", addr); - break; - default: - UNREACHABLE(); - } + UNREACHABLE(); } template u8 MemoryManager::Read<u8>(GPUVAddr addr) const; @@ -195,71 +189,48 @@ template void MemoryManager::Write<u16>(GPUVAddr addr, u16 data); template void MemoryManager::Write<u32>(GPUVAddr addr, u32 data); template void MemoryManager::Write<u64>(GPUVAddr addr, u64 data); -u8* MemoryManager::GetPointer(GPUVAddr addr) { - if (!IsAddressValid(addr)) { +u8* MemoryManager::GetPointer(GPUVAddr gpu_addr) { + if (!GetPageEntry(gpu_addr).IsValid()) { return {}; } - auto& memory = system.Memory(); - - const VAddr page_addr{page_table.backing_addr[addr >> page_bits]}; - - if (page_addr != 0) { - return memory.GetPointer(page_addr + (addr & page_mask)); + const auto address{GpuToCpuAddress(gpu_addr)}; + if (!address) { + return {}; } - LOG_ERROR(HW_GPU, "Unknown GetPointer @ 0x{:016X}", addr); - return {}; + return system.Memory().GetPointer(*address); } -const u8* MemoryManager::GetPointer(GPUVAddr addr) const { - if (!IsAddressValid(addr)) { +const u8* MemoryManager::GetPointer(GPUVAddr gpu_addr) const { + if (!GetPageEntry(gpu_addr).IsValid()) { return {}; } - const auto& memory = system.Memory(); - - const VAddr page_addr{page_table.backing_addr[addr >> page_bits]}; - - if (page_addr != 0) { - return memory.GetPointer(page_addr + (addr & page_mask)); + const auto address{GpuToCpuAddress(gpu_addr)}; + if (!address) { + return {}; } - LOG_ERROR(HW_GPU, "Unknown GetPointer @ 0x{:016X}", addr); - return {}; -} - -bool MemoryManager::IsBlockContinuous(const GPUVAddr start, const std::size_t size) const { - const std::size_t inner_size = size - 1; - const GPUVAddr end = start + inner_size; - const auto host_ptr_start = reinterpret_cast<std::uintptr_t>(GetPointer(start)); - const auto host_ptr_end = reinterpret_cast<std::uintptr_t>(GetPointer(end)); - const auto range = static_cast<std::size_t>(host_ptr_end - host_ptr_start); - return range == inner_size; + return system.Memory().GetPointer(*address); } -void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const { +void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const { std::size_t remaining_size{size}; - std::size_t page_index{src_addr >> page_bits}; - std::size_t page_offset{src_addr & page_mask}; - - auto& memory = system.Memory(); + std::size_t page_index{gpu_src_addr >> page_bits}; + std::size_t page_offset{gpu_src_addr & page_mask}; while (remaining_size > 0) { const std::size_t copy_amount{ std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)}; - switch (page_table.attributes[page_index]) { - case Common::PageType::Memory: { - const VAddr src_addr{page_table.backing_addr[page_index] + page_offset}; + if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) { + const auto src_addr{*page_addr + page_offset}; + // Flush must happen on the rasterizer interface, such that memory is always synchronous // when it is read (even when in asynchronous GPU mode). Fixes Dead Cells title menu. - rasterizer.FlushRegion(src_addr, copy_amount); - memory.ReadBlockUnsafe(src_addr, dest_buffer, copy_amount); - break; - } - default: - UNREACHABLE(); + rasterizer->FlushRegion(src_addr, copy_amount); + system.Memory().ReadBlockUnsafe(src_addr, dest_buffer, copy_amount); } page_index++; @@ -269,24 +240,23 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::s } } -void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, +void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, const std::size_t size) const { std::size_t remaining_size{size}; - std::size_t page_index{src_addr >> page_bits}; - std::size_t page_offset{src_addr & page_mask}; - - auto& memory = system.Memory(); + std::size_t page_index{gpu_src_addr >> page_bits}; + std::size_t page_offset{gpu_src_addr & page_mask}; while (remaining_size > 0) { const std::size_t copy_amount{ std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)}; - const u8* page_pointer = page_table.pointers[page_index]; - if (page_pointer) { - const VAddr src_addr{page_table.backing_addr[page_index] + page_offset}; - memory.ReadBlockUnsafe(src_addr, dest_buffer, copy_amount); + + if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) { + const auto src_addr{*page_addr + page_offset}; + system.Memory().ReadBlockUnsafe(src_addr, dest_buffer, copy_amount); } else { std::memset(dest_buffer, 0, copy_amount); } + page_index++; page_offset = 0; dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount; @@ -294,28 +264,22 @@ void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, } } -void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size) { +void MemoryManager::WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size) { std::size_t remaining_size{size}; - std::size_t page_index{dest_addr >> page_bits}; - std::size_t page_offset{dest_addr & page_mask}; - - auto& memory = system.Memory(); + std::size_t page_index{gpu_dest_addr >> page_bits}; + std::size_t page_offset{gpu_dest_addr & page_mask}; while (remaining_size > 0) { const std::size_t copy_amount{ std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)}; - switch (page_table.attributes[page_index]) { - case Common::PageType::Memory: { - const VAddr dest_addr{page_table.backing_addr[page_index] + page_offset}; + if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) { + const auto dest_addr{*page_addr + page_offset}; + // Invalidate must happen on the rasterizer interface, such that memory is always // synchronous when it is written (even when in asynchronous GPU mode). - rasterizer.InvalidateRegion(dest_addr, copy_amount); - memory.WriteBlockUnsafe(dest_addr, src_buffer, copy_amount); - break; - } - default: - UNREACHABLE(); + rasterizer->InvalidateRegion(dest_addr, copy_amount); + system.Memory().WriteBlockUnsafe(dest_addr, src_buffer, copy_amount); } page_index++; @@ -325,22 +289,21 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const } } -void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, - const std::size_t size) { +void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, + std::size_t size) { std::size_t remaining_size{size}; - std::size_t page_index{dest_addr >> page_bits}; - std::size_t page_offset{dest_addr & page_mask}; - - auto& memory = system.Memory(); + std::size_t page_index{gpu_dest_addr >> page_bits}; + std::size_t page_offset{gpu_dest_addr & page_mask}; while (remaining_size > 0) { const std::size_t copy_amount{ std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)}; - u8* page_pointer = page_table.pointers[page_index]; - if (page_pointer) { - const VAddr dest_addr{page_table.backing_addr[page_index] + page_offset}; - memory.WriteBlockUnsafe(dest_addr, src_buffer, copy_amount); + + if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) { + const auto dest_addr{*page_addr + page_offset}; + system.Memory().WriteBlockUnsafe(dest_addr, src_buffer, copy_amount); } + page_index++; page_offset = 0; src_buffer = static_cast<const u8*>(src_buffer) + copy_amount; @@ -348,270 +311,26 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, } } -void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) { +void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size) { std::vector<u8> tmp_buffer(size); - ReadBlock(src_addr, tmp_buffer.data(), size); - WriteBlock(dest_addr, tmp_buffer.data(), size); + ReadBlock(gpu_src_addr, tmp_buffer.data(), size); + WriteBlock(gpu_dest_addr, tmp_buffer.data(), size); } -void MemoryManager::CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) { +void MemoryManager::CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, + std::size_t size) { std::vector<u8> tmp_buffer(size); - ReadBlockUnsafe(src_addr, tmp_buffer.data(), size); - WriteBlockUnsafe(dest_addr, tmp_buffer.data(), size); -} - -bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) { - const VAddr addr = page_table.backing_addr[gpu_addr >> page_bits]; - const std::size_t page = (addr & Memory::PAGE_MASK) + size; - return page <= Memory::PAGE_SIZE; -} - -void MemoryManager::MapPages(GPUVAddr base, u64 size, u8* memory, Common::PageType type, - VAddr backing_addr) { - LOG_DEBUG(HW_GPU, "Mapping {} onto {:016X}-{:016X}", fmt::ptr(memory), base * page_size, - (base + size) * page_size); - - const VAddr end{base + size}; - ASSERT_MSG(end <= page_table.pointers.size(), "out of range mapping at {:016X}", - base + page_table.pointers.size()); - - std::fill(page_table.attributes.begin() + base, page_table.attributes.begin() + end, type); - - if (memory == nullptr) { - std::fill(page_table.pointers.begin() + base, page_table.pointers.begin() + end, memory); - std::fill(page_table.backing_addr.begin() + base, page_table.backing_addr.begin() + end, - backing_addr); - } else { - while (base != end) { - page_table.pointers[base] = memory; - page_table.backing_addr[base] = backing_addr; - - base += 1; - memory += page_size; - backing_addr += page_size; - } - } -} - -void MemoryManager::MapMemoryRegion(GPUVAddr base, u64 size, u8* target, VAddr backing_addr) { - ASSERT_MSG((size & page_mask) == 0, "non-page aligned size: {:016X}", size); - ASSERT_MSG((base & page_mask) == 0, "non-page aligned base: {:016X}", base); - MapPages(base / page_size, size / page_size, target, Common::PageType::Memory, backing_addr); -} - -void MemoryManager::UnmapRegion(GPUVAddr base, u64 size) { - ASSERT_MSG((size & page_mask) == 0, "non-page aligned size: {:016X}", size); - ASSERT_MSG((base & page_mask) == 0, "non-page aligned base: {:016X}", base); - MapPages(base / page_size, size / page_size, nullptr, Common::PageType::Unmapped); -} - -bool VirtualMemoryArea::CanBeMergedWith(const VirtualMemoryArea& next) const { - ASSERT(base + size == next.base); - if (type != next.type) { - return {}; - } - if (type == VirtualMemoryArea::Type::Allocated && (offset + size != next.offset)) { - return {}; - } - if (type == VirtualMemoryArea::Type::Mapped && backing_memory + size != next.backing_memory) { - return {}; - } - return true; -} - -MemoryManager::VMAHandle MemoryManager::FindVMA(GPUVAddr target) const { - if (target >= address_space_end) { - return vma_map.end(); - } else { - return std::prev(vma_map.upper_bound(target)); - } -} - -MemoryManager::VMAIter MemoryManager::Allocate(VMAIter vma_handle) { - VirtualMemoryArea& vma{vma_handle->second}; - - vma.type = VirtualMemoryArea::Type::Allocated; - vma.backing_addr = 0; - vma.backing_memory = {}; - UpdatePageTableForVMA(vma); - - return MergeAdjacent(vma_handle); -} - -MemoryManager::VMAHandle MemoryManager::AllocateMemory(GPUVAddr target, std::size_t offset, - u64 size) { - - // This is the appropriately sized VMA that will turn into our allocation. - VMAIter vma_handle{CarveVMA(target, size)}; - VirtualMemoryArea& vma{vma_handle->second}; - - ASSERT(vma.size == size); - - vma.offset = offset; - - return Allocate(vma_handle); -} - -MemoryManager::VMAHandle MemoryManager::MapBackingMemory(GPUVAddr target, u8* memory, u64 size, - VAddr backing_addr) { - // This is the appropriately sized VMA that will turn into our allocation. - VMAIter vma_handle{CarveVMA(target, size)}; - VirtualMemoryArea& vma{vma_handle->second}; - - ASSERT(vma.size == size); - - vma.type = VirtualMemoryArea::Type::Mapped; - vma.backing_memory = memory; - vma.backing_addr = backing_addr; - UpdatePageTableForVMA(vma); - - return MergeAdjacent(vma_handle); -} - -void MemoryManager::UnmapRange(GPUVAddr target, u64 size) { - VMAIter vma{CarveVMARange(target, size)}; - const VAddr target_end{target + size}; - const VMAIter end{vma_map.end()}; - - // The comparison against the end of the range must be done using addresses since VMAs can be - // merged during this process, causing invalidation of the iterators. - while (vma != end && vma->second.base < target_end) { - // Unmapped ranges return to allocated state and can be reused - // This behavior is used by Super Mario Odyssey, Sonic Forces, and likely other games - vma = std::next(Allocate(vma)); - } - - ASSERT(FindVMA(target)->second.size >= size); -} - -MemoryManager::VMAIter MemoryManager::StripIterConstness(const VMAHandle& iter) { - // This uses a neat C++ trick to convert a const_iterator to a regular iterator, given - // non-const access to its container. - return vma_map.erase(iter, iter); // Erases an empty range of elements -} - -MemoryManager::VMAIter MemoryManager::CarveVMA(GPUVAddr base, u64 size) { - ASSERT_MSG((size & page_mask) == 0, "non-page aligned size: 0x{:016X}", size); - ASSERT_MSG((base & page_mask) == 0, "non-page aligned base: 0x{:016X}", base); - - VMAIter vma_handle{StripIterConstness(FindVMA(base))}; - if (vma_handle == vma_map.end()) { - // Target address is outside the managed range - return {}; - } - - const VirtualMemoryArea& vma{vma_handle->second}; - if (vma.type == VirtualMemoryArea::Type::Mapped) { - // Region is already allocated - return vma_handle; - } - - const VAddr start_in_vma{base - vma.base}; - const VAddr end_in_vma{start_in_vma + size}; - - ASSERT_MSG(end_in_vma <= vma.size, "region size 0x{:016X} is less than required size 0x{:016X}", - vma.size, end_in_vma); - - if (end_in_vma < vma.size) { - // Split VMA at the end of the allocated region - SplitVMA(vma_handle, end_in_vma); - } - if (start_in_vma != 0) { - // Split VMA at the start of the allocated region - vma_handle = SplitVMA(vma_handle, start_in_vma); - } - - return vma_handle; -} - -MemoryManager::VMAIter MemoryManager::CarveVMARange(GPUVAddr target, u64 size) { - ASSERT_MSG((size & page_mask) == 0, "non-page aligned size: 0x{:016X}", size); - ASSERT_MSG((target & page_mask) == 0, "non-page aligned base: 0x{:016X}", target); - - const VAddr target_end{target + size}; - ASSERT(target_end >= target); - ASSERT(size > 0); - - VMAIter begin_vma{StripIterConstness(FindVMA(target))}; - const VMAIter i_end{vma_map.lower_bound(target_end)}; - if (std::any_of(begin_vma, i_end, [](const auto& entry) { - return entry.second.type == VirtualMemoryArea::Type::Unmapped; - })) { - return {}; - } - - if (target != begin_vma->second.base) { - begin_vma = SplitVMA(begin_vma, target - begin_vma->second.base); - } - - VMAIter end_vma{StripIterConstness(FindVMA(target_end))}; - if (end_vma != vma_map.end() && target_end != end_vma->second.base) { - end_vma = SplitVMA(end_vma, target_end - end_vma->second.base); - } - - return begin_vma; -} - -MemoryManager::VMAIter MemoryManager::SplitVMA(VMAIter vma_handle, u64 offset_in_vma) { - VirtualMemoryArea& old_vma{vma_handle->second}; - VirtualMemoryArea new_vma{old_vma}; // Make a copy of the VMA - - // For now, don't allow no-op VMA splits (trying to split at a boundary) because it's probably - // a bug. This restriction might be removed later. - ASSERT(offset_in_vma < old_vma.size); - ASSERT(offset_in_vma > 0); - - old_vma.size = offset_in_vma; - new_vma.base += offset_in_vma; - new_vma.size -= offset_in_vma; - - switch (new_vma.type) { - case VirtualMemoryArea::Type::Unmapped: - break; - case VirtualMemoryArea::Type::Allocated: - new_vma.offset += offset_in_vma; - break; - case VirtualMemoryArea::Type::Mapped: - new_vma.backing_memory += offset_in_vma; - break; - } - - ASSERT(old_vma.CanBeMergedWith(new_vma)); - - return vma_map.emplace_hint(std::next(vma_handle), new_vma.base, new_vma); -} - -MemoryManager::VMAIter MemoryManager::MergeAdjacent(VMAIter iter) { - const VMAIter next_vma{std::next(iter)}; - if (next_vma != vma_map.end() && iter->second.CanBeMergedWith(next_vma->second)) { - iter->second.size += next_vma->second.size; - vma_map.erase(next_vma); - } - - if (iter != vma_map.begin()) { - VMAIter prev_vma{std::prev(iter)}; - if (prev_vma->second.CanBeMergedWith(iter->second)) { - prev_vma->second.size += iter->second.size; - vma_map.erase(iter); - iter = prev_vma; - } - } - - return iter; + ReadBlockUnsafe(gpu_src_addr, tmp_buffer.data(), size); + WriteBlockUnsafe(gpu_dest_addr, tmp_buffer.data(), size); } -void MemoryManager::UpdatePageTableForVMA(const VirtualMemoryArea& vma) { - switch (vma.type) { - case VirtualMemoryArea::Type::Unmapped: - UnmapRegion(vma.base, vma.size); - break; - case VirtualMemoryArea::Type::Allocated: - MapMemoryRegion(vma.base, vma.size, nullptr, vma.backing_addr); - break; - case VirtualMemoryArea::Type::Mapped: - MapMemoryRegion(vma.base, vma.size, vma.backing_memory, vma.backing_addr); - break; +bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) const { + const auto cpu_addr{GpuToCpuAddress(gpu_addr)}; + if (!cpu_addr) { + return false; } + const std::size_t page{(*cpu_addr & Core::Memory::PAGE_MASK) + size}; + return page <= Core::Memory::PAGE_SIZE; } } // namespace Tegra diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 0d9468535..c078193d9 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -6,9 +6,9 @@ #include <map> #include <optional> +#include <vector> #include "common/common_types.h" -#include "common/page_table.h" namespace VideoCore { class RasterizerInterface; @@ -20,58 +20,70 @@ class System; namespace Tegra { -/** - * Represents a VMA in an address space. A VMA is a contiguous region of virtual addressing space - * with homogeneous attributes across its extents. In this particular implementation each VMA is - * also backed by a single host memory allocation. - */ -struct VirtualMemoryArea { - enum class Type : u8 { - Unmapped, - Allocated, - Mapped, +class PageEntry final { +public: + enum class State : u32 { + Unmapped = static_cast<u32>(-1), + Allocated = static_cast<u32>(-2), }; - /// Virtual base address of the region. - GPUVAddr base{}; - /// Size of the region. - u64 size{}; - /// Memory area mapping type. - Type type{Type::Unmapped}; - /// CPU memory mapped address corresponding to this memory area. - VAddr backing_addr{}; - /// Offset into the backing_memory the mapping starts from. - std::size_t offset{}; - /// Pointer backing this VMA. - u8* backing_memory{}; - - /// Tests if this area can be merged to the right with `next`. - bool CanBeMergedWith(const VirtualMemoryArea& next) const; + constexpr PageEntry() = default; + constexpr PageEntry(State state) : state{state} {} + constexpr PageEntry(VAddr addr) : state{static_cast<State>(addr >> ShiftBits)} {} + + [[nodiscard]] constexpr bool IsUnmapped() const { + return state == State::Unmapped; + } + + [[nodiscard]] constexpr bool IsAllocated() const { + return state == State::Allocated; + } + + [[nodiscard]] constexpr bool IsValid() const { + return !IsUnmapped() && !IsAllocated(); + } + + [[nodiscard]] constexpr VAddr ToAddress() const { + if (!IsValid()) { + return {}; + } + + return static_cast<VAddr>(state) << ShiftBits; + } + + [[nodiscard]] constexpr PageEntry operator+(u64 offset) const { + // If this is a reserved value, offsets do not apply + if (!IsValid()) { + return *this; + } + return PageEntry{(static_cast<VAddr>(state) << ShiftBits) + offset}; + } + +private: + static constexpr std::size_t ShiftBits{12}; + + State state{State::Unmapped}; }; +static_assert(sizeof(PageEntry) == 4, "PageEntry is too large"); class MemoryManager final { public: - explicit MemoryManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer); + explicit MemoryManager(Core::System& system); ~MemoryManager(); - GPUVAddr AllocateSpace(u64 size, u64 align); - GPUVAddr AllocateSpace(GPUVAddr addr, u64 size, u64 align); - GPUVAddr MapBufferEx(VAddr cpu_addr, u64 size); - GPUVAddr MapBufferEx(VAddr cpu_addr, GPUVAddr addr, u64 size); - GPUVAddr UnmapBuffer(GPUVAddr addr, u64 size); - std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const; + /// Binds a renderer to the memory manager. + void BindRasterizer(VideoCore::RasterizerInterface& rasterizer); + + [[nodiscard]] std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const; template <typename T> - T Read(GPUVAddr addr) const; + [[nodiscard]] T Read(GPUVAddr addr) const; template <typename T> void Write(GPUVAddr addr, T data); - u8* GetPointer(GPUVAddr addr); - const u8* GetPointer(GPUVAddr addr) const; - - /// Returns true if the block is continuous in host memory, false otherwise - bool IsBlockContinuous(GPUVAddr start, std::size_t size) const; + [[nodiscard]] u8* GetPointer(GPUVAddr addr); + [[nodiscard]] const u8* GetPointer(GPUVAddr addr) const; /** * ReadBlock and WriteBlock are full read and write operations over virtual @@ -79,9 +91,9 @@ public: * in the Host Memory counterpart. Note: This functions cause Host GPU Memory * Flushes and Invalidations, respectively to each operation. */ - void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const; - void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size); - void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size); + void ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const; + void WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size); + void CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size); /** * ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and @@ -93,97 +105,51 @@ public: * WriteBlockUnsafe instead of WriteBlock since it shouldn't invalidate the texture * being flushed. */ - void ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const; - void WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, std::size_t size); - void CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size); - - /** - * IsGranularRange checks if a gpu region can be simply read with a pointer - */ - bool IsGranularRange(GPUVAddr gpu_addr, std::size_t size); - -private: - using VMAMap = std::map<GPUVAddr, VirtualMemoryArea>; - using VMAHandle = VMAMap::const_iterator; - using VMAIter = VMAMap::iterator; - - bool IsAddressValid(GPUVAddr addr) const; - void MapPages(GPUVAddr base, u64 size, u8* memory, Common::PageType type, - VAddr backing_addr = 0); - void MapMemoryRegion(GPUVAddr base, u64 size, u8* target, VAddr backing_addr); - void UnmapRegion(GPUVAddr base, u64 size); - - /// Finds the VMA in which the given address is included in, or `vma_map.end()`. - VMAHandle FindVMA(GPUVAddr target) const; - - VMAHandle AllocateMemory(GPUVAddr target, std::size_t offset, u64 size); - - /** - * Maps an unmanaged host memory pointer at a given address. - * - * @param target The guest address to start the mapping at. - * @param memory The memory to be mapped. - * @param size Size of the mapping in bytes. - * @param backing_addr The base address of the range to back this mapping. - */ - VMAHandle MapBackingMemory(GPUVAddr target, u8* memory, u64 size, VAddr backing_addr); - - /// Unmaps a range of addresses, splitting VMAs as necessary. - void UnmapRange(GPUVAddr target, u64 size); - - /// Converts a VMAHandle to a mutable VMAIter. - VMAIter StripIterConstness(const VMAHandle& iter); - - /// Marks as the specified VMA as allocated. - VMAIter Allocate(VMAIter vma); + void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const; + void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size); + void CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size); /** - * Carves a VMA of a specific size at the specified address by splitting Free VMAs while doing - * the appropriate error checking. + * IsGranularRange checks if a gpu region can be simply read with a pointer. */ - VMAIter CarveVMA(GPUVAddr base, u64 size); + [[nodiscard]] bool IsGranularRange(GPUVAddr gpu_addr, std::size_t size) const; - /** - * Splits the edges of the given range of non-Free VMAs so that there is a VMA split at each - * end of the range. - */ - VMAIter CarveVMARange(GPUVAddr base, u64 size); - - /** - * Splits a VMA in two, at the specified offset. - * @returns the right side of the split, with the original iterator becoming the left side. - */ - VMAIter SplitVMA(VMAIter vma, u64 offset_in_vma); - - /** - * Checks for and merges the specified VMA with adjacent ones if possible. - * @returns the merged VMA or the original if no merging was possible. - */ - VMAIter MergeAdjacent(VMAIter vma); - - /// Updates the pages corresponding to this VMA so they match the VMA's attributes. - void UpdatePageTableForVMA(const VirtualMemoryArea& vma); - - /// Finds a free (unmapped region) of the specified size starting at the specified address. - GPUVAddr FindFreeRegion(GPUVAddr region_start, u64 size) const; + [[nodiscard]] GPUVAddr Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size); + [[nodiscard]] GPUVAddr MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align); + [[nodiscard]] GPUVAddr MapAllocate32(VAddr cpu_addr, std::size_t size); + [[nodiscard]] std::optional<GPUVAddr> AllocateFixed(GPUVAddr gpu_addr, std::size_t size); + [[nodiscard]] GPUVAddr Allocate(std::size_t size, std::size_t align); + void Unmap(GPUVAddr gpu_addr, std::size_t size); private: + [[nodiscard]] PageEntry GetPageEntry(GPUVAddr gpu_addr) const; + void SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size = page_size); + GPUVAddr UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size); + [[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align, + bool start_32bit_address = false) const; + + void TryLockPage(PageEntry page_entry, std::size_t size); + void TryUnlockPage(PageEntry page_entry, std::size_t size); + + [[nodiscard]] static constexpr std::size_t PageEntryIndex(GPUVAddr gpu_addr) { + return (gpu_addr >> page_bits) & page_table_mask; + } + + static constexpr u64 address_space_size = 1ULL << 40; + static constexpr u64 address_space_start = 1ULL << 32; + static constexpr u64 address_space_start_low = 1ULL << 16; static constexpr u64 page_bits{16}; static constexpr u64 page_size{1 << page_bits}; static constexpr u64 page_mask{page_size - 1}; + static constexpr u64 page_table_bits{24}; + static constexpr u64 page_table_size{1 << page_table_bits}; + static constexpr u64 page_table_mask{page_table_size - 1}; - /// Address space in bits, according to Tegra X1 TRM - static constexpr u32 address_space_width{40}; - /// Start address for mapping, this is fairly arbitrary but must be non-zero. - static constexpr GPUVAddr address_space_base{0x100000}; - /// End of address space, based on address space in bits. - static constexpr GPUVAddr address_space_end{1ULL << address_space_width}; + Core::System& system; - Common::BackingPageTable page_table{page_bits}; - VMAMap vma_map; - VideoCore::RasterizerInterface& rasterizer; + VideoCore::RasterizerInterface* rasterizer = nullptr; - Core::System& system; + std::vector<PageEntry> page_table; }; } // namespace Tegra diff --git a/src/video_core/morton.cpp b/src/video_core/morton.cpp index 6d522c318..9da9fb4ff 100644 --- a/src/video_core/morton.cpp +++ b/src/video_core/morton.cpp @@ -41,144 +41,168 @@ static void MortonCopy(u32 stride, u32 block_height, u32 height, u32 block_depth } static constexpr ConversionArray morton_to_linear_fns = { - MortonCopy<true, PixelFormat::ABGR8U>, - MortonCopy<true, PixelFormat::ABGR8S>, - MortonCopy<true, PixelFormat::ABGR8UI>, - MortonCopy<true, PixelFormat::B5G6R5U>, - MortonCopy<true, PixelFormat::A2B10G10R10U>, - MortonCopy<true, PixelFormat::A1B5G5R5U>, - MortonCopy<true, PixelFormat::R8U>, - MortonCopy<true, PixelFormat::R8UI>, - MortonCopy<true, PixelFormat::RGBA16F>, - MortonCopy<true, PixelFormat::RGBA16U>, - MortonCopy<true, PixelFormat::RGBA16S>, - MortonCopy<true, PixelFormat::RGBA16UI>, - MortonCopy<true, PixelFormat::R11FG11FB10F>, - MortonCopy<true, PixelFormat::RGBA32UI>, - MortonCopy<true, PixelFormat::DXT1>, - MortonCopy<true, PixelFormat::DXT23>, - MortonCopy<true, PixelFormat::DXT45>, - MortonCopy<true, PixelFormat::DXN1>, - MortonCopy<true, PixelFormat::DXN2UNORM>, - MortonCopy<true, PixelFormat::DXN2SNORM>, - MortonCopy<true, PixelFormat::BC7U>, - MortonCopy<true, PixelFormat::BC6H_UF16>, - MortonCopy<true, PixelFormat::BC6H_SF16>, - MortonCopy<true, PixelFormat::ASTC_2D_4X4>, - MortonCopy<true, PixelFormat::BGRA8>, - MortonCopy<true, PixelFormat::RGBA32F>, - MortonCopy<true, PixelFormat::RG32F>, - MortonCopy<true, PixelFormat::R32F>, - MortonCopy<true, PixelFormat::R16F>, - MortonCopy<true, PixelFormat::R16U>, - MortonCopy<true, PixelFormat::R16S>, - MortonCopy<true, PixelFormat::R16UI>, - MortonCopy<true, PixelFormat::R16I>, - MortonCopy<true, PixelFormat::RG16>, - MortonCopy<true, PixelFormat::RG16F>, - MortonCopy<true, PixelFormat::RG16UI>, - MortonCopy<true, PixelFormat::RG16I>, - MortonCopy<true, PixelFormat::RG16S>, - MortonCopy<true, PixelFormat::RGB32F>, - MortonCopy<true, PixelFormat::RGBA8_SRGB>, - MortonCopy<true, PixelFormat::RG8U>, - MortonCopy<true, PixelFormat::RG8S>, - MortonCopy<true, PixelFormat::RG32UI>, - MortonCopy<true, PixelFormat::RGBX16F>, - MortonCopy<true, PixelFormat::R32UI>, - MortonCopy<true, PixelFormat::R32I>, - MortonCopy<true, PixelFormat::ASTC_2D_8X8>, - MortonCopy<true, PixelFormat::ASTC_2D_8X5>, - MortonCopy<true, PixelFormat::ASTC_2D_5X4>, - MortonCopy<true, PixelFormat::BGRA8_SRGB>, - MortonCopy<true, PixelFormat::DXT1_SRGB>, - MortonCopy<true, PixelFormat::DXT23_SRGB>, - MortonCopy<true, PixelFormat::DXT45_SRGB>, - MortonCopy<true, PixelFormat::BC7U_SRGB>, - MortonCopy<true, PixelFormat::R4G4B4A4U>, + MortonCopy<true, PixelFormat::A8B8G8R8_UNORM>, + MortonCopy<true, PixelFormat::A8B8G8R8_SNORM>, + MortonCopy<true, PixelFormat::A8B8G8R8_SINT>, + MortonCopy<true, PixelFormat::A8B8G8R8_UINT>, + MortonCopy<true, PixelFormat::R5G6B5_UNORM>, + MortonCopy<true, PixelFormat::B5G6R5_UNORM>, + MortonCopy<true, PixelFormat::A1R5G5B5_UNORM>, + MortonCopy<true, PixelFormat::A2B10G10R10_UNORM>, + MortonCopy<true, PixelFormat::A2B10G10R10_UINT>, + MortonCopy<true, PixelFormat::A1B5G5R5_UNORM>, + MortonCopy<true, PixelFormat::R8_UNORM>, + MortonCopy<true, PixelFormat::R8_SNORM>, + MortonCopy<true, PixelFormat::R8_SINT>, + MortonCopy<true, PixelFormat::R8_UINT>, + MortonCopy<true, PixelFormat::R16G16B16A16_FLOAT>, + MortonCopy<true, PixelFormat::R16G16B16A16_UNORM>, + MortonCopy<true, PixelFormat::R16G16B16A16_SNORM>, + MortonCopy<true, PixelFormat::R16G16B16A16_SINT>, + MortonCopy<true, PixelFormat::R16G16B16A16_UINT>, + MortonCopy<true, PixelFormat::B10G11R11_FLOAT>, + MortonCopy<true, PixelFormat::R32G32B32A32_UINT>, + MortonCopy<true, PixelFormat::BC1_RGBA_UNORM>, + MortonCopy<true, PixelFormat::BC2_UNORM>, + MortonCopy<true, PixelFormat::BC3_UNORM>, + MortonCopy<true, PixelFormat::BC4_UNORM>, + MortonCopy<true, PixelFormat::BC4_SNORM>, + MortonCopy<true, PixelFormat::BC5_UNORM>, + MortonCopy<true, PixelFormat::BC5_SNORM>, + MortonCopy<true, PixelFormat::BC7_UNORM>, + MortonCopy<true, PixelFormat::BC6H_UFLOAT>, + MortonCopy<true, PixelFormat::BC6H_SFLOAT>, + MortonCopy<true, PixelFormat::ASTC_2D_4X4_UNORM>, + MortonCopy<true, PixelFormat::B8G8R8A8_UNORM>, + MortonCopy<true, PixelFormat::R32G32B32A32_FLOAT>, + MortonCopy<true, PixelFormat::R32G32B32A32_SINT>, + MortonCopy<true, PixelFormat::R32G32_FLOAT>, + MortonCopy<true, PixelFormat::R32G32_SINT>, + MortonCopy<true, PixelFormat::R32_FLOAT>, + MortonCopy<true, PixelFormat::R16_FLOAT>, + MortonCopy<true, PixelFormat::R16_UNORM>, + MortonCopy<true, PixelFormat::R16_SNORM>, + MortonCopy<true, PixelFormat::R16_UINT>, + MortonCopy<true, PixelFormat::R16_SINT>, + MortonCopy<true, PixelFormat::R16G16_UNORM>, + MortonCopy<true, PixelFormat::R16G16_FLOAT>, + MortonCopy<true, PixelFormat::R16G16_UINT>, + MortonCopy<true, PixelFormat::R16G16_SINT>, + MortonCopy<true, PixelFormat::R16G16_SNORM>, + MortonCopy<true, PixelFormat::R32G32B32_FLOAT>, + MortonCopy<true, PixelFormat::A8B8G8R8_SRGB>, + MortonCopy<true, PixelFormat::R8G8_UNORM>, + MortonCopy<true, PixelFormat::R8G8_SNORM>, + MortonCopy<true, PixelFormat::R8G8_SINT>, + MortonCopy<true, PixelFormat::R8G8_UINT>, + MortonCopy<true, PixelFormat::R32G32_UINT>, + MortonCopy<true, PixelFormat::R16G16B16X16_FLOAT>, + MortonCopy<true, PixelFormat::R32_UINT>, + MortonCopy<true, PixelFormat::R32_SINT>, + MortonCopy<true, PixelFormat::ASTC_2D_8X8_UNORM>, + MortonCopy<true, PixelFormat::ASTC_2D_8X5_UNORM>, + MortonCopy<true, PixelFormat::ASTC_2D_5X4_UNORM>, + MortonCopy<true, PixelFormat::B8G8R8A8_SRGB>, + MortonCopy<true, PixelFormat::BC1_RGBA_SRGB>, + MortonCopy<true, PixelFormat::BC2_SRGB>, + MortonCopy<true, PixelFormat::BC3_SRGB>, + MortonCopy<true, PixelFormat::BC7_SRGB>, + MortonCopy<true, PixelFormat::A4B4G4R4_UNORM>, MortonCopy<true, PixelFormat::ASTC_2D_4X4_SRGB>, MortonCopy<true, PixelFormat::ASTC_2D_8X8_SRGB>, MortonCopy<true, PixelFormat::ASTC_2D_8X5_SRGB>, MortonCopy<true, PixelFormat::ASTC_2D_5X4_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_5X5>, + MortonCopy<true, PixelFormat::ASTC_2D_5X5_UNORM>, MortonCopy<true, PixelFormat::ASTC_2D_5X5_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_10X8>, + MortonCopy<true, PixelFormat::ASTC_2D_10X8_UNORM>, MortonCopy<true, PixelFormat::ASTC_2D_10X8_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_6X6>, + MortonCopy<true, PixelFormat::ASTC_2D_6X6_UNORM>, MortonCopy<true, PixelFormat::ASTC_2D_6X6_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_10X10>, + MortonCopy<true, PixelFormat::ASTC_2D_10X10_UNORM>, MortonCopy<true, PixelFormat::ASTC_2D_10X10_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_12X12>, + MortonCopy<true, PixelFormat::ASTC_2D_12X12_UNORM>, MortonCopy<true, PixelFormat::ASTC_2D_12X12_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_8X6>, + MortonCopy<true, PixelFormat::ASTC_2D_8X6_UNORM>, MortonCopy<true, PixelFormat::ASTC_2D_8X6_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_6X5>, + MortonCopy<true, PixelFormat::ASTC_2D_6X5_UNORM>, MortonCopy<true, PixelFormat::ASTC_2D_6X5_SRGB>, - MortonCopy<true, PixelFormat::E5B9G9R9F>, - MortonCopy<true, PixelFormat::Z32F>, - MortonCopy<true, PixelFormat::Z16>, - MortonCopy<true, PixelFormat::Z24S8>, - MortonCopy<true, PixelFormat::S8Z24>, - MortonCopy<true, PixelFormat::Z32FS8>, + MortonCopy<true, PixelFormat::E5B9G9R9_FLOAT>, + MortonCopy<true, PixelFormat::D32_FLOAT>, + MortonCopy<true, PixelFormat::D16_UNORM>, + MortonCopy<true, PixelFormat::D24_UNORM_S8_UINT>, + MortonCopy<true, PixelFormat::S8_UINT_D24_UNORM>, + MortonCopy<true, PixelFormat::D32_FLOAT_S8_UINT>, }; static constexpr ConversionArray linear_to_morton_fns = { - MortonCopy<false, PixelFormat::ABGR8U>, - MortonCopy<false, PixelFormat::ABGR8S>, - MortonCopy<false, PixelFormat::ABGR8UI>, - MortonCopy<false, PixelFormat::B5G6R5U>, - MortonCopy<false, PixelFormat::A2B10G10R10U>, - MortonCopy<false, PixelFormat::A1B5G5R5U>, - MortonCopy<false, PixelFormat::R8U>, - MortonCopy<false, PixelFormat::R8UI>, - MortonCopy<false, PixelFormat::RGBA16F>, - MortonCopy<false, PixelFormat::RGBA16S>, - MortonCopy<false, PixelFormat::RGBA16U>, - MortonCopy<false, PixelFormat::RGBA16UI>, - MortonCopy<false, PixelFormat::R11FG11FB10F>, - MortonCopy<false, PixelFormat::RGBA32UI>, - MortonCopy<false, PixelFormat::DXT1>, - MortonCopy<false, PixelFormat::DXT23>, - MortonCopy<false, PixelFormat::DXT45>, - MortonCopy<false, PixelFormat::DXN1>, - MortonCopy<false, PixelFormat::DXN2UNORM>, - MortonCopy<false, PixelFormat::DXN2SNORM>, - MortonCopy<false, PixelFormat::BC7U>, - MortonCopy<false, PixelFormat::BC6H_UF16>, - MortonCopy<false, PixelFormat::BC6H_SF16>, + MortonCopy<false, PixelFormat::A8B8G8R8_UNORM>, + MortonCopy<false, PixelFormat::A8B8G8R8_SNORM>, + MortonCopy<false, PixelFormat::A8B8G8R8_SINT>, + MortonCopy<false, PixelFormat::A8B8G8R8_UINT>, + MortonCopy<false, PixelFormat::R5G6B5_UNORM>, + MortonCopy<false, PixelFormat::B5G6R5_UNORM>, + MortonCopy<false, PixelFormat::A1R5G5B5_UNORM>, + MortonCopy<false, PixelFormat::A2B10G10R10_UNORM>, + MortonCopy<false, PixelFormat::A2B10G10R10_UINT>, + MortonCopy<false, PixelFormat::A1B5G5R5_UNORM>, + MortonCopy<false, PixelFormat::R8_UNORM>, + MortonCopy<false, PixelFormat::R8_SNORM>, + MortonCopy<false, PixelFormat::R8_SINT>, + MortonCopy<false, PixelFormat::R8_UINT>, + MortonCopy<false, PixelFormat::R16G16B16A16_FLOAT>, + MortonCopy<false, PixelFormat::R16G16B16A16_SNORM>, + MortonCopy<false, PixelFormat::R16G16B16A16_SINT>, + MortonCopy<false, PixelFormat::R16G16B16A16_UNORM>, + MortonCopy<false, PixelFormat::R16G16B16A16_UINT>, + MortonCopy<false, PixelFormat::B10G11R11_FLOAT>, + MortonCopy<false, PixelFormat::R32G32B32A32_UINT>, + MortonCopy<false, PixelFormat::BC1_RGBA_UNORM>, + MortonCopy<false, PixelFormat::BC2_UNORM>, + MortonCopy<false, PixelFormat::BC3_UNORM>, + MortonCopy<false, PixelFormat::BC4_UNORM>, + MortonCopy<false, PixelFormat::BC4_SNORM>, + MortonCopy<false, PixelFormat::BC5_UNORM>, + MortonCopy<false, PixelFormat::BC5_SNORM>, + MortonCopy<false, PixelFormat::BC7_UNORM>, + MortonCopy<false, PixelFormat::BC6H_UFLOAT>, + MortonCopy<false, PixelFormat::BC6H_SFLOAT>, // TODO(Subv): Swizzling ASTC formats are not supported nullptr, - MortonCopy<false, PixelFormat::BGRA8>, - MortonCopy<false, PixelFormat::RGBA32F>, - MortonCopy<false, PixelFormat::RG32F>, - MortonCopy<false, PixelFormat::R32F>, - MortonCopy<false, PixelFormat::R16F>, - MortonCopy<false, PixelFormat::R16U>, - MortonCopy<false, PixelFormat::R16S>, - MortonCopy<false, PixelFormat::R16UI>, - MortonCopy<false, PixelFormat::R16I>, - MortonCopy<false, PixelFormat::RG16>, - MortonCopy<false, PixelFormat::RG16F>, - MortonCopy<false, PixelFormat::RG16UI>, - MortonCopy<false, PixelFormat::RG16I>, - MortonCopy<false, PixelFormat::RG16S>, - MortonCopy<false, PixelFormat::RGB32F>, - MortonCopy<false, PixelFormat::RGBA8_SRGB>, - MortonCopy<false, PixelFormat::RG8U>, - MortonCopy<false, PixelFormat::RG8S>, - MortonCopy<false, PixelFormat::RG32UI>, - MortonCopy<false, PixelFormat::RGBX16F>, - MortonCopy<false, PixelFormat::R32UI>, - MortonCopy<false, PixelFormat::R32I>, + MortonCopy<false, PixelFormat::B8G8R8A8_UNORM>, + MortonCopy<false, PixelFormat::R32G32B32A32_FLOAT>, + MortonCopy<false, PixelFormat::R32G32B32A32_SINT>, + MortonCopy<false, PixelFormat::R32G32_FLOAT>, + MortonCopy<false, PixelFormat::R32G32_SINT>, + MortonCopy<false, PixelFormat::R32_FLOAT>, + MortonCopy<false, PixelFormat::R16_FLOAT>, + MortonCopy<false, PixelFormat::R16_UNORM>, + MortonCopy<false, PixelFormat::R16_SNORM>, + MortonCopy<false, PixelFormat::R16_UINT>, + MortonCopy<false, PixelFormat::R16_SINT>, + MortonCopy<false, PixelFormat::R16G16_UNORM>, + MortonCopy<false, PixelFormat::R16G16_FLOAT>, + MortonCopy<false, PixelFormat::R16G16_UINT>, + MortonCopy<false, PixelFormat::R16G16_SINT>, + MortonCopy<false, PixelFormat::R16G16_SNORM>, + MortonCopy<false, PixelFormat::R32G32B32_FLOAT>, + MortonCopy<false, PixelFormat::A8B8G8R8_SRGB>, + MortonCopy<false, PixelFormat::R8G8_UNORM>, + MortonCopy<false, PixelFormat::R8G8_SNORM>, + MortonCopy<false, PixelFormat::R8G8_SINT>, + MortonCopy<false, PixelFormat::R8G8_UINT>, + MortonCopy<false, PixelFormat::R32G32_UINT>, + MortonCopy<false, PixelFormat::R16G16B16X16_FLOAT>, + MortonCopy<false, PixelFormat::R32_UINT>, + MortonCopy<false, PixelFormat::R32_SINT>, nullptr, nullptr, nullptr, - MortonCopy<false, PixelFormat::BGRA8_SRGB>, - MortonCopy<false, PixelFormat::DXT1_SRGB>, - MortonCopy<false, PixelFormat::DXT23_SRGB>, - MortonCopy<false, PixelFormat::DXT45_SRGB>, - MortonCopy<false, PixelFormat::BC7U_SRGB>, - MortonCopy<false, PixelFormat::R4G4B4A4U>, + MortonCopy<false, PixelFormat::B8G8R8A8_SRGB>, + MortonCopy<false, PixelFormat::BC1_RGBA_SRGB>, + MortonCopy<false, PixelFormat::BC2_SRGB>, + MortonCopy<false, PixelFormat::BC3_SRGB>, + MortonCopy<false, PixelFormat::BC7_SRGB>, + MortonCopy<false, PixelFormat::A4B4G4R4_UNORM>, nullptr, nullptr, nullptr, @@ -197,12 +221,12 @@ static constexpr ConversionArray linear_to_morton_fns = { nullptr, nullptr, nullptr, - MortonCopy<false, PixelFormat::E5B9G9R9F>, - MortonCopy<false, PixelFormat::Z32F>, - MortonCopy<false, PixelFormat::Z16>, - MortonCopy<false, PixelFormat::Z24S8>, - MortonCopy<false, PixelFormat::S8Z24>, - MortonCopy<false, PixelFormat::Z32FS8>, + MortonCopy<false, PixelFormat::E5B9G9R9_FLOAT>, + MortonCopy<false, PixelFormat::D32_FLOAT>, + MortonCopy<false, PixelFormat::D16_UNORM>, + MortonCopy<false, PixelFormat::D24_UNORM_S8_UINT>, + MortonCopy<false, PixelFormat::S8_UINT_D24_UNORM>, + MortonCopy<false, PixelFormat::D32_FLOAT_S8_UINT>, }; static MortonCopyFn GetSwizzleFunction(MortonSwizzleMode mode, Surface::PixelFormat format) { diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h index 5ea2b01f2..fc54ca0ef 100644 --- a/src/video_core/query_cache.h +++ b/src/video_core/query_cache.h @@ -12,10 +12,12 @@ #include <mutex> #include <optional> #include <unordered_map> +#include <unordered_set> #include <vector> #include "common/assert.h" #include "core/core.h" +#include "core/settings.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/gpu.h" #include "video_core/memory_manager.h" @@ -89,14 +91,15 @@ private: std::shared_ptr<HostCounter> last; }; -template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter, - class QueryPool> +template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter> class QueryCacheBase { public: - explicit QueryCacheBase(Core::System& system, VideoCore::RasterizerInterface& rasterizer) - : system{system}, rasterizer{rasterizer}, streams{{CounterStream{ - static_cast<QueryCache&>(*this), - VideoCore::QueryType::SamplesPassed}}} {} + explicit QueryCacheBase(VideoCore::RasterizerInterface& rasterizer_, + Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::MemoryManager& gpu_memory_) + : rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, + gpu_memory{gpu_memory_}, streams{{CounterStream{static_cast<QueryCache&>(*this), + VideoCore::QueryType::SamplesPassed}}} {} void InvalidateRegion(VAddr addr, std::size_t size) { std::unique_lock lock{mutex}; @@ -116,26 +119,27 @@ public: */ void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) { std::unique_lock lock{mutex}; - auto& memory_manager = system.GPU().MemoryManager(); - const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr); - ASSERT(cpu_addr_opt); - VAddr cpu_addr = *cpu_addr_opt; + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + ASSERT(cpu_addr); - CachedQuery* query = TryGet(cpu_addr); + CachedQuery* query = TryGet(*cpu_addr); if (!query) { - ASSERT_OR_EXECUTE(cpu_addr_opt, return;); - const auto host_ptr = memory_manager.GetPointer(gpu_addr); + ASSERT_OR_EXECUTE(cpu_addr, return;); + u8* const host_ptr = gpu_memory.GetPointer(gpu_addr); - query = Register(type, cpu_addr, host_ptr, timestamp.has_value()); + query = Register(type, *cpu_addr, host_ptr, timestamp.has_value()); } query->BindCounter(Stream(type).Current(), timestamp); + if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) { + AsyncFlushQuery(*cpu_addr); + } } /// Updates counters from GPU state. Expected to be called once per draw, clear or dispatch. void UpdateCounters() { std::unique_lock lock{mutex}; - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; Stream(VideoCore::QueryType::SamplesPassed).Update(regs.samplecnt_enable); } @@ -170,8 +174,36 @@ public: return streams[static_cast<std::size_t>(type)]; } -protected: - std::array<QueryPool, VideoCore::NumQueryTypes> query_pools; + void CommitAsyncFlushes() { + committed_flushes.push_back(uncommitted_flushes); + uncommitted_flushes.reset(); + } + + bool HasUncommittedFlushes() const { + return uncommitted_flushes != nullptr; + } + + bool ShouldWaitAsyncFlushes() const { + if (committed_flushes.empty()) { + return false; + } + return committed_flushes.front() != nullptr; + } + + void PopAsyncFlushes() { + if (committed_flushes.empty()) { + return; + } + auto& flush_list = committed_flushes.front(); + if (!flush_list) { + committed_flushes.pop_front(); + return; + } + for (VAddr query_address : *flush_list) { + FlushAndRemoveRegion(query_address, 4); + } + committed_flushes.pop_front(); + } private: /// Flushes a memory range to guest memory and removes it from the cache. @@ -184,8 +216,8 @@ private: return cache_begin < addr_end && addr_begin < cache_end; }; - const u64 page_end = addr_end >> PAGE_SHIFT; - for (u64 page = addr_begin >> PAGE_SHIFT; page <= page_end; ++page) { + const u64 page_end = addr_end >> PAGE_BITS; + for (u64 page = addr_begin >> PAGE_BITS; page <= page_end; ++page) { const auto& it = cached_queries.find(page); if (it == std::end(cached_queries)) { continue; @@ -206,14 +238,14 @@ private: /// Registers the passed parameters as cached and returns a pointer to the stored cached query. CachedQuery* Register(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr, bool timestamp) { rasterizer.UpdatePagesCachedCount(cpu_addr, CachedQuery::SizeInBytes(timestamp), 1); - const u64 page = static_cast<u64>(cpu_addr) >> PAGE_SHIFT; + const u64 page = static_cast<u64>(cpu_addr) >> PAGE_BITS; return &cached_queries[page].emplace_back(static_cast<QueryCache&>(*this), type, cpu_addr, host_ptr); } /// Tries to a get a cached query. Returns nullptr on failure. CachedQuery* TryGet(VAddr addr) { - const u64 page = static_cast<u64>(addr) >> PAGE_SHIFT; + const u64 page = static_cast<u64>(addr) >> PAGE_BITS; const auto it = cached_queries.find(page); if (it == std::end(cached_queries)) { return nullptr; @@ -224,17 +256,28 @@ private: return found != std::end(contents) ? &*found : nullptr; } + void AsyncFlushQuery(VAddr addr) { + if (!uncommitted_flushes) { + uncommitted_flushes = std::make_shared<std::unordered_set<VAddr>>(); + } + uncommitted_flushes->insert(addr); + } + static constexpr std::uintptr_t PAGE_SIZE = 4096; - static constexpr unsigned PAGE_SHIFT = 12; + static constexpr unsigned PAGE_BITS = 12; - Core::System& system; VideoCore::RasterizerInterface& rasterizer; + Tegra::Engines::Maxwell3D& maxwell3d; + Tegra::MemoryManager& gpu_memory; std::recursive_mutex mutex; std::unordered_map<u64, std::vector<CachedQuery>> cached_queries; std::array<CounterStream, VideoCore::NumQueryTypes> streams; + + std::shared_ptr<std::unordered_set<VAddr>> uncommitted_flushes{}; + std::list<std::shared_ptr<std::unordered_set<VAddr>>> committed_flushes; }; template <class QueryCache, class HostCounter> diff --git a/src/video_core/rasterizer_accelerated.cpp b/src/video_core/rasterizer_accelerated.cpp index d01db97da..53622ca05 100644 --- a/src/video_core/rasterizer_accelerated.cpp +++ b/src/video_core/rasterizer_accelerated.cpp @@ -23,15 +23,15 @@ constexpr auto RangeFromInterval(Map& map, const Interval& interval) { } // Anonymous namespace -RasterizerAccelerated::RasterizerAccelerated(Memory::Memory& cpu_memory_) +RasterizerAccelerated::RasterizerAccelerated(Core::Memory::Memory& cpu_memory_) : cpu_memory{cpu_memory_} {} RasterizerAccelerated::~RasterizerAccelerated() = default; void RasterizerAccelerated::UpdatePagesCachedCount(VAddr addr, u64 size, int delta) { std::lock_guard lock{pages_mutex}; - const u64 page_start{addr >> Memory::PAGE_BITS}; - const u64 page_end{(addr + size + Memory::PAGE_SIZE - 1) >> Memory::PAGE_BITS}; + const u64 page_start{addr >> Core::Memory::PAGE_BITS}; + const u64 page_end{(addr + size + Core::Memory::PAGE_SIZE - 1) >> Core::Memory::PAGE_BITS}; // Interval maps will erase segments if count reaches 0, so if delta is negative we have to // subtract after iterating @@ -44,8 +44,8 @@ void RasterizerAccelerated::UpdatePagesCachedCount(VAddr addr, u64 size, int del const auto interval = pair.first & pages_interval; const int count = pair.second; - const VAddr interval_start_addr = boost::icl::first(interval) << Memory::PAGE_BITS; - const VAddr interval_end_addr = boost::icl::last_next(interval) << Memory::PAGE_BITS; + const VAddr interval_start_addr = boost::icl::first(interval) << Core::Memory::PAGE_BITS; + const VAddr interval_end_addr = boost::icl::last_next(interval) << Core::Memory::PAGE_BITS; const u64 interval_size = interval_end_addr - interval_start_addr; if (delta > 0 && count == delta) { diff --git a/src/video_core/rasterizer_accelerated.h b/src/video_core/rasterizer_accelerated.h index 315798e7c..91866d7dd 100644 --- a/src/video_core/rasterizer_accelerated.h +++ b/src/video_core/rasterizer_accelerated.h @@ -11,7 +11,7 @@ #include "common/common_types.h" #include "video_core/rasterizer_interface.h" -namespace Memory { +namespace Core::Memory { class Memory; } @@ -20,7 +20,7 @@ namespace VideoCore { /// Implements the shared part in GPU accelerated rasterizers in RasterizerInterface. class RasterizerAccelerated : public RasterizerInterface { public: - explicit RasterizerAccelerated(Memory::Memory& cpu_memory_); + explicit RasterizerAccelerated(Core::Memory::Memory& cpu_memory_); ~RasterizerAccelerated() override; void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) override; @@ -30,7 +30,7 @@ private: CachedPageMap cached_pages; std::mutex pages_mutex; - Memory::Memory& cpu_memory; + Core::Memory::Memory& cpu_memory; }; } // namespace VideoCore diff --git a/src/video_core/rasterizer_cache.cpp b/src/video_core/rasterizer_cache.cpp deleted file mode 100644 index 093b2cdf4..000000000 --- a/src/video_core/rasterizer_cache.cpp +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "video_core/rasterizer_cache.h" - -RasterizerCacheObject::~RasterizerCacheObject() = default; diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h deleted file mode 100644 index 22987751e..000000000 --- a/src/video_core/rasterizer_cache.h +++ /dev/null @@ -1,197 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <mutex> -#include <set> -#include <unordered_map> - -#include <boost/icl/interval_map.hpp> -#include <boost/range/iterator_range_core.hpp> - -#include "common/common_types.h" -#include "core/settings.h" -#include "video_core/gpu.h" -#include "video_core/rasterizer_interface.h" - -class RasterizerCacheObject { -public: - explicit RasterizerCacheObject(const VAddr cpu_addr) : cpu_addr{cpu_addr} {} - - virtual ~RasterizerCacheObject(); - - VAddr GetCpuAddr() const { - return cpu_addr; - } - - /// Gets the size of the shader in guest memory, required for cache management - virtual std::size_t GetSizeInBytes() const = 0; - - /// Sets whether the cached object should be considered registered - void SetIsRegistered(bool registered) { - is_registered = registered; - } - - /// Returns true if the cached object is registered - bool IsRegistered() const { - return is_registered; - } - - /// Returns true if the cached object is dirty - bool IsDirty() const { - return is_dirty; - } - - /// Returns ticks from when this cached object was last modified - u64 GetLastModifiedTicks() const { - return last_modified_ticks; - } - - /// Marks an object as recently modified, used to specify whether it is clean or dirty - template <class T> - void MarkAsModified(bool dirty, T& cache) { - is_dirty = dirty; - last_modified_ticks = cache.GetModifiedTicks(); - } - -private: - bool is_registered{}; ///< Whether the object is currently registered with the cache - bool is_dirty{}; ///< Whether the object is dirty (out of sync with guest memory) - u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing - VAddr cpu_addr{}; ///< Cpu address memory, unique from emulated virtual address space -}; - -template <class T> -class RasterizerCache : NonCopyable { - friend class RasterizerCacheObject; - -public: - explicit RasterizerCache(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {} - - /// Write any cached resources overlapping the specified region back to memory - void FlushRegion(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; - - const auto& objects{GetSortedObjectsFromRegion(addr, size)}; - for (auto& object : objects) { - FlushObject(object); - } - } - - /// Mark the specified region as being invalidated - void InvalidateRegion(VAddr addr, u64 size) { - std::lock_guard lock{mutex}; - - const auto& objects{GetSortedObjectsFromRegion(addr, size)}; - for (auto& object : objects) { - if (!object->IsRegistered()) { - // Skip duplicates - continue; - } - Unregister(object); - } - } - - /// Invalidates everything in the cache - void InvalidateAll() { - std::lock_guard lock{mutex}; - - while (interval_cache.begin() != interval_cache.end()) { - Unregister(*interval_cache.begin()->second.begin()); - } - } - -protected: - /// Tries to get an object from the cache with the specified cache address - T TryGet(VAddr addr) const { - const auto iter = map_cache.find(addr); - if (iter != map_cache.end()) - return iter->second; - return nullptr; - } - - /// Register an object into the cache - virtual void Register(const T& object) { - std::lock_guard lock{mutex}; - - object->SetIsRegistered(true); - interval_cache.add({GetInterval(object), ObjectSet{object}}); - map_cache.insert({object->GetCpuAddr(), object}); - rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1); - } - - /// Unregisters an object from the cache - virtual void Unregister(const T& object) { - std::lock_guard lock{mutex}; - - object->SetIsRegistered(false); - rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1); - const VAddr addr = object->GetCpuAddr(); - interval_cache.subtract({GetInterval(object), ObjectSet{object}}); - map_cache.erase(addr); - } - - /// Returns a ticks counter used for tracking when cached objects were last modified - u64 GetModifiedTicks() { - std::lock_guard lock{mutex}; - - return ++modified_ticks; - } - - virtual void FlushObjectInner(const T& object) = 0; - - /// Flushes the specified object, updating appropriate cache state as needed - void FlushObject(const T& object) { - std::lock_guard lock{mutex}; - - if (!object->IsDirty()) { - return; - } - FlushObjectInner(object); - object->MarkAsModified(false, *this); - } - - std::recursive_mutex mutex; - -private: - /// Returns a list of cached objects from the specified memory region, ordered by access time - std::vector<T> GetSortedObjectsFromRegion(VAddr addr, u64 size) { - if (size == 0) { - return {}; - } - - std::vector<T> objects; - const ObjectInterval interval{addr, addr + size}; - for (auto& pair : boost::make_iterator_range(interval_cache.equal_range(interval))) { - for (auto& cached_object : pair.second) { - if (!cached_object) { - continue; - } - objects.push_back(cached_object); - } - } - - std::sort(objects.begin(), objects.end(), [](const T& a, const T& b) -> bool { - return a->GetLastModifiedTicks() < b->GetLastModifiedTicks(); - }); - - return objects; - } - - using ObjectSet = std::set<T>; - using ObjectCache = std::unordered_map<VAddr, T>; - using IntervalCache = boost::icl::interval_map<VAddr, ObjectSet>; - using ObjectInterval = typename IntervalCache::interval_type; - - static auto GetInterval(const T& object) { - return ObjectInterval::right_open(object->GetCpuAddr(), - object->GetCpuAddr() + object->GetSizeInBytes()); - } - - ObjectCache map_cache; - IntervalCache interval_cache; ///< Cache of objects - u64 modified_ticks{}; ///< Counter of cache state ticks, used for in-order flushing - VideoCore::RasterizerInterface& rasterizer; -}; diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 8ae5b9c4e..27ef4c69a 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -32,7 +32,7 @@ using DiskResourceLoadCallback = std::function<void(LoadCallbackStage, std::size class RasterizerInterface { public: - virtual ~RasterizerInterface() {} + virtual ~RasterizerInterface() = default; /// Dispatches a draw invocation virtual void Draw(bool is_indexed, bool is_instanced) = 0; @@ -49,19 +49,40 @@ public: /// Records a GPU query and caches it virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0; + /// Signal a GPU based semaphore as a fence + virtual void SignalSemaphore(GPUVAddr addr, u32 value) = 0; + + /// Signal a GPU based syncpoint as a fence + virtual void SignalSyncPoint(u32 value) = 0; + + /// Release all pending fences. + virtual void ReleaseFences() = 0; + /// Notify rasterizer that all caches should be flushed to Switch memory virtual void FlushAll() = 0; /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory virtual void FlushRegion(VAddr addr, u64 size) = 0; + /// Check if the the specified memory area requires flushing to CPU Memory. + virtual bool MustFlushRegion(VAddr addr, u64 size) = 0; + /// Notify rasterizer that any caches of the specified region should be invalidated virtual void InvalidateRegion(VAddr addr, u64 size) = 0; + /// Notify rasterizer that any caches of the specified region are desync with guest + virtual void OnCPUWrite(VAddr addr, u64 size) = 0; + + /// Sync memory between guest and host. + virtual void SyncGuestHost() = 0; + /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory /// and invalidated virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0; + /// Notify the host renderer to wait for previous primitive and compute operations. + virtual void WaitForIdle() = 0; + /// Notify the rasterizer to send all written commands to the host GPU. virtual void FlushCommands() = 0; @@ -69,15 +90,16 @@ public: virtual void TickFrame() = 0; /// Attempt to use a faster method to perform a surface copy - virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, - const Tegra::Engines::Fermi2D::Regs::Surface& dst, - const Tegra::Engines::Fermi2D::Config& copy_config) { + [[nodiscard]] virtual bool AccelerateSurfaceCopy( + const Tegra::Engines::Fermi2D::Regs::Surface& src, + const Tegra::Engines::Fermi2D::Regs::Surface& dst, + const Tegra::Engines::Fermi2D::Config& copy_config) { return false; } /// Attempt to use a faster method to display the framebuffer to screen - virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, - u32 pixel_stride) { + [[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config, + VAddr framebuffer_addr, u32 pixel_stride) { return false; } @@ -85,19 +107,16 @@ public: virtual void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {} /// Initialize disk cached resources for the game being emulated - virtual void LoadDiskResources(const std::atomic_bool& stop_loading = false, - const DiskResourceLoadCallback& callback = {}) {} - - /// Initializes renderer dirty flags - virtual void SetupDirtyFlags() {} + virtual void LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading, + const DiskResourceLoadCallback& callback) {} /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver. - GuestDriverProfile& AccessGuestDriverProfile() { + [[nodiscard]] GuestDriverProfile& AccessGuestDriverProfile() { return guest_driver_profile; } /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver. - const GuestDriverProfile& AccessGuestDriverProfile() const { + [[nodiscard]] const GuestDriverProfile& AccessGuestDriverProfile() const { return guest_driver_profile; } diff --git a/src/video_core/renderer_base.cpp b/src/video_core/renderer_base.cpp index 919d1f2d4..a93a1732c 100644 --- a/src/video_core/renderer_base.cpp +++ b/src/video_core/renderer_base.cpp @@ -9,7 +9,9 @@ namespace VideoCore { -RendererBase::RendererBase(Core::Frontend::EmuWindow& window) : render_window{window} { +RendererBase::RendererBase(Core::Frontend::EmuWindow& window_, + std::unique_ptr<Core::Frontend::GraphicsContext> context_) + : render_window{window_}, context{std::move(context_)} { RefreshBaseSettings(); } @@ -18,7 +20,7 @@ RendererBase::~RendererBase() = default; void RendererBase::RefreshBaseSettings() { UpdateCurrentFramebufferLayout(); - renderer_settings.use_framelimiter = Settings::values.use_frame_limit; + renderer_settings.use_framelimiter = Settings::values.use_frame_limit.GetValue(); renderer_settings.set_background_color = true; } diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h index 1d85219b6..51dde8eb5 100644 --- a/src/video_core/renderer_base.h +++ b/src/video_core/renderer_base.h @@ -15,7 +15,8 @@ namespace Core::Frontend { class EmuWindow; -} +class GraphicsContext; +} // namespace Core::Frontend namespace VideoCore { @@ -25,18 +26,19 @@ struct RendererSettings { // Screenshot std::atomic<bool> screenshot_requested{false}; - void* screenshot_bits; + void* screenshot_bits{}; std::function<void()> screenshot_complete_callback; Layout::FramebufferLayout screenshot_framebuffer_layout; }; class RendererBase : NonCopyable { public: - explicit RendererBase(Core::Frontend::EmuWindow& window); + explicit RendererBase(Core::Frontend::EmuWindow& window, + std::unique_ptr<Core::Frontend::GraphicsContext> context); virtual ~RendererBase(); /// Initialize the renderer - virtual bool Init() = 0; + [[nodiscard]] virtual bool Init() = 0; /// Shutdown the renderer virtual void ShutDown() = 0; @@ -44,43 +46,46 @@ public: /// Finalize rendering the guest frame and draw into the presentation texture virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0; - /// Draws the latest frame to the window waiting timeout_ms for a frame to arrive (Renderer - /// specific implementation) - /// Returns true if a frame was drawn - virtual bool TryPresent(int timeout_ms) = 0; - // Getter/setter functions: // ------------------------ - f32 GetCurrentFPS() const { + [[nodiscard]] f32 GetCurrentFPS() const { return m_current_fps; } - int GetCurrentFrame() const { + [[nodiscard]] int GetCurrentFrame() const { return m_current_frame; } - RasterizerInterface& Rasterizer() { + [[nodiscard]] RasterizerInterface& Rasterizer() { return *rasterizer; } - const RasterizerInterface& Rasterizer() const { + [[nodiscard]] const RasterizerInterface& Rasterizer() const { return *rasterizer; } - Core::Frontend::EmuWindow& GetRenderWindow() { + [[nodiscard]] Core::Frontend::GraphicsContext& Context() { + return *context; + } + + [[nodiscard]] const Core::Frontend::GraphicsContext& Context() const { + return *context; + } + + [[nodiscard]] Core::Frontend::EmuWindow& GetRenderWindow() { return render_window; } - const Core::Frontend::EmuWindow& GetRenderWindow() const { + [[nodiscard]] const Core::Frontend::EmuWindow& GetRenderWindow() const { return render_window; } - RendererSettings& Settings() { + [[nodiscard]] RendererSettings& Settings() { return renderer_settings; } - const RendererSettings& Settings() const { + [[nodiscard]] const RendererSettings& Settings() const { return renderer_settings; } @@ -94,6 +99,7 @@ public: protected: Core::Frontend::EmuWindow& render_window; ///< Reference to the render window handle. std::unique_ptr<RasterizerInterface> rasterizer; + std::unique_ptr<Core::Frontend::GraphicsContext> context; f32 m_current_fps = 0.0f; ///< Current framerate, should be set by the renderer int m_current_frame = 0; ///< Current frame, should be set by the renderer diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp new file mode 100644 index 000000000..d6120c23e --- /dev/null +++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp @@ -0,0 +1,2126 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <array> +#include <cstddef> +#include <string> +#include <string_view> +#include <utility> +#include <variant> + +#include <fmt/format.h> + +#include "common/alignment.h" +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/renderer_opengl/gl_arb_decompiler.h" +#include "video_core/renderer_opengl/gl_device.h" +#include "video_core/shader/registry.h" +#include "video_core/shader/shader_ir.h" + +// Predicates in the decompiled code follow the convention that -1 means true and 0 means false. +// GLASM lacks booleans, so they have to be implemented as integers. +// Using -1 for true is useful because both CMP.S and NOT.U can negate it, and CMP.S can be used to +// select between two values, because -1 will be evaluated as true and 0 as false. + +namespace OpenGL { + +namespace { + +using Tegra::Engines::ShaderType; +using Tegra::Shader::Attribute; +using Tegra::Shader::PixelImap; +using Tegra::Shader::Register; +using namespace VideoCommon::Shader; +using Operation = const OperationNode&; + +constexpr std::array INTERNAL_FLAG_NAMES = {"ZERO", "SIGN", "CARRY", "OVERFLOW"}; + +char Swizzle(std::size_t component) { + static constexpr std::string_view SWIZZLE{"xyzw"}; + return SWIZZLE.at(component); +} + +constexpr bool IsGenericAttribute(Attribute::Index index) { + return index >= Attribute::Index::Attribute_0 && index <= Attribute::Index::Attribute_31; +} + +u32 GetGenericAttributeIndex(Attribute::Index index) { + ASSERT(IsGenericAttribute(index)); + return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0); +} + +std::string_view Modifiers(Operation operation) { + const auto meta = std::get_if<MetaArithmetic>(&operation.GetMeta()); + if (meta && meta->precise) { + return ".PREC"; + } + return ""; +} + +std::string_view GetInputFlags(PixelImap attribute) { + switch (attribute) { + case PixelImap::Perspective: + return ""; + case PixelImap::Constant: + return "FLAT "; + case PixelImap::ScreenLinear: + return "NOPERSPECTIVE "; + case PixelImap::Unused: + break; + } + UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<int>(attribute)); + return {}; +} + +std::string_view ImageType(Tegra::Shader::ImageType image_type) { + switch (image_type) { + case Tegra::Shader::ImageType::Texture1D: + return "1D"; + case Tegra::Shader::ImageType::TextureBuffer: + return "BUFFER"; + case Tegra::Shader::ImageType::Texture1DArray: + return "ARRAY1D"; + case Tegra::Shader::ImageType::Texture2D: + return "2D"; + case Tegra::Shader::ImageType::Texture2DArray: + return "ARRAY2D"; + case Tegra::Shader::ImageType::Texture3D: + return "3D"; + } + UNREACHABLE(); + return {}; +} + +std::string_view StackName(MetaStackClass stack) { + switch (stack) { + case MetaStackClass::Ssy: + return "SSY"; + case MetaStackClass::Pbk: + return "PBK"; + } + UNREACHABLE(); + return ""; +}; + +std::string_view PrimitiveDescription(Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology topology) { + switch (topology) { + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Points: + return "POINTS"; + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Lines: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStrip: + return "LINES"; + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LinesAdjacency: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStripAdjacency: + return "LINES_ADJACENCY"; + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Triangles: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStrip: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleFan: + return "TRIANGLES"; + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency: + case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency: + return "TRIANGLES_ADJACENCY"; + default: + UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology)); + return "POINTS"; + } +} + +std::string_view TopologyName(Tegra::Shader::OutputTopology topology) { + switch (topology) { + case Tegra::Shader::OutputTopology::PointList: + return "POINTS"; + case Tegra::Shader::OutputTopology::LineStrip: + return "LINE_STRIP"; + case Tegra::Shader::OutputTopology::TriangleStrip: + return "TRIANGLE_STRIP"; + default: + UNIMPLEMENTED_MSG("Unknown output topology: {}", static_cast<u32>(topology)); + return "points"; + } +} + +std::string_view StageInputName(ShaderType stage) { + switch (stage) { + case ShaderType::Vertex: + case ShaderType::Geometry: + return "vertex"; + case ShaderType::Fragment: + return "fragment"; + case ShaderType::Compute: + return "invocation"; + default: + UNREACHABLE(); + return ""; + } +} + +std::string TextureType(const MetaTexture& meta) { + if (meta.sampler.is_buffer) { + return "BUFFER"; + } + std::string type; + if (meta.sampler.is_shadow) { + type += "SHADOW"; + } + if (meta.sampler.is_array) { + type += "ARRAY"; + } + type += [&meta] { + switch (meta.sampler.type) { + case Tegra::Shader::TextureType::Texture1D: + return "1D"; + case Tegra::Shader::TextureType::Texture2D: + return "2D"; + case Tegra::Shader::TextureType::Texture3D: + return "3D"; + case Tegra::Shader::TextureType::TextureCube: + return "CUBE"; + } + UNREACHABLE(); + return "2D"; + }(); + return type; +} + +class ARBDecompiler final { +public: + explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, + ShaderType stage, std::string_view identifier); + + std::string Code() const { + return shader_source; + } + +private: + void DefineGlobalMemory(); + + void DeclareHeader(); + void DeclareVertex(); + void DeclareGeometry(); + void DeclareFragment(); + void DeclareCompute(); + void DeclareInputAttributes(); + void DeclareOutputAttributes(); + void DeclareLocalMemory(); + void DeclareGlobalMemory(); + void DeclareConstantBuffers(); + void DeclareRegisters(); + void DeclareTemporaries(); + void DeclarePredicates(); + void DeclareInternalFlags(); + + void InitializeVariables(); + + void DecompileAST(); + void DecompileBranchMode(); + + void VisitAST(const ASTNode& node); + std::string VisitExpression(const Expr& node); + + void VisitBlock(const NodeBlock& bb); + + std::string Visit(const Node& node); + + std::tuple<std::string, std::string, std::size_t> BuildCoords(Operation); + std::string BuildAoffi(Operation); + std::string GlobalMemoryPointer(const GmemNode& gmem); + void Exit(); + + std::string Assign(Operation); + std::string Select(Operation); + std::string FClamp(Operation); + std::string FCastHalf0(Operation); + std::string FCastHalf1(Operation); + std::string FSqrt(Operation); + std::string FSwizzleAdd(Operation); + std::string HAdd2(Operation); + std::string HMul2(Operation); + std::string HFma2(Operation); + std::string HAbsolute(Operation); + std::string HNegate(Operation); + std::string HClamp(Operation); + std::string HCastFloat(Operation); + std::string HUnpack(Operation); + std::string HMergeF32(Operation); + std::string HMergeH0(Operation); + std::string HMergeH1(Operation); + std::string HPack2(Operation); + std::string LogicalAssign(Operation); + std::string LogicalPick2(Operation); + std::string LogicalAnd2(Operation); + std::string FloatOrdered(Operation); + std::string FloatUnordered(Operation); + std::string LogicalAddCarry(Operation); + std::string Texture(Operation); + std::string TextureGather(Operation); + std::string TextureQueryDimensions(Operation); + std::string TextureQueryLod(Operation); + std::string TexelFetch(Operation); + std::string TextureGradient(Operation); + std::string ImageLoad(Operation); + std::string ImageStore(Operation); + std::string Branch(Operation); + std::string BranchIndirect(Operation); + std::string PushFlowStack(Operation); + std::string PopFlowStack(Operation); + std::string Exit(Operation); + std::string Discard(Operation); + std::string EmitVertex(Operation); + std::string EndPrimitive(Operation); + std::string InvocationId(Operation); + std::string YNegate(Operation); + std::string ThreadId(Operation); + std::string ShuffleIndexed(Operation); + std::string Barrier(Operation); + std::string MemoryBarrierGroup(Operation); + std::string MemoryBarrierGlobal(Operation); + + template <const std::string_view& op> + std::string Unary(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("{}{} {}, {};", op, Modifiers(operation), temporary, Visit(operation[0])); + return temporary; + } + + template <const std::string_view& op> + std::string Binary(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("{}{} {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]), + Visit(operation[1])); + return temporary; + } + + template <const std::string_view& op> + std::string Trinary(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("{}{} {}, {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]), + Visit(operation[1]), Visit(operation[2])); + return temporary; + } + + template <const std::string_view& op, bool unordered> + std::string FloatComparison(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("TRUNC.U.CC RC.x, {};", Binary<op>(operation)); + AddLine("MOV.S {}, 0;", temporary); + AddLine("MOV.S {} (NE.x), -1;", temporary); + + const std::string op_a = Visit(operation[0]); + const std::string op_b = Visit(operation[1]); + if constexpr (unordered) { + AddLine("SNE.F RC.x, {}, {};", op_a, op_a); + AddLine("TRUNC.U.CC RC.x, RC.x;"); + AddLine("MOV.S {} (NE.x), -1;", temporary); + AddLine("SNE.F RC.x, {}, {};", op_b, op_b); + AddLine("TRUNC.U.CC RC.x, RC.x;"); + AddLine("MOV.S {} (NE.x), -1;", temporary); + } else if (op == SNE_F) { + AddLine("SNE.F RC.x, {}, {};", op_a, op_a); + AddLine("TRUNC.U.CC RC.x, RC.x;"); + AddLine("MOV.S {} (NE.x), 0;", temporary); + AddLine("SNE.F RC.x, {}, {};", op_b, op_b); + AddLine("TRUNC.U.CC RC.x, RC.x;"); + AddLine("MOV.S {} (NE.x), 0;", temporary); + } + return temporary; + } + + template <const std::string_view& op, bool is_nan> + std::string HalfComparison(Operation operation) { + std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + const std::string op_a = Visit(operation[0]); + const std::string op_b = Visit(operation[1]); + AddLine("UP2H.F {}, {};", tmp1, op_a); + AddLine("UP2H.F {}, {};", tmp2, op_b); + AddLine("{} {}, {}, {};", op, tmp1, tmp1, tmp2); + AddLine("TRUNC.U.CC RC.xy, {};", tmp1); + AddLine("MOV.S {}.xy, {{0, 0, 0, 0}};", tmp1); + AddLine("MOV.S {}.x (NE.x), -1;", tmp1); + AddLine("MOV.S {}.y (NE.y), -1;", tmp1); + if constexpr (is_nan) { + AddLine("MOVC.F RC.x, {};", op_a); + AddLine("MOV.S {}.x (NAN.x), -1;", tmp1); + AddLine("MOVC.F RC.x, {};", op_b); + AddLine("MOV.S {}.y (NAN.x), -1;", tmp1); + } + return tmp1; + } + + template <const std::string_view& op, const std::string_view& type> + std::string AtomicImage(Operation operation) { + const auto& meta = std::get<MetaImage>(operation.GetMeta()); + const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index; + const std::size_t num_coords = operation.GetOperandsCount(); + const std::size_t num_values = meta.values.size(); + + const std::string coord = AllocVectorTemporary(); + const std::string value = AllocVectorTemporary(); + for (std::size_t i = 0; i < num_coords; ++i) { + AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i])); + } + for (std::size_t i = 0; i < num_values; ++i) { + AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i])); + } + + AddLine("ATOMIM.{}.{} {}.x, {}, {}, image[{}], {};", op, type, coord, value, coord, + image_id, ImageType(meta.image.type)); + return fmt::format("{}.x", coord); + } + + template <const std::string_view& op, const std::string_view& type> + std::string Atomic(Operation operation) { + std::string temporary = AllocTemporary(); + std::string address; + std::string_view opname; + bool robust = false; + if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) { + address = GlobalMemoryPointer(*gmem); + opname = "ATOM"; + robust = true; + } else if (const auto smem = std::get_if<SmemNode>(&*operation[0])) { + address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress())); + opname = "ATOMS"; + } else { + UNREACHABLE(); + return "{0, 0, 0, 0}"; + } + if (robust) { + AddLine("IF NE.x;"); + } + AddLine("{}.{}.{} {}, {}, {};", opname, op, type, temporary, Visit(operation[1]), address); + if (robust) { + AddLine("ELSE;"); + AddLine("MOV.S {}, 0;", temporary); + AddLine("ENDIF;"); + } + return temporary; + } + + template <char type> + std::string Negate(Operation operation) { + std::string temporary = AllocTemporary(); + if constexpr (type == 'F') { + AddLine("MOV.F32 {}, -{};", temporary, Visit(operation[0])); + } else { + AddLine("MOV.{} {}, -{};", type, temporary, Visit(operation[0])); + } + return temporary; + } + + template <char type> + std::string Absolute(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("MOV.{} {}, |{}|;", type, temporary, Visit(operation[0])); + return temporary; + } + + template <char type> + std::string BitfieldInsert(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[3])); + AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[2])); + AddLine("BFI.{} {}.x, {}, {}, {};", type, temporary, temporary, Visit(operation[1]), + Visit(operation[0])); + return fmt::format("{}.x", temporary); + } + + template <char type> + std::string BitfieldExtract(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[2])); + AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[1])); + AddLine("BFE.{} {}.x, {}, {};", type, temporary, temporary, Visit(operation[0])); + return fmt::format("{}.x", temporary); + } + + template <char swizzle> + std::string LocalInvocationId(Operation) { + return fmt::format("invocation.localid.{}", swizzle); + } + + template <char swizzle> + std::string WorkGroupId(Operation) { + return fmt::format("invocation.groupid.{}", swizzle); + } + + template <char c1, char c2> + std::string ThreadMask(Operation) { + return fmt::format("{}.thread{}{}mask", StageInputName(stage), c1, c2); + } + + template <typename... Args> + void AddExpression(std::string_view text, Args&&... args) { + shader_source += fmt::format(text, std::forward<Args>(args)...); + } + + template <typename... Args> + void AddLine(std::string_view text, Args&&... args) { + AddExpression(text, std::forward<Args>(args)...); + shader_source += '\n'; + } + + std::string AllocLongVectorTemporary() { + max_long_temporaries = std::max(max_long_temporaries, num_long_temporaries + 1); + return fmt::format("L{}", num_long_temporaries++); + } + + std::string AllocLongTemporary() { + return fmt::format("{}.x", AllocLongVectorTemporary()); + } + + std::string AllocVectorTemporary() { + max_temporaries = std::max(max_temporaries, num_temporaries + 1); + return fmt::format("T{}", num_temporaries++); + } + + std::string AllocTemporary() { + return fmt::format("{}.x", AllocVectorTemporary()); + } + + void ResetTemporaries() noexcept { + num_temporaries = 0; + num_long_temporaries = 0; + } + + const Device& device; + const ShaderIR& ir; + const Registry& registry; + const ShaderType stage; + + std::size_t num_temporaries = 0; + std::size_t max_temporaries = 0; + + std::size_t num_long_temporaries = 0; + std::size_t max_long_temporaries = 0; + + std::map<GlobalMemoryBase, u32> global_memory_names; + + std::string shader_source; + + static constexpr std::string_view ADD_F32 = "ADD.F32"; + static constexpr std::string_view ADD_S = "ADD.S"; + static constexpr std::string_view ADD_U = "ADD.U"; + static constexpr std::string_view MUL_F32 = "MUL.F32"; + static constexpr std::string_view MUL_S = "MUL.S"; + static constexpr std::string_view MUL_U = "MUL.U"; + static constexpr std::string_view DIV_F32 = "DIV.F32"; + static constexpr std::string_view DIV_S = "DIV.S"; + static constexpr std::string_view DIV_U = "DIV.U"; + static constexpr std::string_view MAD_F32 = "MAD.F32"; + static constexpr std::string_view RSQ_F32 = "RSQ.F32"; + static constexpr std::string_view COS_F32 = "COS.F32"; + static constexpr std::string_view SIN_F32 = "SIN.F32"; + static constexpr std::string_view EX2_F32 = "EX2.F32"; + static constexpr std::string_view LG2_F32 = "LG2.F32"; + static constexpr std::string_view SLT_F = "SLT.F32"; + static constexpr std::string_view SLT_S = "SLT.S"; + static constexpr std::string_view SLT_U = "SLT.U"; + static constexpr std::string_view SEQ_F = "SEQ.F32"; + static constexpr std::string_view SEQ_S = "SEQ.S"; + static constexpr std::string_view SEQ_U = "SEQ.U"; + static constexpr std::string_view SLE_F = "SLE.F32"; + static constexpr std::string_view SLE_S = "SLE.S"; + static constexpr std::string_view SLE_U = "SLE.U"; + static constexpr std::string_view SGT_F = "SGT.F32"; + static constexpr std::string_view SGT_S = "SGT.S"; + static constexpr std::string_view SGT_U = "SGT.U"; + static constexpr std::string_view SNE_F = "SNE.F32"; + static constexpr std::string_view SNE_S = "SNE.S"; + static constexpr std::string_view SNE_U = "SNE.U"; + static constexpr std::string_view SGE_F = "SGE.F32"; + static constexpr std::string_view SGE_S = "SGE.S"; + static constexpr std::string_view SGE_U = "SGE.U"; + static constexpr std::string_view AND_S = "AND.S"; + static constexpr std::string_view AND_U = "AND.U"; + static constexpr std::string_view TRUNC_F = "TRUNC.F"; + static constexpr std::string_view TRUNC_S = "TRUNC.S"; + static constexpr std::string_view TRUNC_U = "TRUNC.U"; + static constexpr std::string_view SHL_S = "SHL.S"; + static constexpr std::string_view SHL_U = "SHL.U"; + static constexpr std::string_view SHR_S = "SHR.S"; + static constexpr std::string_view SHR_U = "SHR.U"; + static constexpr std::string_view OR_S = "OR.S"; + static constexpr std::string_view OR_U = "OR.U"; + static constexpr std::string_view XOR_S = "XOR.S"; + static constexpr std::string_view XOR_U = "XOR.U"; + static constexpr std::string_view NOT_S = "NOT.S"; + static constexpr std::string_view NOT_U = "NOT.U"; + static constexpr std::string_view BTC_S = "BTC.S"; + static constexpr std::string_view BTC_U = "BTC.U"; + static constexpr std::string_view BTFM_S = "BTFM.S"; + static constexpr std::string_view BTFM_U = "BTFM.U"; + static constexpr std::string_view ROUND_F = "ROUND.F"; + static constexpr std::string_view CEIL_F = "CEIL.F"; + static constexpr std::string_view FLR_F = "FLR.F"; + static constexpr std::string_view I2F_S = "I2F.S"; + static constexpr std::string_view I2F_U = "I2F.U"; + static constexpr std::string_view MIN_F = "MIN.F"; + static constexpr std::string_view MIN_S = "MIN.S"; + static constexpr std::string_view MIN_U = "MIN.U"; + static constexpr std::string_view MAX_F = "MAX.F"; + static constexpr std::string_view MAX_S = "MAX.S"; + static constexpr std::string_view MAX_U = "MAX.U"; + static constexpr std::string_view MOV_U = "MOV.U"; + static constexpr std::string_view TGBALLOT_U = "TGBALLOT.U"; + static constexpr std::string_view TGALL_U = "TGALL.U"; + static constexpr std::string_view TGANY_U = "TGANY.U"; + static constexpr std::string_view TGEQ_U = "TGEQ.U"; + static constexpr std::string_view EXCH = "EXCH"; + static constexpr std::string_view ADD = "ADD"; + static constexpr std::string_view MIN = "MIN"; + static constexpr std::string_view MAX = "MAX"; + static constexpr std::string_view AND = "AND"; + static constexpr std::string_view OR = "OR"; + static constexpr std::string_view XOR = "XOR"; + static constexpr std::string_view U32 = "U32"; + static constexpr std::string_view S32 = "S32"; + + static constexpr std::size_t NUM_ENTRIES = static_cast<std::size_t>(OperationCode::Amount); + using DecompilerType = std::string (ARBDecompiler::*)(Operation); + static constexpr std::array<DecompilerType, NUM_ENTRIES> OPERATION_DECOMPILERS = { + &ARBDecompiler::Assign, + + &ARBDecompiler::Select, + + &ARBDecompiler::Binary<ADD_F32>, + &ARBDecompiler::Binary<MUL_F32>, + &ARBDecompiler::Binary<DIV_F32>, + &ARBDecompiler::Trinary<MAD_F32>, + &ARBDecompiler::Negate<'F'>, + &ARBDecompiler::Absolute<'F'>, + &ARBDecompiler::FClamp, + &ARBDecompiler::FCastHalf0, + &ARBDecompiler::FCastHalf1, + &ARBDecompiler::Binary<MIN_F>, + &ARBDecompiler::Binary<MAX_F>, + &ARBDecompiler::Unary<COS_F32>, + &ARBDecompiler::Unary<SIN_F32>, + &ARBDecompiler::Unary<EX2_F32>, + &ARBDecompiler::Unary<LG2_F32>, + &ARBDecompiler::Unary<RSQ_F32>, + &ARBDecompiler::FSqrt, + &ARBDecompiler::Unary<ROUND_F>, + &ARBDecompiler::Unary<FLR_F>, + &ARBDecompiler::Unary<CEIL_F>, + &ARBDecompiler::Unary<TRUNC_F>, + &ARBDecompiler::Unary<I2F_S>, + &ARBDecompiler::Unary<I2F_U>, + &ARBDecompiler::FSwizzleAdd, + + &ARBDecompiler::Binary<ADD_S>, + &ARBDecompiler::Binary<MUL_S>, + &ARBDecompiler::Binary<DIV_S>, + &ARBDecompiler::Negate<'S'>, + &ARBDecompiler::Absolute<'S'>, + &ARBDecompiler::Binary<MIN_S>, + &ARBDecompiler::Binary<MAX_S>, + + &ARBDecompiler::Unary<TRUNC_S>, + &ARBDecompiler::Unary<MOV_U>, + &ARBDecompiler::Binary<SHL_S>, + &ARBDecompiler::Binary<SHR_U>, + &ARBDecompiler::Binary<SHR_S>, + &ARBDecompiler::Binary<AND_S>, + &ARBDecompiler::Binary<OR_S>, + &ARBDecompiler::Binary<XOR_S>, + &ARBDecompiler::Unary<NOT_S>, + &ARBDecompiler::BitfieldInsert<'S'>, + &ARBDecompiler::BitfieldExtract<'S'>, + &ARBDecompiler::Unary<BTC_S>, + &ARBDecompiler::Unary<BTFM_S>, + + &ARBDecompiler::Binary<ADD_U>, + &ARBDecompiler::Binary<MUL_U>, + &ARBDecompiler::Binary<DIV_U>, + &ARBDecompiler::Binary<MIN_U>, + &ARBDecompiler::Binary<MAX_U>, + &ARBDecompiler::Unary<TRUNC_U>, + &ARBDecompiler::Unary<MOV_U>, + &ARBDecompiler::Binary<SHL_U>, + &ARBDecompiler::Binary<SHR_U>, + &ARBDecompiler::Binary<SHR_U>, + &ARBDecompiler::Binary<AND_U>, + &ARBDecompiler::Binary<OR_U>, + &ARBDecompiler::Binary<XOR_U>, + &ARBDecompiler::Unary<NOT_U>, + &ARBDecompiler::BitfieldInsert<'U'>, + &ARBDecompiler::BitfieldExtract<'U'>, + &ARBDecompiler::Unary<BTC_U>, + &ARBDecompiler::Unary<BTFM_U>, + + &ARBDecompiler::HAdd2, + &ARBDecompiler::HMul2, + &ARBDecompiler::HFma2, + &ARBDecompiler::HAbsolute, + &ARBDecompiler::HNegate, + &ARBDecompiler::HClamp, + &ARBDecompiler::HCastFloat, + &ARBDecompiler::HUnpack, + &ARBDecompiler::HMergeF32, + &ARBDecompiler::HMergeH0, + &ARBDecompiler::HMergeH1, + &ARBDecompiler::HPack2, + + &ARBDecompiler::LogicalAssign, + &ARBDecompiler::Binary<AND_U>, + &ARBDecompiler::Binary<OR_U>, + &ARBDecompiler::Binary<XOR_U>, + &ARBDecompiler::Unary<NOT_U>, + &ARBDecompiler::LogicalPick2, + &ARBDecompiler::LogicalAnd2, + + &ARBDecompiler::FloatComparison<SLT_F, false>, + &ARBDecompiler::FloatComparison<SEQ_F, false>, + &ARBDecompiler::FloatComparison<SLE_F, false>, + &ARBDecompiler::FloatComparison<SGT_F, false>, + &ARBDecompiler::FloatComparison<SNE_F, false>, + &ARBDecompiler::FloatComparison<SGE_F, false>, + &ARBDecompiler::FloatOrdered, + &ARBDecompiler::FloatUnordered, + &ARBDecompiler::FloatComparison<SLT_F, true>, + &ARBDecompiler::FloatComparison<SEQ_F, true>, + &ARBDecompiler::FloatComparison<SLE_F, true>, + &ARBDecompiler::FloatComparison<SGT_F, true>, + &ARBDecompiler::FloatComparison<SNE_F, true>, + &ARBDecompiler::FloatComparison<SGE_F, true>, + + &ARBDecompiler::Binary<SLT_S>, + &ARBDecompiler::Binary<SEQ_S>, + &ARBDecompiler::Binary<SLE_S>, + &ARBDecompiler::Binary<SGT_S>, + &ARBDecompiler::Binary<SNE_S>, + &ARBDecompiler::Binary<SGE_S>, + + &ARBDecompiler::Binary<SLT_U>, + &ARBDecompiler::Binary<SEQ_U>, + &ARBDecompiler::Binary<SLE_U>, + &ARBDecompiler::Binary<SGT_U>, + &ARBDecompiler::Binary<SNE_U>, + &ARBDecompiler::Binary<SGE_U>, + + &ARBDecompiler::LogicalAddCarry, + + &ARBDecompiler::HalfComparison<SLT_F, false>, + &ARBDecompiler::HalfComparison<SEQ_F, false>, + &ARBDecompiler::HalfComparison<SLE_F, false>, + &ARBDecompiler::HalfComparison<SGT_F, false>, + &ARBDecompiler::HalfComparison<SNE_F, false>, + &ARBDecompiler::HalfComparison<SGE_F, false>, + &ARBDecompiler::HalfComparison<SLT_F, true>, + &ARBDecompiler::HalfComparison<SEQ_F, true>, + &ARBDecompiler::HalfComparison<SLE_F, true>, + &ARBDecompiler::HalfComparison<SGT_F, true>, + &ARBDecompiler::HalfComparison<SNE_F, true>, + &ARBDecompiler::HalfComparison<SGE_F, true>, + + &ARBDecompiler::Texture, + &ARBDecompiler::Texture, + &ARBDecompiler::TextureGather, + &ARBDecompiler::TextureQueryDimensions, + &ARBDecompiler::TextureQueryLod, + &ARBDecompiler::TexelFetch, + &ARBDecompiler::TextureGradient, + + &ARBDecompiler::ImageLoad, + &ARBDecompiler::ImageStore, + + &ARBDecompiler::AtomicImage<ADD, U32>, + &ARBDecompiler::AtomicImage<AND, U32>, + &ARBDecompiler::AtomicImage<OR, U32>, + &ARBDecompiler::AtomicImage<XOR, U32>, + &ARBDecompiler::AtomicImage<EXCH, U32>, + + &ARBDecompiler::Atomic<EXCH, U32>, + &ARBDecompiler::Atomic<ADD, U32>, + &ARBDecompiler::Atomic<MIN, U32>, + &ARBDecompiler::Atomic<MAX, U32>, + &ARBDecompiler::Atomic<AND, U32>, + &ARBDecompiler::Atomic<OR, U32>, + &ARBDecompiler::Atomic<XOR, U32>, + + &ARBDecompiler::Atomic<EXCH, S32>, + &ARBDecompiler::Atomic<ADD, S32>, + &ARBDecompiler::Atomic<MIN, S32>, + &ARBDecompiler::Atomic<MAX, S32>, + &ARBDecompiler::Atomic<AND, S32>, + &ARBDecompiler::Atomic<OR, S32>, + &ARBDecompiler::Atomic<XOR, S32>, + + &ARBDecompiler::Atomic<ADD, U32>, + &ARBDecompiler::Atomic<MIN, U32>, + &ARBDecompiler::Atomic<MAX, U32>, + &ARBDecompiler::Atomic<AND, U32>, + &ARBDecompiler::Atomic<OR, U32>, + &ARBDecompiler::Atomic<XOR, U32>, + + &ARBDecompiler::Atomic<ADD, S32>, + &ARBDecompiler::Atomic<MIN, S32>, + &ARBDecompiler::Atomic<MAX, S32>, + &ARBDecompiler::Atomic<AND, S32>, + &ARBDecompiler::Atomic<OR, S32>, + &ARBDecompiler::Atomic<XOR, S32>, + + &ARBDecompiler::Branch, + &ARBDecompiler::BranchIndirect, + &ARBDecompiler::PushFlowStack, + &ARBDecompiler::PopFlowStack, + &ARBDecompiler::Exit, + &ARBDecompiler::Discard, + + &ARBDecompiler::EmitVertex, + &ARBDecompiler::EndPrimitive, + + &ARBDecompiler::InvocationId, + &ARBDecompiler::YNegate, + &ARBDecompiler::LocalInvocationId<'x'>, + &ARBDecompiler::LocalInvocationId<'y'>, + &ARBDecompiler::LocalInvocationId<'z'>, + &ARBDecompiler::WorkGroupId<'x'>, + &ARBDecompiler::WorkGroupId<'y'>, + &ARBDecompiler::WorkGroupId<'z'>, + + &ARBDecompiler::Unary<TGBALLOT_U>, + &ARBDecompiler::Unary<TGALL_U>, + &ARBDecompiler::Unary<TGANY_U>, + &ARBDecompiler::Unary<TGEQ_U>, + + &ARBDecompiler::ThreadId, + &ARBDecompiler::ThreadMask<'e', 'q'>, + &ARBDecompiler::ThreadMask<'g', 'e'>, + &ARBDecompiler::ThreadMask<'g', 't'>, + &ARBDecompiler::ThreadMask<'l', 'e'>, + &ARBDecompiler::ThreadMask<'l', 't'>, + &ARBDecompiler::ShuffleIndexed, + + &ARBDecompiler::Barrier, + &ARBDecompiler::MemoryBarrierGroup, + &ARBDecompiler::MemoryBarrierGlobal, + }; +}; + +ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, + ShaderType stage, std::string_view identifier) + : device{device}, ir{ir}, registry{registry}, stage{stage} { + DefineGlobalMemory(); + + AddLine("TEMP RC;"); + AddLine("TEMP FSWZA[4];"); + AddLine("TEMP FSWZB[4];"); + if (ir.IsDecompiled()) { + DecompileAST(); + } else { + DecompileBranchMode(); + } + AddLine("END"); + + const std::string code = std::move(shader_source); + DeclareHeader(); + DeclareVertex(); + DeclareGeometry(); + DeclareFragment(); + DeclareCompute(); + DeclareInputAttributes(); + DeclareOutputAttributes(); + DeclareLocalMemory(); + DeclareGlobalMemory(); + DeclareConstantBuffers(); + DeclareRegisters(); + DeclareTemporaries(); + DeclarePredicates(); + DeclareInternalFlags(); + + shader_source += code; +} + +std::string_view HeaderStageName(ShaderType stage) { + switch (stage) { + case ShaderType::Vertex: + return "vp"; + case ShaderType::Geometry: + return "gp"; + case ShaderType::Fragment: + return "fp"; + case ShaderType::Compute: + return "cp"; + default: + UNREACHABLE(); + return ""; + } +} + +void ARBDecompiler::DefineGlobalMemory() { + u32 binding = 0; + for (const auto& pair : ir.GetGlobalMemory()) { + const GlobalMemoryBase base = pair.first; + global_memory_names.emplace(base, binding); + ++binding; + } +} + +void ARBDecompiler::DeclareHeader() { + AddLine("!!NV{}5.0", HeaderStageName(stage)); + // Enabling this allows us to cheat on some instructions like TXL with SHADOWARRAY2D + AddLine("OPTION NV_internal;"); + AddLine("OPTION NV_gpu_program_fp64;"); + AddLine("OPTION NV_shader_thread_group;"); + if (ir.UsesWarps() && device.HasWarpIntrinsics()) { + AddLine("OPTION NV_shader_thread_shuffle;"); + } + if (stage == ShaderType::Vertex) { + if (device.HasNvViewportArray2()) { + AddLine("OPTION NV_viewport_array2;"); + } + } + if (stage == ShaderType::Fragment) { + AddLine("OPTION ARB_draw_buffers;"); + } + if (device.HasImageLoadFormatted()) { + AddLine("OPTION EXT_shader_image_load_formatted;"); + } +} + +void ARBDecompiler::DeclareVertex() { + if (stage != ShaderType::Vertex) { + return; + } + AddLine("OUTPUT result_clip[] = {{ result.clip[0..7] }};"); +} + +void ARBDecompiler::DeclareGeometry() { + if (stage != ShaderType::Geometry) { + return; + } + const auto& info = registry.GetGraphicsInfo(); + const auto& header = ir.GetHeader(); + AddLine("PRIMITIVE_IN {};", PrimitiveDescription(info.primitive_topology)); + AddLine("PRIMITIVE_OUT {};", TopologyName(header.common3.output_topology)); + AddLine("VERTICES_OUT {};", header.common4.max_output_vertices.Value()); + AddLine("ATTRIB vertex_position = vertex.position;"); +} + +void ARBDecompiler::DeclareFragment() { + if (stage != ShaderType::Fragment) { + return; + } + AddLine("OUTPUT result_color7 = result.color[7];"); + AddLine("OUTPUT result_color6 = result.color[6];"); + AddLine("OUTPUT result_color5 = result.color[5];"); + AddLine("OUTPUT result_color4 = result.color[4];"); + AddLine("OUTPUT result_color3 = result.color[3];"); + AddLine("OUTPUT result_color2 = result.color[2];"); + AddLine("OUTPUT result_color1 = result.color[1];"); + AddLine("OUTPUT result_color0 = result.color;"); +} + +void ARBDecompiler::DeclareCompute() { + if (stage != ShaderType::Compute) { + return; + } + const ComputeInfo& info = registry.GetComputeInfo(); + AddLine("GROUP_SIZE {} {} {};", info.workgroup_size[0], info.workgroup_size[1], + info.workgroup_size[2]); + if (info.shared_memory_size_in_words == 0) { + return; + } + const u32 limit = device.GetMaxComputeSharedMemorySize(); + u32 size_in_bytes = info.shared_memory_size_in_words * 4; + if (size_in_bytes > limit) { + LOG_ERROR(Render_OpenGL, "Shared memory size {} is clamped to host's limit {}", + size_in_bytes, limit); + size_in_bytes = limit; + } + + AddLine("SHARED_MEMORY {};", size_in_bytes); + AddLine("SHARED shared_mem[] = {{program.sharedmem}};"); +} + +void ARBDecompiler::DeclareInputAttributes() { + if (stage == ShaderType::Compute) { + return; + } + const std::string_view stage_name = StageInputName(stage); + for (const auto attribute : ir.GetInputAttributes()) { + if (!IsGenericAttribute(attribute)) { + continue; + } + const u32 index = GetGenericAttributeIndex(attribute); + + std::string_view suffix; + if (stage == ShaderType::Fragment) { + const auto input_mode{ir.GetHeader().ps.GetPixelImap(index)}; + if (input_mode == PixelImap::Unused) { + return; + } + suffix = GetInputFlags(input_mode); + } + AddLine("{}ATTRIB in_attr{}[] = {{ {}.attrib[{}..{}] }};", suffix, index, stage_name, index, + index); + } +} + +void ARBDecompiler::DeclareOutputAttributes() { + if (stage == ShaderType::Compute) { + return; + } + for (const auto attribute : ir.GetOutputAttributes()) { + if (!IsGenericAttribute(attribute)) { + continue; + } + const u32 index = GetGenericAttributeIndex(attribute); + AddLine("OUTPUT out_attr{}[] = {{ result.attrib[{}..{}] }};", index, index, index); + } +} + +void ARBDecompiler::DeclareLocalMemory() { + u64 size = 0; + if (stage == ShaderType::Compute) { + size = registry.GetComputeInfo().local_memory_size_in_words * 4ULL; + } else { + size = ir.GetHeader().GetLocalMemorySize(); + } + if (size == 0) { + return; + } + const u64 element_count = Common::AlignUp(size, 4) / 4; + AddLine("TEMP lmem[{}];", element_count); +} + +void ARBDecompiler::DeclareGlobalMemory() { + const size_t num_entries = ir.GetGlobalMemory().size(); + if (num_entries > 0) { + AddLine("PARAM c[{}] = {{ program.local[0..{}] }};", num_entries, num_entries - 1); + } +} + +void ARBDecompiler::DeclareConstantBuffers() { + u32 binding = 0; + for (const auto& cbuf : ir.GetConstantBuffers()) { + AddLine("CBUFFER cbuf{}[] = {{ program.buffer[{}] }};", cbuf.first, binding); + ++binding; + } +} + +void ARBDecompiler::DeclareRegisters() { + for (const u32 gpr : ir.GetRegisters()) { + AddLine("TEMP R{};", gpr); + } +} + +void ARBDecompiler::DeclareTemporaries() { + for (std::size_t i = 0; i < max_temporaries; ++i) { + AddLine("TEMP T{};", i); + } + for (std::size_t i = 0; i < max_long_temporaries; ++i) { + AddLine("LONG TEMP L{};", i); + } +} + +void ARBDecompiler::DeclarePredicates() { + for (const Tegra::Shader::Pred pred : ir.GetPredicates()) { + AddLine("TEMP P{};", static_cast<u64>(pred)); + } +} + +void ARBDecompiler::DeclareInternalFlags() { + for (const char* name : INTERNAL_FLAG_NAMES) { + AddLine("TEMP {};", name); + } +} + +void ARBDecompiler::InitializeVariables() { + AddLine("MOV.F32 FSWZA[0], -1;"); + AddLine("MOV.F32 FSWZA[1], 1;"); + AddLine("MOV.F32 FSWZA[2], -1;"); + AddLine("MOV.F32 FSWZA[3], 0;"); + AddLine("MOV.F32 FSWZB[0], -1;"); + AddLine("MOV.F32 FSWZB[1], -1;"); + AddLine("MOV.F32 FSWZB[2], 1;"); + AddLine("MOV.F32 FSWZB[3], -1;"); + + if (stage == ShaderType::Vertex || stage == ShaderType::Geometry) { + AddLine("MOV.F result.position, {{0, 0, 0, 1}};"); + } + for (const auto attribute : ir.GetOutputAttributes()) { + if (!IsGenericAttribute(attribute)) { + continue; + } + const u32 index = GetGenericAttributeIndex(attribute); + AddLine("MOV.F result.attrib[{}], {{0, 0, 0, 1}};", index); + } + for (const u32 gpr : ir.GetRegisters()) { + AddLine("MOV.F R{}, {{0, 0, 0, 0}};", gpr); + } + for (const Tegra::Shader::Pred pred : ir.GetPredicates()) { + AddLine("MOV.U P{}, {{0, 0, 0, 0}};", static_cast<u64>(pred)); + } +} + +void ARBDecompiler::DecompileAST() { + const u32 num_flow_variables = ir.GetASTNumVariables(); + for (u32 i = 0; i < num_flow_variables; ++i) { + AddLine("TEMP F{};", i); + } + for (u32 i = 0; i < num_flow_variables; ++i) { + AddLine("MOV.U F{}, {{0, 0, 0, 0}};", i); + } + + InitializeVariables(); + + VisitAST(ir.GetASTProgram()); +} + +void ARBDecompiler::DecompileBranchMode() { + static constexpr u32 FLOW_STACK_SIZE = 20; + if (!ir.IsFlowStackDisabled()) { + AddLine("TEMP SSY[{}];", FLOW_STACK_SIZE); + AddLine("TEMP PBK[{}];", FLOW_STACK_SIZE); + AddLine("TEMP SSY_TOP;"); + AddLine("TEMP PBK_TOP;"); + } + + AddLine("TEMP PC;"); + + if (!ir.IsFlowStackDisabled()) { + AddLine("MOV.U SSY_TOP.x, 0;"); + AddLine("MOV.U PBK_TOP.x, 0;"); + } + + InitializeVariables(); + + const auto basic_block_end = ir.GetBasicBlocks().end(); + auto basic_block_it = ir.GetBasicBlocks().begin(); + const u32 first_address = basic_block_it->first; + AddLine("MOV.U PC.x, {};", first_address); + + AddLine("REP;"); + + std::size_t num_blocks = 0; + while (basic_block_it != basic_block_end) { + const auto& [address, bb] = *basic_block_it; + ++num_blocks; + + AddLine("SEQ.S.CC RC.x, PC.x, {};", address); + AddLine("IF NE.x;"); + + VisitBlock(bb); + + ++basic_block_it; + + if (basic_block_it != basic_block_end) { + const auto op = std::get_if<OperationNode>(&*bb[bb.size() - 1]); + if (!op || op->GetCode() != OperationCode::Branch) { + const u32 next_address = basic_block_it->first; + AddLine("MOV.U PC.x, {};", next_address); + AddLine("CONT;"); + } + } + + AddLine("ELSE;"); + } + AddLine("RET;"); + while (num_blocks--) { + AddLine("ENDIF;"); + } + + AddLine("ENDREP;"); +} + +void ARBDecompiler::VisitAST(const ASTNode& node) { + if (const auto ast = std::get_if<ASTProgram>(&*node->GetInnerData())) { + for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + VisitAST(current); + } + } else if (const auto ast = std::get_if<ASTIfThen>(&*node->GetInnerData())) { + const std::string condition = VisitExpression(ast->condition); + ResetTemporaries(); + + AddLine("MOVC.U RC.x, {};", condition); + AddLine("IF NE.x;"); + for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + VisitAST(current); + } + AddLine("ENDIF;"); + } else if (const auto ast = std::get_if<ASTIfElse>(&*node->GetInnerData())) { + AddLine("ELSE;"); + for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + VisitAST(current); + } + } else if (const auto ast = std::get_if<ASTBlockDecoded>(&*node->GetInnerData())) { + VisitBlock(ast->nodes); + } else if (const auto ast = std::get_if<ASTVarSet>(&*node->GetInnerData())) { + AddLine("MOV.U F{}, {};", ast->index, VisitExpression(ast->condition)); + ResetTemporaries(); + } else if (const auto ast = std::get_if<ASTDoWhile>(&*node->GetInnerData())) { + const std::string condition = VisitExpression(ast->condition); + ResetTemporaries(); + AddLine("REP;"); + for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + VisitAST(current); + } + AddLine("MOVC.U RC.x, {};", condition); + AddLine("BRK (NE.x);"); + AddLine("ENDREP;"); + } else if (const auto ast = std::get_if<ASTReturn>(&*node->GetInnerData())) { + const bool is_true = ExprIsTrue(ast->condition); + if (!is_true) { + AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition)); + AddLine("IF NE.x;"); + ResetTemporaries(); + } + if (ast->kills) { + AddLine("KIL TR;"); + } else { + Exit(); + } + if (!is_true) { + AddLine("ENDIF;"); + } + } else if (const auto ast = std::get_if<ASTBreak>(&*node->GetInnerData())) { + if (ExprIsTrue(ast->condition)) { + AddLine("BRK;"); + } else { + AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition)); + AddLine("BRK (NE.x);"); + ResetTemporaries(); + } + } else if (std::holds_alternative<ASTLabel>(*node->GetInnerData())) { + // Nothing to do + } else { + UNREACHABLE(); + } +} + +std::string ARBDecompiler::VisitExpression(const Expr& node) { + if (const auto expr = std::get_if<ExprAnd>(&*node)) { + std::string result = AllocTemporary(); + AddLine("AND.U {}, {}, {};", result, VisitExpression(expr->operand1), + VisitExpression(expr->operand2)); + return result; + } + if (const auto expr = std::get_if<ExprOr>(&*node)) { + std::string result = AllocTemporary(); + AddLine("OR.U {}, {}, {};", result, VisitExpression(expr->operand1), + VisitExpression(expr->operand2)); + return result; + } + if (const auto expr = std::get_if<ExprNot>(&*node)) { + std::string result = AllocTemporary(); + AddLine("CMP.S {}, {}, 0, -1;", result, VisitExpression(expr->operand1)); + return result; + } + if (const auto expr = std::get_if<ExprPredicate>(&*node)) { + return fmt::format("P{}.x", static_cast<u64>(expr->predicate)); + } + if (const auto expr = std::get_if<ExprCondCode>(&*node)) { + return Visit(ir.GetConditionCode(expr->cc)); + } + if (const auto expr = std::get_if<ExprVar>(&*node)) { + return fmt::format("F{}.x", expr->var_index); + } + if (const auto expr = std::get_if<ExprBoolean>(&*node)) { + return expr->value ? "0xffffffff" : "0"; + } + if (const auto expr = std::get_if<ExprGprEqual>(&*node)) { + std::string result = AllocTemporary(); + AddLine("SEQ.U {}, R{}.x, {};", result, expr->gpr, expr->value); + return result; + } + UNREACHABLE(); + return "0"; +} + +void ARBDecompiler::VisitBlock(const NodeBlock& bb) { + for (const auto& node : bb) { + Visit(node); + } +} + +std::string ARBDecompiler::Visit(const Node& node) { + if (const auto operation = std::get_if<OperationNode>(&*node)) { + if (const auto amend_index = operation->GetAmendIndex()) { + Visit(ir.GetAmendNode(*amend_index)); + } + const std::size_t index = static_cast<std::size_t>(operation->GetCode()); + if (index >= OPERATION_DECOMPILERS.size()) { + UNREACHABLE_MSG("Out of bounds operation: {}", index); + return {}; + } + const auto decompiler = OPERATION_DECOMPILERS[index]; + if (decompiler == nullptr) { + UNREACHABLE_MSG("Undefined operation: {}", index); + return {}; + } + return (this->*decompiler)(*operation); + } + + if (const auto gpr = std::get_if<GprNode>(&*node)) { + const u32 index = gpr->GetIndex(); + if (index == Register::ZeroIndex) { + return "{0, 0, 0, 0}.x"; + } + return fmt::format("R{}.x", index); + } + + if (const auto cv = std::get_if<CustomVarNode>(&*node)) { + return fmt::format("CV{}.x", cv->GetIndex()); + } + + if (const auto immediate = std::get_if<ImmediateNode>(&*node)) { + std::string temporary = AllocTemporary(); + AddLine("MOV.U {}, {};", temporary, immediate->GetValue()); + return temporary; + } + + if (const auto predicate = std::get_if<PredicateNode>(&*node)) { + std::string temporary = AllocTemporary(); + switch (const auto index = predicate->GetIndex(); index) { + case Tegra::Shader::Pred::UnusedIndex: + AddLine("MOV.S {}, -1;", temporary); + break; + case Tegra::Shader::Pred::NeverExecute: + AddLine("MOV.S {}, 0;", temporary); + break; + default: + AddLine("MOV.S {}, P{}.x;", temporary, static_cast<u64>(index)); + break; + } + if (predicate->IsNegated()) { + AddLine("CMP.S {}, {}, 0, -1;", temporary, temporary); + } + return temporary; + } + + if (const auto abuf = std::get_if<AbufNode>(&*node)) { + if (abuf->IsPhysicalBuffer()) { + UNIMPLEMENTED_MSG("Physical buffers are not implemented"); + return "{0, 0, 0, 0}.x"; + } + + const Attribute::Index index = abuf->GetIndex(); + const u32 element = abuf->GetElement(); + const char swizzle = Swizzle(element); + switch (index) { + case Attribute::Index::Position: { + if (stage == ShaderType::Geometry) { + return fmt::format("{}_position[{}].{}", StageInputName(stage), + Visit(abuf->GetBuffer()), swizzle); + } else { + return fmt::format("{}.position.{}", StageInputName(stage), swizzle); + } + } + case Attribute::Index::TessCoordInstanceIDVertexID: + ASSERT(stage == ShaderType::Vertex); + switch (element) { + case 2: + return "vertex.instance"; + case 3: + return "vertex.id"; + } + UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element); + break; + case Attribute::Index::PointCoord: + switch (element) { + case 0: + return "fragment.pointcoord.x"; + case 1: + return "fragment.pointcoord.y"; + } + UNIMPLEMENTED(); + break; + case Attribute::Index::FrontFacing: { + ASSERT(stage == ShaderType::Fragment); + ASSERT(element == 3); + const std::string temporary = AllocVectorTemporary(); + AddLine("SGT.S RC.x, fragment.facing, {{0, 0, 0, 0}};"); + AddLine("MOV.U.CC RC.x, -RC;"); + AddLine("MOV.S {}.x, 0;", temporary); + AddLine("MOV.S {}.x (NE.x), -1;", temporary); + return fmt::format("{}.x", temporary); + } + default: + if (IsGenericAttribute(index)) { + if (stage == ShaderType::Geometry) { + return fmt::format("in_attr{}[{}][0].{}", GetGenericAttributeIndex(index), + Visit(abuf->GetBuffer()), swizzle); + } else { + return fmt::format("{}.attrib[{}].{}", StageInputName(stage), + GetGenericAttributeIndex(index), swizzle); + } + } + UNIMPLEMENTED_MSG("Unimplemented input attribute={}", static_cast<int>(index)); + break; + } + return "{0, 0, 0, 0}.x"; + } + + if (const auto cbuf = std::get_if<CbufNode>(&*node)) { + std::string offset_string; + const auto& offset = cbuf->GetOffset(); + if (const auto imm = std::get_if<ImmediateNode>(&*offset)) { + offset_string = std::to_string(imm->GetValue()); + } else { + offset_string = Visit(offset); + } + std::string temporary = AllocTemporary(); + AddLine("LDC.F32 {}, cbuf{}[{}];", temporary, cbuf->GetIndex(), offset_string); + return temporary; + } + + if (const auto gmem = std::get_if<GmemNode>(&*node)) { + std::string temporary = AllocTemporary(); + AddLine("MOV {}, 0;", temporary); + AddLine("LOAD.U32 {} (NE.x), {};", temporary, GlobalMemoryPointer(*gmem)); + return temporary; + } + + if (const auto lmem = std::get_if<LmemNode>(&*node)) { + std::string temporary = Visit(lmem->GetAddress()); + AddLine("SHR.U {}, {}, 2;", temporary, temporary); + AddLine("MOV.U {}, lmem[{}].x;", temporary, temporary); + return temporary; + } + + if (const auto smem = std::get_if<SmemNode>(&*node)) { + std::string temporary = Visit(smem->GetAddress()); + AddLine("LDS.U32 {}, shared_mem[{}];", temporary, temporary); + return temporary; + } + + if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) { + const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag()); + return fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]); + } + + if (const auto conditional = std::get_if<ConditionalNode>(&*node)) { + if (const auto amend_index = conditional->GetAmendIndex()) { + Visit(ir.GetAmendNode(*amend_index)); + } + AddLine("MOVC.U RC.x, {};", Visit(conditional->GetCondition())); + AddLine("IF NE.x;"); + VisitBlock(conditional->GetCode()); + AddLine("ENDIF;"); + return {}; + } + + if ([[maybe_unused]] const auto cmt = std::get_if<CommentNode>(&*node)) { + // Uncommenting this will generate invalid code. GLASM lacks comments. + // AddLine("// {}", cmt->GetText()); + return {}; + } + + UNIMPLEMENTED(); + return {}; +} + +std::tuple<std::string, std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + UNIMPLEMENTED_IF(meta.sampler.is_indexed); + + const bool is_extended = meta.sampler.is_shadow && meta.sampler.is_array && + meta.sampler.type == Tegra::Shader::TextureType::TextureCube; + const std::size_t count = operation.GetOperandsCount(); + std::string temporary = AllocVectorTemporary(); + std::size_t i = 0; + for (; i < count; ++i) { + AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i])); + } + if (meta.sampler.is_array) { + AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i), Visit(meta.array)); + ++i; + } + if (meta.sampler.is_shadow) { + std::string compare = Visit(meta.depth_compare); + if (is_extended) { + ASSERT(i == 4); + std::string extra_coord = AllocVectorTemporary(); + AddLine("MOV.F {}.x, {};", extra_coord, compare); + return {fmt::format("{}, {}", temporary, extra_coord), extra_coord, 0}; + } + AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), compare); + ++i; + } + return {temporary, temporary, i}; +} + +std::string ARBDecompiler::BuildAoffi(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + if (meta.aoffi.empty()) { + return {}; + } + const std::string temporary = AllocVectorTemporary(); + std::size_t i = 0; + for (auto& node : meta.aoffi) { + AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i++), Visit(node)); + } + return fmt::format(", offset({})", temporary); +} + +std::string ARBDecompiler::GlobalMemoryPointer(const GmemNode& gmem) { + // Read a bindless SSBO, return its address and set CC accordingly + // address = c[binding].xy + // length = c[binding].z + const u32 binding = global_memory_names.at(gmem.GetDescriptor()); + + const std::string pointer = AllocLongVectorTemporary(); + std::string temporary = AllocTemporary(); + + AddLine("PK64.U {}, c[{}];", pointer, binding); + AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem.GetRealAddress()), + Visit(gmem.GetBaseAddress())); + AddLine("CVT.U64.U32 {}.z, {};", pointer, temporary); + AddLine("ADD.U64 {}.x, {}.x, {}.z;", pointer, pointer, pointer); + // Compare offset to length and set CC + AddLine("SLT.U.CC RC.x, {}, c[{}].z;", temporary, binding); + return fmt::format("{}.x", pointer); +} + +void ARBDecompiler::Exit() { + if (stage != ShaderType::Fragment) { + AddLine("RET;"); + return; + } + + const auto safe_get_register = [this](u32 reg) -> std::string { + // TODO(Rodrigo): Replace with contains once C++20 releases + const auto& used_registers = ir.GetRegisters(); + if (used_registers.find(reg) != used_registers.end()) { + return fmt::format("R{}.x", reg); + } + return "{0, 0, 0, 0}.x"; + }; + + const auto& header = ir.GetHeader(); + u32 current_reg = 0; + for (u32 rt = 0; rt < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; ++rt) { + for (u32 component = 0; component < 4; ++component) { + if (!header.ps.IsColorComponentOutputEnabled(rt, component)) { + continue; + } + AddLine("MOV.F result_color{}.{}, {};", rt, Swizzle(component), + safe_get_register(current_reg)); + ++current_reg; + } + } + if (header.ps.omap.depth) { + AddLine("MOV.F result.depth.z, {};", safe_get_register(current_reg + 1)); + } + + AddLine("RET;"); +} + +std::string ARBDecompiler::Assign(Operation operation) { + const Node& dest = operation[0]; + const Node& src = operation[1]; + + std::string dest_name; + if (const auto gpr = std::get_if<GprNode>(&*dest)) { + if (gpr->GetIndex() == Register::ZeroIndex) { + // Writing to Register::ZeroIndex is a no op + return {}; + } + dest_name = fmt::format("R{}.x", gpr->GetIndex()); + } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) { + const u32 element = abuf->GetElement(); + const char swizzle = Swizzle(element); + switch (const Attribute::Index index = abuf->GetIndex()) { + case Attribute::Index::Position: + dest_name = fmt::format("result.position.{}", swizzle); + break; + case Attribute::Index::LayerViewportPointSize: + switch (element) { + case 0: + UNIMPLEMENTED(); + return {}; + case 1: + case 2: + if (!device.HasNvViewportArray2()) { + LOG_ERROR( + Render_OpenGL, + "NV_viewport_array2 is missing. Maxwell gen 2 or better is required."); + return {}; + } + dest_name = element == 1 ? "result.layer.x" : "result.viewport.x"; + break; + case 3: + dest_name = "result.pointsize.x"; + break; + } + break; + case Attribute::Index::ClipDistances0123: + dest_name = fmt::format("result.clip[{}].x", element); + break; + case Attribute::Index::ClipDistances4567: + dest_name = fmt::format("result.clip[{}].x", element + 4); + break; + default: + if (!IsGenericAttribute(index)) { + UNREACHABLE(); + return {}; + } + dest_name = + fmt::format("result.attrib[{}].{}", GetGenericAttributeIndex(index), swizzle); + break; + } + } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) { + const std::string address = Visit(lmem->GetAddress()); + AddLine("SHR.U {}, {}, 2;", address, address); + dest_name = fmt::format("lmem[{}].x", address); + } else if (const auto smem = std::get_if<SmemNode>(&*dest)) { + AddLine("STS.U32 {}, shared_mem[{}];", Visit(src), Visit(smem->GetAddress())); + ResetTemporaries(); + return {}; + } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) { + AddLine("IF NE.x;"); + AddLine("STORE.U32 {}, {};", Visit(src), GlobalMemoryPointer(*gmem)); + AddLine("ENDIF;"); + ResetTemporaries(); + return {}; + } else { + UNREACHABLE(); + ResetTemporaries(); + return {}; + } + + AddLine("MOV.U {}, {};", dest_name, Visit(src)); + ResetTemporaries(); + return {}; +} + +std::string ARBDecompiler::Select(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("CMP.S {}, {}, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]), + Visit(operation[2])); + return temporary; +} + +std::string ARBDecompiler::FClamp(Operation operation) { + // 1.0f in hex, replace with std::bit_cast on C++20 + static constexpr u32 POSITIVE_ONE = 0x3f800000; + + std::string temporary = AllocTemporary(); + const Node& value = operation[0]; + const Node& low = operation[1]; + const Node& high = operation[2]; + const auto* const imm_low = std::get_if<ImmediateNode>(&*low); + const auto* const imm_high = std::get_if<ImmediateNode>(&*high); + if (imm_low && imm_high && imm_low->GetValue() == 0 && imm_high->GetValue() == POSITIVE_ONE) { + AddLine("MOV.F32.SAT {}, {};", temporary, Visit(value)); + } else { + AddLine("MIN.F {}, {}, {};", temporary, Visit(value), Visit(high)); + AddLine("MAX.F {}, {}, {};", temporary, temporary, Visit(low)); + } + return temporary; +} + +std::string ARBDecompiler::FCastHalf0(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.x, {};", temporary, Visit(operation[0])); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::FCastHalf1(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.y, {};", temporary, Visit(operation[0])); + AddLine("MOV {}.x, {}.y;", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::FSqrt(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("RSQ.F32 {}, {};", temporary, Visit(operation[0])); + AddLine("RCP.F32 {}, {};", temporary, temporary); + return temporary; +} + +std::string ARBDecompiler::FSwizzleAdd(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + if (!device.HasWarpIntrinsics()) { + LOG_ERROR(Render_OpenGL, + "NV_shader_thread_shuffle is missing. Kepler or better is required."); + AddLine("ADD.F {}.x, {}, {};", temporary, Visit(operation[0]), Visit(operation[1])); + return fmt::format("{}.x", temporary); + } + + AddLine("AND.U {}.z, {}.threadid, 3;", temporary, StageInputName(stage)); + AddLine("SHL.U {}.z, {}.z, 1;", temporary, temporary); + AddLine("SHR.U {}.z, {}, {}.z;", temporary, Visit(operation[2]), temporary); + AddLine("AND.U {}.z, {}.z, 3;", temporary, temporary); + AddLine("MUL.F32 {}.x, {}, FSWZA[{}.z];", temporary, Visit(operation[0]), temporary); + AddLine("MUL.F32 {}.y, {}, FSWZB[{}.z];", temporary, Visit(operation[1]), temporary); + AddLine("ADD.F32 {}.x, {}.x, {}.y;", temporary, temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HAdd2(Operation operation) { + const std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0])); + AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1])); + AddLine("ADD.F16 {}, {}, {};", tmp1, tmp1, tmp2); + AddLine("PK2H.F {}.x, {};", tmp1, tmp1); + return fmt::format("{}.x", tmp1); +} + +std::string ARBDecompiler::HMul2(Operation operation) { + const std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0])); + AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1])); + AddLine("MUL.F16 {}, {}, {};", tmp1, tmp1, tmp2); + AddLine("PK2H.F {}.x, {};", tmp1, tmp1); + return fmt::format("{}.x", tmp1); +} + +std::string ARBDecompiler::HFma2(Operation operation) { + const std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + const std::string tmp3 = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0])); + AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1])); + AddLine("UP2H.F {}.xy, {};", tmp3, Visit(operation[2])); + AddLine("MAD.F16 {}, {}, {}, {};", tmp1, tmp1, tmp2, tmp3); + AddLine("PK2H.F {}.x, {};", tmp1, tmp1); + return fmt::format("{}.x", tmp1); +} + +std::string ARBDecompiler::HAbsolute(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + AddLine("PK2H.F {}.x, |{}|;", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HNegate(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + AddLine("MOVC.S RC.x, {};", Visit(operation[1])); + AddLine("MOV.F {}.x (NE.x), -{}.x;", temporary, temporary); + AddLine("MOVC.S RC.x, {};", Visit(operation[2])); + AddLine("MOV.F {}.y (NE.x), -{}.y;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HClamp(Operation operation) { + const std::string tmp1 = AllocVectorTemporary(); + const std::string tmp2 = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0])); + AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[1])); + AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2); + AddLine("MAX.F {}, {}, {};", tmp1, tmp1, tmp2); + AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[2])); + AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2); + AddLine("MIN.F {}, {}, {};", tmp1, tmp1, tmp2); + AddLine("PK2H.F {}.x, {};", tmp1, tmp1); + return fmt::format("{}.x", tmp1); +} + +std::string ARBDecompiler::HCastFloat(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.F {}.y, {{0, 0, 0, 0}};", temporary); + AddLine("MOV.F {}.x, {};", temporary, Visit(operation[0])); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HUnpack(Operation operation) { + std::string operand = Visit(operation[0]); + switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) { + case Tegra::Shader::HalfType::H0_H1: + return operand; + case Tegra::Shader::HalfType::F32: { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.U {}.x, {};", temporary, operand); + AddLine("MOV.U {}.y, {}.x;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); + } + case Tegra::Shader::HalfType::H0_H0: { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, operand); + AddLine("MOV.U {}.y, {}.x;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); + } + case Tegra::Shader::HalfType::H1_H1: { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, operand); + AddLine("MOV.U {}.x, {}.y;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); + } + } + UNREACHABLE(); + return "{0, 0, 0, 0}.x"; +} + +std::string ARBDecompiler::HMergeF32(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HMergeH0(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1])); + AddLine("MOV.U {}.x, {}.z;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HMergeH1(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0])); + AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1])); + AddLine("MOV.U {}.y, {}.w;", temporary, temporary); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::HPack2(Operation operation) { + const std::string temporary = AllocVectorTemporary(); + AddLine("MOV.U {}.x, {};", temporary, Visit(operation[0])); + AddLine("MOV.U {}.y, {};", temporary, Visit(operation[1])); + AddLine("PK2H.F {}.x, {};", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::LogicalAssign(Operation operation) { + const Node& dest = operation[0]; + const Node& src = operation[1]; + + std::string target; + + if (const auto pred = std::get_if<PredicateNode>(&*dest)) { + ASSERT_MSG(!pred->IsNegated(), "Negating logical assignment"); + + const Tegra::Shader::Pred index = pred->GetIndex(); + switch (index) { + case Tegra::Shader::Pred::NeverExecute: + case Tegra::Shader::Pred::UnusedIndex: + // Writing to these predicates is a no-op + return {}; + } + target = fmt::format("P{}.x", static_cast<u64>(index)); + } else if (const auto internal_flag = std::get_if<InternalFlagNode>(&*dest)) { + const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag()); + target = fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]); + } else { + UNREACHABLE(); + ResetTemporaries(); + return {}; + } + + AddLine("MOV.U {}, {};", target, Visit(src)); + ResetTemporaries(); + return {}; +} + +std::string ARBDecompiler::LogicalPick2(Operation operation) { + std::string temporary = AllocTemporary(); + const u32 index = std::get<ImmediateNode>(*operation[1]).GetValue(); + AddLine("MOV.U {}, {}.{};", temporary, Visit(operation[0]), Swizzle(index)); + return temporary; +} + +std::string ARBDecompiler::LogicalAnd2(Operation operation) { + std::string temporary = AllocTemporary(); + const std::string op = Visit(operation[0]); + AddLine("AND.U {}, {}.x, {}.y;", temporary, op, op); + return temporary; +} + +std::string ARBDecompiler::FloatOrdered(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("MOVC.F32 RC.x, {};", Visit(operation[0])); + AddLine("MOVC.F32 RC.y, {};", Visit(operation[1])); + AddLine("MOV.S {}, -1;", temporary); + AddLine("MOV.S {} (NAN.x), 0;", temporary); + AddLine("MOV.S {} (NAN.y), 0;", temporary); + return temporary; +} + +std::string ARBDecompiler::FloatUnordered(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("MOVC.F32 RC.x, {};", Visit(operation[0])); + AddLine("MOVC.F32 RC.y, {};", Visit(operation[1])); + AddLine("MOV.S {}, 0;", temporary); + AddLine("MOV.S {} (NAN.x), -1;", temporary); + AddLine("MOV.S {} (NAN.y), -1;", temporary); + return temporary; +} + +std::string ARBDecompiler::LogicalAddCarry(Operation operation) { + std::string temporary = AllocTemporary(); + AddLine("ADDC.U RC, {}, {};", Visit(operation[0]), Visit(operation[1])); + AddLine("MOV.S {}, 0;", temporary); + AddLine("IF CF.x;"); + AddLine("MOV.S {}, -1;", temporary); + AddLine("ENDIF;"); + return temporary; +} + +std::string ARBDecompiler::Texture(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + const auto [coords, temporary, swizzle] = BuildCoords(operation); + + std::string_view opcode = "TEX"; + std::string extra; + if (meta.bias) { + ASSERT(!meta.lod); + opcode = "TXB"; + + if (swizzle < 4) { + AddLine("MOV.F {}.w, {};", temporary, Visit(meta.bias)); + } else { + const std::string bias = AllocTemporary(); + AddLine("MOV.F {}, {};", bias, Visit(meta.bias)); + extra = fmt::format(" {},", bias); + } + } + if (meta.lod) { + ASSERT(!meta.bias); + opcode = "TXL"; + + if (swizzle < 4) { + AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod)); + } else { + const std::string lod = AllocTemporary(); + AddLine("MOV.F {}, {};", lod, Visit(meta.lod)); + extra = fmt::format(" {},", lod); + } + } + + AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, coords, extra, sampler_id, + TextureType(meta), BuildAoffi(operation)); + AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TextureGather(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + const auto [coords, temporary, swizzle] = BuildCoords(operation); + + std::string comp; + if (!meta.sampler.is_shadow) { + const auto& immediate = std::get<ImmediateNode>(*meta.component); + comp = fmt::format(".{}", Swizzle(immediate.GetValue())); + } + + AddLine("TXG.F {}, {}, texture[{}]{}, {}{};", temporary, temporary, sampler_id, comp, + TextureType(meta), BuildAoffi(operation)); + AddLine("MOV.U {}.x, {}.{};", temporary, coords, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TextureQueryDimensions(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + const std::string temporary = AllocVectorTemporary(); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + + ASSERT(!meta.sampler.is_array); + + const std::string lod = operation.GetOperandsCount() > 0 ? Visit(operation[0]) : "0"; + AddLine("TXQ {}, {}, texture[{}], {};", temporary, lod, sampler_id, TextureType(meta)); + AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TextureQueryLod(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + const std::string temporary = AllocVectorTemporary(); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + + ASSERT(!meta.sampler.is_array); + + const std::size_t count = operation.GetOperandsCount(); + for (std::size_t i = 0; i < count; ++i) { + AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i])); + } + AddLine("LOD.F {}, {}, texture[{}], {};", temporary, temporary, sampler_id, TextureType(meta)); + AddLine("MUL.F32 {}, {}, {{256, 256, 0, 0}};", temporary, temporary); + AddLine("TRUNC.S {}, {};", temporary, temporary); + AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TexelFetch(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + const auto [coords, temporary, swizzle] = BuildCoords(operation); + + if (!meta.sampler.is_buffer) { + ASSERT(swizzle < 4); + AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod)); + } + AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, coords, sampler_id, TextureType(meta), + BuildAoffi(operation)); + AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::TextureGradient(Operation operation) { + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); + const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; + const std::string ddx = AllocVectorTemporary(); + const std::string ddy = AllocVectorTemporary(); + const std::string coord = std::get<1>(BuildCoords(operation)); + + const std::size_t num_components = meta.derivates.size() / 2; + for (std::size_t index = 0; index < num_components; ++index) { + const char swizzle = Swizzle(index); + AddLine("MOV.F {}.{}, {};", ddx, swizzle, Visit(meta.derivates[index * 2])); + AddLine("MOV.F {}.{}, {};", ddy, swizzle, Visit(meta.derivates[index * 2 + 1])); + } + + const std::string_view result = coord; + AddLine("TXD.F {}, {}, {}, {}, texture[{}], {}{};", result, coord, ddx, ddy, sampler_id, + TextureType(meta), BuildAoffi(operation)); + AddLine("MOV.F {}.x, {}.{};", result, result, Swizzle(meta.element)); + return fmt::format("{}.x", result); +} + +std::string ARBDecompiler::ImageLoad(Operation operation) { + const auto& meta = std::get<MetaImage>(operation.GetMeta()); + const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index; + const std::size_t count = operation.GetOperandsCount(); + const std::string_view type = ImageType(meta.image.type); + + const std::string temporary = AllocVectorTemporary(); + for (std::size_t i = 0; i < count; ++i) { + AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i), Visit(operation[i])); + } + AddLine("LOADIM.F {}, {}, image[{}], {};", temporary, temporary, image_id, type); + AddLine("MOV.F {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::ImageStore(Operation operation) { + const auto& meta = std::get<MetaImage>(operation.GetMeta()); + const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index; + const std::size_t num_coords = operation.GetOperandsCount(); + const std::size_t num_values = meta.values.size(); + const std::string_view type = ImageType(meta.image.type); + + const std::string coord = AllocVectorTemporary(); + const std::string value = AllocVectorTemporary(); + for (std::size_t i = 0; i < num_coords; ++i) { + AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i])); + } + for (std::size_t i = 0; i < num_values; ++i) { + AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i])); + } + AddLine("STOREIM.F image[{}], {}, {}, {};", image_id, value, coord, type); + return {}; +} + +std::string ARBDecompiler::Branch(Operation operation) { + const auto target = std::get<ImmediateNode>(*operation[0]); + AddLine("MOV.U PC.x, {};", target.GetValue()); + AddLine("CONT;"); + return {}; +} + +std::string ARBDecompiler::BranchIndirect(Operation operation) { + AddLine("MOV.U PC.x, {};", Visit(operation[0])); + AddLine("CONT;"); + return {}; +} + +std::string ARBDecompiler::PushFlowStack(Operation operation) { + const auto stack = std::get<MetaStackClass>(operation.GetMeta()); + const u32 target = std::get<ImmediateNode>(*operation[0]).GetValue(); + const std::string_view stack_name = StackName(stack); + AddLine("MOV.U {}[{}_TOP.x].x, {};", stack_name, stack_name, target); + AddLine("ADD.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name); + return {}; +} + +std::string ARBDecompiler::PopFlowStack(Operation operation) { + const auto stack = std::get<MetaStackClass>(operation.GetMeta()); + const std::string_view stack_name = StackName(stack); + AddLine("SUB.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name); + AddLine("MOV.U PC.x, {}[{}_TOP.x].x;", stack_name, stack_name); + AddLine("CONT;"); + return {}; +} + +std::string ARBDecompiler::Exit(Operation) { + Exit(); + return {}; +} + +std::string ARBDecompiler::Discard(Operation) { + AddLine("KIL TR;"); + return {}; +} + +std::string ARBDecompiler::EmitVertex(Operation) { + AddLine("EMIT;"); + return {}; +} + +std::string ARBDecompiler::EndPrimitive(Operation) { + AddLine("ENDPRIM;"); + return {}; +} + +std::string ARBDecompiler::InvocationId(Operation) { + return "primitive.invocation"; +} + +std::string ARBDecompiler::YNegate(Operation) { + LOG_WARNING(Render_OpenGL, "(STUBBED)"); + std::string temporary = AllocTemporary(); + AddLine("MOV.F {}, 1;", temporary); + return temporary; +} + +std::string ARBDecompiler::ThreadId(Operation) { + return fmt::format("{}.threadid", StageInputName(stage)); +} + +std::string ARBDecompiler::ShuffleIndexed(Operation operation) { + if (!device.HasWarpIntrinsics()) { + LOG_ERROR(Render_OpenGL, + "NV_shader_thread_shuffle is missing. Kepler or better is required."); + return Visit(operation[0]); + } + const std::string temporary = AllocVectorTemporary(); + AddLine("SHFIDX.U {}, {}, {}, {{31, 0, 0, 0}};", temporary, Visit(operation[0]), + Visit(operation[1])); + AddLine("MOV.U {}.x, {}.y;", temporary, temporary); + return fmt::format("{}.x", temporary); +} + +std::string ARBDecompiler::Barrier(Operation) { + AddLine("BAR;"); + return {}; +} + +std::string ARBDecompiler::MemoryBarrierGroup(Operation) { + AddLine("MEMBAR.CTA;"); + return {}; +} + +std::string ARBDecompiler::MemoryBarrierGlobal(Operation) { + AddLine("MEMBAR;"); + return {}; +} + +} // Anonymous namespace + +std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, + const VideoCommon::Shader::Registry& registry, + Tegra::Engines::ShaderType stage, std::string_view identifier) { + return ARBDecompiler(device, ir, registry, stage, identifier).Code(); +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.h b/src/video_core/renderer_opengl/gl_arb_decompiler.h new file mode 100644 index 000000000..6afc87220 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_arb_decompiler.h @@ -0,0 +1,29 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <string> +#include <string_view> + +#include "common/common_types.h" + +namespace Tegra::Engines { +enum class ShaderType : u32; +} + +namespace VideoCommon::Shader { +class ShaderIR; +class Registry; +} // namespace VideoCommon::Shader + +namespace OpenGL { + +class Device; + +std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, + const VideoCommon::Shader::Registry& registry, + Tegra::Engines::ShaderType stage, std::string_view identifier); + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index 4eb37a96c..b1c4cd62f 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -8,6 +8,7 @@ #include "common/assert.h" #include "common/microprofile.h" +#include "video_core/buffer_cache/buffer_cache.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" @@ -21,22 +22,54 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs; MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128)); -CachedBufferBlock::CachedBufferBlock(VAddr cpu_addr, const std::size_t size) +Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size) : VideoCommon::BufferBlock{cpu_addr, size} { gl_buffer.Create(); glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW); + if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) { + glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE); + glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); + } } -CachedBufferBlock::~CachedBufferBlock() = default; +Buffer::~Buffer() = default; + +void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) { + glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size), + data); +} -OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, - const Device& device, std::size_t stream_size) - : GenericBufferCache{rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} { +void Buffer::Download(std::size_t offset, std::size_t size, u8* data) { + MICROPROFILE_SCOPE(OpenGL_Buffer_Download); + const GLsizeiptr gl_size = static_cast<GLsizeiptr>(size); + const GLintptr gl_offset = static_cast<GLintptr>(offset); + if (read_buffer.handle == 0) { + read_buffer.Create(); + glNamedBufferData(read_buffer.handle, static_cast<GLsizeiptr>(Size()), nullptr, + GL_STREAM_READ); + } + glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT); + glCopyNamedBufferSubData(gl_buffer.handle, read_buffer.handle, gl_offset, gl_offset, gl_size); + glGetNamedBufferSubData(read_buffer.handle, gl_offset, gl_size, data); +} + +void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, + std::size_t size) { + glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset), + static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size)); +} + +OGLBufferCache::OGLBufferCache(VideoCore::RasterizerInterface& rasterizer, + Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory, + const Device& device_, std::size_t stream_size) + : GenericBufferCache{rasterizer, gpu_memory, cpu_memory, + std::make_unique<OGLStreamBuffer>(device_, stream_size, true)}, + device{device_} { if (!device.HasFastBufferSubData()) { return; } - static constexpr auto size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize); + static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize); glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs)); for (const GLuint cbuf : cbufs) { glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW); @@ -47,49 +80,21 @@ OGLBufferCache::~OGLBufferCache() { glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs)); } -Buffer OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { - return std::make_shared<CachedBufferBlock>(cpu_addr, size); -} - -void OGLBufferCache::WriteBarrier() { - glMemoryBarrier(GL_ALL_BARRIER_BITS); +std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { + return std::make_shared<Buffer>(device, cpu_addr, size); } -const GLuint* OGLBufferCache::ToHandle(const Buffer& buffer) { - return buffer->GetHandle(); -} - -const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) { - static const GLuint null_buffer = 0; - return &null_buffer; -} - -void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - const u8* data) { - glNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset), - static_cast<GLsizeiptr>(size), data); -} - -void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - u8* data) { - MICROPROFILE_SCOPE(OpenGL_Buffer_Download); - glGetNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset), - static_cast<GLsizeiptr>(size), data); -} - -void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, - std::size_t dst_offset, std::size_t size) { - glCopyNamedBufferSubData(*src->GetHandle(), *dst->GetHandle(), - static_cast<GLintptr>(src_offset), static_cast<GLintptr>(dst_offset), - static_cast<GLsizeiptr>(size)); +OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) { + return {0, 0, 0}; } OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer, std::size_t size) { DEBUG_ASSERT(cbuf_cursor < std::size(cbufs)); - const GLuint& cbuf = cbufs[cbuf_cursor++]; + const GLuint cbuf = cbufs[cbuf_cursor++]; + glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer); - return {&cbuf, 0}; + return {cbuf, 0, 0}; } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index d94a11252..f75b32e31 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -10,7 +10,6 @@ #include "common/common_types.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/rasterizer_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" @@ -24,59 +23,59 @@ class Device; class OGLStreamBuffer; class RasterizerOpenGL; -class CachedBufferBlock; +class Buffer : public VideoCommon::BufferBlock { +public: + explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size); + ~Buffer(); -using Buffer = std::shared_ptr<CachedBufferBlock>; -using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>; + void Upload(std::size_t offset, std::size_t size, const u8* data); -class CachedBufferBlock : public VideoCommon::BufferBlock { -public: - explicit CachedBufferBlock(VAddr cpu_addr, const std::size_t size); - ~CachedBufferBlock(); + void Download(std::size_t offset, std::size_t size, u8* data); + + void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, + std::size_t size); - const GLuint* GetHandle() const { - return &gl_buffer.handle; + GLuint Handle() const noexcept { + return gl_buffer.handle; + } + + u64 Address() const noexcept { + return gpu_address; } private: - OGLBuffer gl_buffer{}; + OGLBuffer gl_buffer; + OGLBuffer read_buffer; + u64 gpu_address = 0; }; +using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>; class OGLBufferCache final : public GenericBufferCache { public: - explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system, + explicit OGLBufferCache(VideoCore::RasterizerInterface& rasterizer, + Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory, const Device& device, std::size_t stream_size); ~OGLBufferCache(); - const GLuint* GetEmptyBuffer(std::size_t) override; + BufferInfo GetEmptyBuffer(std::size_t) override; void Acquire() noexcept { cbuf_cursor = 0; } protected: - Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override; - - void WriteBarrier() override; - - const GLuint* ToHandle(const Buffer& buffer) override; - - void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - const u8* data) override; - - void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - u8* data) override; - - void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, - std::size_t dst_offset, std::size_t size) override; + std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override; BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override; private: + static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * + Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; + + const Device& device; + std::size_t cbuf_cursor = 0; - std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * - Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram> - cbufs; + std::array<GLuint, NUM_CBUFS> cbufs{}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index c286502ba..a94e4f72e 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -6,6 +6,7 @@ #include <array> #include <cstddef> #include <cstring> +#include <limits> #include <optional> #include <vector> @@ -13,6 +14,7 @@ #include "common/logging/log.h" #include "common/scope_exit.h" +#include "core/settings.h" #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" @@ -25,24 +27,27 @@ constexpr u32 ReservedUniformBlocks = 1; constexpr u32 NumStages = 5; -constexpr std::array LimitUBOs = {GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS, - GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, - GL_MAX_GEOMETRY_UNIFORM_BLOCKS, GL_MAX_FRAGMENT_UNIFORM_BLOCKS}; +constexpr std::array LimitUBOs = { + GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS, + GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS, + GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS}; constexpr std::array LimitSSBOs = { - GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS, + GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS, - GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS}; + GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS}; -constexpr std::array LimitSamplers = { - GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS, - GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS, - GL_MAX_TEXTURE_IMAGE_UNITS}; +constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, + GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS, + GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, + GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS, + GL_MAX_TEXTURE_IMAGE_UNITS, + GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS}; -constexpr std::array LimitImages = {GL_MAX_VERTEX_IMAGE_UNIFORMS, - GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS, - GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, - GL_MAX_GEOMETRY_IMAGE_UNIFORMS, GL_MAX_FRAGMENT_IMAGE_UNIFORMS}; +constexpr std::array LimitImages = { + GL_MAX_VERTEX_IMAGE_UNIFORMS, GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS, + GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS, + GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS}; template <typename T> T GetInteger(GLenum pname) { @@ -84,10 +89,17 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) { return std::exchange(base, base + amount); } +std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept { + std::array<u32, Tegra::Engines::MaxShaderTypes> max; + std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(), + [](GLenum pname) { return GetInteger<u32>(pname); }); + return max; +} + std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept { std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings; - static std::array<std::size_t, 5> stage_swizzle = {0, 1, 2, 3, 4}; + static constexpr std::array<std::size_t, 5> stage_swizzle{0, 1, 2, 3, 4}; const u32 total_ubos = GetInteger<u32>(GL_MAX_UNIFORM_BUFFER_BINDINGS); const u32 total_ssbos = GetInteger<u32>(GL_MAX_SHADER_STORAGE_BUFFER_BINDINGS); const u32 total_samplers = GetInteger<u32>(GL_MAX_COMBINED_TEXTURE_IMAGE_UNITS); @@ -111,16 +123,24 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin u32 num_images = GetInteger<u32>(GL_MAX_IMAGE_UNITS); u32 base_images = 0; - // Reserve more image bindings on fragment and vertex stages. + // GL_MAX_IMAGE_UNITS is guaranteed by the spec to have a minimum value of 8. + // Due to the limitation of GL_MAX_IMAGE_UNITS, reserve at least 4 image bindings on the + // fragment stage, and at least 1 for the rest of the stages. + // So far games are observed to use 1 image binding on vertex and 4 on fragment stages. + + // Reserve at least 4 image bindings on the fragment stage. bindings[4].image = - Extract(base_images, num_images, num_images / NumStages + 2, LimitImages[4]); - bindings[0].image = - Extract(base_images, num_images, num_images / NumStages + 1, LimitImages[0]); + Extract(base_images, num_images, std::max(4U, num_images / NumStages), LimitImages[4]); + + // This is guaranteed to be at least 1. + const u32 total_extracted_images = num_images / (NumStages - 1); // Reserve the other image bindings. - const u32 total_extracted_images = num_images / (NumStages - 2); - for (std::size_t i = 2; i < NumStages; ++i) { + for (std::size_t i = 0; i < NumStages; ++i) { const std::size_t stage = stage_swizzle[i]; + if (stage == 4) { + continue; + } bindings[stage].image = Extract(base_images, num_images, total_extracted_images, LimitImages[stage]); } @@ -132,6 +152,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin } bool IsASTCSupported() { + static constexpr std::array targets = {GL_TEXTURE_2D, GL_TEXTURE_2D_ARRAY}; static constexpr std::array formats = { GL_COMPRESSED_RGBA_ASTC_4x4_KHR, GL_COMPRESSED_RGBA_ASTC_5x4_KHR, GL_COMPRESSED_RGBA_ASTC_5x5_KHR, GL_COMPRESSED_RGBA_ASTC_6x5_KHR, @@ -148,59 +169,94 @@ bool IsASTCSupported() { GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR, }; - return std::find_if_not(formats.begin(), formats.end(), [](GLenum format) { - GLint supported; - glGetInternalformativ(GL_TEXTURE_2D, format, GL_INTERNALFORMAT_SUPPORTED, 1, - &supported); - return supported == GL_TRUE; - }) == formats.end(); + static constexpr std::array required_support = { + GL_VERTEX_TEXTURE, GL_TESS_CONTROL_TEXTURE, GL_TESS_EVALUATION_TEXTURE, + GL_GEOMETRY_TEXTURE, GL_FRAGMENT_TEXTURE, GL_COMPUTE_TEXTURE, + }; + + for (const GLenum target : targets) { + for (const GLenum format : formats) { + for (const GLenum support : required_support) { + GLint value; + glGetInternalformativ(target, format, support, 1, &value); + if (value != GL_FULL_SUPPORT) { + return false; + } + } + } + } + return true; } } // Anonymous namespace -Device::Device() : base_bindings{BuildBaseBindings()} { +Device::Device() + : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} { const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); - const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER)); + const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION)); const std::vector extensions = GetExtensions(); const bool is_nvidia = vendor == "NVIDIA Corporation"; const bool is_amd = vendor == "ATI Technologies Inc."; - const bool is_intel = vendor == "Intel"; - const bool is_intel_proprietary = is_intel && std::strstr(renderer, "Mesa") == nullptr; + + bool disable_fast_buffer_sub_data = false; + if (is_nvidia && version == "4.6.0 NVIDIA 443.24") { + LOG_WARNING( + Render_OpenGL, + "Beta driver 443.24 is known to have issues. There might be performance issues."); + disable_fast_buffer_sub_data = true; + } uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS); max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS); + max_compute_shared_memory_size = GetInteger<u32>(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE); has_warp_intrinsics = GLAD_GL_NV_gpu_shader5 && GLAD_GL_NV_shader_thread_group && GLAD_GL_NV_shader_thread_shuffle; has_shader_ballot = GLAD_GL_ARB_shader_ballot; has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array; has_image_load_formatted = HasExtension(extensions, "GL_EXT_shader_image_load_formatted"); + has_texture_shadow_lod = HasExtension(extensions, "GL_EXT_texture_shadow_lod"); has_astc = IsASTCSupported(); has_variable_aoffi = TestVariableAoffi(); has_component_indexing_bug = is_amd; has_precise_bug = TestPreciseBug(); - has_broken_compute = is_intel_proprietary; - has_fast_buffer_sub_data = is_nvidia; + has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2; + has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory; + + // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive + // uniform buffers as "push constants" + has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data; + + use_assembly_shaders = Settings::values.use_assembly_shaders.GetValue() && + GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5 && + GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2; + + use_asynchronous_shaders = Settings::values.use_asynchronous_shaders.GetValue(); LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi); LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug); LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug); + + if (Settings::values.use_assembly_shaders.GetValue() && !use_assembly_shaders) { + LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported"); + } } Device::Device(std::nullptr_t) { - uniform_buffer_alignment = 0; + max_uniform_buffers.fill(std::numeric_limits<u32>::max()); + uniform_buffer_alignment = 4; + shader_storage_alignment = 4; max_vertex_attributes = 16; max_varyings = 15; + max_compute_shared_memory_size = 0x10000; has_warp_intrinsics = true; has_shader_ballot = true; has_vertex_viewport_layer = true; has_image_load_formatted = true; + has_texture_shadow_lod = true; has_variable_aoffi = true; - has_component_indexing_bug = false; - has_broken_compute = false; - has_precise_bug = false; } bool Device::TestVariableAoffi() { diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index a55050cb5..8a4b6b9fc 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -24,6 +24,10 @@ public: explicit Device(); explicit Device(std::nullptr_t); + u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept { + return max_uniform_buffers[static_cast<std::size_t>(shader_type)]; + } + const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept { return base_bindings[stage_index]; } @@ -48,6 +52,10 @@ public: return max_varyings; } + u32 GetMaxComputeSharedMemorySize() const { + return max_compute_shared_memory_size; + } + bool HasWarpIntrinsics() const { return has_warp_intrinsics; } @@ -64,6 +72,14 @@ public: return has_image_load_formatted; } + bool HasTextureShadowLod() const { + return has_texture_shadow_lod; + } + + bool HasVertexBufferUnifiedMemory() const { + return has_vertex_buffer_unified_memory; + } + bool HasASTC() const { return has_astc; } @@ -80,33 +96,47 @@ public: return has_precise_bug; } - bool HasBrokenCompute() const { - return has_broken_compute; - } - bool HasFastBufferSubData() const { return has_fast_buffer_sub_data; } + bool HasNvViewportArray2() const { + return has_nv_viewport_array2; + } + + bool UseAssemblyShaders() const { + return use_assembly_shaders; + } + + bool UseAsynchronousShaders() const { + return use_asynchronous_shaders; + } + private: static bool TestVariableAoffi(); static bool TestPreciseBug(); - std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings; + std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{}; + std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{}; std::size_t uniform_buffer_alignment{}; std::size_t shader_storage_alignment{}; u32 max_vertex_attributes{}; u32 max_varyings{}; + u32 max_compute_shared_memory_size{}; bool has_warp_intrinsics{}; bool has_shader_ballot{}; bool has_vertex_viewport_layer{}; bool has_image_load_formatted{}; + bool has_texture_shadow_lod{}; + bool has_vertex_buffer_unified_memory{}; bool has_astc{}; bool has_variable_aoffi{}; bool has_component_indexing_bug{}; bool has_precise_bug{}; - bool has_broken_compute{}; bool has_fast_buffer_sub_data{}; + bool has_nv_viewport_array2{}; + bool use_assembly_shaders{}; + bool use_asynchronous_shaders{}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_fence_manager.cpp b/src/video_core/renderer_opengl/gl_fence_manager.cpp new file mode 100644 index 000000000..b532fdcc2 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp @@ -0,0 +1,73 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" + +#include <glad/glad.h> + +#include "video_core/renderer_opengl/gl_buffer_cache.h" +#include "video_core/renderer_opengl/gl_fence_manager.h" + +namespace OpenGL { + +GLInnerFence::GLInnerFence(u32 payload, bool is_stubbed) : FenceBase(payload, is_stubbed) {} + +GLInnerFence::GLInnerFence(GPUVAddr address, u32 payload, bool is_stubbed) + : FenceBase(address, payload, is_stubbed) {} + +GLInnerFence::~GLInnerFence() = default; + +void GLInnerFence::Queue() { + if (is_stubbed) { + return; + } + ASSERT(sync_object.handle == 0); + sync_object.Create(); +} + +bool GLInnerFence::IsSignaled() const { + if (is_stubbed) { + return true; + } + ASSERT(sync_object.handle != 0); + GLsizei length; + GLint sync_status; + glGetSynciv(sync_object.handle, GL_SYNC_STATUS, sizeof(GLint), &length, &sync_status); + return sync_status == GL_SIGNALED; +} + +void GLInnerFence::Wait() { + if (is_stubbed) { + return; + } + ASSERT(sync_object.handle != 0); + glClientWaitSync(sync_object.handle, 0, GL_TIMEOUT_IGNORED); +} + +FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu, + TextureCacheOpenGL& texture_cache, + OGLBufferCache& buffer_cache, QueryCache& query_cache) + : GenericFenceManager{rasterizer, gpu, texture_cache, buffer_cache, query_cache} {} + +Fence FenceManagerOpenGL::CreateFence(u32 value, bool is_stubbed) { + return std::make_shared<GLInnerFence>(value, is_stubbed); +} + +Fence FenceManagerOpenGL::CreateFence(GPUVAddr addr, u32 value, bool is_stubbed) { + return std::make_shared<GLInnerFence>(addr, value, is_stubbed); +} + +void FenceManagerOpenGL::QueueFence(Fence& fence) { + fence->Queue(); +} + +bool FenceManagerOpenGL::IsFenceSignaled(Fence& fence) const { + return fence->IsSignaled(); +} + +void FenceManagerOpenGL::WaitFence(Fence& fence) { + fence->Wait(); +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_fence_manager.h b/src/video_core/renderer_opengl/gl_fence_manager.h new file mode 100644 index 000000000..da1dcdace --- /dev/null +++ b/src/video_core/renderer_opengl/gl_fence_manager.h @@ -0,0 +1,52 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> + +#include "common/common_types.h" +#include "video_core/fence_manager.h" +#include "video_core/renderer_opengl/gl_buffer_cache.h" +#include "video_core/renderer_opengl/gl_query_cache.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/renderer_opengl/gl_texture_cache.h" + +namespace OpenGL { + +class GLInnerFence : public VideoCommon::FenceBase { +public: + GLInnerFence(u32 payload, bool is_stubbed); + GLInnerFence(GPUVAddr address, u32 payload, bool is_stubbed); + ~GLInnerFence(); + + void Queue(); + + bool IsSignaled() const; + + void Wait(); + +private: + OGLSync sync_object; +}; + +using Fence = std::shared_ptr<GLInnerFence>; +using GenericFenceManager = + VideoCommon::FenceManager<Fence, TextureCacheOpenGL, OGLBufferCache, QueryCache>; + +class FenceManagerOpenGL final : public GenericFenceManager { +public: + explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu, + TextureCacheOpenGL& texture_cache, OGLBufferCache& buffer_cache, + QueryCache& query_cache); + +protected: + Fence CreateFence(u32 value, bool is_stubbed) override; + Fence CreateFence(GPUVAddr addr, u32 value, bool is_stubbed) override; + void QueueFence(Fence& fence) override; + bool IsFenceSignaled(Fence& fence) const override; + void WaitFence(Fence& fence) override; +}; + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp index f12e9f55f..1a3d9720e 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.cpp +++ b/src/video_core/renderer_opengl/gl_query_cache.cpp @@ -30,12 +30,11 @@ constexpr GLenum GetTarget(VideoCore::QueryType type) { } // Anonymous namespace -QueryCache::QueryCache(Core::System& system, RasterizerOpenGL& gl_rasterizer) - : VideoCommon::QueryCacheBase< - QueryCache, CachedQuery, CounterStream, HostCounter, - std::vector<OGLQuery>>{system, - static_cast<VideoCore::RasterizerInterface&>(gl_rasterizer)}, - gl_rasterizer{gl_rasterizer} {} +QueryCache::QueryCache(RasterizerOpenGL& rasterizer, Tegra::Engines::Maxwell3D& maxwell3d, + Tegra::MemoryManager& gpu_memory) + : VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter>( + rasterizer, maxwell3d, gpu_memory), + gl_rasterizer{rasterizer} {} QueryCache::~QueryCache() = default; @@ -90,13 +89,15 @@ u64 HostCounter::BlockingQuery() const { CachedQuery::CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr) : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr}, cache{&cache}, type{type} {} +CachedQuery::~CachedQuery() = default; + CachedQuery::CachedQuery(CachedQuery&& rhs) noexcept : VideoCommon::CachedQueryBase<HostCounter>(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {} CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept { - VideoCommon::CachedQueryBase<HostCounter>::operator=(std::move(rhs)); cache = rhs.cache; type = rhs.type; + CachedQueryBase<HostCounter>::operator=(std::move(rhs)); return *this; } diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h index d8e7052a1..82cac51ee 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.h +++ b/src/video_core/renderer_opengl/gl_query_cache.h @@ -26,10 +26,11 @@ class RasterizerOpenGL; using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; -class QueryCache final : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, - HostCounter, std::vector<OGLQuery>> { +class QueryCache final + : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { public: - explicit QueryCache(Core::System& system, RasterizerOpenGL& rasterizer); + explicit QueryCache(RasterizerOpenGL& rasterizer, Tegra::Engines::Maxwell3D& maxwell3d, + Tegra::MemoryManager& gpu_memory); ~QueryCache(); OGLQuery AllocateQuery(VideoCore::QueryType type); @@ -40,6 +41,7 @@ public: private: RasterizerOpenGL& gl_rasterizer; + std::array<std::vector<OGLQuery>, VideoCore::NumQueryTypes> query_pools; }; class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> { @@ -62,10 +64,12 @@ class CachedQuery final : public VideoCommon::CachedQueryBase<HostCounter> { public: explicit CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr); - CachedQuery(CachedQuery&& rhs) noexcept; - CachedQuery(const CachedQuery&) = delete; + ~CachedQuery() override; + CachedQuery(CachedQuery&& rhs) noexcept; CachedQuery& operator=(CachedQuery&& rhs) noexcept; + + CachedQuery(const CachedQuery&) = delete; CachedQuery& operator=(const CachedQuery&) = delete; void Flush() override; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index f4598fbf7..cfddbde5d 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -30,6 +30,7 @@ #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/maxwell_to_gl.h" #include "video_core/renderer_opengl/renderer_opengl.h" +#include "video_core/shader_cache.h" namespace OpenGL { @@ -54,19 +55,36 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255 namespace { -constexpr std::size_t NumSupportedVertexAttributes = 16; +constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18; +constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE = + NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize; +constexpr std::size_t TOTAL_CONST_BUFFER_BYTES = + NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage; + +constexpr std::size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16; +constexpr std::size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16; template <typename Engine, typename Entry> Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, ShaderType shader_type, std::size_t index = 0) { - if (entry.IsBindless()) { - const Tegra::Texture::TextureHandle tex_handle = - engine.AccessConstBuffer32(shader_type, entry.GetBuffer(), entry.GetOffset()); - return engine.GetTextureInfo(tex_handle); + if constexpr (std::is_same_v<Entry, SamplerEntry>) { + if (entry.is_separated) { + const u32 buffer_1 = entry.buffer; + const u32 buffer_2 = entry.secondary_buffer; + const u32 offset_1 = entry.offset; + const u32 offset_2 = entry.secondary_offset; + const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1); + const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2); + return engine.GetTextureInfo(handle_1 | handle_2); + } + } + if (entry.is_bindless) { + const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset); + return engine.GetTextureInfo(handle); } + const auto& gpu_profile = engine.AccessGuestDriverProfile(); - const u32 offset = - entry.GetOffset() + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize()); + const u32 offset = entry.offset + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize()); if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) { return engine.GetStageTexture(shader_type, offset); } else { @@ -89,23 +107,84 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer, return buffer.size; } +/// Translates hardware transform feedback indices +/// @param location Hardware location +/// @return Pair of ARB_transform_feedback3 token stream first and third arguments +/// @note Read https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_transform_feedback3.txt +std::pair<GLint, GLint> TransformFeedbackEnum(u8 location) { + const u8 index = location / 4; + if (index >= 8 && index <= 39) { + return {GL_GENERIC_ATTRIB_NV, index - 8}; + } + if (index >= 48 && index <= 55) { + return {GL_TEXTURE_COORD_NV, index - 48}; + } + switch (index) { + case 7: + return {GL_POSITION, 0}; + case 40: + return {GL_PRIMARY_COLOR_NV, 0}; + case 41: + return {GL_SECONDARY_COLOR_NV, 0}; + case 42: + return {GL_BACK_PRIMARY_COLOR_NV, 0}; + case 43: + return {GL_BACK_SECONDARY_COLOR_NV, 0}; + } + UNIMPLEMENTED_MSG("index={}", static_cast<int>(index)); + return {GL_POSITION, 0}; +} + void oglEnable(GLenum cap, bool state) { (state ? glEnable : glDisable)(cap); } +void UpdateBindlessSSBOs(GLenum target, const BindlessSSBO* ssbos, size_t num_ssbos) { + if (num_ssbos == 0) { + return; + } + glProgramLocalParametersI4uivNV(target, 0, static_cast<GLsizei>(num_ssbos), + reinterpret_cast<const GLuint*>(ssbos)); +} + } // Anonymous namespace -RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, - ScreenInfo& info, GLShader::ProgramManager& program_manager, - StateTracker& state_tracker) - : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device, state_tracker}, - shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, system{system}, - screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker}, - buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} { +RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu_, + Core::Memory::Memory& cpu_memory, const Device& device_, + ScreenInfo& screen_info_, ProgramManager& program_manager_, + StateTracker& state_tracker_) + : RasterizerAccelerated{cpu_memory}, gpu(gpu_), maxwell3d(gpu.Maxwell3D()), + kepler_compute(gpu.KeplerCompute()), gpu_memory(gpu.MemoryManager()), device(device_), + screen_info(screen_info_), program_manager(program_manager_), state_tracker(state_tracker_), + texture_cache(*this, maxwell3d, gpu_memory, device, state_tracker), + shader_cache(*this, emu_window, gpu, maxwell3d, kepler_compute, gpu_memory, device), + query_cache(*this, maxwell3d, gpu_memory), + buffer_cache(*this, gpu_memory, cpu_memory, device, STREAM_BUFFER_SIZE), + fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache), + async_shaders(emu_window) { CheckExtensions(); + + unified_uniform_buffer.Create(); + glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0); + + if (device.UseAssemblyShaders()) { + glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data()); + for (const GLuint cbuf : staging_cbufs) { + glNamedBufferStorage(cbuf, static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize), + nullptr, 0); + } + } + + if (device.UseAsynchronousShaders()) { + async_shaders.AllocateWorkers(); + } } -RasterizerOpenGL::~RasterizerOpenGL() {} +RasterizerOpenGL::~RasterizerOpenGL() { + if (device.UseAssemblyShaders()) { + glDeleteBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data()); + } +} void RasterizerOpenGL::CheckExtensions() { if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) { @@ -116,8 +195,7 @@ void RasterizerOpenGL::CheckExtensions() { } void RasterizerOpenGL::SetupVertexFormat() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::VertexFormats]) { return; } @@ -131,13 +209,13 @@ void RasterizerOpenGL::SetupVertexFormat() { // avoid OpenGL errors. // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't // assume every shader uses them all. - for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) { + for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) { if (!flags[Dirty::VertexFormat0 + index]) { continue; } flags[Dirty::VertexFormat0 + index] = false; - const auto attrib = gpu.regs.vertex_attrib_format[index]; + const auto attrib = maxwell3d.regs.vertex_attrib_format[index]; const auto gl_index = static_cast<GLuint>(index); // Disable constant attributes. @@ -150,9 +228,10 @@ void RasterizerOpenGL::SetupVertexFormat() { if (attrib.type == Maxwell::VertexAttribute::Type::SignedInt || attrib.type == Maxwell::VertexAttribute::Type::UnsignedInt) { glVertexAttribIFormat(gl_index, attrib.ComponentCount(), - MaxwellToGL::VertexType(attrib), attrib.offset); + MaxwellToGL::VertexFormat(attrib), attrib.offset); } else { - glVertexAttribFormat(gl_index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib), + glVertexAttribFormat(gl_index, attrib.ComponentCount(), + MaxwellToGL::VertexFormat(attrib), attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset); } glVertexAttribBinding(gl_index, attrib.buffer); @@ -160,8 +239,7 @@ void RasterizerOpenGL::SetupVertexFormat() { } void RasterizerOpenGL::SetupVertexBuffer() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::VertexBuffers]) { return; } @@ -169,9 +247,11 @@ void RasterizerOpenGL::SetupVertexBuffer() { MICROPROFILE_SCOPE(OpenGL_VB); + const bool use_unified_memory = device.HasVertexBufferUnifiedMemory(); + // Upload all guest vertex arrays sequentially to our buffer - const auto& regs = gpu.regs; - for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { + const auto& regs = maxwell3d.regs; + for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) { if (!flags[Dirty::VertexBuffer0 + index]) { continue; } @@ -184,27 +264,37 @@ void RasterizerOpenGL::SetupVertexBuffer() { const GPUVAddr start = vertex_array.StartAddress(); const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); - - ASSERT(end > start); - const u64 size = end - start + 1; - const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size); - - // Bind the vertex array to the buffer at the current offset. - vertex_array_pushbuffer.SetVertexBuffer(static_cast<GLuint>(index), vertex_buffer, - vertex_buffer_offset, vertex_array.stride); + ASSERT(end >= start); + + const GLuint gl_index = static_cast<GLuint>(index); + const u64 size = end - start; + if (size == 0) { + glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride); + if (use_unified_memory) { + glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0); + } + continue; + } + const auto info = buffer_cache.UploadMemory(start, size); + if (use_unified_memory) { + glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride); + glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, + info.address + info.offset, size); + } else { + glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride); + } } } void RasterizerOpenGL::SetupVertexInstances() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::VertexInstances]) { return; } flags[Dirty::VertexInstances] = false; - const auto& regs = gpu.regs; - for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) { + const auto& regs = maxwell3d.regs; + for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) { if (!flags[Dirty::VertexInstance0 + index]) { continue; } @@ -219,24 +309,23 @@ void RasterizerOpenGL::SetupVertexInstances() { GLintptr RasterizerOpenGL::SetupIndexBuffer() { MICROPROFILE_SCOPE(OpenGL_Index); - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; const std::size_t size = CalculateIndexBufferSize(); - const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size); - vertex_array_pushbuffer.SetIndexBuffer(buffer); - return offset; + const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size); + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle); + return info.offset; } void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { MICROPROFILE_SCOPE(OpenGL_Shader); - auto& gpu = system.GPU().Maxwell3D(); u32 clip_distances = 0; for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { - const auto& shader_config = gpu.regs.shader_config[index]; + const auto& shader_config = maxwell3d.regs.shader_config[index]; const auto program{static_cast<Maxwell::ShaderProgram>(index)}; // Skip stages that are not enabled - if (!gpu.regs.IsShaderConfigEnabled(index)) { + if (!maxwell3d.regs.IsShaderConfigEnabled(index)) { switch (program) { case Maxwell::ShaderProgram::Geometry: program_manager.UseGeometryShader(0); @@ -251,23 +340,15 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { } // Currently this stages are not supported in the OpenGL backend. - // Todo(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL - if (program == Maxwell::ShaderProgram::TesselationControl) { - continue; - } else if (program == Maxwell::ShaderProgram::TesselationEval) { + // TODO(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL + if (program == Maxwell::ShaderProgram::TesselationControl || + program == Maxwell::ShaderProgram::TesselationEval) { continue; } - Shader shader{shader_cache.GetStageProgram(program)}; + Shader* const shader = shader_cache.GetStageProgram(program, async_shaders); - // Stage indices are 0 - 5 - const std::size_t stage = index == 0 ? 0 : index - 1; - SetupDrawConstBuffers(stage, shader); - SetupDrawGlobalMemory(stage, shader); - SetupDrawTextures(stage, shader); - SetupDrawImages(stage, shader); - - const GLuint program_handle = shader->GetHandle(); + const GLuint program_handle = shader->IsBuilt() ? shader->GetHandle() : 0; switch (program) { case Maxwell::ShaderProgram::VertexA: case Maxwell::ShaderProgram::VertexB: @@ -284,6 +365,13 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { shader_config.enable.Value(), shader_config.offset); } + // Stage indices are 0 - 5 + const std::size_t stage = index == 0 ? 0 : index - 1; + SetupDrawConstBuffers(stage, shader); + SetupDrawGlobalMemory(stage, shader); + SetupDrawTextures(stage, shader); + SetupDrawImages(stage, shader); + // Workaround for Intel drivers. // When a clip distance is enabled but not set in the shader it crops parts of the screen // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the @@ -298,11 +386,11 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { } SyncClipEnabled(clip_distances); - gpu.dirty.flags[Dirty::Shaders] = false; + maxwell3d.dirty.flags[Dirty::Shaders] = false; } std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const { - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; std::size_t size = 0; for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { @@ -312,49 +400,42 @@ std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const { const GPUVAddr start = regs.vertex_array[index].StartAddress(); const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); - ASSERT(end > start); - size += end - start + 1; + size += end - start; + ASSERT(end >= start); } return size; } std::size_t RasterizerOpenGL::CalculateIndexBufferSize() const { - const auto& regs = system.GPU().Maxwell3D().regs; - - return static_cast<std::size_t>(regs.index_array.count) * - static_cast<std::size_t>(regs.index_array.FormatSizeInBytes()); + return static_cast<std::size_t>(maxwell3d.regs.index_array.count) * + static_cast<std::size_t>(maxwell3d.regs.index_array.FormatSizeInBytes()); } -void RasterizerOpenGL::LoadDiskResources(const std::atomic_bool& stop_loading, +void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) { - shader_cache.LoadDiskCache(stop_loading, callback); -} - -void RasterizerOpenGL::SetupDirtyFlags() { - state_tracker.Initialize(); + shader_cache.LoadDiskCache(title_id, stop_loading, callback); } void RasterizerOpenGL::ConfigureFramebuffers() { MICROPROFILE_SCOPE(OpenGL_Framebuffer); - auto& gpu = system.GPU().Maxwell3D(); - if (!gpu.dirty.flags[VideoCommon::Dirty::RenderTargets]) { + if (!maxwell3d.dirty.flags[VideoCommon::Dirty::RenderTargets]) { return; } - gpu.dirty.flags[VideoCommon::Dirty::RenderTargets] = false; + maxwell3d.dirty.flags[VideoCommon::Dirty::RenderTargets] = false; texture_cache.GuardRenderTargets(true); - View depth_surface = texture_cache.GetDepthBufferSurface(); + View depth_surface = texture_cache.GetDepthBufferSurface(true); - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0); // Bind the framebuffer surfaces FramebufferCacheKey key; const auto colors_count = static_cast<std::size_t>(regs.rt_control.count); for (std::size_t index = 0; index < colors_count; ++index) { - View color_surface{texture_cache.GetColorBufferSurface(index)}; + View color_surface{texture_cache.GetColorBufferSurface(index, true)}; if (!color_surface) { continue; } @@ -378,40 +459,62 @@ void RasterizerOpenGL::ConfigureFramebuffers() { glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key)); } -void RasterizerOpenGL::ConfigureClearFramebuffer(bool using_color_fb, bool using_depth_fb, - bool using_stencil_fb) { - auto& gpu = system.GPU().Maxwell3D(); - const auto& regs = gpu.regs; +void RasterizerOpenGL::ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil) { + const auto& regs = maxwell3d.regs; texture_cache.GuardRenderTargets(true); View color_surface; - if (using_color_fb) { + + if (using_color) { + // Determine if we have to preserve the contents. + // First we have to make sure all clear masks are enabled. + bool preserve_contents = !regs.clear_buffers.R || !regs.clear_buffers.G || + !regs.clear_buffers.B || !regs.clear_buffers.A; const std::size_t index = regs.clear_buffers.RT; - color_surface = texture_cache.GetColorBufferSurface(index); + if (regs.clear_flags.scissor) { + // Then we have to confirm scissor testing clears the whole image. + const auto& scissor = regs.scissor_test[0]; + preserve_contents |= scissor.min_x > 0; + preserve_contents |= scissor.min_y > 0; + preserve_contents |= scissor.max_x < regs.rt[index].width; + preserve_contents |= scissor.max_y < regs.rt[index].height; + } + + color_surface = texture_cache.GetColorBufferSurface(index, preserve_contents); texture_cache.MarkColorBufferInUse(index); } + View depth_surface; - if (using_depth_fb || using_stencil_fb) { - depth_surface = texture_cache.GetDepthBufferSurface(); + if (using_depth_stencil) { + bool preserve_contents = false; + if (regs.clear_flags.scissor) { + // For depth stencil clears we only have to confirm scissor test covers the whole image. + const auto& scissor = regs.scissor_test[0]; + preserve_contents |= scissor.min_x > 0; + preserve_contents |= scissor.min_y > 0; + preserve_contents |= scissor.max_x < regs.zeta_width; + preserve_contents |= scissor.max_y < regs.zeta_height; + } + + depth_surface = texture_cache.GetDepthBufferSurface(preserve_contents); texture_cache.MarkDepthBufferInUse(); } texture_cache.GuardRenderTargets(false); FramebufferCacheKey key; - key.colors[0] = color_surface; - key.zeta = depth_surface; + key.colors[0] = std::move(color_surface); + key.zeta = std::move(depth_surface); state_tracker.NotifyFramebuffer(); glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key)); } void RasterizerOpenGL::Clear() { - const auto& gpu = system.GPU().Maxwell3D(); - if (!gpu.ShouldExecute()) { + if (!maxwell3d.ShouldExecute()) { return; } - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; bool use_color{}; bool use_depth{}; bool use_stencil{}; @@ -419,8 +522,7 @@ void RasterizerOpenGL::Clear() { if (regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B || regs.clear_buffers.A) { use_color = true; - } - if (use_color) { + state_tracker.NotifyColorMask0(); glColorMaski(0, regs.clear_buffers.R != 0, regs.clear_buffers.G != 0, regs.clear_buffers.B != 0, regs.clear_buffers.A != 0); @@ -458,7 +560,7 @@ void RasterizerOpenGL::Clear() { UNIMPLEMENTED_IF(regs.clear_flags.viewport); - ConfigureClearFramebuffer(use_color, use_depth, use_stencil); + ConfigureClearFramebuffer(use_color, use_depth || use_stencil); if (use_color) { glClearBufferfv(GL_COLOR, 0, regs.clear_color); @@ -477,7 +579,6 @@ void RasterizerOpenGL::Clear() { void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { MICROPROFILE_SCOPE(OpenGL_Drawing); - auto& gpu = system.GPU().Maxwell3D(); query_cache.UpdateCounters(); @@ -502,6 +603,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { SyncFramebufferSRGB(); buffer_cache.Acquire(); + current_cbuf = 0; std::size_t buffer_size = CalculateVertexArraysSize(); @@ -511,20 +613,28 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { } // Uniform space for the 5 shader stages - buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + - (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) * - Maxwell::MaxShaderStage; + buffer_size = + Common::AlignUp<std::size_t>(buffer_size, 4) + + (sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage; // Add space for at least 18 constant buffers buffer_size += Maxwell::MaxConstBuffers * (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); // Prepare the vertex array. - buffer_cache.Map(buffer_size); + const bool invalidated = buffer_cache.Map(buffer_size); + + if (invalidated) { + // When the stream buffer has been invalidated, we have to consider vertex buffers as dirty + auto& dirty = maxwell3d.dirty.flags; + dirty[Dirty::VertexBuffers] = true; + for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) { + dirty[index] = true; + } + } // Prepare vertex array format. SetupVertexFormat(); - vertex_array_pushbuffer.Setup(); // Upload vertex and index data. SetupVertexBuffer(); @@ -534,21 +644,19 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { index_buffer_offset = SetupIndexBuffer(); } - // Prepare packed bindings. - bind_ubo_pushbuffer.Setup(); - bind_ssbo_pushbuffer.Setup(); - // Setup emulation uniform buffer. - GLShader::MaxwellUniformData ubo; - ubo.SetFromRegs(gpu); - const auto [buffer, offset] = - buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); - bind_ubo_pushbuffer.Push(EmulationUniformBlockBinding, buffer, offset, - static_cast<GLsizeiptr>(sizeof(ubo))); + if (!device.UseAssemblyShaders()) { + MaxwellUniformData ubo; + ubo.SetFromRegs(maxwell3d); + const auto info = + buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); + glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset, + static_cast<GLsizeiptr>(sizeof(ubo))); + } // Setup shaders and their used resources. texture_cache.GuardSamplers(true); - const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology); + const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(maxwell3d.regs.draw.topology); SetupShaders(primitive_mode); texture_cache.GuardSamplers(false); @@ -557,11 +665,6 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { // Signal the buffer cache that we are not going to upload more things. buffer_cache.Unmap(); - // Now that we are no longer uploading data, we can safely bind the buffers to OpenGL. - vertex_array_pushbuffer.Bind(); - bind_ubo_pushbuffer.Bind(); - bind_ssbo_pushbuffer.Bind(); - program_manager.BindGraphicsPipeline(); if (texture_cache.TextureBarrier()) { @@ -570,14 +673,14 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { BeginTransformFeedback(primitive_mode); - const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance); + const GLuint base_instance = static_cast<GLuint>(maxwell3d.regs.vb_base_instance); const GLsizei num_instances = - static_cast<GLsizei>(is_instanced ? gpu.mme_draw.instance_count : 1); + static_cast<GLsizei>(is_instanced ? maxwell3d.mme_draw.instance_count : 1); if (is_indexed) { - const GLint base_vertex = static_cast<GLint>(gpu.regs.vb_element_base); - const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.index_array.count); + const GLint base_vertex = static_cast<GLint>(maxwell3d.regs.vb_element_base); + const GLsizei num_vertices = static_cast<GLsizei>(maxwell3d.regs.index_array.count); const GLvoid* offset = reinterpret_cast<const GLvoid*>(index_buffer_offset); - const GLenum format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format); + const GLenum format = MaxwellToGL::IndexFormat(maxwell3d.regs.index_array.format); if (num_instances == 1 && base_instance == 0 && base_vertex == 0) { glDrawElements(primitive_mode, num_vertices, format, offset); } else if (num_instances == 1 && base_instance == 0) { @@ -596,8 +699,8 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { base_instance); } } else { - const GLint base_vertex = static_cast<GLint>(gpu.regs.vertex_buffer.first); - const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.vertex_buffer.count); + const GLint base_vertex = static_cast<GLint>(maxwell3d.regs.vertex_buffer.first); + const GLsizei num_vertices = static_cast<GLsizei>(maxwell3d.regs.vertex_buffer.count); if (num_instances == 1 && base_instance == 0) { glDrawArrays(primitive_mode, base_vertex, num_vertices); } else if (base_instance == 0) { @@ -611,37 +714,32 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { EndTransformFeedback(); ++num_queued_commands; + + gpu.TickWork(); } void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { - if (device.HasBrokenCompute()) { - return; - } - buffer_cache.Acquire(); + current_cbuf = 0; auto kernel = shader_cache.GetComputeKernel(code_addr); + program_manager.BindCompute(kernel->GetHandle()); + SetupComputeTextures(kernel); SetupComputeImages(kernel); - program_manager.BindComputeShader(kernel->GetHandle()); const std::size_t buffer_size = Tegra::Engines::KeplerCompute::NumConstBuffers * (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); buffer_cache.Map(buffer_size); - bind_ubo_pushbuffer.Setup(); - bind_ssbo_pushbuffer.Setup(); - SetupComputeConstBuffers(kernel); SetupComputeGlobalMemory(kernel); buffer_cache.Unmap(); - bind_ubo_pushbuffer.Bind(); - bind_ssbo_pushbuffer.Bind(); - - const auto& launch_desc = system.GPU().KeplerCompute().launch_description; + const auto& launch_desc = kepler_compute.launch_description; + program_manager.BindCompute(kernel->GetHandle()); glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); ++num_queued_commands; } @@ -667,6 +765,13 @@ void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) { query_cache.FlushRegion(addr, size); } +bool RasterizerOpenGL::MustFlushRegion(VAddr addr, u64 size) { + if (!Settings::IsGPULevelHigh()) { + return buffer_cache.MustFlushRegion(addr, size); + } + return texture_cache.MustFlushRegion(addr, size) || buffer_cache.MustFlushRegion(addr, size); +} + void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { MICROPROFILE_SCOPE(OpenGL_CacheManagement); if (addr == 0 || size == 0) { @@ -678,13 +783,64 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { query_cache.InvalidateRegion(addr, size); } +void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) { + MICROPROFILE_SCOPE(OpenGL_CacheManagement); + if (addr == 0 || size == 0) { + return; + } + texture_cache.OnCPUWrite(addr, size); + shader_cache.OnCPUWrite(addr, size); + buffer_cache.OnCPUWrite(addr, size); +} + +void RasterizerOpenGL::SyncGuestHost() { + MICROPROFILE_SCOPE(OpenGL_CacheManagement); + texture_cache.SyncGuestHost(); + buffer_cache.SyncGuestHost(); + shader_cache.SyncGuestHost(); +} + +void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) { + if (!gpu.IsAsync()) { + gpu_memory.Write<u32>(addr, value); + return; + } + fence_manager.SignalSemaphore(addr, value); +} + +void RasterizerOpenGL::SignalSyncPoint(u32 value) { + if (!gpu.IsAsync()) { + gpu.IncrementSyncPoint(value); + return; + } + fence_manager.SignalSyncPoint(value); +} + +void RasterizerOpenGL::ReleaseFences() { + if (!gpu.IsAsync()) { + return; + } + fence_manager.WaitPendingFences(); +} + void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) { - if (Settings::values.use_accurate_gpu_emulation) { + if (Settings::IsGPULevelExtreme()) { FlushRegion(addr, size); } InvalidateRegion(addr, size); } +void RasterizerOpenGL::WaitForIdle() { + // Place a barrier on everything that is not framebuffer related. + // This is related to another flag that is not currently implemented. + glMemoryBarrier(GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT | GL_ELEMENT_ARRAY_BARRIER_BIT | + GL_UNIFORM_BARRIER_BIT | GL_TEXTURE_FETCH_BARRIER_BIT | + GL_SHADER_IMAGE_ACCESS_BARRIER_BIT | GL_COMMAND_BARRIER_BIT | + GL_PIXEL_BUFFER_BARRIER_BIT | GL_TEXTURE_UPDATE_BARRIER_BIT | + GL_BUFFER_UPDATE_BARRIER_BIT | GL_TRANSFORM_FEEDBACK_BARRIER_BIT | + GL_SHADER_STORAGE_BARRIER_BIT | GL_QUERY_BUFFER_BARRIER_BIT); +} + void RasterizerOpenGL::FlushCommands() { // Only flush when we have commands queued to OpenGL. if (num_queued_commands == 0) { @@ -739,40 +895,72 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, return true; } -void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) { + static constexpr std::array PARAMETER_LUT{ + GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV, + GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV, + GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV, + }; MICROPROFILE_SCOPE(OpenGL_UBO); - const auto& stages = system.GPU().Maxwell3D().state.shader_stages; + const auto& stages = maxwell3d.state.shader_stages; const auto& shader_stage = stages[stage_index]; - - u32 binding = device.GetBaseBindings(stage_index).uniform_buffer; - for (const auto& entry : shader->GetEntries().const_buffers) { - const auto& buffer = shader_stage.const_buffers[entry.GetIndex()]; - SetupConstBuffer(binding++, buffer, entry); + const auto& entries = shader->GetEntries(); + const bool use_unified = entries.use_unified_uniforms; + const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE; + + const auto base_bindings = device.GetBaseBindings(stage_index); + u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer; + for (const auto& entry : entries.const_buffers) { + const u32 index = entry.GetIndex(); + const auto& buffer = shader_stage.const_buffers[index]; + SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified, + base_unified_offset + index * Maxwell::MaxConstBufferSize); + ++binding; + } + if (use_unified) { + const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer + + entries.global_memory_entries.size()); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, + base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE); } } -void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { +void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) { MICROPROFILE_SCOPE(OpenGL_UBO); - const auto& launch_desc = system.GPU().KeplerCompute().launch_description; + const auto& launch_desc = kepler_compute.launch_description; + const auto& entries = kernel->GetEntries(); + const bool use_unified = entries.use_unified_uniforms; u32 binding = 0; - for (const auto& entry : kernel->GetEntries().const_buffers) { + for (const auto& entry : entries.const_buffers) { const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); Tegra::Engines::ConstBufferInfo buffer; buffer.address = config.Address(); buffer.size = config.size; buffer.enabled = mask[entry.GetIndex()]; - SetupConstBuffer(binding++, buffer, entry); + SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry, + use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize); + ++binding; + } + if (use_unified) { + const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size()); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0, + NUM_CONST_BUFFERS_BYTES_PER_STAGE); } } -void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, - const ConstBufferEntry& entry) { +void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, + const Tegra::Engines::ConstBufferInfo& buffer, + const ConstBufferEntry& entry, bool use_unified, + std::size_t unified_offset) { if (!buffer.enabled) { // Set values to zero to unbind buffers - bind_ubo_pushbuffer.Push(binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0, - sizeof(float)); + if (device.UseAssemblyShaders()) { + glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0); + } else { + glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float)); + } return; } @@ -780,68 +968,112 @@ void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::Const // UBO alignment requirements. const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); - const auto alignment = device.GetUniformBufferAlignment(); - const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false, - device.HasFastBufferSubData()); - bind_ubo_pushbuffer.Push(binding, cbuf, offset, size); + const bool fast_upload = !use_unified && device.HasFastBufferSubData(); + + const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment(); + const GPUVAddr gpu_addr = buffer.address; + auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload); + + if (device.UseAssemblyShaders()) { + UNIMPLEMENTED_IF(use_unified); + if (info.offset != 0) { + const GLuint staging_cbuf = staging_cbufs[current_cbuf++]; + glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size); + info.handle = staging_cbuf; + info.offset = 0; + } + glBindBufferRangeNV(stage, binding, info.handle, info.offset, size); + return; + } + + if (use_unified) { + glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset, + unified_offset, size); + } else { + glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size); + } } -void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) { - auto& gpu{system.GPU()}; - auto& memory_manager{gpu.MemoryManager()}; - const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]}; +void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) { + static constexpr std::array TARGET_LUT = { + GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV, + GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV, + }; + + const auto& cbufs{maxwell3d.state.shader_stages[stage_index]}; + const auto& entries{shader->GetEntries().global_memory_entries}; - u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer; - for (const auto& entry : shader->GetEntries().global_memory_entries) { - const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()}; - const auto gpu_addr{memory_manager.Read<u64>(addr)}; - const auto size{memory_manager.Read<u32>(addr + 8)}; - SetupGlobalMemory(binding++, entry, gpu_addr, size); + std::array<BindlessSSBO, 32> ssbos; + ASSERT(entries.size() < ssbos.size()); + + const bool assembly_shaders = device.UseAssemblyShaders(); + u32 binding = assembly_shaders ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer; + for (const auto& entry : entries) { + const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset}; + const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)}; + const u32 size{gpu_memory.Read<u32>(addr + 8)}; + SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]); + ++binding; + } + if (assembly_shaders) { + UpdateBindlessSSBOs(TARGET_LUT[stage_index], ssbos.data(), entries.size()); } } -void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) { - auto& gpu{system.GPU()}; - auto& memory_manager{gpu.MemoryManager()}; - const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config}; +void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) { + const auto& cbufs{kepler_compute.launch_description.const_buffer_config}; + const auto& entries{kernel->GetEntries().global_memory_entries}; + + std::array<BindlessSSBO, 32> ssbos; + ASSERT(entries.size() < ssbos.size()); u32 binding = 0; - for (const auto& entry : kernel->GetEntries().global_memory_entries) { - const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()}; - const auto gpu_addr{memory_manager.Read<u64>(addr)}; - const auto size{memory_manager.Read<u32>(addr + 8)}; - SetupGlobalMemory(binding++, entry, gpu_addr, size); + for (const auto& entry : entries) { + const GPUVAddr addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset}; + const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)}; + const u32 size{gpu_memory.Read<u32>(addr + 8)}; + SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]); + ++binding; + } + if (device.UseAssemblyShaders()) { + UpdateBindlessSSBOs(GL_COMPUTE_PROGRAM_NV, ssbos.data(), ssbos.size()); } } void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, - GPUVAddr gpu_addr, std::size_t size) { - const auto alignment{device.GetShaderStorageBufferAlignment()}; - const auto [ssbo, buffer_offset] = - buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.IsWritten()); - bind_ssbo_pushbuffer.Push(binding, ssbo, buffer_offset, static_cast<GLsizeiptr>(size)); + GPUVAddr gpu_addr, size_t size, BindlessSSBO* ssbo) { + const size_t alignment{device.GetShaderStorageBufferAlignment()}; + const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written); + if (device.UseAssemblyShaders()) { + *ssbo = BindlessSSBO{ + .address = static_cast<GLuint64EXT>(info.address + info.offset), + .length = static_cast<GLsizei>(size), + .padding = 0, + }; + } else { + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset, + static_cast<GLsizeiptr>(size)); + } } -void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) { MICROPROFILE_SCOPE(OpenGL_Texture); - const auto& maxwell3d = system.GPU().Maxwell3D(); u32 binding = device.GetBaseBindings(stage_index).sampler; for (const auto& entry : shader->GetEntries().samplers) { const auto shader_type = static_cast<ShaderType>(stage_index); - for (std::size_t i = 0; i < entry.Size(); ++i) { + for (std::size_t i = 0; i < entry.size; ++i) { const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i); SetupTexture(binding++, texture, entry); } } } -void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) { +void RasterizerOpenGL::SetupComputeTextures(Shader* kernel) { MICROPROFILE_SCOPE(OpenGL_Texture); - const auto& compute = system.GPU().KeplerCompute(); u32 binding = 0; for (const auto& entry : kernel->GetEntries().samplers) { - for (std::size_t i = 0; i < entry.Size(); ++i) { - const auto texture = GetTextureInfo(compute, entry, ShaderType::Compute, i); + for (std::size_t i = 0; i < entry.size; ++i) { + const auto texture = GetTextureInfo(kepler_compute, entry, ShaderType::Compute, i); SetupTexture(binding++, texture, entry); } } @@ -856,33 +1088,27 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu glBindTextureUnit(binding, 0); return; } - glBindTextureUnit(binding, view->GetTexture()); - - if (view->GetSurfaceParams().IsBuffer()) { - return; + const GLuint handle = view->GetTexture(texture.tic.x_source, texture.tic.y_source, + texture.tic.z_source, texture.tic.w_source); + glBindTextureUnit(binding, handle); + if (!view->GetSurfaceParams().IsBuffer()) { + glBindSampler(binding, sampler_cache.GetSampler(texture.tsc)); } - // Apply swizzle to textures that are not buffers. - view->ApplySwizzle(texture.tic.x_source, texture.tic.y_source, texture.tic.z_source, - texture.tic.w_source); - - glBindSampler(binding, sampler_cache.GetSampler(texture.tsc)); } -void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) { - const auto& maxwell3d = system.GPU().Maxwell3D(); +void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, Shader* shader) { u32 binding = device.GetBaseBindings(stage_index).image; for (const auto& entry : shader->GetEntries().images) { - const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage_index); + const auto shader_type = static_cast<ShaderType>(stage_index); const auto tic = GetTextureInfo(maxwell3d, entry, shader_type).tic; SetupImage(binding++, tic, entry); } } -void RasterizerOpenGL::SetupComputeImages(const Shader& shader) { - const auto& compute = system.GPU().KeplerCompute(); +void RasterizerOpenGL::SetupComputeImages(Shader* shader) { u32 binding = 0; for (const auto& entry : shader->GetEntries().images) { - const auto tic = GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute).tic; + const auto tic = GetTextureInfo(kepler_compute, entry, ShaderType::Compute).tic; SetupImage(binding++, tic, entry); } } @@ -894,27 +1120,43 @@ void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& t glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8); return; } - if (!tic.IsBuffer()) { - view->ApplySwizzle(tic.x_source, tic.y_source, tic.z_source, tic.w_source); - } - if (entry.IsWritten()) { + if (entry.is_written) { view->MarkAsModified(texture_cache.Tick()); } - glBindImageTexture(binding, view->GetTexture(), 0, GL_TRUE, 0, GL_READ_WRITE, - view->GetFormat()); + const GLuint handle = view->GetTexture(tic.x_source, tic.y_source, tic.z_source, tic.w_source); + glBindImageTexture(binding, handle, 0, GL_TRUE, 0, GL_READ_WRITE, view->GetFormat()); } void RasterizerOpenGL::SyncViewport() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; - const auto& regs = gpu.regs; + auto& flags = maxwell3d.dirty.flags; + const auto& regs = maxwell3d.regs; const bool dirty_viewport = flags[Dirty::Viewports]; + const bool dirty_clip_control = flags[Dirty::ClipControl]; + + if (dirty_clip_control || flags[Dirty::FrontFace]) { + flags[Dirty::FrontFace] = false; + + GLenum mode = MaxwellToGL::FrontFace(regs.front_face); + if (regs.screen_y_control.triangle_rast_flip != 0 && + regs.viewport_transform[0].scale_y < 0.0f) { + switch (mode) { + case GL_CW: + mode = GL_CCW; + break; + case GL_CCW: + mode = GL_CW; + break; + } + } + glFrontFace(mode); + } + if (dirty_viewport || flags[Dirty::ClipControl]) { flags[Dirty::ClipControl] = false; bool flip_y = false; - if (regs.viewport_transform[0].scale_y < 0.0) { + if (regs.viewport_transform[0].scale_y < 0.0f) { flip_y = !flip_y; } if (regs.screen_y_control.y_negate != 0) { @@ -946,34 +1188,36 @@ void RasterizerOpenGL::SyncViewport() { const GLdouble near_depth = src.translate_z - src.scale_z * reduce_z; const GLdouble far_depth = src.translate_z + src.scale_z; glDepthRangeIndexed(static_cast<GLuint>(i), near_depth, far_depth); + + if (!GLAD_GL_NV_viewport_swizzle) { + continue; + } + glViewportSwizzleNV(static_cast<GLuint>(i), MaxwellToGL::ViewportSwizzle(src.swizzle.x), + MaxwellToGL::ViewportSwizzle(src.swizzle.y), + MaxwellToGL::ViewportSwizzle(src.swizzle.z), + MaxwellToGL::ViewportSwizzle(src.swizzle.w)); } } } void RasterizerOpenGL::SyncDepthClamp() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::DepthClampEnabled]) { return; } flags[Dirty::DepthClampEnabled] = false; - const auto& state = gpu.regs.view_volume_clip_control; - UNIMPLEMENTED_IF_MSG(state.depth_clamp_far != state.depth_clamp_near, - "Unimplemented depth clamp separation!"); - - oglEnable(GL_DEPTH_CLAMP, state.depth_clamp_far || state.depth_clamp_near); + oglEnable(GL_DEPTH_CLAMP, maxwell3d.regs.view_volume_clip_control.depth_clamp_disabled == 0); } void RasterizerOpenGL::SyncClipEnabled(u32 clip_mask) { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::ClipDistances] && !flags[Dirty::Shaders]) { return; } flags[Dirty::ClipDistances] = false; - clip_mask &= gpu.regs.clip_distance_enabled; + clip_mask &= maxwell3d.regs.clip_distance_enabled; if (clip_mask == last_clip_distance_mask) { return; } @@ -989,9 +1233,8 @@ void RasterizerOpenGL::SyncClipCoef() { } void RasterizerOpenGL::SyncCullMode() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; - const auto& regs = gpu.regs; + auto& flags = maxwell3d.dirty.flags; + const auto& regs = maxwell3d.regs; if (flags[Dirty::CullTest]) { flags[Dirty::CullTest] = false; @@ -1003,34 +1246,27 @@ void RasterizerOpenGL::SyncCullMode() { glDisable(GL_CULL_FACE); } } - - if (flags[Dirty::FrontFace]) { - flags[Dirty::FrontFace] = false; - glFrontFace(MaxwellToGL::FrontFace(regs.front_face)); - } } void RasterizerOpenGL::SyncPrimitiveRestart() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::PrimitiveRestart]) { return; } flags[Dirty::PrimitiveRestart] = false; - if (gpu.regs.primitive_restart.enabled) { + if (maxwell3d.regs.primitive_restart.enabled) { glEnable(GL_PRIMITIVE_RESTART); - glPrimitiveRestartIndex(gpu.regs.primitive_restart.index); + glPrimitiveRestartIndex(maxwell3d.regs.primitive_restart.index); } else { glDisable(GL_PRIMITIVE_RESTART); } } void RasterizerOpenGL::SyncDepthTestState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; + const auto& regs = maxwell3d.regs; - const auto& regs = gpu.regs; if (flags[Dirty::DepthMask]) { flags[Dirty::DepthMask] = false; glDepthMask(regs.depth_write_enabled ? GL_TRUE : GL_FALSE); @@ -1048,14 +1284,13 @@ void RasterizerOpenGL::SyncDepthTestState() { } void RasterizerOpenGL::SyncStencilTestState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::StencilTest]) { return; } flags[Dirty::StencilTest] = false; - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; oglEnable(GL_STENCIL_TEST, regs.stencil_enable); glStencilFuncSeparate(GL_FRONT, MaxwellToGL::ComparisonOp(regs.stencil_front_func_func), @@ -1080,25 +1315,24 @@ void RasterizerOpenGL::SyncStencilTestState() { } void RasterizerOpenGL::SyncRasterizeEnable() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::RasterizeEnable]) { return; } flags[Dirty::RasterizeEnable] = false; - oglEnable(GL_RASTERIZER_DISCARD, gpu.regs.rasterize_enable == 0); + oglEnable(GL_RASTERIZER_DISCARD, maxwell3d.regs.rasterize_enable == 0); } void RasterizerOpenGL::SyncPolygonModes() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::PolygonModes]) { return; } flags[Dirty::PolygonModes] = false; - if (gpu.regs.fill_rectangle) { + const auto& regs = maxwell3d.regs; + if (regs.fill_rectangle) { if (!GLAD_GL_NV_fill_rectangle) { LOG_ERROR(Render_OpenGL, "GL_NV_fill_rectangle used and not supported"); glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); @@ -1111,27 +1345,26 @@ void RasterizerOpenGL::SyncPolygonModes() { return; } - if (gpu.regs.polygon_mode_front == gpu.regs.polygon_mode_back) { + if (regs.polygon_mode_front == regs.polygon_mode_back) { flags[Dirty::PolygonModeFront] = false; flags[Dirty::PolygonModeBack] = false; - glPolygonMode(GL_FRONT_AND_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front)); + glPolygonMode(GL_FRONT_AND_BACK, MaxwellToGL::PolygonMode(regs.polygon_mode_front)); return; } if (flags[Dirty::PolygonModeFront]) { flags[Dirty::PolygonModeFront] = false; - glPolygonMode(GL_FRONT, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front)); + glPolygonMode(GL_FRONT, MaxwellToGL::PolygonMode(regs.polygon_mode_front)); } if (flags[Dirty::PolygonModeBack]) { flags[Dirty::PolygonModeBack] = false; - glPolygonMode(GL_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_back)); + glPolygonMode(GL_BACK, MaxwellToGL::PolygonMode(regs.polygon_mode_back)); } } void RasterizerOpenGL::SyncColorMask() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::ColorMasks]) { return; } @@ -1140,7 +1373,7 @@ void RasterizerOpenGL::SyncColorMask() { const bool force = flags[Dirty::ColorMaskCommon]; flags[Dirty::ColorMaskCommon] = false; - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; if (regs.color_mask_common) { if (!force && !flags[Dirty::ColorMask0]) { return; @@ -1165,33 +1398,30 @@ void RasterizerOpenGL::SyncColorMask() { } void RasterizerOpenGL::SyncMultiSampleState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::MultisampleControl]) { return; } flags[Dirty::MultisampleControl] = false; - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; oglEnable(GL_SAMPLE_ALPHA_TO_COVERAGE, regs.multisample_control.alpha_to_coverage); oglEnable(GL_SAMPLE_ALPHA_TO_ONE, regs.multisample_control.alpha_to_one); } void RasterizerOpenGL::SyncFragmentColorClampState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::FragmentClampColor]) { return; } flags[Dirty::FragmentClampColor] = false; - glClampColor(GL_CLAMP_FRAGMENT_COLOR, gpu.regs.frag_color_clamp ? GL_TRUE : GL_FALSE); + glClampColor(GL_CLAMP_FRAGMENT_COLOR, maxwell3d.regs.frag_color_clamp ? GL_TRUE : GL_FALSE); } void RasterizerOpenGL::SyncBlendState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; - const auto& regs = gpu.regs; + auto& flags = maxwell3d.dirty.flags; + const auto& regs = maxwell3d.regs; if (flags[Dirty::BlendColor]) { flags[Dirty::BlendColor] = false; @@ -1248,14 +1478,13 @@ void RasterizerOpenGL::SyncBlendState() { } void RasterizerOpenGL::SyncLogicOpState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::LogicOp]) { return; } flags[Dirty::LogicOp] = false; - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; if (regs.logic_op.enable) { glEnable(GL_COLOR_LOGIC_OP); glLogicOp(MaxwellToGL::LogicOp(regs.logic_op.operation)); @@ -1265,14 +1494,13 @@ void RasterizerOpenGL::SyncLogicOpState() { } void RasterizerOpenGL::SyncScissorTest() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::Scissors]) { return; } flags[Dirty::Scissors] = false; - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; for (std::size_t index = 0; index < Maxwell::NumViewports; ++index) { if (!flags[Dirty::Scissor0 + index]) { continue; @@ -1291,16 +1519,15 @@ void RasterizerOpenGL::SyncScissorTest() { } void RasterizerOpenGL::SyncPointState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::PointSize]) { return; } flags[Dirty::PointSize] = false; - oglEnable(GL_POINT_SPRITE, gpu.regs.point_sprite_enable); + oglEnable(GL_POINT_SPRITE, maxwell3d.regs.point_sprite_enable); - if (gpu.regs.vp_point_size.enable) { + if (maxwell3d.regs.vp_point_size.enable) { // By definition of GL_POINT_SIZE, it only matters if GL_PROGRAM_POINT_SIZE is disabled. glEnable(GL_PROGRAM_POINT_SIZE); return; @@ -1308,32 +1535,30 @@ void RasterizerOpenGL::SyncPointState() { // Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid // in OpenGL). - glPointSize(std::max(1.0f, gpu.regs.point_size)); + glPointSize(std::max(1.0f, maxwell3d.regs.point_size)); glDisable(GL_PROGRAM_POINT_SIZE); } void RasterizerOpenGL::SyncLineState() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::LineWidth]) { return; } flags[Dirty::LineWidth] = false; - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; oglEnable(GL_LINE_SMOOTH, regs.line_smooth_enable); glLineWidth(regs.line_smooth_enable ? regs.line_width_smooth : regs.line_width_aliased); } void RasterizerOpenGL::SyncPolygonOffset() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::PolygonOffset]) { return; } flags[Dirty::PolygonOffset] = false; - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; oglEnable(GL_POLYGON_OFFSET_FILL, regs.polygon_offset_fill_enable); oglEnable(GL_POLYGON_OFFSET_LINE, regs.polygon_offset_line_enable); oglEnable(GL_POLYGON_OFFSET_POINT, regs.polygon_offset_point_enable); @@ -1347,18 +1572,13 @@ void RasterizerOpenGL::SyncPolygonOffset() { } void RasterizerOpenGL::SyncAlphaTest() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::AlphaTest]) { return; } flags[Dirty::AlphaTest] = false; - const auto& regs = gpu.regs; - if (regs.alpha_test_enabled && regs.rt_control.count > 1) { - LOG_WARNING(Render_OpenGL, "Alpha testing with more than one render target is not tested"); - } - + const auto& regs = maxwell3d.regs; if (regs.alpha_test_enabled) { glEnable(GL_ALPHA_TEST); glAlphaFunc(MaxwellToGL::ComparisonOp(regs.alpha_test_func), regs.alpha_test_ref); @@ -1368,22 +1588,79 @@ void RasterizerOpenGL::SyncAlphaTest() { } void RasterizerOpenGL::SyncFramebufferSRGB() { - auto& gpu = system.GPU().Maxwell3D(); - auto& flags = gpu.dirty.flags; + auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::FramebufferSRGB]) { return; } flags[Dirty::FramebufferSRGB] = false; - oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb); + oglEnable(GL_FRAMEBUFFER_SRGB, maxwell3d.regs.framebuffer_srgb); +} + +void RasterizerOpenGL::SyncTransformFeedback() { + // TODO(Rodrigo): Inject SKIP_COMPONENTS*_NV when required. An unimplemented message will signal + // when this is required. + const auto& regs = maxwell3d.regs; + + static constexpr std::size_t STRIDE = 3; + std::array<GLint, 128 * STRIDE * Maxwell::NumTransformFeedbackBuffers> attribs; + std::array<GLint, Maxwell::NumTransformFeedbackBuffers> streams; + + GLint* cursor = attribs.data(); + GLint* current_stream = streams.data(); + + for (std::size_t feedback = 0; feedback < Maxwell::NumTransformFeedbackBuffers; ++feedback) { + const auto& layout = regs.tfb_layouts[feedback]; + UNIMPLEMENTED_IF_MSG(layout.stride != layout.varying_count * 4, "Stride padding"); + if (layout.varying_count == 0) { + continue; + } + + *current_stream = static_cast<GLint>(feedback); + if (current_stream != streams.data()) { + // When stepping one stream, push the expected token + cursor[0] = GL_NEXT_BUFFER_NV; + cursor[1] = 0; + cursor[2] = 0; + cursor += STRIDE; + } + ++current_stream; + + const auto& locations = regs.tfb_varying_locs[feedback]; + std::optional<u8> current_index; + for (u32 offset = 0; offset < layout.varying_count; ++offset) { + const u8 location = locations[offset]; + const u8 index = location / 4; + + if (current_index == index) { + // Increase number of components of the previous attachment + ++cursor[-2]; + continue; + } + current_index = index; + + std::tie(cursor[0], cursor[2]) = TransformFeedbackEnum(location); + cursor[1] = 1; + cursor += STRIDE; + } + } + + const GLsizei num_attribs = static_cast<GLsizei>((cursor - attribs.data()) / STRIDE); + const GLsizei num_strides = static_cast<GLsizei>(current_stream - streams.data()); + glTransformFeedbackStreamAttribsNV(num_attribs, attribs.data(), num_strides, streams.data(), + GL_INTERLEAVED_ATTRIBS); } void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) { - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; if (regs.tfb_enabled == 0) { return; } + if (device.UseAssemblyShaders()) { + SyncTransformFeedback(); + } + UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); @@ -1410,11 +1687,15 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) { static_cast<GLsizeiptr>(size)); } + // We may have to call BeginTransformFeedbackNV here since they seem to call different + // implementations on Nvidia's driver (the pointer is different) but we are using + // ARB_transform_feedback3 features with NV_transform_feedback interactions and the ARB + // extension doesn't define BeginTransformFeedback (without NV) interactions. It just works. glBeginTransformFeedback(GL_POINTS); } void RasterizerOpenGL::EndTransformFeedback() { - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; if (regs.tfb_enabled == 0) { return; } @@ -1431,8 +1712,9 @@ void RasterizerOpenGL::EndTransformFeedback() { const GLuint handle = transform_feedback_buffers[index].handle; const GPUVAddr gpu_addr = binding.Address(); const std::size_t size = binding.buffer_size; - const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true); - glCopyNamedBufferSubData(handle, *dest_buffer, 0, offset, static_cast<GLsizeiptr>(size)); + const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true); + glCopyNamedBufferSubData(handle, info.handle, 0, info.offset, + static_cast<GLsizeiptr>(size)); } } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 435da4425..1d0f585fa 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -19,10 +19,10 @@ #include "video_core/engines/const_buffer_info.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/rasterizer_accelerated.h" -#include "video_core/rasterizer_cache.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_device.h" +#include "video_core/renderer_opengl/gl_fence_manager.h" #include "video_core/renderer_opengl/gl_framebuffer_cache.h" #include "video_core/renderer_opengl/gl_query_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" @@ -33,10 +33,11 @@ #include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/gl_texture_cache.h" #include "video_core/renderer_opengl/utils.h" +#include "video_core/shader/async_shaders.h" #include "video_core/textures/texture.h" -namespace Core { -class System; +namespace Core::Memory { +class Memory; } namespace Core::Frontend { @@ -52,10 +53,18 @@ namespace OpenGL { struct ScreenInfo; struct DrawParameters; +struct BindlessSSBO { + GLuint64EXT address; + GLsizei length; + GLsizei padding; +}; +static_assert(sizeof(BindlessSSBO) * CHAR_BIT == 128); + class RasterizerOpenGL : public VideoCore::RasterizerAccelerated { public: - explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, - ScreenInfo& info, GLShader::ProgramManager& program_manager, + explicit RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu, + Core::Memory::Memory& cpu_memory, const Device& device, + ScreenInfo& screen_info, ProgramManager& program_manager, StateTracker& state_tracker); ~RasterizerOpenGL() override; @@ -66,8 +75,15 @@ public: void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; void FlushAll() override; void FlushRegion(VAddr addr, u64 size) override; + bool MustFlushRegion(VAddr addr, u64 size) override; void InvalidateRegion(VAddr addr, u64 size) override; + void OnCPUWrite(VAddr addr, u64 size) override; + void SyncGuestHost() override; + void SignalSemaphore(GPUVAddr addr, u32 value) override; + void SignalSyncPoint(u32 value) override; + void ReleaseFences() override; void FlushAndInvalidateRegion(VAddr addr, u64 size) override; + void WaitForIdle() override; void FlushCommands() override; void TickFrame() override; bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, @@ -75,56 +91,65 @@ public: const Tegra::Engines::Fermi2D::Config& copy_config) override; bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) override; - void LoadDiskResources(const std::atomic_bool& stop_loading, + void LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) override; - void SetupDirtyFlags() override; /// Returns true when there are commands queued to the OpenGL server. bool AnyCommandQueued() const { return num_queued_commands > 0; } + VideoCommon::Shader::AsyncShaders& GetAsyncShaders() { + return async_shaders; + } + + const VideoCommon::Shader::AsyncShaders& GetAsyncShaders() const { + return async_shaders; + } + private: /// Configures the color and depth framebuffer states. void ConfigureFramebuffers(); - void ConfigureClearFramebuffer(bool using_color_fb, bool using_depth_fb, bool using_stencil_fb); + /// Configures the color and depth framebuffer for clearing. + void ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil); /// Configures the current constbuffers to use for the draw command. - void SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader); + void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader); /// Configures the current constbuffers to use for the kernel invocation. - void SetupComputeConstBuffers(const Shader& kernel); + void SetupComputeConstBuffers(Shader* kernel); /// Configures a constant buffer. - void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, - const ConstBufferEntry& entry); + void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, + const ConstBufferEntry& entry, bool use_unified, + std::size_t unified_offset); /// Configures the current global memory entries to use for the draw command. - void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader); + void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader); /// Configures the current global memory entries to use for the kernel invocation. - void SetupComputeGlobalMemory(const Shader& kernel); + void SetupComputeGlobalMemory(Shader* kernel); - /// Configures a constant buffer. + /// Configures a global memory buffer. void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr, - std::size_t size); + size_t size, BindlessSSBO* ssbo); /// Configures the current textures to use for the draw command. - void SetupDrawTextures(std::size_t stage_index, const Shader& shader); + void SetupDrawTextures(std::size_t stage_index, Shader* shader); /// Configures the textures used in a compute shader. - void SetupComputeTextures(const Shader& kernel); + void SetupComputeTextures(Shader* kernel); /// Configures a texture. void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry); /// Configures images in a graphics shader. - void SetupDrawImages(std::size_t stage_index, const Shader& shader); + void SetupDrawImages(std::size_t stage_index, Shader* shader); /// Configures images in a compute shader. - void SetupComputeImages(const Shader& shader); + void SetupComputeImages(Shader* shader); /// Configures an image. void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); @@ -192,6 +217,10 @@ private: /// Syncs the framebuffer sRGB state to match the guest state void SyncFramebufferSRGB(); + /// Syncs transform feedback state to match guest state + /// @note Only valid on assembly shaders + void SyncTransformFeedback(); + /// Begin a transform feedback void BeginTransformFeedback(GLenum primitive_mode); @@ -215,31 +244,42 @@ private: void SetupShaders(GLenum primitive_mode); - const Device device; + Tegra::GPU& gpu; + Tegra::Engines::Maxwell3D& maxwell3d; + Tegra::Engines::KeplerCompute& kepler_compute; + Tegra::MemoryManager& gpu_memory; + + const Device& device; + ScreenInfo& screen_info; + ProgramManager& program_manager; + StateTracker& state_tracker; TextureCacheOpenGL texture_cache; ShaderCacheOpenGL shader_cache; SamplerCacheOpenGL sampler_cache; FramebufferCacheOpenGL framebuffer_cache; QueryCache query_cache; + OGLBufferCache buffer_cache; + FenceManagerOpenGL fence_manager; - Core::System& system; - ScreenInfo& screen_info; - GLShader::ProgramManager& program_manager; - StateTracker& state_tracker; + VideoCommon::Shader::AsyncShaders async_shaders; static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; - OGLBufferCache buffer_cache; - VertexArrayPushBuffer vertex_array_pushbuffer{state_tracker}; - BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER}; - BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER}; + GLint vertex_binding = 0; std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers> transform_feedback_buffers; std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers> enabled_transform_feedback_buffers; + static constexpr std::size_t NUM_CONSTANT_BUFFERS = + Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * + Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; + std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{}; + std::size_t current_cbuf = 0; + OGLBuffer unified_uniform_buffer; + /// Number of commands queued to the OpenGL driver. Reseted on flush. std::size_t num_queued_commands = 0; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp index 97803d480..0ebcec427 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp @@ -2,6 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <string_view> #include <utility> #include <glad/glad.h> #include "common/common_types.h" @@ -82,11 +83,13 @@ void OGLSampler::Release() { handle = 0; } -void OGLShader::Create(const char* source, GLenum type) { - if (handle != 0) +void OGLShader::Create(std::string_view source, GLenum type) { + if (handle != 0) { return; - if (source == nullptr) + } + if (source.empty()) { return; + } MICROPROFILE_SCOPE(OpenGL_ResourceCreation); handle = GLShader::LoadShader(source, type); @@ -125,6 +128,15 @@ void OGLProgram::Release() { handle = 0; } +void OGLAssemblyProgram::Release() { + if (handle == 0) { + return; + } + MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); + glDeleteProgramsARB(1, &handle); + handle = 0; +} + void OGLPipeline::Create() { if (handle != 0) return; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index de93f4212..f48398669 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -4,6 +4,7 @@ #pragma once +#include <string_view> #include <utility> #include <glad/glad.h> #include "common/common_types.h" @@ -127,7 +128,7 @@ public: return *this; } - void Create(const char* source, GLenum type); + void Create(std::string_view source, GLenum type); void Release(); @@ -167,6 +168,28 @@ public: GLuint handle = 0; }; +class OGLAssemblyProgram : private NonCopyable { +public: + OGLAssemblyProgram() = default; + + OGLAssemblyProgram(OGLAssemblyProgram&& o) noexcept : handle(std::exchange(o.handle, 0)) {} + + ~OGLAssemblyProgram() { + Release(); + } + + OGLAssemblyProgram& operator=(OGLAssemblyProgram&& o) noexcept { + Release(); + handle = std::exchange(o.handle, 0); + return *this; + } + + /// Deletes the internal OpenGL resource + void Release(); + + GLuint handle = 0; +}; + class OGLPipeline : private NonCopyable { public: OGLPipeline() = default; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 12c6dcfde..bd56bed0c 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -10,8 +10,6 @@ #include <thread> #include <unordered_set> -#include <boost/functional/hash.hpp> - #include "common/alignment.h" #include "common/assert.h" #include "common/logging/log.h" @@ -22,83 +20,35 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/shader_type.h" #include "video_core/memory_manager.h" +#include "video_core/renderer_opengl/gl_arb_decompiler.h" #include "video_core/renderer_opengl/gl_rasterizer.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_disk_cache.h" #include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/utils.h" +#include "video_core/shader/memory_util.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" +#include "video_core/shader_cache.h" +#include "video_core/shader_notify.h" namespace OpenGL { using Tegra::Engines::ShaderType; -using VideoCommon::Shader::CompileDepth; -using VideoCommon::Shader::CompilerSettings; +using VideoCommon::Shader::GetShaderAddress; +using VideoCommon::Shader::GetShaderCode; +using VideoCommon::Shader::GetUniqueIdentifier; +using VideoCommon::Shader::KERNEL_MAIN_OFFSET; using VideoCommon::Shader::ProgramCode; using VideoCommon::Shader::Registry; using VideoCommon::Shader::ShaderIR; +using VideoCommon::Shader::STAGE_MAIN_OFFSET; namespace { -constexpr u32 STAGE_MAIN_OFFSET = 10; -constexpr u32 KERNEL_MAIN_OFFSET = 0; - -constexpr CompilerSettings COMPILER_SETTINGS{CompileDepth::FullDecompile}; - -/// Gets the address for the specified shader stage program -GPUVAddr GetShaderAddress(Core::System& system, Maxwell::ShaderProgram program) { - const auto& gpu{system.GPU().Maxwell3D()}; - const auto& shader_config{gpu.regs.shader_config[static_cast<std::size_t>(program)]}; - return gpu.regs.code_address.CodeAddress() + shader_config.offset; -} - -/// Gets if the current instruction offset is a scheduler instruction -constexpr bool IsSchedInstruction(std::size_t offset, std::size_t main_offset) { - // Sched instructions appear once every 4 instructions. - constexpr std::size_t SchedPeriod = 4; - const std::size_t absolute_offset = offset - main_offset; - return (absolute_offset % SchedPeriod) == 0; -} - -/// Calculates the size of a program stream -std::size_t CalculateProgramSize(const ProgramCode& program) { - constexpr std::size_t start_offset = 10; - // This is the encoded version of BRA that jumps to itself. All Nvidia - // shaders end with one. - constexpr u64 self_jumping_branch = 0xE2400FFFFF07000FULL; - constexpr u64 mask = 0xFFFFFFFFFF7FFFFFULL; - std::size_t offset = start_offset; - while (offset < program.size()) { - const u64 instruction = program[offset]; - if (!IsSchedInstruction(offset, start_offset)) { - if ((instruction & mask) == self_jumping_branch) { - // End on Maxwell's "nop" instruction - break; - } - if (instruction == 0) { - break; - } - } - offset++; - } - // The last instruction is included in the program size - return std::min(offset + 1, program.size()); -} - -/// Gets the shader program code from memory for the specified address -ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr gpu_addr, - const u8* host_ptr) { - ProgramCode code(VideoCommon::Shader::MAX_PROGRAM_LENGTH); - ASSERT_OR_EXECUTE(host_ptr != nullptr, { - std::fill(code.begin(), code.end(), 0); - return code; - }); - memory_manager.ReadBlockUnsafe(gpu_addr, code.data(), code.size() * sizeof(u64)); - code.resize(CalculateProgramSize(code)); - return code; -} +constexpr VideoCommon::Shader::CompilerSettings COMPILER_SETTINGS{}; /// Gets the shader type from a Maxwell program type constexpr GLenum GetGLShaderType(ShaderType shader_type) { @@ -116,17 +66,6 @@ constexpr GLenum GetGLShaderType(ShaderType shader_type) { } } -/// Hashes one (or two) program streams -u64 GetUniqueIdentifier(ShaderType shader_type, bool is_a, const ProgramCode& code, - const ProgramCode& code_b = {}) { - u64 unique_identifier = boost::hash_value(code); - if (is_a) { - // VertexA programs include two programs - boost::hash_combine(unique_identifier, boost::hash_value(code_b)); - } - return unique_identifier; -} - constexpr const char* GetShaderTypeName(ShaderType shader_type) { switch (shader_type) { case ShaderType::Vertex: @@ -162,6 +101,24 @@ constexpr ShaderType GetShaderType(Maxwell::ShaderProgram program_type) { return {}; } +constexpr GLenum AssemblyEnum(ShaderType shader_type) { + switch (shader_type) { + case ShaderType::Vertex: + return GL_VERTEX_PROGRAM_NV; + case ShaderType::TesselationControl: + return GL_TESS_CONTROL_PROGRAM_NV; + case ShaderType::TesselationEval: + return GL_TESS_EVALUATION_PROGRAM_NV; + case ShaderType::Geometry: + return GL_GEOMETRY_PROGRAM_NV; + case ShaderType::Fragment: + return GL_FRAGMENT_PROGRAM_NV; + case ShaderType::Compute: + return GL_COMPUTE_PROGRAM_NV; + } + return {}; +} + std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) { return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier); } @@ -170,7 +127,7 @@ std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) { const VideoCore::GuestDriverProfile guest_profile{entry.texture_handler_size}; const VideoCommon::Shader::SerializedRegistryInfo info{guest_profile, entry.bound_buffer, entry.graphics_info, entry.compute_info}; - const auto registry = std::make_shared<Registry>(entry.type, info); + auto registry = std::make_shared<Registry>(entry.type, info); for (const auto& [address, value] : entry.keys) { const auto [buffer, offset] = address; registry->InsertKey(buffer, offset, value); @@ -185,21 +142,6 @@ std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) { return registry; } -std::shared_ptr<OGLProgram> BuildShader(const Device& device, ShaderType shader_type, - u64 unique_identifier, const ShaderIR& ir, - const Registry& registry, bool hint_retrievable = false) { - const std::string shader_id = MakeShaderID(unique_identifier, shader_type); - LOG_INFO(Render_OpenGL, "{}", shader_id); - - const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id); - OGLShader shader; - shader.Create(glsl.c_str(), GetGLShaderType(shader_type)); - - auto program = std::make_shared<OGLProgram>(); - program->Create(true, hint_retrievable, shader.handle); - return program; -} - std::unordered_set<GLenum> GetSupportedFormats() { GLint num_formats; glGetIntegerv(GL_NUM_PROGRAM_BINARY_FORMATS, &num_formats); @@ -216,55 +158,138 @@ std::unordered_set<GLenum> GetSupportedFormats() { } // Anonymous namespace -CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, - std::shared_ptr<VideoCommon::Shader::Registry> registry, - ShaderEntries entries, std::shared_ptr<OGLProgram> program) - : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)}, - size_in_bytes{size_in_bytes}, program{std::move(program)} {} +ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 unique_identifier, + const ShaderIR& ir, const Registry& registry, bool hint_retrievable) { + const std::string shader_id = MakeShaderID(unique_identifier, shader_type); + LOG_INFO(Render_OpenGL, "{}", shader_id); + + auto program = std::make_shared<ProgramHandle>(); + + if (device.UseAssemblyShaders()) { + const std::string arb = + DecompileAssemblyShader(device, ir, registry, shader_type, shader_id); + + GLuint& arb_prog = program->assembly_program.handle; + +// Commented out functions signal OpenGL errors but are compatible with apitrace. +// Use them only to capture and replay on apitrace. +#if 0 + glGenProgramsNV(1, &arb_prog); + glLoadProgramNV(AssemblyEnum(shader_type), arb_prog, static_cast<GLsizei>(arb.size()), + reinterpret_cast<const GLubyte*>(arb.data())); +#else + glGenProgramsARB(1, &arb_prog); + glNamedProgramStringEXT(arb_prog, AssemblyEnum(shader_type), GL_PROGRAM_FORMAT_ASCII_ARB, + static_cast<GLsizei>(arb.size()), arb.data()); +#endif + const auto err = reinterpret_cast<const char*>(glGetString(GL_PROGRAM_ERROR_STRING_NV)); + if (err && *err) { + LOG_CRITICAL(Render_OpenGL, "{}", err); + LOG_INFO(Render_OpenGL, "\n{}", arb); + } + } else { + const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id); + OGLShader shader; + shader.Create(glsl.c_str(), GetGLShaderType(shader_type)); + + program->source_program.Create(true, hint_retrievable, shader.handle); + } -CachedShader::~CachedShader() = default; + return program; +} -GLuint CachedShader::GetHandle() const { +Shader::Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry_, ShaderEntries entries_, + ProgramSharedPtr program_, bool is_built) + : registry{std::move(registry_)}, entries{std::move(entries_)}, program{std::move(program_)}, + is_built(is_built) { + handle = program->assembly_program.handle; + if (handle == 0) { + handle = program->source_program.handle; + } + if (is_built) { + ASSERT(handle != 0); + } +} + +Shader::~Shader() = default; + +GLuint Shader::GetHandle() const { DEBUG_ASSERT(registry->IsConsistent()); - return program->handle; + return handle; } -Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, - Maxwell::ShaderProgram program_type, ProgramCode code, - ProgramCode code_b) { +bool Shader::IsBuilt() const { + return is_built; +} + +void Shader::AsyncOpenGLBuilt(OGLProgram new_program) { + program->source_program = std::move(new_program); + handle = program->source_program.handle; + is_built = true; +} + +void Shader::AsyncGLASMBuilt(OGLAssemblyProgram new_program) { + program->assembly_program = std::move(new_program); + handle = program->assembly_program.handle; + is_built = true; +} + +std::unique_ptr<Shader> Shader::CreateStageFromMemory( + const ShaderParameters& params, Maxwell::ShaderProgram program_type, ProgramCode code, + ProgramCode code_b, VideoCommon::Shader::AsyncShaders& async_shaders, VAddr cpu_addr) { const auto shader_type = GetShaderType(program_type); - const std::size_t size_in_bytes = code.size() * sizeof(u64); - auto registry = std::make_shared<Registry>(shader_type, params.system.GPU().Maxwell3D()); - const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry); - // TODO(Rodrigo): Handle VertexA shaders - // std::optional<ShaderIR> ir_b; - // if (!code_b.empty()) { - // ir_b.emplace(code_b, STAGE_MAIN_OFFSET); - // } - auto program = BuildShader(params.device, shader_type, params.unique_identifier, ir, *registry); + auto& gpu = params.gpu; + gpu.ShaderNotify().MarkSharderBuilding(); + + auto registry = std::make_shared<Registry>(shader_type, gpu.Maxwell3D()); + if (!async_shaders.IsShaderAsync(gpu) || !params.device.UseAsynchronousShaders()) { + const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry); + // TODO(Rodrigo): Handle VertexA shaders + // std::optional<ShaderIR> ir_b; + // if (!code_b.empty()) { + // ir_b.emplace(code_b, STAGE_MAIN_OFFSET); + // } + auto program = + BuildShader(params.device, shader_type, params.unique_identifier, ir, *registry); + ShaderDiskCacheEntry entry; + entry.type = shader_type; + entry.code = std::move(code); + entry.code_b = std::move(code_b); + entry.unique_identifier = params.unique_identifier; + entry.bound_buffer = registry->GetBoundBuffer(); + entry.graphics_info = registry->GetGraphicsInfo(); + entry.keys = registry->GetKeys(); + entry.bound_samplers = registry->GetBoundSamplers(); + entry.bindless_samplers = registry->GetBindlessSamplers(); + params.disk_cache.SaveEntry(std::move(entry)); + + gpu.ShaderNotify().MarkShaderComplete(); + + return std::unique_ptr<Shader>(new Shader(std::move(registry), + MakeEntries(params.device, ir, shader_type), + std::move(program), true)); + } else { + // Required for entries + const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry); + auto entries = MakeEntries(params.device, ir, shader_type); - ShaderDiskCacheEntry entry; - entry.type = shader_type; - entry.code = std::move(code); - entry.code_b = std::move(code_b); - entry.unique_identifier = params.unique_identifier; - entry.bound_buffer = registry->GetBoundBuffer(); - entry.graphics_info = registry->GetGraphicsInfo(); - entry.keys = registry->GetKeys(); - entry.bound_samplers = registry->GetBoundSamplers(); - entry.bindless_samplers = registry->GetBindlessSamplers(); - params.disk_cache.SaveEntry(std::move(entry)); + async_shaders.QueueOpenGLShader(params.device, shader_type, params.unique_identifier, + std::move(code), std::move(code_b), STAGE_MAIN_OFFSET, + COMPILER_SETTINGS, *registry, cpu_addr); - return std::shared_ptr<CachedShader>(new CachedShader( - params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program))); + auto program = std::make_shared<ProgramHandle>(); + return std::unique_ptr<Shader>( + new Shader(std::move(registry), std::move(entries), std::move(program), false)); + } } -Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) { - const std::size_t size_in_bytes = code.size() * sizeof(u64); +std::unique_ptr<Shader> Shader::CreateKernelFromMemory(const ShaderParameters& params, + ProgramCode code) { + auto& gpu = params.gpu; + gpu.ShaderNotify().MarkSharderBuilding(); - auto& engine = params.system.GPU().KeplerCompute(); - auto registry = std::make_shared<Registry>(ShaderType::Compute, engine); + auto registry = std::make_shared<Registry>(ShaderType::Compute, params.engine); const ShaderIR ir(code, KERNEL_MAIN_OFFSET, COMPILER_SETTINGS, *registry); const u64 uid = params.unique_identifier; auto program = BuildShader(params.device, ShaderType::Compute, uid, ir, *registry); @@ -280,31 +305,43 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog entry.bindless_samplers = registry->GetBindlessSamplers(); params.disk_cache.SaveEntry(std::move(entry)); - return std::shared_ptr<CachedShader>(new CachedShader( - params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program))); + gpu.ShaderNotify().MarkShaderComplete(); + + return std::unique_ptr<Shader>(new Shader(std::move(registry), + MakeEntries(params.device, ir, ShaderType::Compute), + std::move(program))); } -Shader CachedShader::CreateFromCache(const ShaderParameters& params, - const PrecompiledShader& precompiled_shader, - std::size_t size_in_bytes) { - return std::shared_ptr<CachedShader>( - new CachedShader(params.cpu_addr, size_in_bytes, precompiled_shader.registry, - precompiled_shader.entries, precompiled_shader.program)); +std::unique_ptr<Shader> Shader::CreateFromCache(const ShaderParameters& params, + const PrecompiledShader& precompiled_shader) { + return std::unique_ptr<Shader>(new Shader( + precompiled_shader.registry, precompiled_shader.entries, precompiled_shader.program)); } -ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, - Core::Frontend::EmuWindow& emu_window, const Device& device) - : RasterizerCache{rasterizer}, system{system}, emu_window{emu_window}, device{device}, - disk_cache{system} {} +ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, + Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, + Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::Engines::KeplerCompute& kepler_compute_, + Tegra::MemoryManager& gpu_memory_, const Device& device_) + : VideoCommon::ShaderCache<Shader>{rasterizer}, emu_window{emu_window_}, gpu{gpu_}, + gpu_memory{gpu_memory_}, maxwell3d{maxwell3d_}, + kepler_compute{kepler_compute_}, device{device_} {} -void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, +ShaderCacheOpenGL::~ShaderCacheOpenGL() = default; + +void ShaderCacheOpenGL::LoadDiskCache(u64 title_id, const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) { + disk_cache.BindTitleID(title_id); const std::optional transferable = disk_cache.LoadTransferable(); if (!transferable) { return; } - const std::vector gl_cache = disk_cache.LoadPrecompiled(); + std::vector<ShaderDiskCachePrecompiled> gl_cache; + if (!device.UseAssemblyShaders()) { + // Only load precompiled cache when we are not using assembly shaders + gl_cache = disk_cache.LoadPrecompiled(); + } const auto supported_formats = GetSupportedFormats(); // Track if precompiled cache was altered during loading to know if we have to @@ -343,7 +380,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, auto registry = MakeRegistry(entry); const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry); - std::shared_ptr<OGLProgram> program; + ProgramSharedPtr program; if (precompiled_entry) { // If the shader is precompiled, attempt to load it with program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats); @@ -359,7 +396,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, PrecompiledShader shader; shader.program = std::move(program); shader.registry = std::move(registry); - shader.entries = MakeEntries(ir); + shader.entries = MakeEntries(device, ir, entry.type); std::scoped_lock lock{mutex}; if (callback) { @@ -370,7 +407,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, } }; - const auto num_workers{static_cast<std::size_t>(std::thread::hardware_concurrency() + 1ULL)}; + const std::size_t num_workers{std::max(1U, std::thread::hardware_concurrency())}; const std::size_t bucket_size{transferable->size() / num_workers}; std::vector<std::unique_ptr<Core::Frontend::GraphicsContext>> contexts(num_workers); std::vector<std::thread> threads(num_workers); @@ -397,6 +434,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, return; } + if (device.UseAssemblyShaders()) { + // Don't store precompiled binaries for assembly shaders. + return; + } + // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw // before precompiling them @@ -404,7 +446,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, const u64 id = (*transferable)[i].unique_identifier; const auto it = find_precompiled(id); if (it == gl_cache.end()) { - const GLuint program = runtime_cache.at(id).program->handle; + const GLuint program = runtime_cache.at(id).program->source_program.handle; disk_cache.SavePrecompiled(id, program); precompiled_cache_altered = true; } @@ -415,7 +457,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, } } -std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram( +ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram( const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry, const std::unordered_set<GLenum>& supported_formats) { if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) { @@ -423,15 +465,15 @@ std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram( return {}; } - auto program = std::make_shared<OGLProgram>(); - program->handle = glCreateProgram(); - glProgramParameteri(program->handle, GL_PROGRAM_SEPARABLE, GL_TRUE); - glProgramBinary(program->handle, precompiled_entry.binary_format, - precompiled_entry.binary.data(), + auto program = std::make_shared<ProgramHandle>(); + GLuint& handle = program->source_program.handle; + handle = glCreateProgram(); + glProgramParameteri(handle, GL_PROGRAM_SEPARABLE, GL_TRUE); + glProgramBinary(handle, precompiled_entry.binary_format, precompiled_entry.binary.data(), static_cast<GLsizei>(precompiled_entry.binary.size())); GLint link_status; - glGetProgramiv(program->handle, GL_LINK_STATUS, &link_status); + glGetProgramiv(handle, GL_LINK_STATUS, &link_status); if (link_status == GL_FALSE) { LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing"); return {}; @@ -440,77 +482,122 @@ std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram( return program; } -Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { - if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) { - return last_shaders[static_cast<std::size_t>(program)]; +Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program, + VideoCommon::Shader::AsyncShaders& async_shaders) { + if (!maxwell3d.dirty.flags[Dirty::Shaders]) { + auto* last_shader = last_shaders[static_cast<std::size_t>(program)]; + if (last_shader->IsBuilt()) { + return last_shader; + } } - auto& memory_manager{system.GPU().MemoryManager()}; - const GPUVAddr address{GetShaderAddress(system, program)}; + const GPUVAddr address{GetShaderAddress(maxwell3d, program)}; + + if (device.UseAsynchronousShaders() && async_shaders.HasCompletedWork()) { + auto completed_work = async_shaders.GetCompletedWork(); + for (auto& work : completed_work) { + Shader* shader = TryGet(work.cpu_address); + gpu.ShaderNotify().MarkShaderComplete(); + if (shader == nullptr) { + continue; + } + using namespace VideoCommon::Shader; + if (work.backend == AsyncShaders::Backend::OpenGL) { + shader->AsyncOpenGLBuilt(std::move(work.program.opengl)); + } else if (work.backend == AsyncShaders::Backend::GLASM) { + shader->AsyncGLASMBuilt(std::move(work.program.glasm)); + } + + auto& registry = shader->GetRegistry(); + + ShaderDiskCacheEntry entry; + entry.type = work.shader_type; + entry.code = std::move(work.code); + entry.code_b = std::move(work.code_b); + entry.unique_identifier = work.uid; + entry.bound_buffer = registry.GetBoundBuffer(); + entry.graphics_info = registry.GetGraphicsInfo(); + entry.keys = registry.GetKeys(); + entry.bound_samplers = registry.GetBoundSamplers(); + entry.bindless_samplers = registry.GetBindlessSamplers(); + disk_cache.SaveEntry(std::move(entry)); + } + } // Look up shader in the cache based on address - const auto cpu_addr{memory_manager.GpuToCpuAddress(address)}; - Shader shader{cpu_addr ? TryGet(*cpu_addr) : nullptr}; - if (shader) { + const std::optional<VAddr> cpu_addr{gpu_memory.GpuToCpuAddress(address)}; + if (Shader* const shader{cpu_addr ? TryGet(*cpu_addr) : null_shader.get()}) { return last_shaders[static_cast<std::size_t>(program)] = shader; } - const auto host_ptr{memory_manager.GetPointer(address)}; + const u8* const host_ptr{gpu_memory.GetPointer(address)}; // No shader found - create a new one - ProgramCode code{GetShaderCode(memory_manager, address, host_ptr)}; + ProgramCode code{GetShaderCode(gpu_memory, address, host_ptr, false)}; ProgramCode code_b; if (program == Maxwell::ShaderProgram::VertexA) { - const GPUVAddr address_b{GetShaderAddress(system, Maxwell::ShaderProgram::VertexB)}; - code_b = GetShaderCode(memory_manager, address_b, memory_manager.GetPointer(address_b)); + const GPUVAddr address_b{GetShaderAddress(maxwell3d, Maxwell::ShaderProgram::VertexB)}; + const u8* host_ptr_b = gpu_memory.GetPointer(address_b); + code_b = GetShaderCode(gpu_memory, address_b, host_ptr_b, false); } + const std::size_t code_size = code.size() * sizeof(u64); - const auto unique_identifier = GetUniqueIdentifier( + const u64 unique_identifier = GetUniqueIdentifier( GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b); - const ShaderParameters params{system, disk_cache, device, - *cpu_addr, host_ptr, unique_identifier}; + const ShaderParameters params{gpu, maxwell3d, disk_cache, device, + *cpu_addr, host_ptr, unique_identifier}; + std::unique_ptr<Shader> shader; const auto found = runtime_cache.find(unique_identifier); if (found == runtime_cache.end()) { - shader = CachedShader::CreateStageFromMemory(params, program, std::move(code), - std::move(code_b)); + shader = Shader::CreateStageFromMemory(params, program, std::move(code), std::move(code_b), + async_shaders, cpu_addr.value_or(0)); } else { - const std::size_t size_in_bytes = code.size() * sizeof(u64); - shader = CachedShader::CreateFromCache(params, found->second, size_in_bytes); + shader = Shader::CreateFromCache(params, found->second); } - Register(shader); - return last_shaders[static_cast<std::size_t>(program)] = shader; + Shader* const result = shader.get(); + if (cpu_addr) { + Register(std::move(shader), *cpu_addr, code_size); + } else { + null_shader = std::move(shader); + } + + return last_shaders[static_cast<std::size_t>(program)] = result; } -Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { - auto& memory_manager{system.GPU().MemoryManager()}; - const auto cpu_addr{memory_manager.GpuToCpuAddress(code_addr)}; +Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { + const std::optional<VAddr> cpu_addr{gpu_memory.GpuToCpuAddress(code_addr)}; - auto kernel = cpu_addr ? TryGet(*cpu_addr) : nullptr; - if (kernel) { + if (Shader* const kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get()) { return kernel; } - const auto host_ptr{memory_manager.GetPointer(code_addr)}; // No kernel found, create a new one - auto code{GetShaderCode(memory_manager, code_addr, host_ptr)}; - const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)}; + const u8* host_ptr{gpu_memory.GetPointer(code_addr)}; + ProgramCode code{GetShaderCode(gpu_memory, code_addr, host_ptr, true)}; + const std::size_t code_size{code.size() * sizeof(u64)}; + const u64 unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)}; - const ShaderParameters params{system, disk_cache, device, - *cpu_addr, host_ptr, unique_identifier}; + const ShaderParameters params{gpu, kepler_compute, disk_cache, device, + *cpu_addr, host_ptr, unique_identifier}; + std::unique_ptr<Shader> kernel; const auto found = runtime_cache.find(unique_identifier); if (found == runtime_cache.end()) { - kernel = CachedShader::CreateKernelFromMemory(params, std::move(code)); + kernel = Shader::CreateKernelFromMemory(params, std::move(code)); } else { - const std::size_t size_in_bytes = code.size() * sizeof(u64); - kernel = CachedShader::CreateFromCache(params, found->second, size_in_bytes); + kernel = Shader::CreateFromCache(params, found->second); } - Register(kernel); - return kernel; + Shader* const result = kernel.get(); + if (cpu_addr) { + Register(std::move(kernel), *cpu_addr, code_size); + } else { + null_kernel = std::move(kernel); + } + return result; } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index c836df5bd..1708af06a 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -18,114 +18,143 @@ #include "common/common_types.h" #include "video_core/engines/shader_type.h" -#include "video_core/rasterizer_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_disk_cache.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" +#include "video_core/shader_cache.h" -namespace Core { -class System; +namespace Tegra { +class MemoryManager; } namespace Core::Frontend { class EmuWindow; } +namespace VideoCommon::Shader { +class AsyncShaders; +} + namespace OpenGL { -class CachedShader; class Device; class RasterizerOpenGL; -struct UnspecializedShader; -using Shader = std::shared_ptr<CachedShader>; using Maxwell = Tegra::Engines::Maxwell3D::Regs; +struct ProgramHandle { + OGLProgram source_program; + OGLAssemblyProgram assembly_program; +}; +using ProgramSharedPtr = std::shared_ptr<ProgramHandle>; + struct PrecompiledShader { - std::shared_ptr<OGLProgram> program; + ProgramSharedPtr program; std::shared_ptr<VideoCommon::Shader::Registry> registry; ShaderEntries entries; }; struct ShaderParameters { - Core::System& system; + Tegra::GPU& gpu; + Tegra::Engines::ConstBufferEngineInterface& engine; ShaderDiskCacheOpenGL& disk_cache; const Device& device; VAddr cpu_addr; - u8* host_ptr; + const u8* host_ptr; u64 unique_identifier; }; -class CachedShader final : public RasterizerCacheObject { +ProgramSharedPtr BuildShader(const Device& device, Tegra::Engines::ShaderType shader_type, + u64 unique_identifier, const VideoCommon::Shader::ShaderIR& ir, + const VideoCommon::Shader::Registry& registry, + bool hint_retrievable = false); + +class Shader final { public: - ~CachedShader(); + ~Shader(); /// Gets the GL program handle for the shader GLuint GetHandle() const; - /// Returns the size in bytes of the shader - std::size_t GetSizeInBytes() const override { - return size_in_bytes; - } + bool IsBuilt() const; /// Gets the shader entries for the shader const ShaderEntries& GetEntries() const { return entries; } - static Shader CreateStageFromMemory(const ShaderParameters& params, - Maxwell::ShaderProgram program_type, - ProgramCode program_code, ProgramCode program_code_b); - static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code); + const VideoCommon::Shader::Registry& GetRegistry() const { + return *registry; + } + + /// Mark a OpenGL shader as built + void AsyncOpenGLBuilt(OGLProgram new_program); + + /// Mark a GLASM shader as built + void AsyncGLASMBuilt(OGLAssemblyProgram new_program); + + static std::unique_ptr<Shader> CreateStageFromMemory( + const ShaderParameters& params, Maxwell::ShaderProgram program_type, + ProgramCode program_code, ProgramCode program_code_b, + VideoCommon::Shader::AsyncShaders& async_shaders, VAddr cpu_addr); + + static std::unique_ptr<Shader> CreateKernelFromMemory(const ShaderParameters& params, + ProgramCode code); - static Shader CreateFromCache(const ShaderParameters& params, - const PrecompiledShader& precompiled_shader, - std::size_t size_in_bytes); + static std::unique_ptr<Shader> CreateFromCache(const ShaderParameters& params, + const PrecompiledShader& precompiled_shader); private: - explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, - std::shared_ptr<VideoCommon::Shader::Registry> registry, - ShaderEntries entries, std::shared_ptr<OGLProgram> program); + explicit Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry, ShaderEntries entries, + ProgramSharedPtr program, bool is_built = true); std::shared_ptr<VideoCommon::Shader::Registry> registry; ShaderEntries entries; - std::size_t size_in_bytes = 0; - std::shared_ptr<OGLProgram> program; + ProgramSharedPtr program; + GLuint handle = 0; + bool is_built{}; }; -class ShaderCacheOpenGL final : public RasterizerCache<Shader> { +class ShaderCacheOpenGL final : public VideoCommon::ShaderCache<Shader> { public: - explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, - Core::Frontend::EmuWindow& emu_window, const Device& device); + explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::Frontend::EmuWindow& emu_window, + Tegra::GPU& gpu, Tegra::Engines::Maxwell3D& maxwell3d, + Tegra::Engines::KeplerCompute& kepler_compute, + Tegra::MemoryManager& gpu_memory, const Device& device); + ~ShaderCacheOpenGL() override; /// Loads disk cache for the current game - void LoadDiskCache(const std::atomic_bool& stop_loading, + void LoadDiskCache(u64 title_id, const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback); /// Gets the current specified shader stage program - Shader GetStageProgram(Maxwell::ShaderProgram program); + Shader* GetStageProgram(Maxwell::ShaderProgram program, + VideoCommon::Shader::AsyncShaders& async_shaders); /// Gets a compute kernel in the passed address - Shader GetComputeKernel(GPUVAddr code_addr); - -protected: - // We do not have to flush this cache as things in it are never modified by us. - void FlushObjectInner(const Shader& object) override {} + Shader* GetComputeKernel(GPUVAddr code_addr); private: - std::shared_ptr<OGLProgram> GeneratePrecompiledProgram( + ProgramSharedPtr GeneratePrecompiledProgram( const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry, const std::unordered_set<GLenum>& supported_formats); - Core::System& system; Core::Frontend::EmuWindow& emu_window; + Tegra::GPU& gpu; + Tegra::MemoryManager& gpu_memory; + Tegra::Engines::Maxwell3D& maxwell3d; + Tegra::Engines::KeplerCompute& kepler_compute; const Device& device; + ShaderDiskCacheOpenGL disk_cache; std::unordered_map<u64, PrecompiledShader> runtime_cache; - std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; + std::unique_ptr<Shader> null_shader; + std::unique_ptr<Shader> null_kernel; + + std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index b1804e9ea..95ca96c8e 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -37,6 +37,7 @@ using Tegra::Shader::IpaMode; using Tegra::Shader::IpaSampleMode; using Tegra::Shader::PixelImap; using Tegra::Shader::Register; +using Tegra::Shader::TextureType; using VideoCommon::Shader::BuildTransformFeedback; using VideoCommon::Shader::Registry; @@ -61,8 +62,8 @@ struct TextureDerivates {}; using TextureArgument = std::pair<Type, Node>; using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>; -constexpr u32 MAX_CONSTBUFFER_ELEMENTS = - static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float)); +constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32); +constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32); constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt #define ftou floatBitsToUint @@ -402,6 +403,13 @@ std::string FlowStackTopName(MetaStackClass stack) { return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); } +bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) { + const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size()); + // We waste one UBO for emulation + const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1; + return num_ubos > num_available_ubos; +} + struct GenericVaryingDescription { std::string name; u8 first_element = 0; @@ -412,8 +420,9 @@ class GLSLDecompiler final { public: explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, ShaderType stage, std::string_view identifier, std::string_view suffix) - : device{device}, ir{ir}, registry{registry}, stage{stage}, - identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} { + : device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier}, + suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{ + UseUnifiedUniforms(device, ir, stage)} { if (stage != ShaderType::Compute) { transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); } @@ -484,7 +493,7 @@ private: code.AddLine("switch (jmp_to) {{"); for (const auto& pair : ir.GetBasicBlocks()) { - const auto [address, bb] = pair; + const auto& [address, bb] = pair; code.AddLine("case 0x{:X}U: {{", address); ++code.scope; @@ -518,6 +527,9 @@ private: if (device.HasImageLoadFormatted()) { code.AddLine("#extension GL_EXT_shader_image_load_formatted : require"); } + if (device.HasTextureShadowLod()) { + code.AddLine("#extension GL_EXT_texture_shadow_lod : require"); + } if (device.HasWarpIntrinsics()) { code.AddLine("#extension GL_NV_gpu_shader5 : require"); code.AddLine("#extension GL_NV_shader_thread_group : require"); @@ -590,8 +602,15 @@ private: return; } const auto& info = registry.GetComputeInfo(); - if (const u32 size = info.shared_memory_size_in_words; size > 0) { - code.AddLine("shared uint smem[{}];", size); + if (u32 size = info.shared_memory_size_in_words * 4; size > 0) { + const u32 limit = device.GetMaxComputeSharedMemorySize(); + if (size > limit) { + LOG_ERROR(Render_OpenGL, "Shared memory size {} is clamped to host's limit {}", + size, limit); + size = limit; + } + + code.AddLine("shared uint smem[{}];", size / 4); code.AddNewLine(); } code.AddLine("layout (local_size_x = {}, local_size_y = {}, local_size_z = {}) in;", @@ -618,7 +637,9 @@ private: break; } } - if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) { + + if (stage != ShaderType::Geometry && + (stage != ShaderType::Vertex || device.HasVertexViewportLayer())) { if (ir.UsesLayer()) { code.AddLine("int gl_Layer;"); } @@ -647,6 +668,16 @@ private: --code.scope; code.AddLine("}};"); code.AddNewLine(); + + if (stage == ShaderType::Geometry) { + if (ir.UsesLayer()) { + code.AddLine("out int gl_Layer;"); + } + if (ir.UsesViewportIndex()) { + code.AddLine("out int gl_ViewportIndex;"); + } + } + code.AddNewLine(); } void DeclareRegisters() { @@ -782,7 +813,7 @@ private: const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element); const auto it = transform_feedback.find(location); if (it == transform_feedback.end()) { - return {}; + return std::nullopt; } return it->second.components; } @@ -834,11 +865,24 @@ private: } void DeclareConstantBuffers() { + if (use_unified_uniforms) { + const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer + + static_cast<u32>(ir.GetGlobalMemory().size()); + code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{", + binding); + code.AddLine(" uint cbufs[];"); + code.AddLine("}};"); + code.AddNewLine(); + return; + } + u32 binding = device.GetBaseBindings(stage).uniform_buffer; - for (const auto& [index, cbuf] : ir.GetConstantBuffers()) { + for (const auto [index, info] : ir.GetConstantBuffers()) { + const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4; + const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements; code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++, GetConstBufferBlock(index)); - code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS); + code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), size); code.AddLine("}};"); code.AddNewLine(); } @@ -869,37 +913,37 @@ private: for (const auto& sampler : ir.GetSamplers()) { const std::string name = GetSampler(sampler); const std::string description = fmt::format("layout (binding = {}) uniform", binding); - binding += sampler.IsIndexed() ? sampler.Size() : 1; + binding += sampler.is_indexed ? sampler.size : 1; std::string sampler_type = [&]() { - if (sampler.IsBuffer()) { + if (sampler.is_buffer) { return "samplerBuffer"; } - switch (sampler.GetType()) { - case Tegra::Shader::TextureType::Texture1D: + switch (sampler.type) { + case TextureType::Texture1D: return "sampler1D"; - case Tegra::Shader::TextureType::Texture2D: + case TextureType::Texture2D: return "sampler2D"; - case Tegra::Shader::TextureType::Texture3D: + case TextureType::Texture3D: return "sampler3D"; - case Tegra::Shader::TextureType::TextureCube: + case TextureType::TextureCube: return "samplerCube"; default: UNREACHABLE(); return "sampler2D"; } }(); - if (sampler.IsArray()) { + if (sampler.is_array) { sampler_type += "Array"; } - if (sampler.IsShadow()) { + if (sampler.is_shadow) { sampler_type += "Shadow"; } - if (!sampler.IsIndexed()) { + if (!sampler.is_indexed) { code.AddLine("{} {} {};", description, sampler_type, name); } else { - code.AddLine("{} {} {}[{}];", description, sampler_type, name, sampler.Size()); + code.AddLine("{} {} {}[{}];", description, sampler_type, name, sampler.size); } } if (!ir.GetSamplers().empty()) { @@ -945,14 +989,14 @@ private: u32 binding = device.GetBaseBindings(stage).image; for (const auto& image : ir.GetImages()) { std::string qualifier = "coherent volatile"; - if (image.IsRead() && !image.IsWritten()) { + if (image.is_read && !image.is_written) { qualifier += " readonly"; - } else if (image.IsWritten() && !image.IsRead()) { + } else if (image.is_written && !image.is_read) { qualifier += " writeonly"; } - const char* format = image.IsAtomic() ? "r32ui, " : ""; - const char* type_declaration = GetImageTypeDeclaration(image.GetType()); + const char* format = image.is_atomic ? "r32ui, " : ""; + const char* type_declaration = GetImageTypeDeclaration(image.type); code.AddLine("layout ({}binding = {}) {} uniform uimage{} {};", format, binding++, qualifier, type_declaration, GetImage(image)); } @@ -1037,42 +1081,51 @@ private: if (const auto cbuf = std::get_if<CbufNode>(&*node)) { const Node offset = cbuf->GetOffset(); + const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS; + if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { // Direct access const u32 offset_imm = immediate->GetValue(); ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access"); - return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), - offset_imm / (4 * 4), (offset_imm / 4) % 4), - Type::Uint}; + if (use_unified_uniforms) { + return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4), + Type::Uint}; + } else { + return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), + offset_imm / (4 * 4), (offset_imm / 4) % 4), + Type::Uint}; + } } - if (std::holds_alternative<OperationNode>(*offset)) { - // Indirect access - const std::string final_offset = code.GenerateTemporary(); - code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); + // Indirect access + if (use_unified_uniforms) { + return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset, + Visit(offset).AsUint()), + Type::Uint}; + } - if (!device.HasComponentIndexingBug()) { - return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()), - final_offset, final_offset), - Type::Uint}; - } + const std::string final_offset = code.GenerateTemporary(); + code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); - // AMD's proprietary GLSL compiler emits ill code for variable component access. - // To bypass this driver bug generate 4 ifs, one per each component. - const std::string pack = code.GenerateTemporary(); - code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()), - final_offset); - - const std::string result = code.GenerateTemporary(); - code.AddLine("uint {};", result); - for (u32 swizzle = 0; swizzle < 4; ++swizzle) { - code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, - pack, GetSwizzle(swizzle)); - } - return {result, Type::Uint}; + if (!device.HasComponentIndexingBug()) { + return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()), + final_offset, final_offset), + Type::Uint}; } - UNREACHABLE_MSG("Unmanaged offset node type"); + // AMD's proprietary GLSL compiler emits ill code for variable component access. + // To bypass this driver bug generate 4 ifs, one per each component. + const std::string pack = code.GenerateTemporary(); + code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()), + final_offset); + + const std::string result = code.GenerateTemporary(); + code.AddLine("uint {};", result); + for (u32 swizzle = 0; swizzle < 4; ++swizzle) { + code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, pack, + GetSwizzle(swizzle)); + } + return {result, Type::Uint}; } if (const auto gmem = std::get_if<GmemNode>(&*node)) { @@ -1144,6 +1197,7 @@ private: return {"gl_FragCoord"s + GetSwizzle(element), Type::Float}; default: UNREACHABLE(); + return {"0", Type::Int}; } case Attribute::Index::FrontColor: return {"gl_Color"s + GetSwizzle(element), Type::Float}; @@ -1241,21 +1295,21 @@ private: switch (element) { case 0: UNIMPLEMENTED(); - return {}; + return std::nullopt; case 1: if (stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) { - return {}; + return std::nullopt; } return {{"gl_Layer", Type::Int}}; case 2: if (stage == ShaderType::Vertex && !device.HasVertexViewportLayer()) { - return {}; + return std::nullopt; } return {{"gl_ViewportIndex", Type::Int}}; case 3: return {{"gl_PointSize", Type::Float}}; } - return {}; + return std::nullopt; case Attribute::Index::FrontColor: return {{"gl_FrontColor"s + GetSwizzle(element), Type::Float}}; case Attribute::Index::FrontSecondaryColor: @@ -1278,7 +1332,7 @@ private: Type::Float}}; } UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute)); - return {}; + return std::nullopt; } } @@ -1335,16 +1389,27 @@ private: ASSERT(meta); const std::size_t count = operation.GetOperandsCount(); - const bool has_array = meta->sampler.IsArray(); - const bool has_shadow = meta->sampler.IsShadow(); + const bool has_array = meta->sampler.is_array; + const bool has_shadow = meta->sampler.is_shadow; + const bool workaround_lod_array_shadow_as_grad = + !device.HasTextureShadowLod() && function_suffix == "Lod" && meta->sampler.is_shadow && + ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) || + meta->sampler.type == TextureType::TextureCube); + + std::string expr = "texture"; + + if (workaround_lod_array_shadow_as_grad) { + expr += "Grad"; + } else { + expr += function_suffix; + } - std::string expr = "texture" + function_suffix; if (!meta->aoffi.empty()) { expr += "Offset"; } else if (!meta->ptp.empty()) { expr += "Offsets"; } - if (!meta->sampler.IsIndexed()) { + if (!meta->sampler.is_indexed) { expr += '(' + GetSampler(meta->sampler) + ", "; } else { expr += '(' + GetSampler(meta->sampler) + '[' + Visit(meta->index).AsUint() + "], "; @@ -1372,6 +1437,18 @@ private: expr += ')'; } + if (workaround_lod_array_shadow_as_grad) { + switch (meta->sampler.type) { + case TextureType::Texture2D: + return expr + ", vec2(0.0), vec2(0.0))"; + case TextureType::TextureCube: + return expr + ", vec3(0.0), vec3(0.0))"; + default: + UNREACHABLE(); + break; + } + } + for (const auto& variant : extras) { if (const auto argument = std::get_if<TextureArgument>(&variant)) { expr += GenerateTextureArgument(*argument); @@ -1482,8 +1559,8 @@ private: dy += '('; for (std::size_t index = 0; index < components; ++index) { - const auto operand_x{derivates.at(index * 2)}; - const auto operand_y{derivates.at(index * 2 + 1)}; + const auto& operand_x{derivates.at(index * 2)}; + const auto& operand_y{derivates.at(index * 2 + 1)}; dx += Visit(operand_x).AsFloat(); dy += Visit(operand_y).AsFloat(); @@ -1536,7 +1613,9 @@ private: Expression target; if (const auto gpr = std::get_if<GprNode>(&*dest)) { if (gpr->GetIndex() == Register::ZeroIndex) { - // Writing to Register::ZeroIndex is a no op + // Writing to Register::ZeroIndex is a no op but we still have to visit the source + // as it might have side effects. + code.AddLine("{};", Visit(src).GetCode()); return {}; } target = {GetRegister(gpr->GetIndex()), Type::Float}; @@ -1838,38 +1917,48 @@ private: Type::HalfFloat}; } - template <Type type> - Expression LogicalLessThan(Operation operation) { - return GenerateBinaryInfix(operation, "<", Type::Bool, type, type); - } - - template <Type type> - Expression LogicalEqual(Operation operation) { - return GenerateBinaryInfix(operation, "==", Type::Bool, type, type); - } + template <const std::string_view& op, Type type, bool unordered = false> + Expression Comparison(Operation operation) { + static_assert(!unordered || type == Type::Float); - template <Type type> - Expression LogicalLessEqual(Operation operation) { - return GenerateBinaryInfix(operation, "<=", Type::Bool, type, type); - } + Expression expr = GenerateBinaryInfix(operation, op, Type::Bool, type, type); - template <Type type> - Expression LogicalGreaterThan(Operation operation) { - return GenerateBinaryInfix(operation, ">", Type::Bool, type, type); + if constexpr (op.compare("!=") == 0 && type == Type::Float && !unordered) { + // GLSL's operator!=(float, float) doesn't seem be ordered. This happens on both AMD's + // and Nvidia's proprietary stacks. Manually force an ordered comparison. + return {fmt::format("({} && !isnan({}) && !isnan({}))", expr.AsBool(), + VisitOperand(operation, 0).AsFloat(), + VisitOperand(operation, 1).AsFloat()), + Type::Bool}; + } + if constexpr (!unordered) { + return expr; + } + // Unordered comparisons are always true for NaN operands. + return {fmt::format("({} || isnan({}) || isnan({}))", expr.AsBool(), + VisitOperand(operation, 0).AsFloat(), + VisitOperand(operation, 1).AsFloat()), + Type::Bool}; } - template <Type type> - Expression LogicalNotEqual(Operation operation) { - return GenerateBinaryInfix(operation, "!=", Type::Bool, type, type); + Expression FOrdered(Operation operation) { + return {fmt::format("(!isnan({}) && !isnan({}))", VisitOperand(operation, 0).AsFloat(), + VisitOperand(operation, 1).AsFloat()), + Type::Bool}; } - template <Type type> - Expression LogicalGreaterEqual(Operation operation) { - return GenerateBinaryInfix(operation, ">=", Type::Bool, type, type); + Expression FUnordered(Operation operation) { + return {fmt::format("(isnan({}) || isnan({}))", VisitOperand(operation, 0).AsFloat(), + VisitOperand(operation, 1).AsFloat()), + Type::Bool}; } - Expression LogicalFIsNan(Operation operation) { - return GenerateUnary(operation, "isnan", Type::Bool, Type::Float); + Expression LogicalAddCarry(Operation operation) { + const std::string carry = code.GenerateTemporary(); + code.AddLine("uint {};", carry); + code.AddLine("uaddCarry({}, {}, {});", VisitOperand(operation, 0).AsUint(), + VisitOperand(operation, 1).AsUint(), carry); + return {fmt::format("({} != 0)", carry), Type::Bool}; } Expression LogicalAssign(Operation operation) { @@ -1967,24 +2056,39 @@ private: } Expression Texture(Operation operation) { - const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); - ASSERT(meta); - - std::string expr = GenerateTexture( - operation, "", {TextureOffset{}, TextureArgument{Type::Float, meta->bias}}); - if (meta->sampler.IsShadow()) { - expr = "vec4(" + expr + ')'; + const auto meta = std::get<MetaTexture>(operation.GetMeta()); + const bool separate_dc = meta.sampler.type == TextureType::TextureCube && + meta.sampler.is_array && meta.sampler.is_shadow; + // TODO: Replace this with an array and make GenerateTexture use C++20 std::span + const std::vector<TextureIR> extras{ + TextureOffset{}, + TextureArgument{Type::Float, meta.bias}, + }; + std::string expr = GenerateTexture(operation, "", extras, separate_dc); + if (meta.sampler.is_shadow) { + expr = fmt::format("vec4({})", expr); } - return {expr + GetSwizzle(meta->element), Type::Float}; + return {expr + GetSwizzle(meta.element), Type::Float}; } Expression TextureLod(Operation operation) { const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); ASSERT(meta); - std::string expr = GenerateTexture( - operation, "Lod", {TextureArgument{Type::Float, meta->lod}, TextureOffset{}}); - if (meta->sampler.IsShadow()) { + std::string expr{}; + + if (!device.HasTextureShadowLod() && meta->sampler.is_shadow && + ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) || + meta->sampler.type == TextureType::TextureCube)) { + LOG_ERROR(Render_OpenGL, + "Device lacks GL_EXT_texture_shadow_lod, using textureGrad as a workaround"); + expr = GenerateTexture(operation, "Lod", {}); + } else { + expr = GenerateTexture(operation, "Lod", + {TextureArgument{Type::Float, meta->lod}, TextureOffset{}}); + } + + if (meta->sampler.is_shadow) { expr = "vec4(" + expr + ')'; } return {expr + GetSwizzle(meta->element), Type::Float}; @@ -1993,11 +2097,11 @@ private: Expression TextureGather(Operation operation) { const auto& meta = std::get<MetaTexture>(operation.GetMeta()); - const auto type = meta.sampler.IsShadow() ? Type::Float : Type::Int; - const bool separate_dc = meta.sampler.IsShadow(); + const auto type = meta.sampler.is_shadow ? Type::Float : Type::Int; + const bool separate_dc = meta.sampler.is_shadow; std::vector<TextureIR> ir; - if (meta.sampler.IsShadow()) { + if (meta.sampler.is_shadow) { ir = {TextureOffset{}}; } else { ir = {TextureOffset{}, TextureArgument{type, meta.component}}; @@ -2042,7 +2146,7 @@ private: constexpr std::array constructors = {"int", "ivec2", "ivec3", "ivec4"}; const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); ASSERT(meta); - UNIMPLEMENTED_IF(meta->sampler.IsArray()); + UNIMPLEMENTED_IF(meta->sampler.is_array); const std::size_t count = operation.GetOperandsCount(); std::string expr = "texelFetch("; @@ -2063,7 +2167,7 @@ private: } expr += ')'; - if (meta->lod && !meta->sampler.IsBuffer()) { + if (meta->lod && !meta->sampler.is_buffer) { expr += ", "; expr += Visit(meta->lod).AsInt(); } @@ -2074,12 +2178,10 @@ private: } Expression TextureGradient(Operation operation) { - const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); - ASSERT(meta); - + const auto& meta = std::get<MetaTexture>(operation.GetMeta()); std::string expr = GenerateTexture(operation, "Grad", {TextureDerivates{}, TextureOffset{}}); - return {std::move(expr) + GetSwizzle(meta->element), Type::Float}; + return {std::move(expr) + GetSwizzle(meta.element), Type::Float}; } Expression ImageLoad(Operation operation) { @@ -2295,6 +2397,18 @@ private: return {"gl_SubGroupInvocationARB", Type::Uint}; } + template <const std::string_view& comparison> + Expression ThreadMask(Operation) { + if (device.HasWarpIntrinsics()) { + return {fmt::format("gl_Thread{}MaskNV", comparison), Type::Uint}; + } + if (device.HasShaderBallot()) { + return {fmt::format("uint(gl_SubGroup{}MaskARB)", comparison), Type::Uint}; + } + LOG_ERROR(Render_OpenGL, "Thread mask intrinsics are required by the shader"); + return {"0U", Type::Uint}; + } + Expression ShuffleIndexed(Operation operation) { std::string value = VisitOperand(operation, 0).AsFloat(); @@ -2307,7 +2421,21 @@ private: return {fmt::format("readInvocationARB({}, {})", value, index), Type::Float}; } - Expression MemoryBarrierGL(Operation) { + Expression Barrier(Operation) { + if (!ir.IsDecompiled()) { + LOG_ERROR(Render_OpenGL, "barrier() used but shader is not decompiled"); + return {}; + } + code.AddLine("barrier();"); + return {}; + } + + Expression MemoryBarrierGroup(Operation) { + code.AddLine("groupMemoryBarrier();"); + return {}; + } + + Expression MemoryBarrierGlobal(Operation) { code.AddLine("memoryBarrier();"); return {}; } @@ -2316,6 +2444,19 @@ private: Func() = delete; ~Func() = delete; + static constexpr std::string_view LessThan = "<"; + static constexpr std::string_view Equal = "=="; + static constexpr std::string_view LessEqual = "<="; + static constexpr std::string_view GreaterThan = ">"; + static constexpr std::string_view NotEqual = "!="; + static constexpr std::string_view GreaterEqual = ">="; + + static constexpr std::string_view Eq = "Eq"; + static constexpr std::string_view Ge = "Ge"; + static constexpr std::string_view Gt = "Gt"; + static constexpr std::string_view Le = "Le"; + static constexpr std::string_view Lt = "Lt"; + static constexpr std::string_view Add = "Add"; static constexpr std::string_view Min = "Min"; static constexpr std::string_view Max = "Max"; @@ -2417,27 +2558,36 @@ private: &GLSLDecompiler::LogicalPick2, &GLSLDecompiler::LogicalAnd2, - &GLSLDecompiler::LogicalLessThan<Type::Float>, - &GLSLDecompiler::LogicalEqual<Type::Float>, - &GLSLDecompiler::LogicalLessEqual<Type::Float>, - &GLSLDecompiler::LogicalGreaterThan<Type::Float>, - &GLSLDecompiler::LogicalNotEqual<Type::Float>, - &GLSLDecompiler::LogicalGreaterEqual<Type::Float>, - &GLSLDecompiler::LogicalFIsNan, - - &GLSLDecompiler::LogicalLessThan<Type::Int>, - &GLSLDecompiler::LogicalEqual<Type::Int>, - &GLSLDecompiler::LogicalLessEqual<Type::Int>, - &GLSLDecompiler::LogicalGreaterThan<Type::Int>, - &GLSLDecompiler::LogicalNotEqual<Type::Int>, - &GLSLDecompiler::LogicalGreaterEqual<Type::Int>, - - &GLSLDecompiler::LogicalLessThan<Type::Uint>, - &GLSLDecompiler::LogicalEqual<Type::Uint>, - &GLSLDecompiler::LogicalLessEqual<Type::Uint>, - &GLSLDecompiler::LogicalGreaterThan<Type::Uint>, - &GLSLDecompiler::LogicalNotEqual<Type::Uint>, - &GLSLDecompiler::LogicalGreaterEqual<Type::Uint>, + &GLSLDecompiler::Comparison<Func::LessThan, Type::Float, false>, + &GLSLDecompiler::Comparison<Func::Equal, Type::Float, false>, + &GLSLDecompiler::Comparison<Func::LessEqual, Type::Float, false>, + &GLSLDecompiler::Comparison<Func::GreaterThan, Type::Float, false>, + &GLSLDecompiler::Comparison<Func::NotEqual, Type::Float, false>, + &GLSLDecompiler::Comparison<Func::GreaterEqual, Type::Float, false>, + &GLSLDecompiler::FOrdered, + &GLSLDecompiler::FUnordered, + &GLSLDecompiler::Comparison<Func::LessThan, Type::Float, true>, + &GLSLDecompiler::Comparison<Func::Equal, Type::Float, true>, + &GLSLDecompiler::Comparison<Func::LessEqual, Type::Float, true>, + &GLSLDecompiler::Comparison<Func::GreaterThan, Type::Float, true>, + &GLSLDecompiler::Comparison<Func::NotEqual, Type::Float, true>, + &GLSLDecompiler::Comparison<Func::GreaterEqual, Type::Float, true>, + + &GLSLDecompiler::Comparison<Func::LessThan, Type::Int>, + &GLSLDecompiler::Comparison<Func::Equal, Type::Int>, + &GLSLDecompiler::Comparison<Func::LessEqual, Type::Int>, + &GLSLDecompiler::Comparison<Func::GreaterThan, Type::Int>, + &GLSLDecompiler::Comparison<Func::NotEqual, Type::Int>, + &GLSLDecompiler::Comparison<Func::GreaterEqual, Type::Int>, + + &GLSLDecompiler::Comparison<Func::LessThan, Type::Uint>, + &GLSLDecompiler::Comparison<Func::Equal, Type::Uint>, + &GLSLDecompiler::Comparison<Func::LessEqual, Type::Uint>, + &GLSLDecompiler::Comparison<Func::GreaterThan, Type::Uint>, + &GLSLDecompiler::Comparison<Func::NotEqual, Type::Uint>, + &GLSLDecompiler::Comparison<Func::GreaterEqual, Type::Uint>, + + &GLSLDecompiler::LogicalAddCarry, &GLSLDecompiler::Logical2HLessThan<false>, &GLSLDecompiler::Logical2HEqual<false>, @@ -2524,9 +2674,16 @@ private: &GLSLDecompiler::VoteEqual, &GLSLDecompiler::ThreadId, + &GLSLDecompiler::ThreadMask<Func::Eq>, + &GLSLDecompiler::ThreadMask<Func::Ge>, + &GLSLDecompiler::ThreadMask<Func::Gt>, + &GLSLDecompiler::ThreadMask<Func::Le>, + &GLSLDecompiler::ThreadMask<Func::Lt>, &GLSLDecompiler::ShuffleIndexed, - &GLSLDecompiler::MemoryBarrierGL, + &GLSLDecompiler::Barrier, + &GLSLDecompiler::MemoryBarrierGroup, + &GLSLDecompiler::MemoryBarrierGlobal, }; static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); @@ -2596,11 +2753,11 @@ private: } std::string GetSampler(const Sampler& sampler) const { - return AppendSuffix(static_cast<u32>(sampler.GetIndex()), "sampler"); + return AppendSuffix(sampler.index, "sampler"); } std::string GetImage(const Image& image) const { - return AppendSuffix(static_cast<u32>(image.GetIndex()), "image"); + return AppendSuffix(image.index, "image"); } std::string AppendSuffix(u32 index, std::string_view name) const { @@ -2623,15 +2780,6 @@ private: return std::min<u32>(device.GetMaxVaryings(), Maxwell::NumVaryings); } - bool IsRenderTargetEnabled(u32 render_target) const { - for (u32 component = 0; component < 4; ++component) { - if (header.ps.IsColorComponentOutputEnabled(render_target, component)) { - return true; - } - } - return false; - } - const Device& device; const ShaderIR& ir; const Registry& registry; @@ -2639,6 +2787,7 @@ private: const std::string_view identifier; const std::string_view suffix; const Header header; + const bool use_unified_uniforms; std::unordered_map<u8, VaryingTFB> transform_feedback; ShaderWriter code; @@ -2834,7 +2983,7 @@ void GLSLDecompiler::DecompileAST() { } // Anonymous namespace -ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) { +ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType stage) { ShaderEntries entries; for (const auto& cbuf : ir.GetConstantBuffers()) { entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(), @@ -2855,6 +3004,7 @@ ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) { entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i; } entries.shader_length = ir.GetLength(); + entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage); return entries; } diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index e7dbd810c..451c9689a 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h @@ -33,36 +33,19 @@ public: } private: - u32 index{}; + u32 index = 0; }; -class GlobalMemoryEntry { -public: - explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset, bool is_read, bool is_written) +struct GlobalMemoryEntry { + constexpr explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset, bool is_read, + bool is_written) : cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset}, is_read{is_read}, is_written{ is_written} {} - u32 GetCbufIndex() const { - return cbuf_index; - } - - u32 GetCbufOffset() const { - return cbuf_offset; - } - - bool IsRead() const { - return is_read; - } - - bool IsWritten() const { - return is_written; - } - -private: - u32 cbuf_index{}; - u32 cbuf_offset{}; - bool is_read{}; - bool is_written{}; + u32 cbuf_index = 0; + u32 cbuf_offset = 0; + bool is_read = false; + bool is_written = false; }; struct ShaderEntries { @@ -70,11 +53,13 @@ struct ShaderEntries { std::vector<GlobalMemoryEntry> global_memory_entries; std::vector<SamplerEntry> samplers; std::vector<ImageEntry> images; - u32 clip_distances{}; std::size_t shader_length{}; + u32 clip_distances{}; + bool use_unified_uniforms{}; }; -ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir); +ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir, + Tegra::Engines::ShaderType stage); std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, const VideoCommon::Shader::Registry& registry, diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index 9e95a122b..70dd0c3c6 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp @@ -29,6 +29,8 @@ using VideoCommon::Shader::KeyMap; namespace { +using VideoCommon::Shader::SeparateSamplerKey; + using ShaderCacheVersionHash = std::array<u8, 64>; struct ConstBufferKey { @@ -37,18 +39,26 @@ struct ConstBufferKey { u32 value = 0; }; -struct BoundSamplerKey { +struct BoundSamplerEntry { u32 offset = 0; Tegra::Engines::SamplerDescriptor sampler; }; -struct BindlessSamplerKey { +struct SeparateSamplerEntry { + u32 cbuf1 = 0; + u32 cbuf2 = 0; + u32 offset1 = 0; + u32 offset2 = 0; + Tegra::Engines::SamplerDescriptor sampler; +}; + +struct BindlessSamplerEntry { u32 cbuf = 0; u32 offset = 0; Tegra::Engines::SamplerDescriptor sampler; }; -constexpr u32 NativeVersion = 20; +constexpr u32 NativeVersion = 21; ShaderCacheVersionHash GetShaderCacheVersionHash() { ShaderCacheVersionHash hash{}; @@ -63,7 +73,7 @@ ShaderDiskCacheEntry::ShaderDiskCacheEntry() = default; ShaderDiskCacheEntry::~ShaderDiskCacheEntry() = default; -bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) { +bool ShaderDiskCacheEntry::Load(Common::FS::IOFile& file) { if (file.ReadBytes(&type, sizeof(u32)) != sizeof(u32)) { return false; } @@ -87,12 +97,14 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) { u32 texture_handler_size_value; u32 num_keys; u32 num_bound_samplers; + u32 num_separate_samplers; u32 num_bindless_samplers; if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 || file.ReadArray(&is_texture_handler_size_known, 1) != 1 || file.ReadArray(&texture_handler_size_value, 1) != 1 || file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 || file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 || + file.ReadArray(&num_separate_samplers, 1) != 1 || file.ReadArray(&num_bindless_samplers, 1) != 1) { return false; } @@ -101,29 +113,38 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) { } std::vector<ConstBufferKey> flat_keys(num_keys); - std::vector<BoundSamplerKey> flat_bound_samplers(num_bound_samplers); - std::vector<BindlessSamplerKey> flat_bindless_samplers(num_bindless_samplers); + std::vector<BoundSamplerEntry> flat_bound_samplers(num_bound_samplers); + std::vector<SeparateSamplerEntry> flat_separate_samplers(num_separate_samplers); + std::vector<BindlessSamplerEntry> flat_bindless_samplers(num_bindless_samplers); if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() || file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) != flat_bound_samplers.size() || + file.ReadArray(flat_separate_samplers.data(), flat_separate_samplers.size()) != + flat_separate_samplers.size() || file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) != flat_bindless_samplers.size()) { return false; } - for (const auto& key : flat_keys) { - keys.insert({{key.cbuf, key.offset}, key.value}); + for (const auto& entry : flat_keys) { + keys.insert({{entry.cbuf, entry.offset}, entry.value}); + } + for (const auto& entry : flat_bound_samplers) { + bound_samplers.emplace(entry.offset, entry.sampler); } - for (const auto& key : flat_bound_samplers) { - bound_samplers.emplace(key.offset, key.sampler); + for (const auto& entry : flat_separate_samplers) { + SeparateSamplerKey key; + key.buffers = {entry.cbuf1, entry.cbuf2}; + key.offsets = {entry.offset1, entry.offset2}; + separate_samplers.emplace(key, entry.sampler); } - for (const auto& key : flat_bindless_samplers) { - bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler}); + for (const auto& entry : flat_bindless_samplers) { + bindless_samplers.insert({{entry.cbuf, entry.offset}, entry.sampler}); } return true; } -bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const { +bool ShaderDiskCacheEntry::Save(Common::FS::IOFile& file) const { if (file.WriteObject(static_cast<u32>(type)) != 1 || file.WriteObject(static_cast<u32>(code.size())) != 1 || file.WriteObject(static_cast<u32>(code_b.size())) != 1) { @@ -142,6 +163,7 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const { file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 || file.WriteObject(static_cast<u32>(keys.size())) != 1 || file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 || + file.WriteObject(static_cast<u32>(separate_samplers.size())) != 1 || file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) { return false; } @@ -152,48 +174,64 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const { flat_keys.push_back(ConstBufferKey{address.first, address.second, value}); } - std::vector<BoundSamplerKey> flat_bound_samplers; + std::vector<BoundSamplerEntry> flat_bound_samplers; flat_bound_samplers.reserve(bound_samplers.size()); for (const auto& [address, sampler] : bound_samplers) { - flat_bound_samplers.push_back(BoundSamplerKey{address, sampler}); + flat_bound_samplers.push_back(BoundSamplerEntry{address, sampler}); } - std::vector<BindlessSamplerKey> flat_bindless_samplers; + std::vector<SeparateSamplerEntry> flat_separate_samplers; + flat_separate_samplers.reserve(separate_samplers.size()); + for (const auto& [key, sampler] : separate_samplers) { + SeparateSamplerEntry entry; + std::tie(entry.cbuf1, entry.cbuf2) = key.buffers; + std::tie(entry.offset1, entry.offset2) = key.offsets; + entry.sampler = sampler; + flat_separate_samplers.push_back(entry); + } + + std::vector<BindlessSamplerEntry> flat_bindless_samplers; flat_bindless_samplers.reserve(bindless_samplers.size()); for (const auto& [address, sampler] : bindless_samplers) { flat_bindless_samplers.push_back( - BindlessSamplerKey{address.first, address.second, sampler}); + BindlessSamplerEntry{address.first, address.second, sampler}); } return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() && file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) == flat_bound_samplers.size() && + file.WriteArray(flat_separate_samplers.data(), flat_separate_samplers.size()) == + flat_separate_samplers.size() && file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) == flat_bindless_samplers.size(); } -ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system) : system{system} {} +ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL() = default; ShaderDiskCacheOpenGL::~ShaderDiskCacheOpenGL() = default; +void ShaderDiskCacheOpenGL::BindTitleID(u64 title_id_) { + title_id = title_id_; +} + std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTransferable() { // Skip games without title id - const bool has_title_id = system.CurrentProcess()->GetTitleID() != 0; - if (!Settings::values.use_disk_shader_cache || !has_title_id) { - return {}; + const bool has_title_id = title_id != 0; + if (!Settings::values.use_disk_shader_cache.GetValue() || !has_title_id) { + return std::nullopt; } - FileUtil::IOFile file(GetTransferablePath(), "rb"); + Common::FS::IOFile file(GetTransferablePath(), "rb"); if (!file.IsOpen()) { LOG_INFO(Render_OpenGL, "No transferable shader cache found"); is_usable = true; - return {}; + return std::nullopt; } u32 version{}; if (file.ReadBytes(&version, sizeof(version)) != sizeof(version)) { LOG_ERROR(Render_OpenGL, "Failed to get transferable cache version, skipping it"); - return {}; + return std::nullopt; } if (version < NativeVersion) { @@ -201,12 +239,12 @@ std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTran file.Close(); InvalidateTransferable(); is_usable = true; - return {}; + return std::nullopt; } if (version > NativeVersion) { LOG_WARNING(Render_OpenGL, "Transferable shader cache was generated with a newer version " "of the emulator, skipping"); - return {}; + return std::nullopt; } // Version is valid, load the shaders @@ -215,7 +253,7 @@ std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTran ShaderDiskCacheEntry& entry = entries.emplace_back(); if (!entry.Load(file)) { LOG_ERROR(Render_OpenGL, "Failed to load transferable raw entry, skipping"); - return {}; + return std::nullopt; } } @@ -228,7 +266,7 @@ std::vector<ShaderDiskCachePrecompiled> ShaderDiskCacheOpenGL::LoadPrecompiled() return {}; } - FileUtil::IOFile file(GetPrecompiledPath(), "rb"); + Common::FS::IOFile file(GetPrecompiledPath(), "rb"); if (!file.IsOpen()) { LOG_INFO(Render_OpenGL, "No precompiled shader cache found"); return {}; @@ -245,7 +283,7 @@ std::vector<ShaderDiskCachePrecompiled> ShaderDiskCacheOpenGL::LoadPrecompiled() } std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::LoadPrecompiledFile( - FileUtil::IOFile& file) { + Common::FS::IOFile& file) { // Read compressed file from disk and decompress to virtual precompiled cache file std::vector<u8> compressed(file.GetSize()); file.ReadBytes(compressed.data(), compressed.size()); @@ -256,12 +294,12 @@ std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::Lo ShaderCacheVersionHash file_hash{}; if (!LoadArrayFromPrecompiled(file_hash.data(), file_hash.size())) { precompiled_cache_virtual_file_offset = 0; - return {}; + return std::nullopt; } if (GetShaderCacheVersionHash() != file_hash) { LOG_INFO(Render_OpenGL, "Precompiled cache is from another version of the emulator"); precompiled_cache_virtual_file_offset = 0; - return {}; + return std::nullopt; } std::vector<ShaderDiskCachePrecompiled> entries; @@ -271,19 +309,19 @@ std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::Lo if (!LoadObjectFromPrecompiled(entry.unique_identifier) || !LoadObjectFromPrecompiled(entry.binary_format) || !LoadObjectFromPrecompiled(binary_size)) { - return {}; + return std::nullopt; } entry.binary.resize(binary_size); if (!LoadArrayFromPrecompiled(entry.binary.data(), entry.binary.size())) { - return {}; + return std::nullopt; } } return entries; } void ShaderDiskCacheOpenGL::InvalidateTransferable() { - if (!FileUtil::Delete(GetTransferablePath())) { + if (!Common::FS::Delete(GetTransferablePath())) { LOG_ERROR(Render_OpenGL, "Failed to invalidate transferable file={}", GetTransferablePath()); } @@ -294,7 +332,7 @@ void ShaderDiskCacheOpenGL::InvalidatePrecompiled() { // Clear virtaul precompiled cache file precompiled_cache_virtual_file.Resize(0); - if (!FileUtil::Delete(GetPrecompiledPath())) { + if (!Common::FS::Delete(GetPrecompiledPath())) { LOG_ERROR(Render_OpenGL, "Failed to invalidate precompiled file={}", GetPrecompiledPath()); } } @@ -310,7 +348,7 @@ void ShaderDiskCacheOpenGL::SaveEntry(const ShaderDiskCacheEntry& entry) { return; } - FileUtil::IOFile file = AppendTransferableFile(); + Common::FS::IOFile file = AppendTransferableFile(); if (!file.IsOpen()) { return; } @@ -352,15 +390,15 @@ void ShaderDiskCacheOpenGL::SavePrecompiled(u64 unique_identifier, GLuint progra } } -FileUtil::IOFile ShaderDiskCacheOpenGL::AppendTransferableFile() const { +Common::FS::IOFile ShaderDiskCacheOpenGL::AppendTransferableFile() const { if (!EnsureDirectories()) { return {}; } const auto transferable_path{GetTransferablePath()}; - const bool existed = FileUtil::Exists(transferable_path); + const bool existed = Common::FS::Exists(transferable_path); - FileUtil::IOFile file(transferable_path, "ab"); + Common::FS::IOFile file(transferable_path, "ab"); if (!file.IsOpen()) { LOG_ERROR(Render_OpenGL, "Failed to open transferable cache in path={}", transferable_path); return {}; @@ -392,7 +430,7 @@ void ShaderDiskCacheOpenGL::SaveVirtualPrecompiledFile() { Common::Compression::CompressDataZSTDDefault(uncompressed.data(), uncompressed.size()); const auto precompiled_path{GetPrecompiledPath()}; - FileUtil::IOFile file(precompiled_path, "wb"); + Common::FS::IOFile file(precompiled_path, "wb"); if (!file.IsOpen()) { LOG_ERROR(Render_OpenGL, "Failed to open precompiled cache in path={}", precompiled_path); @@ -406,24 +444,24 @@ void ShaderDiskCacheOpenGL::SaveVirtualPrecompiledFile() { bool ShaderDiskCacheOpenGL::EnsureDirectories() const { const auto CreateDir = [](const std::string& dir) { - if (!FileUtil::CreateDir(dir)) { + if (!Common::FS::CreateDir(dir)) { LOG_ERROR(Render_OpenGL, "Failed to create directory={}", dir); return false; } return true; }; - return CreateDir(FileUtil::GetUserPath(FileUtil::UserPath::ShaderDir)) && + return CreateDir(Common::FS::GetUserPath(Common::FS::UserPath::ShaderDir)) && CreateDir(GetBaseDir()) && CreateDir(GetTransferableDir()) && CreateDir(GetPrecompiledDir()); } std::string ShaderDiskCacheOpenGL::GetTransferablePath() const { - return FileUtil::SanitizePath(GetTransferableDir() + DIR_SEP_CHR + GetTitleID() + ".bin"); + return Common::FS::SanitizePath(GetTransferableDir() + DIR_SEP_CHR + GetTitleID() + ".bin"); } std::string ShaderDiskCacheOpenGL::GetPrecompiledPath() const { - return FileUtil::SanitizePath(GetPrecompiledDir() + DIR_SEP_CHR + GetTitleID() + ".bin"); + return Common::FS::SanitizePath(GetPrecompiledDir() + DIR_SEP_CHR + GetTitleID() + ".bin"); } std::string ShaderDiskCacheOpenGL::GetTransferableDir() const { @@ -435,11 +473,11 @@ std::string ShaderDiskCacheOpenGL::GetPrecompiledDir() const { } std::string ShaderDiskCacheOpenGL::GetBaseDir() const { - return FileUtil::GetUserPath(FileUtil::UserPath::ShaderDir) + DIR_SEP "opengl"; + return Common::FS::GetUserPath(Common::FS::UserPath::ShaderDir) + DIR_SEP "opengl"; } std::string ShaderDiskCacheOpenGL::GetTitleID() const { - return fmt::format("{:016X}", system.CurrentProcess()->GetTitleID()); + return fmt::format("{:016X}", title_id); } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h index d5be52e40..aef841c1d 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h @@ -21,11 +21,7 @@ #include "video_core/engines/shader_type.h" #include "video_core/shader/registry.h" -namespace Core { -class System; -} - -namespace FileUtil { +namespace Common::FS { class IOFile; } @@ -38,9 +34,9 @@ struct ShaderDiskCacheEntry { ShaderDiskCacheEntry(); ~ShaderDiskCacheEntry(); - bool Load(FileUtil::IOFile& file); + bool Load(Common::FS::IOFile& file); - bool Save(FileUtil::IOFile& file) const; + bool Save(Common::FS::IOFile& file) const; bool HasProgramA() const { return !code.empty() && !code_b.empty(); @@ -57,6 +53,7 @@ struct ShaderDiskCacheEntry { VideoCommon::Shader::ComputeInfo compute_info; VideoCommon::Shader::KeyMap keys; VideoCommon::Shader::BoundSamplerMap bound_samplers; + VideoCommon::Shader::SeparateSamplerMap separate_samplers; VideoCommon::Shader::BindlessSamplerMap bindless_samplers; }; @@ -69,9 +66,12 @@ struct ShaderDiskCachePrecompiled { class ShaderDiskCacheOpenGL { public: - explicit ShaderDiskCacheOpenGL(Core::System& system); + explicit ShaderDiskCacheOpenGL(); ~ShaderDiskCacheOpenGL(); + /// Binds a title ID for all future operations. + void BindTitleID(u64 title_id); + /// Loads transferable cache. If file has a old version or on failure, it deletes the file. std::optional<std::vector<ShaderDiskCacheEntry>> LoadTransferable(); @@ -96,10 +96,10 @@ public: private: /// Loads the transferable cache. Returns empty on failure. std::optional<std::vector<ShaderDiskCachePrecompiled>> LoadPrecompiledFile( - FileUtil::IOFile& file); + Common::FS::IOFile& file); /// Opens current game's transferable file and write it's header if it doesn't exist - FileUtil::IOFile AppendTransferableFile() const; + Common::FS::IOFile AppendTransferableFile() const; /// Save precompiled header to precompiled_cache_in_memory void SavePrecompiledHeaderToVirtualPrecompiledCache(); @@ -156,8 +156,6 @@ private: return LoadArrayFromPrecompiled(&object, 1); } - Core::System& system; - // Stores whole precompiled cache which will be read from or saved to the precompiled chache // file FileSys::VectorVfsFile precompiled_cache_virtual_file; @@ -167,8 +165,11 @@ private: // Stored transferable shaders std::unordered_set<u64> stored_transferable; + /// Title ID to operate on + u64 title_id = 0; + // The cache has been loaded at boot - bool is_usable{}; + bool is_usable = false; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 9c7b0adbd..691c6c79b 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -6,47 +6,124 @@ #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_shader_manager.h" -namespace OpenGL::GLShader { +namespace OpenGL { -ProgramManager::ProgramManager() = default; +namespace { + +void BindProgram(GLenum stage, GLuint current, GLuint old, bool& enabled) { + if (current == old) { + return; + } + if (current == 0) { + if (enabled) { + enabled = false; + glDisable(stage); + } + return; + } + if (!enabled) { + enabled = true; + glEnable(stage); + } + glBindProgramARB(stage, current); +} + +} // Anonymous namespace + +ProgramManager::ProgramManager(const Device& device) + : use_assembly_programs{device.UseAssemblyShaders()} { + if (use_assembly_programs) { + glEnable(GL_COMPUTE_PROGRAM_NV); + } else { + graphics_pipeline.Create(); + glBindProgramPipeline(graphics_pipeline.handle); + } +} ProgramManager::~ProgramManager() = default; -void ProgramManager::Create() { - graphics_pipeline.Create(); - glBindProgramPipeline(graphics_pipeline.handle); +void ProgramManager::BindCompute(GLuint program) { + if (use_assembly_programs) { + glBindProgramARB(GL_COMPUTE_PROGRAM_NV, program); + } else { + is_graphics_bound = false; + glUseProgram(program); + } } void ProgramManager::BindGraphicsPipeline() { + if (!use_assembly_programs) { + UpdateSourcePrograms(); + } +} + +void ProgramManager::BindHostPipeline(GLuint pipeline) { + if (use_assembly_programs) { + if (geometry_enabled) { + geometry_enabled = false; + old_state.geometry = 0; + glDisable(GL_GEOMETRY_PROGRAM_NV); + } + } else { + if (!is_graphics_bound) { + glUseProgram(0); + } + } + glBindProgramPipeline(pipeline); +} + +void ProgramManager::RestoreGuestPipeline() { + if (use_assembly_programs) { + glBindProgramPipeline(0); + } else { + glBindProgramPipeline(graphics_pipeline.handle); + } +} + +void ProgramManager::UseVertexShader(GLuint program) { + if (use_assembly_programs) { + BindProgram(GL_VERTEX_PROGRAM_NV, program, current_state.vertex, vertex_enabled); + } + current_state.vertex = program; +} + +void ProgramManager::UseGeometryShader(GLuint program) { + if (use_assembly_programs) { + BindProgram(GL_GEOMETRY_PROGRAM_NV, program, current_state.vertex, geometry_enabled); + } + current_state.geometry = program; +} + +void ProgramManager::UseFragmentShader(GLuint program) { + if (use_assembly_programs) { + BindProgram(GL_FRAGMENT_PROGRAM_NV, program, current_state.vertex, fragment_enabled); + } + current_state.fragment = program; +} + +void ProgramManager::UpdateSourcePrograms() { if (!is_graphics_bound) { is_graphics_bound = true; glUseProgram(0); } - // Avoid updating the pipeline when values have no changed - if (old_state == current_state) { - return; - } - - // Workaround for AMD bug - static constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT | - GL_FRAGMENT_SHADER_BIT}; const GLuint handle = graphics_pipeline.handle; - glUseProgramStages(handle, all_used_stages, 0); - glUseProgramStages(handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader); - glUseProgramStages(handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader); - glUseProgramStages(handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader); + const auto update_state = [handle](GLenum stage, GLuint current, GLuint old) { + if (current == old) { + return; + } + glUseProgramStages(handle, stage, current); + }; + update_state(GL_VERTEX_SHADER_BIT, current_state.vertex, old_state.vertex); + update_state(GL_GEOMETRY_SHADER_BIT, current_state.geometry, old_state.geometry); + update_state(GL_FRAGMENT_SHADER_BIT, current_state.fragment, old_state.fragment); old_state = current_state; } -void ProgramManager::BindComputeShader(GLuint program) { - is_graphics_bound = false; - glUseProgram(program); -} - void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) { const auto& regs = maxwell.regs; @@ -54,4 +131,4 @@ void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) { y_direction = regs.screen_y_control.y_negate == 0 ? 1.0f : -1.0f; } -} // namespace OpenGL::GLShader +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h index d2e47f2a9..950e0dfcb 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.h +++ b/src/video_core/renderer_opengl/gl_shader_manager.h @@ -11,7 +11,9 @@ #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/maxwell_to_gl.h" -namespace OpenGL::GLShader { +namespace OpenGL { + +class Device; /// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned /// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at @@ -28,50 +30,47 @@ static_assert(sizeof(MaxwellUniformData) < 16384, class ProgramManager { public: - explicit ProgramManager(); + explicit ProgramManager(const Device& device); ~ProgramManager(); - void Create(); + /// Binds a compute program + void BindCompute(GLuint program); - /// Updates the graphics pipeline and binds it. + /// Updates bound programs. void BindGraphicsPipeline(); - /// Binds a compute shader. - void BindComputeShader(GLuint program); - - void UseVertexShader(GLuint program) { - current_state.vertex_shader = program; - } + /// Binds an OpenGL pipeline object unsynchronized with the guest state. + void BindHostPipeline(GLuint pipeline); - void UseGeometryShader(GLuint program) { - current_state.geometry_shader = program; - } + /// Rewinds BindHostPipeline state changes. + void RestoreGuestPipeline(); - void UseFragmentShader(GLuint program) { - current_state.fragment_shader = program; - } + void UseVertexShader(GLuint program); + void UseGeometryShader(GLuint program); + void UseFragmentShader(GLuint program); private: struct PipelineState { - bool operator==(const PipelineState& rhs) const noexcept { - return vertex_shader == rhs.vertex_shader && fragment_shader == rhs.fragment_shader && - geometry_shader == rhs.geometry_shader; - } - - bool operator!=(const PipelineState& rhs) const noexcept { - return !operator==(rhs); - } - - GLuint vertex_shader = 0; - GLuint fragment_shader = 0; - GLuint geometry_shader = 0; + GLuint vertex = 0; + GLuint geometry = 0; + GLuint fragment = 0; }; + /// Update GLSL programs. + void UpdateSourcePrograms(); + OGLPipeline graphics_pipeline; - OGLPipeline compute_pipeline; + PipelineState current_state; PipelineState old_state; + + bool use_assembly_programs = false; + bool is_graphics_bound = true; + + bool vertex_enabled = false; + bool geometry_enabled = false; + bool fragment_enabled = false; }; -} // namespace OpenGL::GLShader +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_util.cpp b/src/video_core/renderer_opengl/gl_shader_util.cpp index 9e74eda0d..4bf0d6090 100644 --- a/src/video_core/renderer_opengl/gl_shader_util.cpp +++ b/src/video_core/renderer_opengl/gl_shader_util.cpp @@ -2,6 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <string_view> #include <vector> #include <glad/glad.h> #include "common/assert.h" @@ -11,7 +12,8 @@ namespace OpenGL::GLShader { namespace { -const char* GetStageDebugName(GLenum type) { + +std::string_view StageDebugName(GLenum type) { switch (type) { case GL_VERTEX_SHADER: return "vertex"; @@ -25,12 +27,17 @@ const char* GetStageDebugName(GLenum type) { UNIMPLEMENTED(); return "unknown"; } + } // Anonymous namespace -GLuint LoadShader(const char* source, GLenum type) { - const char* debug_type = GetStageDebugName(type); +GLuint LoadShader(std::string_view source, GLenum type) { + const std::string_view debug_type = StageDebugName(type); const GLuint shader_id = glCreateShader(type); - glShaderSource(shader_id, 1, &source, nullptr); + + const GLchar* source_string = source.data(); + const GLint source_length = static_cast<GLint>(source.size()); + + glShaderSource(shader_id, 1, &source_string, &source_length); LOG_DEBUG(Render_OpenGL, "Compiling {} shader...", debug_type); glCompileShader(shader_id); diff --git a/src/video_core/renderer_opengl/gl_shader_util.h b/src/video_core/renderer_opengl/gl_shader_util.h index 03b7548c2..1b770532e 100644 --- a/src/video_core/renderer_opengl/gl_shader_util.h +++ b/src/video_core/renderer_opengl/gl_shader_util.h @@ -38,7 +38,7 @@ void LogShaderSource(T... shaders) { * @param source String of the GLSL shader program * @param type Type of the shader (GL_VERTEX_SHADER, GL_GEOMETRY_SHADER or GL_FRAGMENT_SHADER) */ -GLuint LoadShader(const char* source, GLenum type); +GLuint LoadShader(std::string_view source, GLenum type); /** * Utility function to create and compile an OpenGL GLSL shader program (vertex + fragment shader) diff --git a/src/video_core/renderer_opengl/gl_state_tracker.cpp b/src/video_core/renderer_opengl/gl_state_tracker.cpp index d24fad3de..6bcf831f2 100644 --- a/src/video_core/renderer_opengl/gl_state_tracker.cpp +++ b/src/video_core/renderer_opengl/gl_state_tracker.cpp @@ -214,10 +214,8 @@ void SetupDirtyMisc(Tables& tables) { } // Anonymous namespace -StateTracker::StateTracker(Core::System& system) : system{system} {} - -void StateTracker::Initialize() { - auto& dirty = system.GPU().Maxwell3D().dirty; +StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags} { + auto& dirty = gpu.Maxwell3D().dirty; auto& tables = dirty.tables; SetupDirtyRenderTargets(tables); SetupDirtyColorMasks(tables); diff --git a/src/video_core/renderer_opengl/gl_state_tracker.h b/src/video_core/renderer_opengl/gl_state_tracker.h index 0f823288e..9d127548f 100644 --- a/src/video_core/renderer_opengl/gl_state_tracker.h +++ b/src/video_core/renderer_opengl/gl_state_tracker.h @@ -13,8 +13,8 @@ #include "video_core/dirty_flags.h" #include "video_core/engines/maxwell_3d.h" -namespace Core { -class System; +namespace Tegra { +class GPU; } namespace OpenGL { @@ -90,9 +90,7 @@ static_assert(Last <= std::numeric_limits<u8>::max()); class StateTracker { public: - explicit StateTracker(Core::System& system); - - void Initialize(); + explicit StateTracker(Tegra::GPU& gpu); void BindIndexBuffer(GLuint new_index_buffer) { if (index_buffer == new_index_buffer) { @@ -103,7 +101,6 @@ public: } void NotifyScreenDrawVertexArray() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::VertexFormats] = true; flags[OpenGL::Dirty::VertexFormat0 + 0] = true; flags[OpenGL::Dirty::VertexFormat0 + 1] = true; @@ -117,98 +114,81 @@ public: } void NotifyPolygonModes() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::PolygonModes] = true; flags[OpenGL::Dirty::PolygonModeFront] = true; flags[OpenGL::Dirty::PolygonModeBack] = true; } void NotifyViewport0() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::Viewports] = true; flags[OpenGL::Dirty::Viewport0] = true; } void NotifyScissor0() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::Scissors] = true; flags[OpenGL::Dirty::Scissor0] = true; } void NotifyColorMask0() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::ColorMasks] = true; flags[OpenGL::Dirty::ColorMask0] = true; } void NotifyBlend0() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::BlendStates] = true; flags[OpenGL::Dirty::BlendState0] = true; } void NotifyFramebuffer() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[VideoCommon::Dirty::RenderTargets] = true; } void NotifyFrontFace() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::FrontFace] = true; } void NotifyCullTest() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::CullTest] = true; } void NotifyDepthMask() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::DepthMask] = true; } void NotifyDepthTest() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::DepthTest] = true; } void NotifyStencilTest() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::StencilTest] = true; } void NotifyPolygonOffset() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::PolygonOffset] = true; } void NotifyRasterizeEnable() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::RasterizeEnable] = true; } void NotifyFramebufferSRGB() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::FramebufferSRGB] = true; } void NotifyLogicOp() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::LogicOp] = true; } void NotifyClipControl() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::ClipControl] = true; } void NotifyAlphaTest() { - auto& flags = system.GPU().Maxwell3D().dirty.flags; flags[OpenGL::Dirty::AlphaTest] = true; } private: - Core::System& system; + Tegra::Engines::Maxwell3D::DirtyState::Flags& flags; GLuint index_buffer = 0; }; diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index 6ec328c53..887995cf4 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -2,11 +2,13 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <deque> +#include <tuple> #include <vector> + #include "common/alignment.h" #include "common/assert.h" #include "common/microprofile.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", @@ -14,8 +16,7 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", namespace OpenGL { -OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent, - bool use_persistent) +OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage) : buffer_size(size) { gl_buffer.Create(); @@ -29,34 +30,22 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool p allocate_size *= 2; } - if (use_persistent) { - persistent = true; - coherent = prefer_coherent; - const GLbitfield flags = - GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0); - glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags); - mapped_ptr = static_cast<u8*>(glMapNamedBufferRange( - gl_buffer.handle, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT))); - } else { - glNamedBufferData(gl_buffer.handle, allocate_size, nullptr, GL_STREAM_DRAW); + static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT; + glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags); + mapped_ptr = static_cast<u8*>( + glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT)); + + if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) { + glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY); + glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); } } OGLStreamBuffer::~OGLStreamBuffer() { - if (persistent) { - glUnmapNamedBuffer(gl_buffer.handle); - } + glUnmapNamedBuffer(gl_buffer.handle); gl_buffer.Release(); } -GLuint OGLStreamBuffer::GetHandle() const { - return gl_buffer.handle; -} - -GLsizeiptr OGLStreamBuffer::GetSize() const { - return buffer_size; -} - std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) { ASSERT(size <= buffer_size); ASSERT(alignment <= buffer_size); @@ -68,36 +57,21 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a bool invalidate = false; if (buffer_pos + size > buffer_size) { + MICROPROFILE_SCOPE(OpenGL_StreamBuffer); + glInvalidateBufferData(gl_buffer.handle); + buffer_pos = 0; invalidate = true; - - if (persistent) { - glUnmapNamedBuffer(gl_buffer.handle); - } } - if (invalidate || !persistent) { - MICROPROFILE_SCOPE(OpenGL_StreamBuffer); - GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) | - (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) | - (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT); - mapped_ptr = static_cast<u8*>( - glMapNamedBufferRange(gl_buffer.handle, buffer_pos, buffer_size - buffer_pos, flags)); - mapped_offset = buffer_pos; - } - - return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate); + return std::make_tuple(mapped_ptr + buffer_pos, buffer_pos, invalidate); } void OGLStreamBuffer::Unmap(GLsizeiptr size) { ASSERT(size <= mapped_size); - if (!coherent && size > 0) { - glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos - mapped_offset, size); - } - - if (!persistent) { - glUnmapNamedBuffer(gl_buffer.handle); + if (size > 0) { + glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size); } buffer_pos += size; diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h index f8383cbd4..307a67113 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.h +++ b/src/video_core/renderer_opengl/gl_stream_buffer.h @@ -11,15 +11,13 @@ namespace OpenGL { +class Device; + class OGLStreamBuffer : private NonCopyable { public: - explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false, - bool use_persistent = true); + explicit OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage); ~OGLStreamBuffer(); - GLuint GetHandle() const; - GLsizeiptr GetSize() const; - /* * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes * and the optional alignment requirement. @@ -32,15 +30,24 @@ public: void Unmap(GLsizeiptr size); + GLuint Handle() const { + return gl_buffer.handle; + } + + u64 Address() const { + return gpu_address; + } + + GLsizeiptr Size() const noexcept { + return buffer_size; + } + private: OGLBuffer gl_buffer; - bool coherent = false; - bool persistent = false; - + GLuint64EXT gpu_address = 0; GLintptr buffer_pos = 0; GLsizeiptr buffer_size = 0; - GLintptr mapped_offset = 0; GLsizeiptr mapped_size = 0; u8* mapped_ptr = nullptr; }; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 2729d1265..a863ef218 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -35,96 +35,109 @@ MICROPROFILE_DEFINE(OpenGL_Texture_Buffer_Copy, "OpenGL", "Texture Buffer Copy", namespace { struct FormatTuple { - GLint internal_format; + GLenum internal_format; GLenum format = GL_NONE; GLenum type = GL_NONE; }; constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format_tuples = {{ - {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV}, // ABGR8U - {GL_RGBA8_SNORM, GL_RGBA, GL_BYTE}, // ABGR8S - {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE}, // ABGR8UI - {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV}, // B5G6R5U - {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV}, // A2B10G10R10U - {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV}, // A1B5G5R5U - {GL_R8, GL_RED, GL_UNSIGNED_BYTE}, // R8U - {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE}, // R8UI - {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT}, // RGBA16F - {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT}, // RGBA16U - {GL_RGBA16_SNORM, GL_RGBA, GL_SHORT}, // RGBA16S - {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT}, // RGBA16UI - {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV}, // R11FG11FB10F - {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT}, // RGBA32UI - {GL_COMPRESSED_RGBA_S3TC_DXT1_EXT}, // DXT1 - {GL_COMPRESSED_RGBA_S3TC_DXT3_EXT}, // DXT23 - {GL_COMPRESSED_RGBA_S3TC_DXT5_EXT}, // DXT45 - {GL_COMPRESSED_RED_RGTC1}, // DXN1 - {GL_COMPRESSED_RG_RGTC2}, // DXN2UNORM - {GL_COMPRESSED_SIGNED_RG_RGTC2}, // DXN2SNORM - {GL_COMPRESSED_RGBA_BPTC_UNORM}, // BC7U - {GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT}, // BC6H_UF16 - {GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT}, // BC6H_SF16 - {GL_COMPRESSED_RGBA_ASTC_4x4_KHR}, // ASTC_2D_4X4 - {GL_RGBA8, GL_BGRA, GL_UNSIGNED_BYTE}, // BGRA8 - {GL_RGBA32F, GL_RGBA, GL_FLOAT}, // RGBA32F - {GL_RG32F, GL_RG, GL_FLOAT}, // RG32F - {GL_R32F, GL_RED, GL_FLOAT}, // R32F - {GL_R16F, GL_RED, GL_HALF_FLOAT}, // R16F - {GL_R16, GL_RED, GL_UNSIGNED_SHORT}, // R16U - {GL_R16_SNORM, GL_RED, GL_SHORT}, // R16S - {GL_R16UI, GL_RED_INTEGER, GL_UNSIGNED_SHORT}, // R16UI - {GL_R16I, GL_RED_INTEGER, GL_SHORT}, // R16I - {GL_RG16, GL_RG, GL_UNSIGNED_SHORT}, // RG16 - {GL_RG16F, GL_RG, GL_HALF_FLOAT}, // RG16F - {GL_RG16UI, GL_RG_INTEGER, GL_UNSIGNED_SHORT}, // RG16UI - {GL_RG16I, GL_RG_INTEGER, GL_SHORT}, // RG16I - {GL_RG16_SNORM, GL_RG, GL_SHORT}, // RG16S - {GL_RGB32F, GL_RGB, GL_FLOAT}, // RGB32F - {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV}, // RGBA8_SRGB - {GL_RG8, GL_RG, GL_UNSIGNED_BYTE}, // RG8U - {GL_RG8_SNORM, GL_RG, GL_BYTE}, // RG8S - {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT}, // RG32UI - {GL_RGB16F, GL_RGBA, GL_HALF_FLOAT}, // RGBX16F - {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT}, // R32UI - {GL_R32I, GL_RED_INTEGER, GL_INT}, // R32I - {GL_COMPRESSED_RGBA_ASTC_8x8_KHR}, // ASTC_2D_8X8 - {GL_COMPRESSED_RGBA_ASTC_8x5_KHR}, // ASTC_2D_8X5 - {GL_COMPRESSED_RGBA_ASTC_5x4_KHR}, // ASTC_2D_5X4 - {GL_SRGB8_ALPHA8, GL_BGRA, GL_UNSIGNED_BYTE}, // BGRA8 + {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV}, // A8B8G8R8_UNORM + {GL_RGBA8_SNORM, GL_RGBA, GL_BYTE}, // A8B8G8R8_SNORM + {GL_RGBA8I, GL_RGBA_INTEGER, GL_BYTE}, // A8B8G8R8_SINT + {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE}, // A8B8G8R8_UINT + {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5}, // R5G6B5_UNORM + {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV}, // B5G6R5_UNORM + {GL_RGB5_A1, GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV}, // A1R5G5B5_UNORM + {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV}, // A2B10G10R10_UNORM + {GL_RGB10_A2UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT_2_10_10_10_REV}, // A2B10G10R10_UINT + {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV}, // A1B5G5R5_UNORM + {GL_R8, GL_RED, GL_UNSIGNED_BYTE}, // R8_UNORM + {GL_R8_SNORM, GL_RED, GL_BYTE}, // R8_SNORM + {GL_R8I, GL_RED_INTEGER, GL_BYTE}, // R8_SINT + {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE}, // R8_UINT + {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT}, // R16G16B16A16_FLOAT + {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT}, // R16G16B16A16_UNORM + {GL_RGBA16_SNORM, GL_RGBA, GL_SHORT}, // R16G16B16A16_SNORM + {GL_RGBA16I, GL_RGBA_INTEGER, GL_SHORT}, // R16G16B16A16_SINT + {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT}, // R16G16B16A16_UINT + {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV}, // B10G11R11_FLOAT + {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT}, // R32G32B32A32_UINT + {GL_COMPRESSED_RGBA_S3TC_DXT1_EXT}, // BC1_RGBA_UNORM + {GL_COMPRESSED_RGBA_S3TC_DXT3_EXT}, // BC2_UNORM + {GL_COMPRESSED_RGBA_S3TC_DXT5_EXT}, // BC3_UNORM + {GL_COMPRESSED_RED_RGTC1}, // BC4_UNORM + {GL_COMPRESSED_SIGNED_RED_RGTC1}, // BC4_SNORM + {GL_COMPRESSED_RG_RGTC2}, // BC5_UNORM + {GL_COMPRESSED_SIGNED_RG_RGTC2}, // BC5_SNORM + {GL_COMPRESSED_RGBA_BPTC_UNORM}, // BC7_UNORM + {GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT}, // BC6H_UFLOAT + {GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT}, // BC6H_SFLOAT + {GL_COMPRESSED_RGBA_ASTC_4x4_KHR}, // ASTC_2D_4X4_UNORM + {GL_RGBA8, GL_BGRA, GL_UNSIGNED_BYTE}, // B8G8R8A8_UNORM + {GL_RGBA32F, GL_RGBA, GL_FLOAT}, // R32G32B32A32_FLOAT + {GL_RGBA32I, GL_RGBA_INTEGER, GL_INT}, // R32G32B32A32_SINT + {GL_RG32F, GL_RG, GL_FLOAT}, // R32G32_FLOAT + {GL_RG32I, GL_RG_INTEGER, GL_INT}, // R32G32_SINT + {GL_R32F, GL_RED, GL_FLOAT}, // R32_FLOAT + {GL_R16F, GL_RED, GL_HALF_FLOAT}, // R16_FLOAT + {GL_R16, GL_RED, GL_UNSIGNED_SHORT}, // R16_UNORM + {GL_R16_SNORM, GL_RED, GL_SHORT}, // R16_SNORM + {GL_R16UI, GL_RED_INTEGER, GL_UNSIGNED_SHORT}, // R16_UINT + {GL_R16I, GL_RED_INTEGER, GL_SHORT}, // R16_SINT + {GL_RG16, GL_RG, GL_UNSIGNED_SHORT}, // R16G16_UNORM + {GL_RG16F, GL_RG, GL_HALF_FLOAT}, // R16G16_FLOAT + {GL_RG16UI, GL_RG_INTEGER, GL_UNSIGNED_SHORT}, // R16G16_UINT + {GL_RG16I, GL_RG_INTEGER, GL_SHORT}, // R16G16_SINT + {GL_RG16_SNORM, GL_RG, GL_SHORT}, // R16G16_SNORM + {GL_RGB32F, GL_RGB, GL_FLOAT}, // R32G32B32_FLOAT + {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV}, // A8B8G8R8_SRGB + {GL_RG8, GL_RG, GL_UNSIGNED_BYTE}, // R8G8_UNORM + {GL_RG8_SNORM, GL_RG, GL_BYTE}, // R8G8_SNORM + {GL_RG8I, GL_RG_INTEGER, GL_BYTE}, // R8G8_SINT + {GL_RG8UI, GL_RG_INTEGER, GL_UNSIGNED_BYTE}, // R8G8_UINT + {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT}, // R32G32_UINT + {GL_RGB16F, GL_RGBA, GL_HALF_FLOAT}, // R16G16B16X16_FLOAT + {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT}, // R32_UINT + {GL_R32I, GL_RED_INTEGER, GL_INT}, // R32_SINT + {GL_COMPRESSED_RGBA_ASTC_8x8_KHR}, // ASTC_2D_8X8_UNORM + {GL_COMPRESSED_RGBA_ASTC_8x5_KHR}, // ASTC_2D_8X5_UNORM + {GL_COMPRESSED_RGBA_ASTC_5x4_KHR}, // ASTC_2D_5X4_UNORM + {GL_SRGB8_ALPHA8, GL_BGRA, GL_UNSIGNED_BYTE}, // B8G8R8A8_UNORM // Compressed sRGB formats - {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT}, // DXT1_SRGB - {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT}, // DXT23_SRGB - {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT}, // DXT45_SRGB - {GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM}, // BC7U_SRGB - {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4_REV}, // R4G4B4A4U + {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT}, // BC1_RGBA_SRGB + {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT}, // BC2_SRGB + {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT}, // BC3_SRGB + {GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM}, // BC7_SRGB + {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4_REV}, // A4B4G4R4_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR}, // ASTC_2D_4X4_SRGB {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR}, // ASTC_2D_8X8_SRGB {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR}, // ASTC_2D_8X5_SRGB {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR}, // ASTC_2D_5X4_SRGB - {GL_COMPRESSED_RGBA_ASTC_5x5_KHR}, // ASTC_2D_5X5 + {GL_COMPRESSED_RGBA_ASTC_5x5_KHR}, // ASTC_2D_5X5_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR}, // ASTC_2D_5X5_SRGB - {GL_COMPRESSED_RGBA_ASTC_10x8_KHR}, // ASTC_2D_10X8 + {GL_COMPRESSED_RGBA_ASTC_10x8_KHR}, // ASTC_2D_10X8_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR}, // ASTC_2D_10X8_SRGB - {GL_COMPRESSED_RGBA_ASTC_6x6_KHR}, // ASTC_2D_6X6 + {GL_COMPRESSED_RGBA_ASTC_6x6_KHR}, // ASTC_2D_6X6_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR}, // ASTC_2D_6X6_SRGB - {GL_COMPRESSED_RGBA_ASTC_10x10_KHR}, // ASTC_2D_10X10 + {GL_COMPRESSED_RGBA_ASTC_10x10_KHR}, // ASTC_2D_10X10_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR}, // ASTC_2D_10X10_SRGB - {GL_COMPRESSED_RGBA_ASTC_12x12_KHR}, // ASTC_2D_12X12 + {GL_COMPRESSED_RGBA_ASTC_12x12_KHR}, // ASTC_2D_12X12_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR}, // ASTC_2D_12X12_SRGB - {GL_COMPRESSED_RGBA_ASTC_8x6_KHR}, // ASTC_2D_8X6 + {GL_COMPRESSED_RGBA_ASTC_8x6_KHR}, // ASTC_2D_8X6_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR}, // ASTC_2D_8X6_SRGB - {GL_COMPRESSED_RGBA_ASTC_6x5_KHR}, // ASTC_2D_6X5 + {GL_COMPRESSED_RGBA_ASTC_6x5_KHR}, // ASTC_2D_6X5_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR}, // ASTC_2D_6X5_SRGB - {GL_RGB9_E5, GL_RGB, GL_UNSIGNED_INT_5_9_9_9_REV}, // E5B9G9R9F + {GL_RGB9_E5, GL_RGB, GL_UNSIGNED_INT_5_9_9_9_REV}, // E5B9G9R9_FLOAT // Depth formats - {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT}, // Z32F - {GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT}, // Z16 + {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT}, // D32_FLOAT + {GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT}, // D16_UNORM // DepthStencil formats - {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // Z24S8 - {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // S8Z24 - {GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL, GL_FLOAT_32_UNSIGNED_INT_24_8_REV}, // Z32FS8 + {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // D24_UNORM_S8_UINT + {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // S8_UINT_D24_UNORM + {GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL, + GL_FLOAT_32_UNSIGNED_INT_24_8_REV}, // D32_FLOAT_S8_UINT }}; const FormatTuple& GetFormatTuple(PixelFormat pixel_format) { @@ -177,10 +190,10 @@ GLint GetSwizzleSource(SwizzleSource source) { GLenum GetComponent(PixelFormat format, bool is_first) { switch (format) { - case PixelFormat::Z24S8: - case PixelFormat::Z32FS8: + case PixelFormat::D24_UNORM_S8_UINT: + case PixelFormat::D32_FLOAT_S8_UINT: return is_first ? GL_DEPTH_COMPONENT : GL_STENCIL_INDEX; - case PixelFormat::S8Z24: + case PixelFormat::S8_UINT_D24_UNORM: return is_first ? GL_STENCIL_INDEX : GL_DEPTH_COMPONENT; default: UNREACHABLE(); @@ -237,6 +250,12 @@ OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum inte return texture; } +constexpr u32 EncodeSwizzle(SwizzleSource x_source, SwizzleSource y_source, SwizzleSource z_source, + SwizzleSource w_source) { + return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) | + (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source); +} + } // Anonymous namespace CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& params, @@ -256,9 +275,14 @@ CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& param target = GetTextureTarget(params.target); texture = CreateTexture(params, target, internal_format, texture_buffer); DecorateSurfaceName(); - main_view = CreateViewInner( - ViewParams(params.target, 0, params.is_layered ? params.depth : 1, 0, params.num_levels), - true); + + u32 num_layers = 1; + if (params.is_layered || params.target == SurfaceTarget::Texture3D) { + num_layers = params.depth; + } + + main_view = + CreateViewInner(ViewParams(params.target, 0, num_layers, 0, params.num_levels), true); } CachedSurface::~CachedSurface() = default; @@ -379,8 +403,8 @@ void CachedSurface::DecorateSurfaceName() { LabelGLObject(GL_TEXTURE, texture.handle, GetGpuAddr(), params.TargetName()); } -void CachedSurfaceView::DecorateViewName(GPUVAddr gpu_addr, std::string prefix) { - LabelGLObject(GL_TEXTURE, texture_view.handle, gpu_addr, prefix); +void CachedSurfaceView::DecorateViewName(GPUVAddr gpu_addr, const std::string& prefix) { + LabelGLObject(GL_TEXTURE, main_view.handle, gpu_addr, prefix); } View CachedSurface::CreateView(const ViewParams& view_key) { @@ -396,32 +420,33 @@ View CachedSurface::CreateViewInner(const ViewParams& view_key, const bool is_pr } CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params, - const bool is_proxy) - : VideoCommon::ViewBase(params), surface{surface}, is_proxy{is_proxy} { - target = GetTextureTarget(params.target); - format = GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format; + bool is_proxy) + : VideoCommon::ViewBase(params), surface{surface}, format{surface.internal_format}, + target{GetTextureTarget(params.target)}, is_proxy{is_proxy} { if (!is_proxy) { - texture_view = CreateTextureView(); + main_view = CreateTextureView(); } - swizzle = EncodeSwizzle(SwizzleSource::R, SwizzleSource::G, SwizzleSource::B, SwizzleSource::A); } CachedSurfaceView::~CachedSurfaceView() = default; -void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { +void CachedSurfaceView::Attach(GLenum attachment, GLenum fb_target) const { ASSERT(params.num_levels == 1); + if (params.target == SurfaceTarget::Texture3D) { + if (params.num_layers > 1) { + ASSERT(params.base_layer == 0); + glFramebufferTexture(fb_target, attachment, surface.texture.handle, params.base_level); + } else { + glFramebufferTexture3D(fb_target, attachment, target, surface.texture.handle, + params.base_level, params.base_layer); + } + return; + } + if (params.num_layers > 1) { - // Layered framebuffer attachments UNIMPLEMENTED_IF(params.base_layer != 0); - - switch (params.target) { - case SurfaceTarget::Texture2DArray: - glFramebufferTexture(target, attachment, GetTexture(), 0); - break; - default: - UNIMPLEMENTED(); - } + glFramebufferTexture(fb_target, attachment, GetTexture(), 0); return; } @@ -429,16 +454,16 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { const GLuint texture = surface.GetTexture(); switch (surface.GetSurfaceParams().target) { case SurfaceTarget::Texture1D: - glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level); + glFramebufferTexture1D(fb_target, attachment, view_target, texture, params.base_level); break; case SurfaceTarget::Texture2D: - glFramebufferTexture2D(target, attachment, view_target, texture, params.base_level); + glFramebufferTexture2D(fb_target, attachment, view_target, texture, params.base_level); break; case SurfaceTarget::Texture1DArray: case SurfaceTarget::Texture2DArray: case SurfaceTarget::TextureCubemap: case SurfaceTarget::TextureCubeArray: - glFramebufferTextureLayer(target, attachment, texture, params.base_level, + glFramebufferTextureLayer(fb_target, attachment, texture, params.base_level, params.base_layer); break; default: @@ -446,44 +471,73 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { } } -void CachedSurfaceView::ApplySwizzle(SwizzleSource x_source, SwizzleSource y_source, +GLuint CachedSurfaceView::GetTexture(SwizzleSource x_source, SwizzleSource y_source, SwizzleSource z_source, SwizzleSource w_source) { - u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source); - if (new_swizzle == swizzle) - return; - swizzle = new_swizzle; - const std::array gl_swizzle = {GetSwizzleSource(x_source), GetSwizzleSource(y_source), - GetSwizzleSource(z_source), GetSwizzleSource(w_source)}; - const GLuint handle = GetTexture(); - const PixelFormat format = surface.GetSurfaceParams().pixel_format; - switch (format) { - case PixelFormat::Z24S8: - case PixelFormat::Z32FS8: - case PixelFormat::S8Z24: - glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE, + if (GetSurfaceParams().IsBuffer()) { + return GetTexture(); + } + const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source); + if (current_swizzle == new_swizzle) { + return current_view; + } + current_swizzle = new_swizzle; + + const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle); + OGLTextureView& view = entry->second; + if (!is_cache_miss) { + current_view = view.handle; + return view.handle; + } + view = CreateTextureView(); + current_view = view.handle; + + std::array swizzle{x_source, y_source, z_source, w_source}; + + switch (const PixelFormat format = GetSurfaceParams().pixel_format) { + case PixelFormat::D24_UNORM_S8_UINT: + case PixelFormat::D32_FLOAT_S8_UINT: + case PixelFormat::S8_UINT_D24_UNORM: + UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G); + glTextureParameteri(view.handle, GL_DEPTH_STENCIL_TEXTURE_MODE, GetComponent(format, x_source == SwizzleSource::R)); - break; - default: - glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data()); + + // Make sure we sample the first component + std::transform(swizzle.begin(), swizzle.end(), swizzle.begin(), [](SwizzleSource value) { + return value == SwizzleSource::G ? SwizzleSource::R : value; + }); + [[fallthrough]]; + default: { + const std::array gl_swizzle = {GetSwizzleSource(swizzle[0]), GetSwizzleSource(swizzle[1]), + GetSwizzleSource(swizzle[2]), GetSwizzleSource(swizzle[3])}; + glTextureParameteriv(view.handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data()); break; } + } + return view.handle; } OGLTextureView CachedSurfaceView::CreateTextureView() const { OGLTextureView texture_view; texture_view.Create(); - glTextureView(texture_view.handle, target, surface.texture.handle, format, params.base_level, - params.num_levels, params.base_layer, params.num_layers); + if (target == GL_TEXTURE_3D) { + glTextureView(texture_view.handle, target, surface.texture.handle, format, + params.base_level, params.num_levels, 0, 1); + } else { + glTextureView(texture_view.handle, target, surface.texture.handle, format, + params.base_level, params.num_levels, params.base_layer, params.num_layers); + } ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle); return texture_view; } -TextureCacheOpenGL::TextureCacheOpenGL(Core::System& system, - VideoCore::RasterizerInterface& rasterizer, - const Device& device, StateTracker& state_tracker) - : TextureCacheBase{system, rasterizer, device.HasASTC()}, state_tracker{state_tracker} { +TextureCacheOpenGL::TextureCacheOpenGL(VideoCore::RasterizerInterface& rasterizer, + Tegra::Engines::Maxwell3D& maxwell3d, + Tegra::MemoryManager& gpu_memory, const Device& device, + StateTracker& state_tracker_) + : TextureCacheBase{rasterizer, maxwell3d, gpu_memory, device.HasASTC()}, state_tracker{ + state_tracker_} { src_framebuffer.Create(); dst_framebuffer.Create(); } @@ -517,8 +571,8 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view, const Tegra::Engines::Fermi2D::Config& copy_config) { const auto& src_params{src_view->GetSurfaceParams()}; const auto& dst_params{dst_view->GetSurfaceParams()}; - UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D); - UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D); + UNIMPLEMENTED_IF(src_params.depth != 1); + UNIMPLEMENTED_IF(dst_params.depth != 1); state_tracker.NotifyScissor0(); state_tracker.NotifyFramebuffer(); diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 02d9981a1..7787134fc 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -80,15 +80,17 @@ public: explicit CachedSurfaceView(CachedSurface& surface, const ViewParams& params, bool is_proxy); ~CachedSurfaceView(); - /// Attaches this texture view to the current bound GL_DRAW_FRAMEBUFFER - void Attach(GLenum attachment, GLenum target) const; + /// @brief Attaches this texture view to the currently bound fb_target framebuffer + /// @param attachment Attachment to bind textures to + /// @param fb_target Framebuffer target to attach to (e.g. DRAW_FRAMEBUFFER) + void Attach(GLenum attachment, GLenum fb_target) const; - void ApplySwizzle(Tegra::Texture::SwizzleSource x_source, + GLuint GetTexture(Tegra::Texture::SwizzleSource x_source, Tegra::Texture::SwizzleSource y_source, Tegra::Texture::SwizzleSource z_source, Tegra::Texture::SwizzleSource w_source); - void DecorateViewName(GPUVAddr gpu_addr, std::string prefix); + void DecorateViewName(GPUVAddr gpu_addr, const std::string& prefix); void MarkAsModified(u64 tick) { surface.MarkAsModified(true, tick); @@ -98,7 +100,7 @@ public: if (is_proxy) { return surface.GetTexture(); } - return texture_view.handle; + return main_view.handle; } GLenum GetFormat() const { @@ -110,29 +112,27 @@ public: } private: - u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, - Tegra::Texture::SwizzleSource y_source, - Tegra::Texture::SwizzleSource z_source, - Tegra::Texture::SwizzleSource w_source) const { - return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) | - (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source); - } - OGLTextureView CreateTextureView() const; CachedSurface& surface; - GLenum target{}; - GLenum format{}; + const GLenum format; + const GLenum target; + const bool is_proxy; + + std::unordered_map<u32, OGLTextureView> view_cache; + OGLTextureView main_view; - OGLTextureView texture_view; - u32 swizzle{}; - bool is_proxy{}; + // Use an invalid default so it always fails the comparison test + u32 current_swizzle = 0xffffffff; + GLuint current_view = 0; }; class TextureCacheOpenGL final : public TextureCacheBase { public: - explicit TextureCacheOpenGL(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - const Device& device, StateTracker& state_tracker); + explicit TextureCacheOpenGL(VideoCore::RasterizerInterface& rasterizer, + Tegra::Engines::Maxwell3D& maxwell3d, + Tegra::MemoryManager& gpu_memory, const Device& device, + StateTracker& state_tracker); ~TextureCacheOpenGL(); protected: diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index 89f0e04ef..a8be2aa37 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -24,10 +24,11 @@ namespace MaxwellToGL { using Maxwell = Tegra::Engines::Maxwell3D::Regs; -inline GLenum VertexType(Maxwell::VertexAttribute attrib) { +inline GLenum VertexFormat(Maxwell::VertexAttribute attrib) { switch (attrib.type) { - case Maxwell::VertexAttribute::Type::UnsignedInt: case Maxwell::VertexAttribute::Type::UnsignedNorm: + case Maxwell::VertexAttribute::Type::UnsignedScaled: + case Maxwell::VertexAttribute::Type::UnsignedInt: switch (attrib.size) { case Maxwell::VertexAttribute::Size::Size_8: case Maxwell::VertexAttribute::Size::Size_8_8: @@ -47,11 +48,12 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return GL_UNSIGNED_INT_2_10_10_10_REV; default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; + break; } - case Maxwell::VertexAttribute::Type::SignedInt: + break; case Maxwell::VertexAttribute::Type::SignedNorm: + case Maxwell::VertexAttribute::Type::SignedScaled: + case Maxwell::VertexAttribute::Type::SignedInt: switch (attrib.size) { case Maxwell::VertexAttribute::Size::Size_8: case Maxwell::VertexAttribute::Size::Size_8_8: @@ -71,9 +73,9 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return GL_INT_2_10_10_10_REV; default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; + break; } + break; case Maxwell::VertexAttribute::Type::Float: switch (attrib.size) { case Maxwell::VertexAttribute::Size::Size_16: @@ -87,45 +89,13 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { case Maxwell::VertexAttribute::Size::Size_32_32_32_32: return GL_FLOAT; default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; - } - case Maxwell::VertexAttribute::Type::UnsignedScaled: - switch (attrib.size) { - case Maxwell::VertexAttribute::Size::Size_8: - case Maxwell::VertexAttribute::Size::Size_8_8: - case Maxwell::VertexAttribute::Size::Size_8_8_8: - case Maxwell::VertexAttribute::Size::Size_8_8_8_8: - return GL_UNSIGNED_BYTE; - case Maxwell::VertexAttribute::Size::Size_16: - case Maxwell::VertexAttribute::Size::Size_16_16: - case Maxwell::VertexAttribute::Size::Size_16_16_16: - case Maxwell::VertexAttribute::Size::Size_16_16_16_16: - return GL_UNSIGNED_SHORT; - default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; - } - case Maxwell::VertexAttribute::Type::SignedScaled: - switch (attrib.size) { - case Maxwell::VertexAttribute::Size::Size_8: - case Maxwell::VertexAttribute::Size::Size_8_8: - case Maxwell::VertexAttribute::Size::Size_8_8_8: - case Maxwell::VertexAttribute::Size::Size_8_8_8_8: - return GL_BYTE; - case Maxwell::VertexAttribute::Size::Size_16: - case Maxwell::VertexAttribute::Size::Size_16_16: - case Maxwell::VertexAttribute::Size::Size_16_16_16: - case Maxwell::VertexAttribute::Size::Size_16_16_16_16: - return GL_SHORT; - default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - return {}; + break; } - default: - LOG_ERROR(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString()); - return {}; + break; } + UNIMPLEMENTED_MSG("Unimplemented vertex format of type={} and size={}", attrib.TypeString(), + attrib.SizeString()); + return {}; } inline GLenum IndexFormat(Maxwell::IndexFormat index_format) { @@ -137,8 +107,7 @@ inline GLenum IndexFormat(Maxwell::IndexFormat index_format) { case Maxwell::IndexFormat::UnsignedInt: return GL_UNSIGNED_INT; } - LOG_CRITICAL(Render_OpenGL, "Unimplemented index_format={}", static_cast<u32>(index_format)); - UNREACHABLE(); + UNREACHABLE_MSG("Invalid index_format={}", static_cast<u32>(index_format)); return {}; } @@ -180,31 +149,32 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) { } inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode, - Tegra::Texture::TextureMipmapFilter mip_filter_mode) { + Tegra::Texture::TextureMipmapFilter mipmap_filter_mode) { switch (filter_mode) { - case Tegra::Texture::TextureFilter::Linear: { - switch (mip_filter_mode) { + case Tegra::Texture::TextureFilter::Nearest: + switch (mipmap_filter_mode) { case Tegra::Texture::TextureMipmapFilter::None: - return GL_LINEAR; + return GL_NEAREST; case Tegra::Texture::TextureMipmapFilter::Nearest: - return GL_LINEAR_MIPMAP_NEAREST; + return GL_NEAREST_MIPMAP_NEAREST; case Tegra::Texture::TextureMipmapFilter::Linear: - return GL_LINEAR_MIPMAP_LINEAR; + return GL_NEAREST_MIPMAP_LINEAR; } - } - case Tegra::Texture::TextureFilter::Nearest: { - switch (mip_filter_mode) { + break; + case Tegra::Texture::TextureFilter::Linear: + switch (mipmap_filter_mode) { case Tegra::Texture::TextureMipmapFilter::None: - return GL_NEAREST; + return GL_LINEAR; case Tegra::Texture::TextureMipmapFilter::Nearest: - return GL_NEAREST_MIPMAP_NEAREST; + return GL_LINEAR_MIPMAP_NEAREST; case Tegra::Texture::TextureMipmapFilter::Linear: - return GL_NEAREST_MIPMAP_LINEAR; + return GL_LINEAR_MIPMAP_LINEAR; } + break; } - } - LOG_ERROR(Render_OpenGL, "Unimplemented texture filter mode={}", static_cast<u32>(filter_mode)); - return GL_LINEAR; + UNREACHABLE_MSG("Invalid texture filter mode={} and mipmap filter mode={}", + static_cast<u32>(filter_mode), static_cast<u32>(mipmap_filter_mode)); + return GL_NEAREST; } inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) { @@ -227,10 +197,15 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) { } else { return GL_MIRROR_CLAMP_TO_EDGE; } - default: - LOG_ERROR(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode)); - return GL_REPEAT; + case Tegra::Texture::WrapMode::MirrorOnceClampOGL: + if (GL_EXT_texture_mirror_clamp) { + return GL_MIRROR_CLAMP_EXT; + } else { + return GL_MIRROR_CLAMP_TO_EDGE; + } } + UNIMPLEMENTED_MSG("Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode)); + return GL_REPEAT; } inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) { @@ -252,8 +227,7 @@ inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) { case Tegra::Texture::DepthCompareFunc::Always: return GL_ALWAYS; } - LOG_ERROR(Render_OpenGL, "Unimplemented texture depth compare function ={}", - static_cast<u32>(func)); + UNIMPLEMENTED_MSG("Unimplemented texture depth compare function={}", static_cast<u32>(func)); return GL_GREATER; } @@ -275,7 +249,7 @@ inline GLenum BlendEquation(Maxwell::Blend::Equation equation) { case Maxwell::Blend::Equation::MaxGL: return GL_MAX; } - LOG_ERROR(Render_OpenGL, "Unimplemented blend equation={}", static_cast<u32>(equation)); + UNIMPLEMENTED_MSG("Unimplemented blend equation={}", static_cast<u32>(equation)); return GL_FUNC_ADD; } @@ -339,7 +313,7 @@ inline GLenum BlendFunc(Maxwell::Blend::Factor factor) { case Maxwell::Blend::Factor::OneMinusConstantAlphaGL: return GL_ONE_MINUS_CONSTANT_ALPHA; } - LOG_ERROR(Render_OpenGL, "Unimplemented blend factor={}", static_cast<u32>(factor)); + UNIMPLEMENTED_MSG("Unimplemented blend factor={}", static_cast<u32>(factor)); return GL_ZERO; } @@ -359,7 +333,7 @@ inline GLenum SwizzleSource(Tegra::Texture::SwizzleSource source) { case Tegra::Texture::SwizzleSource::OneFloat: return GL_ONE; } - LOG_ERROR(Render_OpenGL, "Unimplemented swizzle source={}", static_cast<u32>(source)); + UNIMPLEMENTED_MSG("Unimplemented swizzle source={}", static_cast<u32>(source)); return GL_ZERO; } @@ -390,7 +364,7 @@ inline GLenum ComparisonOp(Maxwell::ComparisonOp comparison) { case Maxwell::ComparisonOp::AlwaysOld: return GL_ALWAYS; } - LOG_ERROR(Render_OpenGL, "Unimplemented comparison op={}", static_cast<u32>(comparison)); + UNIMPLEMENTED_MSG("Unimplemented comparison op={}", static_cast<u32>(comparison)); return GL_ALWAYS; } @@ -421,7 +395,7 @@ inline GLenum StencilOp(Maxwell::StencilOp stencil) { case Maxwell::StencilOp::DecrWrapOGL: return GL_DECR_WRAP; } - LOG_ERROR(Render_OpenGL, "Unimplemented stencil op={}", static_cast<u32>(stencil)); + UNIMPLEMENTED_MSG("Unimplemented stencil op={}", static_cast<u32>(stencil)); return GL_KEEP; } @@ -432,7 +406,7 @@ inline GLenum FrontFace(Maxwell::FrontFace front_face) { case Maxwell::FrontFace::CounterClockWise: return GL_CCW; } - LOG_ERROR(Render_OpenGL, "Unimplemented front face cull={}", static_cast<u32>(front_face)); + UNIMPLEMENTED_MSG("Unimplemented front face cull={}", static_cast<u32>(front_face)); return GL_CCW; } @@ -445,7 +419,7 @@ inline GLenum CullFace(Maxwell::CullFace cull_face) { case Maxwell::CullFace::FrontAndBack: return GL_FRONT_AND_BACK; } - LOG_ERROR(Render_OpenGL, "Unimplemented cull face={}", static_cast<u32>(cull_face)); + UNIMPLEMENTED_MSG("Unimplemented cull face={}", static_cast<u32>(cull_face)); return GL_BACK; } @@ -484,7 +458,7 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) { case Maxwell::LogicOperation::Set: return GL_SET; } - LOG_ERROR(Render_OpenGL, "Unimplemented logic operation={}", static_cast<u32>(operation)); + UNIMPLEMENTED_MSG("Unimplemented logic operation={}", static_cast<u32>(operation)); return GL_COPY; } @@ -501,5 +475,10 @@ inline GLenum PolygonMode(Maxwell::PolygonMode polygon_mode) { return GL_FILL; } +inline GLenum ViewportSwizzle(Maxwell::ViewportSwizzle swizzle) { + // Enumeration order matches register order. We can convert it arithmetically. + return GL_VIEWPORT_SWIZZLE_POSITIVE_X_NV + static_cast<GLenum>(swizzle); +} + } // namespace MaxwellToGL } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index b2a179746..2ccca1993 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -21,6 +21,8 @@ #include "core/perf_stats.h" #include "core/settings.h" #include "core/telemetry_session.h" +#include "video_core/host_shaders/opengl_present_frag.h" +#include "video_core/host_shaders/opengl_present_vert.h" #include "video_core/morton.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_manager.h" @@ -30,60 +32,6 @@ namespace OpenGL { namespace { -constexpr std::size_t SWAP_CHAIN_SIZE = 3; - -struct Frame { - u32 width{}; /// Width of the frame (to detect resize) - u32 height{}; /// Height of the frame - bool color_reloaded{}; /// Texture attachment was recreated (ie: resized) - OpenGL::OGLRenderbuffer color{}; /// Buffer shared between the render/present FBO - OpenGL::OGLFramebuffer render{}; /// FBO created on the render thread - OpenGL::OGLFramebuffer present{}; /// FBO created on the present thread - GLsync render_fence{}; /// Fence created on the render thread - GLsync present_fence{}; /// Fence created on the presentation thread - bool is_srgb{}; /// Framebuffer is sRGB or RGB -}; - -constexpr char VERTEX_SHADER[] = R"( -#version 430 core - -out gl_PerVertex { - vec4 gl_Position; -}; - -layout (location = 0) in vec2 vert_position; -layout (location = 1) in vec2 vert_tex_coord; -layout (location = 0) out vec2 frag_tex_coord; - -// This is a truncated 3x3 matrix for 2D transformations: -// The upper-left 2x2 submatrix performs scaling/rotation/mirroring. -// The third column performs translation. -// The third row could be used for projection, which we don't need in 2D. It hence is assumed to -// implicitly be [0, 0, 1] -layout (location = 0) uniform mat3x2 modelview_matrix; - -void main() { - // Multiply input position by the rotscale part of the matrix and then manually translate by - // the last column. This is equivalent to using a full 3x3 matrix and expanding the vector - // to `vec3(vert_position.xy, 1.0)` - gl_Position = vec4(mat2(modelview_matrix) * vert_position + modelview_matrix[2], 0.0, 1.0); - frag_tex_coord = vert_tex_coord; -} -)"; - -constexpr char FRAGMENT_SHADER[] = R"( -#version 430 core - -layout (location = 0) in vec2 frag_tex_coord; -layout (location = 0) out vec4 color; - -layout (binding = 0) uniform sampler2D color_texture; - -void main() { - color = vec4(texture(color_texture, frag_tex_coord).rgb, 1.0f); -} -)"; - constexpr GLint PositionLocation = 0; constexpr GLint TexCoordLocation = 1; constexpr GLint ModelViewMatrixLocation = 0; @@ -96,24 +44,6 @@ struct ScreenRectVertex { std::array<GLfloat, 2> tex_coord; }; -/// Returns true if any debug tool is attached -bool HasDebugTool() { - const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); - if (nsight) { - return true; - } - - GLint num_extensions; - glGetIntegerv(GL_NUM_EXTENSIONS, &num_extensions); - for (GLuint index = 0; index < static_cast<GLuint>(num_extensions); ++index) { - const auto name = reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, index)); - if (!std::strcmp(name, "GL_EXT_debug_tool")) { - return true; - } - } - return false; -} - /** * Defines a 1:1 pixel ortographic projection matrix with (0,0) on the top-left * corner and (width, height) on the lower-bottom. @@ -197,132 +127,15 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit } // Anonymous namespace -/** - * For smooth Vsync rendering, we want to always present the latest frame that the core generates, - * but also make sure that rendering happens at the pace that the frontend dictates. This is a - * helper class that the renderer uses to sync frames between the render thread and the presentation - * thread - */ -class FrameMailbox { -public: - std::mutex swap_chain_lock; - std::condition_variable present_cv; - std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{}; - std::queue<Frame*> free_queue; - std::deque<Frame*> present_queue; - Frame* previous_frame{}; - - FrameMailbox() { - for (auto& frame : swap_chain) { - free_queue.push(&frame); - } - } - - ~FrameMailbox() { - // lock the mutex and clear out the present and free_queues and notify any people who are - // blocked to prevent deadlock on shutdown - std::scoped_lock lock{swap_chain_lock}; - std::queue<Frame*>().swap(free_queue); - present_queue.clear(); - present_cv.notify_all(); - } - - void ReloadPresentFrame(Frame* frame, u32 height, u32 width) { - frame->present.Release(); - frame->present.Create(); - GLint previous_draw_fbo{}; - glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo); - glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle); - glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, - frame->color.handle); - if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { - LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!"); - } - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo); - frame->color_reloaded = false; - } - - void ReloadRenderFrame(Frame* frame, u32 width, u32 height) { - // Recreate the color texture attachment - frame->color.Release(); - frame->color.Create(); - const GLenum internal_format = frame->is_srgb ? GL_SRGB8 : GL_RGB8; - glNamedRenderbufferStorage(frame->color.handle, internal_format, width, height); - - // Recreate the FBO for the render target - frame->render.Release(); - frame->render.Create(); - glBindFramebuffer(GL_FRAMEBUFFER, frame->render.handle); - glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, - frame->color.handle); - if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { - LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!"); - } - - frame->width = width; - frame->height = height; - frame->color_reloaded = true; - } - - Frame* GetRenderFrame() { - std::unique_lock lock{swap_chain_lock}; - - // If theres no free frames, we will reuse the oldest render frame - if (free_queue.empty()) { - auto frame = present_queue.back(); - present_queue.pop_back(); - return frame; - } - - Frame* frame = free_queue.front(); - free_queue.pop(); - return frame; - } - - void ReleaseRenderFrame(Frame* frame) { - std::unique_lock lock{swap_chain_lock}; - present_queue.push_front(frame); - present_cv.notify_one(); - } - - Frame* TryGetPresentFrame(int timeout_ms) { - std::unique_lock lock{swap_chain_lock}; - // wait for new entries in the present_queue - present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms), - [&] { return !present_queue.empty(); }); - if (present_queue.empty()) { - // timed out waiting for a frame to draw so return the previous frame - return previous_frame; - } - - // free the previous frame and add it back to the free queue - if (previous_frame) { - free_queue.push(previous_frame); - } - - // the newest entries are pushed to the front of the queue - Frame* frame = present_queue.front(); - present_queue.pop_front(); - // remove all old entries from the present queue and move them back to the free_queue - for (auto f : present_queue) { - free_queue.push(f); - } - present_queue.clear(); - previous_frame = frame; - return frame; - } -}; - -RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system, - Core::Frontend::GraphicsContext& context) - : RendererBase{emu_window}, emu_window{emu_window}, system{system}, context{context}, - has_debug_tool{HasDebugTool()} {} +RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_, + Core::Frontend::EmuWindow& emu_window_, + Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_, + std::unique_ptr<Core::Frontend::GraphicsContext> context) + : RendererBase{emu_window_, std::move(context)}, telemetry_session{telemetry_session_}, + emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_}, program_manager{device} {} RendererOpenGL::~RendererOpenGL() = default; -MICROPROFILE_DEFINE(OpenGL_RenderFrame, "OpenGL", "Render Frame", MP_RGB(128, 128, 64)); -MICROPROFILE_DEFINE(OpenGL_WaitPresent, "OpenGL", "Wait For Present", MP_RGB(128, 128, 128)); - void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { if (!framebuffer) { return; @@ -331,79 +144,34 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { PrepareRendertarget(framebuffer); RenderScreenshot(); - Frame* frame; - { - MICROPROFILE_SCOPE(OpenGL_WaitPresent); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); + DrawScreen(emu_window.GetFramebufferLayout()); - frame = frame_mailbox->GetRenderFrame(); + ++m_current_frame; - // Clean up sync objects before drawing - - // INTEL driver workaround. We can't delete the previous render sync object until we are - // sure that the presentation is done - if (frame->present_fence) { - glClientWaitSync(frame->present_fence, 0, GL_TIMEOUT_IGNORED); - } - - // delete the draw fence if the frame wasn't presented - if (frame->render_fence) { - glDeleteSync(frame->render_fence); - frame->render_fence = 0; - } - - // wait for the presentation to be done - if (frame->present_fence) { - glWaitSync(frame->present_fence, 0, GL_TIMEOUT_IGNORED); - glDeleteSync(frame->present_fence); - frame->present_fence = 0; - } - } - - { - MICROPROFILE_SCOPE(OpenGL_RenderFrame); - const auto& layout = render_window.GetFramebufferLayout(); - - // Recreate the frame if the size of the window has changed - if (layout.width != frame->width || layout.height != frame->height || - screen_info.display_srgb != frame->is_srgb) { - LOG_DEBUG(Render_OpenGL, "Reloading render frame"); - frame->is_srgb = screen_info.display_srgb; - frame_mailbox->ReloadRenderFrame(frame, layout.width, layout.height); - } - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, frame->render.handle); - DrawScreen(layout); - // Create a fence for the frontend to wait on and swap this frame to OffTex - frame->render_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); - glFlush(); - frame_mailbox->ReleaseRenderFrame(frame); - m_current_frame++; - rasterizer->TickFrame(); - } + rasterizer->TickFrame(); render_window.PollEvents(); - if (has_debug_tool) { - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); - Present(0); - context.SwapBuffers(); - } + context->SwapBuffers(); } void RendererOpenGL::PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer) { - if (framebuffer) { - // If framebuffer is provided, reload it from memory to a texture - if (screen_info.texture.width != static_cast<GLsizei>(framebuffer->width) || - screen_info.texture.height != static_cast<GLsizei>(framebuffer->height) || - screen_info.texture.pixel_format != framebuffer->pixel_format || - gl_framebuffer_data.empty()) { - // Reallocate texture if the framebuffer size has changed. - // This is expected to not happen very often and hence should not be a - // performance problem. - ConfigureFramebufferTexture(screen_info.texture, *framebuffer); - } - - // Load the framebuffer from memory, draw it to the screen, and swap buffers - LoadFBToScreenInfo(*framebuffer); + if (!framebuffer) { + return; + } + // If framebuffer is provided, reload it from memory to a texture + if (screen_info.texture.width != static_cast<GLsizei>(framebuffer->width) || + screen_info.texture.height != static_cast<GLsizei>(framebuffer->height) || + screen_info.texture.pixel_format != framebuffer->pixel_format || + gl_framebuffer_data.empty()) { + // Reallocate texture if the framebuffer size has changed. + // This is expected to not happen very often and hence should not be a + // performance problem. + ConfigureFramebufferTexture(screen_info.texture, *framebuffer); } + + // Load the framebuffer from memory, draw it to the screen, and swap buffers + LoadFBToScreenInfo(*framebuffer); } void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuffer) { @@ -423,7 +191,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)}; const u32 bytes_per_pixel{VideoCore::Surface::GetBytesPerPixel(pixel_format)}; const u64 size_in_bytes{framebuffer.stride * framebuffer.height * bytes_per_pixel}; - u8* const host_ptr{system.Memory().GetPointer(framebuffer_addr)}; + u8* const host_ptr{cpu_memory.GetPointer(framebuffer_addr)}; rasterizer->FlushRegion(ToCacheAddr(host_ptr), size_in_bytes); // TODO(Rodrigo): Read this from HLE @@ -453,23 +221,22 @@ void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color } void RendererOpenGL::InitOpenGLObjects() { - frame_mailbox = std::make_unique<FrameMailbox>(); - - glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue, - 0.0f); + glClearColor(Settings::values.bg_red.GetValue(), Settings::values.bg_green.GetValue(), + Settings::values.bg_blue.GetValue(), 0.0f); // Create shader programs OGLShader vertex_shader; - vertex_shader.Create(VERTEX_SHADER, GL_VERTEX_SHADER); + vertex_shader.Create(HostShaders::OPENGL_PRESENT_VERT, GL_VERTEX_SHADER); OGLShader fragment_shader; - fragment_shader.Create(FRAGMENT_SHADER, GL_FRAGMENT_SHADER); + fragment_shader.Create(HostShaders::OPENGL_PRESENT_FRAG, GL_FRAGMENT_SHADER); vertex_program.Create(true, false, vertex_shader.handle); fragment_program.Create(true, false, fragment_shader.handle); - // Create program pipeline - program_manager.Create(); + pipeline.Create(); + glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, vertex_program.handle); + glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, fragment_program.handle); // Generate VBO handle for drawing vertex_buffer.Create(); @@ -487,6 +254,15 @@ void RendererOpenGL::InitOpenGLObjects() { // Clear screen to black LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture); + + // Enable unified vertex attributes and query vertex buffer address when the driver supports it + if (device.HasVertexBufferUnifiedMemory()) { + glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV); + + glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY); + glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, + &vertex_buffer_address); + } } void RendererOpenGL::AddTelemetryFields() { @@ -498,18 +274,18 @@ void RendererOpenGL::AddTelemetryFields() { LOG_INFO(Render_OpenGL, "GL_VENDOR: {}", gpu_vendor); LOG_INFO(Render_OpenGL, "GL_RENDERER: {}", gpu_model); - auto& telemetry_session = system.TelemetrySession(); - telemetry_session.AddField(Telemetry::FieldType::UserSystem, "GPU_Vendor", gpu_vendor); - telemetry_session.AddField(Telemetry::FieldType::UserSystem, "GPU_Model", gpu_model); - telemetry_session.AddField(Telemetry::FieldType::UserSystem, "GPU_OpenGL_Version", gl_version); + constexpr auto user_system = Common::Telemetry::FieldType::UserSystem; + telemetry_session.AddField(user_system, "GPU_Vendor", gpu_vendor); + telemetry_session.AddField(user_system, "GPU_Model", gpu_model); + telemetry_session.AddField(user_system, "GPU_OpenGL_Version", gl_version); } void RendererOpenGL::CreateRasterizer() { if (rasterizer) { return; } - rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info, - program_manager, state_tracker); + rasterizer = std::make_unique<RasterizerOpenGL>(emu_window, gpu, cpu_memory, device, + screen_info, program_manager, state_tracker); } void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, @@ -525,12 +301,12 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, GLint internal_format; switch (framebuffer.pixel_format) { - case Tegra::FramebufferConfig::PixelFormat::ABGR8: + case Tegra::FramebufferConfig::PixelFormat::A8B8G8R8_UNORM: internal_format = GL_RGBA8; texture.gl_format = GL_RGBA; texture.gl_type = GL_UNSIGNED_INT_8_8_8_8_REV; break; - case Tegra::FramebufferConfig::PixelFormat::RGB565: + case Tegra::FramebufferConfig::PixelFormat::RGB565_UNORM: internal_format = GL_RGB565; texture.gl_format = GL_RGB; texture.gl_type = GL_UNSIGNED_SHORT_5_6_5; @@ -551,8 +327,8 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { if (renderer_settings.set_background_color) { // Update background color before drawing - glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue, - 0.0f); + glClearColor(Settings::values.bg_red.GetValue(), Settings::values.bg_green.GetValue(), + Settings::values.bg_blue.GetValue(), 0.0f); } // Set projection matrix @@ -620,10 +396,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { state_tracker.NotifyClipControl(); state_tracker.NotifyAlphaTest(); - program_manager.UseVertexShader(vertex_program.handle); - program_manager.UseGeometryShader(0); - program_manager.UseFragmentShader(fragment_program.handle); - program_manager.BindGraphicsPipeline(); + program_manager.BindHostPipeline(pipeline.handle); glEnable(GL_CULL_FACE); if (screen_info.display_srgb) { @@ -658,58 +431,21 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { offsetof(ScreenRectVertex, tex_coord)); glVertexAttribBinding(PositionLocation, 0); glVertexAttribBinding(TexCoordLocation, 0); - glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex)); + if (device.HasVertexBufferUnifiedMemory()) { + glBindVertexBuffer(0, 0, 0, sizeof(ScreenRectVertex)); + glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, 0, vertex_buffer_address, + sizeof(vertices)); + } else { + glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex)); + } glBindTextureUnit(0, screen_info.display_texture); glBindSampler(0, 0); glClear(GL_COLOR_BUFFER_BIT); glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); -} -bool RendererOpenGL::TryPresent(int timeout_ms) { - if (has_debug_tool) { - LOG_DEBUG(Render_OpenGL, - "Skipping presentation because we are presenting on the main context"); - return false; - } - return Present(timeout_ms); -} - -bool RendererOpenGL::Present(int timeout_ms) { - const auto& layout = render_window.GetFramebufferLayout(); - auto frame = frame_mailbox->TryGetPresentFrame(timeout_ms); - if (!frame) { - LOG_DEBUG(Render_OpenGL, "TryGetPresentFrame returned no frame to present"); - return false; - } - - // Clearing before a full overwrite of a fbo can signal to drivers that they can avoid a - // readback since we won't be doing any blending - glClear(GL_COLOR_BUFFER_BIT); - - // Recreate the presentation FBO if the color attachment was changed - if (frame->color_reloaded) { - LOG_DEBUG(Render_OpenGL, "Reloading present frame"); - frame_mailbox->ReloadPresentFrame(frame, layout.width, layout.height); - } - glWaitSync(frame->render_fence, 0, GL_TIMEOUT_IGNORED); - // INTEL workaround. - // Normally we could just delete the draw fence here, but due to driver bugs, we can just delete - // it on the emulation thread without too much penalty - // glDeleteSync(frame.render_sync); - // frame.render_sync = 0; - - glBindFramebuffer(GL_READ_FRAMEBUFFER, frame->present.handle); - glBlitFramebuffer(0, 0, frame->width, frame->height, 0, 0, layout.width, layout.height, - GL_COLOR_BUFFER_BIT, GL_LINEAR); - - // Insert fence for the main thread to block on - frame->present_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); - glFlush(); - - glBindFramebuffer(GL_READ_FRAMEBUFFER, 0); - return true; + program_manager.RestoreGuestPipeline(); } void RendererOpenGL::RenderScreenshot() { @@ -726,7 +462,7 @@ void RendererOpenGL::RenderScreenshot() { screenshot_framebuffer.Create(); glBindFramebuffer(GL_FRAMEBUFFER, screenshot_framebuffer.handle); - Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout}; + const Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout}; GLuint renderbuffer; glGenRenderbuffers(1, &renderbuffer); @@ -751,8 +487,9 @@ void RendererOpenGL::RenderScreenshot() { } bool RendererOpenGL::Init() { - if (GLAD_GL_KHR_debug) { + if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) { glEnable(GL_DEBUG_OUTPUT); + glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); glDebugMessageCallback(DebugHandler, nullptr); } diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index 50b647661..9ef181f95 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -9,22 +9,32 @@ #include "common/common_types.h" #include "common/math_util.h" #include "video_core/renderer_base.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_state_tracker.h" namespace Core { class System; -} +class TelemetrySession; +} // namespace Core namespace Core::Frontend { class EmuWindow; } +namespace Core::Memory { +class Memory; +} + namespace Layout { struct FramebufferLayout; } +namespace Tegra { +class GPU; +} + namespace OpenGL { /// Structure used for storing information about the textures for the Switch screen @@ -45,24 +55,17 @@ struct ScreenInfo { TextureInfo texture; }; -struct PresentationTexture { - u32 width = 0; - u32 height = 0; - OGLTexture texture; -}; - -class FrameMailbox; - class RendererOpenGL final : public VideoCore::RendererBase { public: - explicit RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system, - Core::Frontend::GraphicsContext& context); + explicit RendererOpenGL(Core::TelemetrySession& telemetry_session, + Core::Frontend::EmuWindow& emu_window, Core::Memory::Memory& cpu_memory, + Tegra::GPU& gpu, + std::unique_ptr<Core::Frontend::GraphicsContext> context); ~RendererOpenGL() override; bool Init() override; void ShutDown() override; void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; - bool TryPresent(int timeout_ms) override; private: /// Initializes the OpenGL state and creates persistent objects. @@ -90,37 +93,36 @@ private: void PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer); - bool Present(int timeout_ms); - + Core::TelemetrySession& telemetry_session; Core::Frontend::EmuWindow& emu_window; - Core::System& system; - Core::Frontend::GraphicsContext& context; + Core::Memory::Memory& cpu_memory; + Tegra::GPU& gpu; - StateTracker state_tracker{system}; + const Device device; + StateTracker state_tracker{gpu}; // OpenGL object IDs OGLBuffer vertex_buffer; OGLProgram vertex_program; OGLProgram fragment_program; + OGLPipeline pipeline; OGLFramebuffer screenshot_framebuffer; + // GPU address of the vertex buffer + GLuint64EXT vertex_buffer_address = 0; + /// Display information for Switch screen ScreenInfo screen_info; /// Global dummy shader pipeline - GLShader::ProgramManager program_manager; + ProgramManager program_manager; /// OpenGL framebuffer data std::vector<u8> gl_framebuffer_data; /// Used for transforming the framebuffer orientation - Tegra::FramebufferConfig::TransformFlags framebuffer_transform_flags; + Tegra::FramebufferConfig::TransformFlags framebuffer_transform_flags{}; Common::Rectangle<int> framebuffer_crop_rect; - - /// Frame presentation mailbox - std::unique_ptr<FrameMailbox> frame_mailbox; - - bool has_debug_tool = false; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/utils.cpp b/src/video_core/renderer_opengl/utils.cpp index b751086fa..6d7bb16b2 100644 --- a/src/video_core/renderer_opengl/utils.cpp +++ b/src/video_core/renderer_opengl/utils.cpp @@ -14,68 +14,6 @@ namespace OpenGL { -struct VertexArrayPushBuffer::Entry { - GLuint binding_index{}; - const GLuint* buffer{}; - GLintptr offset{}; - GLsizei stride{}; -}; - -VertexArrayPushBuffer::VertexArrayPushBuffer(StateTracker& state_tracker) - : state_tracker{state_tracker} {} - -VertexArrayPushBuffer::~VertexArrayPushBuffer() = default; - -void VertexArrayPushBuffer::Setup() { - index_buffer = nullptr; - vertex_buffers.clear(); -} - -void VertexArrayPushBuffer::SetIndexBuffer(const GLuint* buffer) { - index_buffer = buffer; -} - -void VertexArrayPushBuffer::SetVertexBuffer(GLuint binding_index, const GLuint* buffer, - GLintptr offset, GLsizei stride) { - vertex_buffers.push_back(Entry{binding_index, buffer, offset, stride}); -} - -void VertexArrayPushBuffer::Bind() { - if (index_buffer) { - state_tracker.BindIndexBuffer(*index_buffer); - } - - for (const auto& entry : vertex_buffers) { - glBindVertexBuffer(entry.binding_index, *entry.buffer, entry.offset, entry.stride); - } -} - -struct BindBuffersRangePushBuffer::Entry { - GLuint binding; - const GLuint* buffer; - GLintptr offset; - GLsizeiptr size; -}; - -BindBuffersRangePushBuffer::BindBuffersRangePushBuffer(GLenum target) : target{target} {} - -BindBuffersRangePushBuffer::~BindBuffersRangePushBuffer() = default; - -void BindBuffersRangePushBuffer::Setup() { - entries.clear(); -} - -void BindBuffersRangePushBuffer::Push(GLuint binding, const GLuint* buffer, GLintptr offset, - GLsizeiptr size) { - entries.push_back(Entry{binding, buffer, offset, size}); -} - -void BindBuffersRangePushBuffer::Bind() { - for (const Entry& entry : entries) { - glBindBufferRange(target, entry.binding, *entry.buffer, entry.offset, entry.size); - } -} - void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string_view extra_info) { if (!GLAD_GL_KHR_debug) { // We don't need to throw an error as this is just for debugging diff --git a/src/video_core/renderer_opengl/utils.h b/src/video_core/renderer_opengl/utils.h index 47ee3177b..9c09ee12c 100644 --- a/src/video_core/renderer_opengl/utils.h +++ b/src/video_core/renderer_opengl/utils.h @@ -11,49 +11,6 @@ namespace OpenGL { -class StateTracker; - -class VertexArrayPushBuffer final { -public: - explicit VertexArrayPushBuffer(StateTracker& state_tracker); - ~VertexArrayPushBuffer(); - - void Setup(); - - void SetIndexBuffer(const GLuint* buffer); - - void SetVertexBuffer(GLuint binding_index, const GLuint* buffer, GLintptr offset, - GLsizei stride); - - void Bind(); - -private: - struct Entry; - - StateTracker& state_tracker; - - const GLuint* index_buffer{}; - std::vector<Entry> vertex_buffers; -}; - -class BindBuffersRangePushBuffer final { -public: - explicit BindBuffersRangePushBuffer(GLenum target); - ~BindBuffersRangePushBuffer(); - - void Setup(); - - void Push(GLuint binding, const GLuint* buffer, GLintptr offset, GLsizeiptr size); - - void Bind(); - -private: - struct Entry; - - GLenum target; - std::vector<Entry> entries; -}; - void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string_view extra_info = {}); } // namespace OpenGL diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp index 2bb376555..da5c550ea 100644 --- a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp +++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp @@ -2,10 +2,13 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <algorithm> +#include <cstring> #include <tuple> #include <boost/functional/hash.hpp> +#include "common/cityhash.h" #include "common/common_types.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" @@ -13,289 +16,375 @@ namespace Vulkan { namespace { -constexpr FixedPipelineState::DepthStencil GetDepthStencilState(const Maxwell& regs) { - const FixedPipelineState::StencilFace front_stencil( - regs.stencil_front_op_fail, regs.stencil_front_op_zfail, regs.stencil_front_op_zpass, - regs.stencil_front_func_func); - const FixedPipelineState::StencilFace back_stencil = - regs.stencil_two_side_enable - ? FixedPipelineState::StencilFace(regs.stencil_back_op_fail, regs.stencil_back_op_zfail, - regs.stencil_back_op_zpass, - regs.stencil_back_func_func) - : front_stencil; - return FixedPipelineState::DepthStencil( - regs.depth_test_enable == 1, regs.depth_write_enabled == 1, regs.depth_bounds_enable == 1, - regs.stencil_enable == 1, regs.depth_test_func, front_stencil, back_stencil); -} +constexpr std::size_t POINT = 0; +constexpr std::size_t LINE = 1; +constexpr std::size_t POLYGON = 2; +constexpr std::array POLYGON_OFFSET_ENABLE_LUT = { + POINT, // Points + LINE, // Lines + LINE, // LineLoop + LINE, // LineStrip + POLYGON, // Triangles + POLYGON, // TriangleStrip + POLYGON, // TriangleFan + POLYGON, // Quads + POLYGON, // QuadStrip + POLYGON, // Polygon + LINE, // LinesAdjacency + LINE, // LineStripAdjacency + POLYGON, // TrianglesAdjacency + POLYGON, // TriangleStripAdjacency + POLYGON, // Patches +}; -constexpr FixedPipelineState::InputAssembly GetInputAssemblyState(const Maxwell& regs) { - return FixedPipelineState::InputAssembly( - regs.draw.topology, regs.primitive_restart.enabled, - regs.draw.topology == Maxwell::PrimitiveTopology::Points ? regs.point_size : 0.0f); -} +} // Anonymous namespace -constexpr FixedPipelineState::BlendingAttachment GetBlendingAttachmentState( - const Maxwell& regs, std::size_t render_target) { - const auto& mask = regs.color_mask[regs.color_mask_common ? 0 : render_target]; - const std::array components = {mask.R != 0, mask.G != 0, mask.B != 0, mask.A != 0}; - - const FixedPipelineState::BlendingAttachment default_blending( - false, Maxwell::Blend::Equation::Add, Maxwell::Blend::Factor::One, - Maxwell::Blend::Factor::Zero, Maxwell::Blend::Equation::Add, Maxwell::Blend::Factor::One, - Maxwell::Blend::Factor::Zero, components); - if (render_target >= regs.rt_control.count) { - return default_blending; +void FixedPipelineState::Fill(const Maxwell& regs, bool has_extended_dynamic_state) { + const std::array enabled_lut = {regs.polygon_offset_point_enable, + regs.polygon_offset_line_enable, + regs.polygon_offset_fill_enable}; + const u32 topology_index = static_cast<u32>(regs.draw.topology.Value()); + + raw = 0; + primitive_restart_enable.Assign(regs.primitive_restart.enabled != 0 ? 1 : 0); + depth_bias_enable.Assign(enabled_lut[POLYGON_OFFSET_ENABLE_LUT[topology_index]] != 0 ? 1 : 0); + depth_clamp_disabled.Assign(regs.view_volume_clip_control.depth_clamp_disabled.Value()); + ndc_minus_one_to_one.Assign(regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1 : 0); + polygon_mode.Assign(PackPolygonMode(regs.polygon_mode_front)); + patch_control_points_minus_one.Assign(regs.patch_vertices - 1); + tessellation_primitive.Assign(static_cast<u32>(regs.tess_mode.prim.Value())); + tessellation_spacing.Assign(static_cast<u32>(regs.tess_mode.spacing.Value())); + tessellation_clockwise.Assign(regs.tess_mode.cw.Value()); + logic_op_enable.Assign(regs.logic_op.enable != 0 ? 1 : 0); + logic_op.Assign(PackLogicOp(regs.logic_op.operation)); + rasterize_enable.Assign(regs.rasterize_enable != 0 ? 1 : 0); + topology.Assign(regs.draw.topology); + + std::memcpy(&point_size, ®s.point_size, sizeof(point_size)); // TODO: C++20 std::bit_cast + + for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { + binding_divisors[index] = + regs.instanced_arrays.IsInstancingEnabled(index) ? regs.vertex_array[index].divisor : 0; } - if (!regs.independent_blend_enable) { - const auto& src = regs.blend; - if (!src.enable[render_target]) { - return default_blending; - } - return FixedPipelineState::BlendingAttachment( - true, src.equation_rgb, src.factor_source_rgb, src.factor_dest_rgb, src.equation_a, - src.factor_source_a, src.factor_dest_a, components); + for (std::size_t index = 0; index < Maxwell::NumVertexAttributes; ++index) { + const auto& input = regs.vertex_attrib_format[index]; + auto& attribute = attributes[index]; + attribute.raw = 0; + attribute.enabled.Assign(input.IsConstant() ? 0 : 1); + attribute.buffer.Assign(input.buffer); + attribute.offset.Assign(input.offset); + attribute.type.Assign(static_cast<u32>(input.type.Value())); + attribute.size.Assign(static_cast<u32>(input.size.Value())); } - if (!regs.blend.enable[render_target]) { - return default_blending; + for (std::size_t index = 0; index < std::size(attachments); ++index) { + attachments[index].Fill(regs, index); } - const auto& src = regs.independent_blend[render_target]; - return FixedPipelineState::BlendingAttachment( - true, src.equation_rgb, src.factor_source_rgb, src.factor_dest_rgb, src.equation_a, - src.factor_source_a, src.factor_dest_a, components); -} -constexpr FixedPipelineState::ColorBlending GetColorBlendingState(const Maxwell& regs) { - return FixedPipelineState::ColorBlending( - {regs.blend_color.r, regs.blend_color.g, regs.blend_color.b, regs.blend_color.a}, - regs.rt_control.count, - {GetBlendingAttachmentState(regs, 0), GetBlendingAttachmentState(regs, 1), - GetBlendingAttachmentState(regs, 2), GetBlendingAttachmentState(regs, 3), - GetBlendingAttachmentState(regs, 4), GetBlendingAttachmentState(regs, 5), - GetBlendingAttachmentState(regs, 6), GetBlendingAttachmentState(regs, 7)}); -} + const auto& transform = regs.viewport_transform; + std::transform(transform.begin(), transform.end(), viewport_swizzles.begin(), + [](const auto& viewport) { return static_cast<u16>(viewport.swizzle.raw); }); -constexpr FixedPipelineState::Tessellation GetTessellationState(const Maxwell& regs) { - return FixedPipelineState::Tessellation(regs.patch_vertices, regs.tess_mode.prim, - regs.tess_mode.spacing, regs.tess_mode.cw != 0); + if (!has_extended_dynamic_state) { + no_extended_dynamic_state.Assign(1); + dynamic_state.Fill(regs); + } } -constexpr std::size_t Point = 0; -constexpr std::size_t Line = 1; -constexpr std::size_t Polygon = 2; -constexpr std::array PolygonOffsetEnableLUT = { - Point, // Points - Line, // Lines - Line, // LineLoop - Line, // LineStrip - Polygon, // Triangles - Polygon, // TriangleStrip - Polygon, // TriangleFan - Polygon, // Quads - Polygon, // QuadStrip - Polygon, // Polygon - Line, // LinesAdjacency - Line, // LineStripAdjacency - Polygon, // TrianglesAdjacency - Polygon, // TriangleStripAdjacency - Polygon, // Patches -}; +void FixedPipelineState::BlendingAttachment::Fill(const Maxwell& regs, std::size_t index) { + const auto& mask = regs.color_mask[regs.color_mask_common ? 0 : index]; -constexpr FixedPipelineState::Rasterizer GetRasterizerState(const Maxwell& regs) { - const std::array enabled_lut = {regs.polygon_offset_point_enable, - regs.polygon_offset_line_enable, - regs.polygon_offset_fill_enable}; - const auto topology = static_cast<std::size_t>(regs.draw.topology.Value()); - const bool depth_bias_enabled = enabled_lut[PolygonOffsetEnableLUT[topology]]; - - const auto& clip = regs.view_volume_clip_control; - const bool depth_clamp_enabled = clip.depth_clamp_near == 1 || clip.depth_clamp_far == 1; - - Maxwell::FrontFace front_face = regs.front_face; - if (regs.screen_y_control.triangle_rast_flip != 0 && - regs.viewport_transform[0].scale_y > 0.0f) { - if (front_face == Maxwell::FrontFace::CounterClockWise) - front_face = Maxwell::FrontFace::ClockWise; - else if (front_face == Maxwell::FrontFace::ClockWise) - front_face = Maxwell::FrontFace::CounterClockWise; - } + raw = 0; + mask_r.Assign(mask.R); + mask_g.Assign(mask.G); + mask_b.Assign(mask.B); + mask_a.Assign(mask.A); - const bool gl_ndc = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne; - return FixedPipelineState::Rasterizer(regs.cull_test_enabled, depth_bias_enabled, - depth_clamp_enabled, gl_ndc, regs.cull_face, front_face); -} + // TODO: C++20 Use templated lambda to deduplicate code -} // Anonymous namespace - -std::size_t FixedPipelineState::VertexBinding::Hash() const noexcept { - return (index << stride) ^ divisor; -} + if (!regs.independent_blend_enable) { + const auto& src = regs.blend; + if (!src.enable[index]) { + return; + } + equation_rgb.Assign(PackBlendEquation(src.equation_rgb)); + equation_a.Assign(PackBlendEquation(src.equation_a)); + factor_source_rgb.Assign(PackBlendFactor(src.factor_source_rgb)); + factor_dest_rgb.Assign(PackBlendFactor(src.factor_dest_rgb)); + factor_source_a.Assign(PackBlendFactor(src.factor_source_a)); + factor_dest_a.Assign(PackBlendFactor(src.factor_dest_a)); + enable.Assign(1); + return; + } -bool FixedPipelineState::VertexBinding::operator==(const VertexBinding& rhs) const noexcept { - return std::tie(index, stride, divisor) == std::tie(rhs.index, rhs.stride, rhs.divisor); + if (!regs.blend.enable[index]) { + return; + } + const auto& src = regs.independent_blend[index]; + equation_rgb.Assign(PackBlendEquation(src.equation_rgb)); + equation_a.Assign(PackBlendEquation(src.equation_a)); + factor_source_rgb.Assign(PackBlendFactor(src.factor_source_rgb)); + factor_dest_rgb.Assign(PackBlendFactor(src.factor_dest_rgb)); + factor_source_a.Assign(PackBlendFactor(src.factor_source_a)); + factor_dest_a.Assign(PackBlendFactor(src.factor_dest_a)); + enable.Assign(1); } -std::size_t FixedPipelineState::VertexAttribute::Hash() const noexcept { - return static_cast<std::size_t>(index) ^ (static_cast<std::size_t>(buffer) << 13) ^ - (static_cast<std::size_t>(type) << 22) ^ (static_cast<std::size_t>(size) << 31) ^ - (static_cast<std::size_t>(offset) << 36); -} +void FixedPipelineState::DynamicState::Fill(const Maxwell& regs) { + u32 packed_front_face = PackFrontFace(regs.front_face); + if (regs.screen_y_control.triangle_rast_flip != 0) { + // Flip front face + packed_front_face = 1 - packed_front_face; + } -bool FixedPipelineState::VertexAttribute::operator==(const VertexAttribute& rhs) const noexcept { - return std::tie(index, buffer, type, size, offset) == - std::tie(rhs.index, rhs.buffer, rhs.type, rhs.size, rhs.offset); + raw1 = 0; + raw2 = 0; + front.action_stencil_fail.Assign(PackStencilOp(regs.stencil_front_op_fail)); + front.action_depth_fail.Assign(PackStencilOp(regs.stencil_front_op_zfail)); + front.action_depth_pass.Assign(PackStencilOp(regs.stencil_front_op_zpass)); + front.test_func.Assign(PackComparisonOp(regs.stencil_front_func_func)); + if (regs.stencil_two_side_enable) { + back.action_stencil_fail.Assign(PackStencilOp(regs.stencil_back_op_fail)); + back.action_depth_fail.Assign(PackStencilOp(regs.stencil_back_op_zfail)); + back.action_depth_pass.Assign(PackStencilOp(regs.stencil_back_op_zpass)); + back.test_func.Assign(PackComparisonOp(regs.stencil_back_func_func)); + } else { + back.action_stencil_fail.Assign(front.action_stencil_fail); + back.action_depth_fail.Assign(front.action_depth_fail); + back.action_depth_pass.Assign(front.action_depth_pass); + back.test_func.Assign(front.test_func); + } + stencil_enable.Assign(regs.stencil_enable); + depth_write_enable.Assign(regs.depth_write_enabled); + depth_bounds_enable.Assign(regs.depth_bounds_enable); + depth_test_enable.Assign(regs.depth_test_enable); + front_face.Assign(packed_front_face); + depth_test_func.Assign(PackComparisonOp(regs.depth_test_func)); + cull_face.Assign(PackCullFace(regs.cull_face)); + cull_enable.Assign(regs.cull_test_enabled != 0 ? 1 : 0); + + for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { + const auto& input = regs.vertex_array[index]; + VertexBinding& binding = vertex_bindings[index]; + binding.raw = 0; + binding.enabled.Assign(input.IsEnabled() ? 1 : 0); + binding.stride.Assign(static_cast<u16>(input.stride.Value())); + } } -std::size_t FixedPipelineState::StencilFace::Hash() const noexcept { - return static_cast<std::size_t>(action_stencil_fail) ^ - (static_cast<std::size_t>(action_depth_fail) << 4) ^ - (static_cast<std::size_t>(action_depth_fail) << 20) ^ - (static_cast<std::size_t>(action_depth_pass) << 36); +std::size_t FixedPipelineState::Hash() const noexcept { + const u64 hash = Common::CityHash64(reinterpret_cast<const char*>(this), Size()); + return static_cast<std::size_t>(hash); } -bool FixedPipelineState::StencilFace::operator==(const StencilFace& rhs) const noexcept { - return std::tie(action_stencil_fail, action_depth_fail, action_depth_pass, test_func) == - std::tie(rhs.action_stencil_fail, rhs.action_depth_fail, rhs.action_depth_pass, - rhs.test_func); +bool FixedPipelineState::operator==(const FixedPipelineState& rhs) const noexcept { + return std::memcmp(this, &rhs, Size()) == 0; } -std::size_t FixedPipelineState::BlendingAttachment::Hash() const noexcept { - return static_cast<std::size_t>(enable) ^ (static_cast<std::size_t>(rgb_equation) << 5) ^ - (static_cast<std::size_t>(src_rgb_func) << 10) ^ - (static_cast<std::size_t>(dst_rgb_func) << 15) ^ - (static_cast<std::size_t>(a_equation) << 20) ^ - (static_cast<std::size_t>(src_a_func) << 25) ^ - (static_cast<std::size_t>(dst_a_func) << 30) ^ - (static_cast<std::size_t>(components[0]) << 35) ^ - (static_cast<std::size_t>(components[1]) << 36) ^ - (static_cast<std::size_t>(components[2]) << 37) ^ - (static_cast<std::size_t>(components[3]) << 38); +u32 FixedPipelineState::PackComparisonOp(Maxwell::ComparisonOp op) noexcept { + // OpenGL enums go from 0x200 to 0x207 and the others from 1 to 8 + // If we substract 0x200 to OpenGL enums and 1 to the others we get a 0-7 range. + // Perfect for a hash. + const u32 value = static_cast<u32>(op); + return value - (value >= 0x200 ? 0x200 : 1); } -bool FixedPipelineState::BlendingAttachment::operator==(const BlendingAttachment& rhs) const - noexcept { - return std::tie(enable, rgb_equation, src_rgb_func, dst_rgb_func, a_equation, src_a_func, - dst_a_func, components) == - std::tie(rhs.enable, rhs.rgb_equation, rhs.src_rgb_func, rhs.dst_rgb_func, - rhs.a_equation, rhs.src_a_func, rhs.dst_a_func, rhs.components); +Maxwell::ComparisonOp FixedPipelineState::UnpackComparisonOp(u32 packed) noexcept { + // Read PackComparisonOp for the logic behind this. + return static_cast<Maxwell::ComparisonOp>(packed + 1); } -std::size_t FixedPipelineState::VertexInput::Hash() const noexcept { - std::size_t hash = num_bindings ^ (num_attributes << 32); - for (std::size_t i = 0; i < num_bindings; ++i) { - boost::hash_combine(hash, bindings[i].Hash()); +u32 FixedPipelineState::PackStencilOp(Maxwell::StencilOp op) noexcept { + switch (op) { + case Maxwell::StencilOp::Keep: + case Maxwell::StencilOp::KeepOGL: + return 0; + case Maxwell::StencilOp::Zero: + case Maxwell::StencilOp::ZeroOGL: + return 1; + case Maxwell::StencilOp::Replace: + case Maxwell::StencilOp::ReplaceOGL: + return 2; + case Maxwell::StencilOp::Incr: + case Maxwell::StencilOp::IncrOGL: + return 3; + case Maxwell::StencilOp::Decr: + case Maxwell::StencilOp::DecrOGL: + return 4; + case Maxwell::StencilOp::Invert: + case Maxwell::StencilOp::InvertOGL: + return 5; + case Maxwell::StencilOp::IncrWrap: + case Maxwell::StencilOp::IncrWrapOGL: + return 6; + case Maxwell::StencilOp::DecrWrap: + case Maxwell::StencilOp::DecrWrapOGL: + return 7; } - for (std::size_t i = 0; i < num_attributes; ++i) { - boost::hash_combine(hash, attributes[i].Hash()); - } - return hash; + return 0; } -bool FixedPipelineState::VertexInput::operator==(const VertexInput& rhs) const noexcept { - return std::equal(bindings.begin(), bindings.begin() + num_bindings, rhs.bindings.begin(), - rhs.bindings.begin() + rhs.num_bindings) && - std::equal(attributes.begin(), attributes.begin() + num_attributes, - rhs.attributes.begin(), rhs.attributes.begin() + rhs.num_attributes); +Maxwell::StencilOp FixedPipelineState::UnpackStencilOp(u32 packed) noexcept { + static constexpr std::array LUT = {Maxwell::StencilOp::Keep, Maxwell::StencilOp::Zero, + Maxwell::StencilOp::Replace, Maxwell::StencilOp::Incr, + Maxwell::StencilOp::Decr, Maxwell::StencilOp::Invert, + Maxwell::StencilOp::IncrWrap, Maxwell::StencilOp::DecrWrap}; + return LUT[packed]; } -std::size_t FixedPipelineState::InputAssembly::Hash() const noexcept { - std::size_t point_size_int = 0; - std::memcpy(&point_size_int, &point_size, sizeof(point_size)); - return (static_cast<std::size_t>(topology) << 24) ^ (point_size_int << 32) ^ - static_cast<std::size_t>(primitive_restart_enable); +u32 FixedPipelineState::PackCullFace(Maxwell::CullFace cull) noexcept { + // FrontAndBack is 0x408, by substracting 0x406 in it we get 2. + // Individual cull faces are in 0x404 and 0x405, substracting 0x404 we get 0 and 1. + const u32 value = static_cast<u32>(cull); + return value - (value == 0x408 ? 0x406 : 0x404); } -bool FixedPipelineState::InputAssembly::operator==(const InputAssembly& rhs) const noexcept { - return std::tie(topology, primitive_restart_enable, point_size) == - std::tie(rhs.topology, rhs.primitive_restart_enable, rhs.point_size); +Maxwell::CullFace FixedPipelineState::UnpackCullFace(u32 packed) noexcept { + static constexpr std::array LUT = {Maxwell::CullFace::Front, Maxwell::CullFace::Back, + Maxwell::CullFace::FrontAndBack}; + return LUT[packed]; } -std::size_t FixedPipelineState::Tessellation::Hash() const noexcept { - return static_cast<std::size_t>(patch_control_points) ^ - (static_cast<std::size_t>(primitive) << 6) ^ (static_cast<std::size_t>(spacing) << 8) ^ - (static_cast<std::size_t>(clockwise) << 10); +u32 FixedPipelineState::PackFrontFace(Maxwell::FrontFace face) noexcept { + return static_cast<u32>(face) - 0x900; } -bool FixedPipelineState::Tessellation::operator==(const Tessellation& rhs) const noexcept { - return std::tie(patch_control_points, primitive, spacing, clockwise) == - std::tie(rhs.patch_control_points, rhs.primitive, rhs.spacing, rhs.clockwise); +Maxwell::FrontFace FixedPipelineState::UnpackFrontFace(u32 packed) noexcept { + return static_cast<Maxwell::FrontFace>(packed + 0x900); } -std::size_t FixedPipelineState::Rasterizer::Hash() const noexcept { - return static_cast<std::size_t>(cull_enable) ^ - (static_cast<std::size_t>(depth_bias_enable) << 1) ^ - (static_cast<std::size_t>(depth_clamp_enable) << 2) ^ - (static_cast<std::size_t>(ndc_minus_one_to_one) << 3) ^ - (static_cast<std::size_t>(cull_face) << 24) ^ - (static_cast<std::size_t>(front_face) << 48); +u32 FixedPipelineState::PackPolygonMode(Maxwell::PolygonMode mode) noexcept { + return static_cast<u32>(mode) - 0x1B00; } -bool FixedPipelineState::Rasterizer::operator==(const Rasterizer& rhs) const noexcept { - return std::tie(cull_enable, depth_bias_enable, depth_clamp_enable, ndc_minus_one_to_one, - cull_face, front_face) == - std::tie(rhs.cull_enable, rhs.depth_bias_enable, rhs.depth_clamp_enable, - rhs.ndc_minus_one_to_one, rhs.cull_face, rhs.front_face); +Maxwell::PolygonMode FixedPipelineState::UnpackPolygonMode(u32 packed) noexcept { + return static_cast<Maxwell::PolygonMode>(packed + 0x1B00); } -std::size_t FixedPipelineState::DepthStencil::Hash() const noexcept { - std::size_t hash = static_cast<std::size_t>(depth_test_enable) ^ - (static_cast<std::size_t>(depth_write_enable) << 1) ^ - (static_cast<std::size_t>(depth_bounds_enable) << 2) ^ - (static_cast<std::size_t>(stencil_enable) << 3) ^ - (static_cast<std::size_t>(depth_test_function) << 4); - boost::hash_combine(hash, front_stencil.Hash()); - boost::hash_combine(hash, back_stencil.Hash()); - return hash; +u32 FixedPipelineState::PackLogicOp(Maxwell::LogicOperation op) noexcept { + return static_cast<u32>(op) - 0x1500; } -bool FixedPipelineState::DepthStencil::operator==(const DepthStencil& rhs) const noexcept { - return std::tie(depth_test_enable, depth_write_enable, depth_bounds_enable, depth_test_function, - stencil_enable, front_stencil, back_stencil) == - std::tie(rhs.depth_test_enable, rhs.depth_write_enable, rhs.depth_bounds_enable, - rhs.depth_test_function, rhs.stencil_enable, rhs.front_stencil, - rhs.back_stencil); +Maxwell::LogicOperation FixedPipelineState::UnpackLogicOp(u32 packed) noexcept { + return static_cast<Maxwell::LogicOperation>(packed + 0x1500); } -std::size_t FixedPipelineState::ColorBlending::Hash() const noexcept { - std::size_t hash = attachments_count << 13; - for (std::size_t rt = 0; rt < static_cast<std::size_t>(attachments_count); ++rt) { - boost::hash_combine(hash, attachments[rt].Hash()); +u32 FixedPipelineState::PackBlendEquation(Maxwell::Blend::Equation equation) noexcept { + switch (equation) { + case Maxwell::Blend::Equation::Add: + case Maxwell::Blend::Equation::AddGL: + return 0; + case Maxwell::Blend::Equation::Subtract: + case Maxwell::Blend::Equation::SubtractGL: + return 1; + case Maxwell::Blend::Equation::ReverseSubtract: + case Maxwell::Blend::Equation::ReverseSubtractGL: + return 2; + case Maxwell::Blend::Equation::Min: + case Maxwell::Blend::Equation::MinGL: + return 3; + case Maxwell::Blend::Equation::Max: + case Maxwell::Blend::Equation::MaxGL: + return 4; } - return hash; + return 0; } -bool FixedPipelineState::ColorBlending::operator==(const ColorBlending& rhs) const noexcept { - return std::equal(attachments.begin(), attachments.begin() + attachments_count, - rhs.attachments.begin(), rhs.attachments.begin() + rhs.attachments_count); +Maxwell::Blend::Equation FixedPipelineState::UnpackBlendEquation(u32 packed) noexcept { + static constexpr std::array LUT = { + Maxwell::Blend::Equation::Add, Maxwell::Blend::Equation::Subtract, + Maxwell::Blend::Equation::ReverseSubtract, Maxwell::Blend::Equation::Min, + Maxwell::Blend::Equation::Max}; + return LUT[packed]; } -std::size_t FixedPipelineState::Hash() const noexcept { - std::size_t hash = 0; - boost::hash_combine(hash, vertex_input.Hash()); - boost::hash_combine(hash, input_assembly.Hash()); - boost::hash_combine(hash, tessellation.Hash()); - boost::hash_combine(hash, rasterizer.Hash()); - boost::hash_combine(hash, depth_stencil.Hash()); - boost::hash_combine(hash, color_blending.Hash()); - return hash; -} - -bool FixedPipelineState::operator==(const FixedPipelineState& rhs) const noexcept { - return std::tie(vertex_input, input_assembly, tessellation, rasterizer, depth_stencil, - color_blending) == std::tie(rhs.vertex_input, rhs.input_assembly, - rhs.tessellation, rhs.rasterizer, rhs.depth_stencil, - rhs.color_blending); +u32 FixedPipelineState::PackBlendFactor(Maxwell::Blend::Factor factor) noexcept { + switch (factor) { + case Maxwell::Blend::Factor::Zero: + case Maxwell::Blend::Factor::ZeroGL: + return 0; + case Maxwell::Blend::Factor::One: + case Maxwell::Blend::Factor::OneGL: + return 1; + case Maxwell::Blend::Factor::SourceColor: + case Maxwell::Blend::Factor::SourceColorGL: + return 2; + case Maxwell::Blend::Factor::OneMinusSourceColor: + case Maxwell::Blend::Factor::OneMinusSourceColorGL: + return 3; + case Maxwell::Blend::Factor::SourceAlpha: + case Maxwell::Blend::Factor::SourceAlphaGL: + return 4; + case Maxwell::Blend::Factor::OneMinusSourceAlpha: + case Maxwell::Blend::Factor::OneMinusSourceAlphaGL: + return 5; + case Maxwell::Blend::Factor::DestAlpha: + case Maxwell::Blend::Factor::DestAlphaGL: + return 6; + case Maxwell::Blend::Factor::OneMinusDestAlpha: + case Maxwell::Blend::Factor::OneMinusDestAlphaGL: + return 7; + case Maxwell::Blend::Factor::DestColor: + case Maxwell::Blend::Factor::DestColorGL: + return 8; + case Maxwell::Blend::Factor::OneMinusDestColor: + case Maxwell::Blend::Factor::OneMinusDestColorGL: + return 9; + case Maxwell::Blend::Factor::SourceAlphaSaturate: + case Maxwell::Blend::Factor::SourceAlphaSaturateGL: + return 10; + case Maxwell::Blend::Factor::Source1Color: + case Maxwell::Blend::Factor::Source1ColorGL: + return 11; + case Maxwell::Blend::Factor::OneMinusSource1Color: + case Maxwell::Blend::Factor::OneMinusSource1ColorGL: + return 12; + case Maxwell::Blend::Factor::Source1Alpha: + case Maxwell::Blend::Factor::Source1AlphaGL: + return 13; + case Maxwell::Blend::Factor::OneMinusSource1Alpha: + case Maxwell::Blend::Factor::OneMinusSource1AlphaGL: + return 14; + case Maxwell::Blend::Factor::ConstantColor: + case Maxwell::Blend::Factor::ConstantColorGL: + return 15; + case Maxwell::Blend::Factor::OneMinusConstantColor: + case Maxwell::Blend::Factor::OneMinusConstantColorGL: + return 16; + case Maxwell::Blend::Factor::ConstantAlpha: + case Maxwell::Blend::Factor::ConstantAlphaGL: + return 17; + case Maxwell::Blend::Factor::OneMinusConstantAlpha: + case Maxwell::Blend::Factor::OneMinusConstantAlphaGL: + return 18; + } + return 0; } -FixedPipelineState GetFixedPipelineState(const Maxwell& regs) { - FixedPipelineState fixed_state; - fixed_state.input_assembly = GetInputAssemblyState(regs); - fixed_state.tessellation = GetTessellationState(regs); - fixed_state.rasterizer = GetRasterizerState(regs); - fixed_state.depth_stencil = GetDepthStencilState(regs); - fixed_state.color_blending = GetColorBlendingState(regs); - return fixed_state; +Maxwell::Blend::Factor FixedPipelineState::UnpackBlendFactor(u32 packed) noexcept { + static constexpr std::array LUT = { + Maxwell::Blend::Factor::Zero, + Maxwell::Blend::Factor::One, + Maxwell::Blend::Factor::SourceColor, + Maxwell::Blend::Factor::OneMinusSourceColor, + Maxwell::Blend::Factor::SourceAlpha, + Maxwell::Blend::Factor::OneMinusSourceAlpha, + Maxwell::Blend::Factor::DestAlpha, + Maxwell::Blend::Factor::OneMinusDestAlpha, + Maxwell::Blend::Factor::DestColor, + Maxwell::Blend::Factor::OneMinusDestColor, + Maxwell::Blend::Factor::SourceAlphaSaturate, + Maxwell::Blend::Factor::Source1Color, + Maxwell::Blend::Factor::OneMinusSource1Color, + Maxwell::Blend::Factor::Source1Alpha, + Maxwell::Blend::Factor::OneMinusSource1Alpha, + Maxwell::Blend::Factor::ConstantColor, + Maxwell::Blend::Factor::OneMinusConstantColor, + Maxwell::Blend::Factor::ConstantAlpha, + Maxwell::Blend::Factor::OneMinusConstantAlpha, + }; + return LUT[packed]; } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.h b/src/video_core/renderer_vulkan/fixed_pipeline_state.h index 4c8ba7f90..2c18eeaae 100644 --- a/src/video_core/renderer_vulkan/fixed_pipeline_state.h +++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.h @@ -7,6 +7,7 @@ #include <array> #include <type_traits> +#include "common/bit_field.h" #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" @@ -16,230 +17,184 @@ namespace Vulkan { using Maxwell = Tegra::Engines::Maxwell3D::Regs; -// TODO(Rodrigo): Optimize this structure. - struct FixedPipelineState { - using PixelFormat = VideoCore::Surface::PixelFormat; - - struct VertexBinding { - constexpr VertexBinding(u32 index, u32 stride, u32 divisor) - : index{index}, stride{stride}, divisor{divisor} {} - VertexBinding() = default; - - u32 index; - u32 stride; - u32 divisor; + static u32 PackComparisonOp(Maxwell::ComparisonOp op) noexcept; + static Maxwell::ComparisonOp UnpackComparisonOp(u32 packed) noexcept; - std::size_t Hash() const noexcept; + static u32 PackStencilOp(Maxwell::StencilOp op) noexcept; + static Maxwell::StencilOp UnpackStencilOp(u32 packed) noexcept; - bool operator==(const VertexBinding& rhs) const noexcept; + static u32 PackCullFace(Maxwell::CullFace cull) noexcept; + static Maxwell::CullFace UnpackCullFace(u32 packed) noexcept; - bool operator!=(const VertexBinding& rhs) const noexcept { - return !operator==(rhs); - } - }; + static u32 PackFrontFace(Maxwell::FrontFace face) noexcept; + static Maxwell::FrontFace UnpackFrontFace(u32 packed) noexcept; - struct VertexAttribute { - constexpr VertexAttribute(u32 index, u32 buffer, Maxwell::VertexAttribute::Type type, - Maxwell::VertexAttribute::Size size, u32 offset) - : index{index}, buffer{buffer}, type{type}, size{size}, offset{offset} {} - VertexAttribute() = default; + static u32 PackPolygonMode(Maxwell::PolygonMode mode) noexcept; + static Maxwell::PolygonMode UnpackPolygonMode(u32 packed) noexcept; - u32 index; - u32 buffer; - Maxwell::VertexAttribute::Type type; - Maxwell::VertexAttribute::Size size; - u32 offset; + static u32 PackLogicOp(Maxwell::LogicOperation op) noexcept; + static Maxwell::LogicOperation UnpackLogicOp(u32 packed) noexcept; - std::size_t Hash() const noexcept; + static u32 PackBlendEquation(Maxwell::Blend::Equation equation) noexcept; + static Maxwell::Blend::Equation UnpackBlendEquation(u32 packed) noexcept; - bool operator==(const VertexAttribute& rhs) const noexcept; + static u32 PackBlendFactor(Maxwell::Blend::Factor factor) noexcept; + static Maxwell::Blend::Factor UnpackBlendFactor(u32 packed) noexcept; - bool operator!=(const VertexAttribute& rhs) const noexcept { - return !operator==(rhs); + struct BlendingAttachment { + union { + u32 raw; + BitField<0, 1, u32> mask_r; + BitField<1, 1, u32> mask_g; + BitField<2, 1, u32> mask_b; + BitField<3, 1, u32> mask_a; + BitField<4, 3, u32> equation_rgb; + BitField<7, 3, u32> equation_a; + BitField<10, 5, u32> factor_source_rgb; + BitField<15, 5, u32> factor_dest_rgb; + BitField<20, 5, u32> factor_source_a; + BitField<25, 5, u32> factor_dest_a; + BitField<30, 1, u32> enable; + }; + + void Fill(const Maxwell& regs, std::size_t index); + + constexpr std::array<bool, 4> Mask() const noexcept { + return {mask_r != 0, mask_g != 0, mask_b != 0, mask_a != 0}; } - }; - - struct StencilFace { - constexpr StencilFace(Maxwell::StencilOp action_stencil_fail, - Maxwell::StencilOp action_depth_fail, - Maxwell::StencilOp action_depth_pass, Maxwell::ComparisonOp test_func) - : action_stencil_fail{action_stencil_fail}, action_depth_fail{action_depth_fail}, - action_depth_pass{action_depth_pass}, test_func{test_func} {} - StencilFace() = default; - - Maxwell::StencilOp action_stencil_fail; - Maxwell::StencilOp action_depth_fail; - Maxwell::StencilOp action_depth_pass; - Maxwell::ComparisonOp test_func; - std::size_t Hash() const noexcept; - - bool operator==(const StencilFace& rhs) const noexcept; - - bool operator!=(const StencilFace& rhs) const noexcept { - return !operator==(rhs); + Maxwell::Blend::Equation EquationRGB() const noexcept { + return UnpackBlendEquation(equation_rgb.Value()); } - }; - struct BlendingAttachment { - constexpr BlendingAttachment(bool enable, Maxwell::Blend::Equation rgb_equation, - Maxwell::Blend::Factor src_rgb_func, - Maxwell::Blend::Factor dst_rgb_func, - Maxwell::Blend::Equation a_equation, - Maxwell::Blend::Factor src_a_func, - Maxwell::Blend::Factor dst_a_func, - std::array<bool, 4> components) - : enable{enable}, rgb_equation{rgb_equation}, src_rgb_func{src_rgb_func}, - dst_rgb_func{dst_rgb_func}, a_equation{a_equation}, src_a_func{src_a_func}, - dst_a_func{dst_a_func}, components{components} {} - BlendingAttachment() = default; - - bool enable; - Maxwell::Blend::Equation rgb_equation; - Maxwell::Blend::Factor src_rgb_func; - Maxwell::Blend::Factor dst_rgb_func; - Maxwell::Blend::Equation a_equation; - Maxwell::Blend::Factor src_a_func; - Maxwell::Blend::Factor dst_a_func; - std::array<bool, 4> components; - - std::size_t Hash() const noexcept; - - bool operator==(const BlendingAttachment& rhs) const noexcept; - - bool operator!=(const BlendingAttachment& rhs) const noexcept { - return !operator==(rhs); + Maxwell::Blend::Equation EquationAlpha() const noexcept { + return UnpackBlendEquation(equation_a.Value()); } - }; - struct VertexInput { - std::size_t num_bindings = 0; - std::size_t num_attributes = 0; - std::array<VertexBinding, Maxwell::NumVertexArrays> bindings; - std::array<VertexAttribute, Maxwell::NumVertexAttributes> attributes; + Maxwell::Blend::Factor SourceRGBFactor() const noexcept { + return UnpackBlendFactor(factor_source_rgb.Value()); + } - std::size_t Hash() const noexcept; + Maxwell::Blend::Factor DestRGBFactor() const noexcept { + return UnpackBlendFactor(factor_dest_rgb.Value()); + } - bool operator==(const VertexInput& rhs) const noexcept; + Maxwell::Blend::Factor SourceAlphaFactor() const noexcept { + return UnpackBlendFactor(factor_source_a.Value()); + } - bool operator!=(const VertexInput& rhs) const noexcept { - return !operator==(rhs); + Maxwell::Blend::Factor DestAlphaFactor() const noexcept { + return UnpackBlendFactor(factor_dest_a.Value()); } }; - struct InputAssembly { - constexpr InputAssembly(Maxwell::PrimitiveTopology topology, bool primitive_restart_enable, - float point_size) - : topology{topology}, primitive_restart_enable{primitive_restart_enable}, - point_size{point_size} {} - InputAssembly() = default; - - Maxwell::PrimitiveTopology topology; - bool primitive_restart_enable; - float point_size; + union VertexAttribute { + u32 raw; + BitField<0, 1, u32> enabled; + BitField<1, 5, u32> buffer; + BitField<6, 14, u32> offset; + BitField<20, 3, u32> type; + BitField<23, 6, u32> size; - std::size_t Hash() const noexcept; - - bool operator==(const InputAssembly& rhs) const noexcept; + constexpr Maxwell::VertexAttribute::Type Type() const noexcept { + return static_cast<Maxwell::VertexAttribute::Type>(type.Value()); + } - bool operator!=(const InputAssembly& rhs) const noexcept { - return !operator==(rhs); + constexpr Maxwell::VertexAttribute::Size Size() const noexcept { + return static_cast<Maxwell::VertexAttribute::Size>(size.Value()); } }; - struct Tessellation { - constexpr Tessellation(u32 patch_control_points, Maxwell::TessellationPrimitive primitive, - Maxwell::TessellationSpacing spacing, bool clockwise) - : patch_control_points{patch_control_points}, primitive{primitive}, spacing{spacing}, - clockwise{clockwise} {} - Tessellation() = default; + template <std::size_t Position> + union StencilFace { + BitField<Position + 0, 3, u32> action_stencil_fail; + BitField<Position + 3, 3, u32> action_depth_fail; + BitField<Position + 6, 3, u32> action_depth_pass; + BitField<Position + 9, 3, u32> test_func; - u32 patch_control_points; - Maxwell::TessellationPrimitive primitive; - Maxwell::TessellationSpacing spacing; - bool clockwise; + Maxwell::StencilOp ActionStencilFail() const noexcept { + return UnpackStencilOp(action_stencil_fail); + } - std::size_t Hash() const noexcept; + Maxwell::StencilOp ActionDepthFail() const noexcept { + return UnpackStencilOp(action_depth_fail); + } - bool operator==(const Tessellation& rhs) const noexcept; + Maxwell::StencilOp ActionDepthPass() const noexcept { + return UnpackStencilOp(action_depth_pass); + } - bool operator!=(const Tessellation& rhs) const noexcept { - return !operator==(rhs); + Maxwell::ComparisonOp TestFunc() const noexcept { + return UnpackComparisonOp(test_func); } }; - struct Rasterizer { - constexpr Rasterizer(bool cull_enable, bool depth_bias_enable, bool depth_clamp_enable, - bool ndc_minus_one_to_one, Maxwell::CullFace cull_face, - Maxwell::FrontFace front_face) - : cull_enable{cull_enable}, depth_bias_enable{depth_bias_enable}, - depth_clamp_enable{depth_clamp_enable}, ndc_minus_one_to_one{ndc_minus_one_to_one}, - cull_face{cull_face}, front_face{front_face} {} - Rasterizer() = default; - - bool cull_enable; - bool depth_bias_enable; - bool depth_clamp_enable; - bool ndc_minus_one_to_one; - Maxwell::CullFace cull_face; - Maxwell::FrontFace front_face; - - std::size_t Hash() const noexcept; + union VertexBinding { + u16 raw; + BitField<0, 12, u16> stride; + BitField<12, 1, u16> enabled; + }; - bool operator==(const Rasterizer& rhs) const noexcept; + struct DynamicState { + union { + u32 raw1; + StencilFace<0> front; + StencilFace<12> back; + BitField<24, 1, u32> stencil_enable; + BitField<25, 1, u32> depth_write_enable; + BitField<26, 1, u32> depth_bounds_enable; + BitField<27, 1, u32> depth_test_enable; + BitField<28, 1, u32> front_face; + BitField<29, 3, u32> depth_test_func; + }; + union { + u32 raw2; + BitField<0, 2, u32> cull_face; + BitField<2, 1, u32> cull_enable; + }; + std::array<VertexBinding, Maxwell::NumVertexArrays> vertex_bindings; + + void Fill(const Maxwell& regs); + + Maxwell::ComparisonOp DepthTestFunc() const noexcept { + return UnpackComparisonOp(depth_test_func); + } - bool operator!=(const Rasterizer& rhs) const noexcept { - return !operator==(rhs); + Maxwell::CullFace CullFace() const noexcept { + return UnpackCullFace(cull_face.Value()); } - }; - struct DepthStencil { - constexpr DepthStencil(bool depth_test_enable, bool depth_write_enable, - bool depth_bounds_enable, bool stencil_enable, - Maxwell::ComparisonOp depth_test_function, StencilFace front_stencil, - StencilFace back_stencil) - : depth_test_enable{depth_test_enable}, depth_write_enable{depth_write_enable}, - depth_bounds_enable{depth_bounds_enable}, stencil_enable{stencil_enable}, - depth_test_function{depth_test_function}, front_stencil{front_stencil}, - back_stencil{back_stencil} {} - DepthStencil() = default; - - bool depth_test_enable; - bool depth_write_enable; - bool depth_bounds_enable; - bool stencil_enable; - Maxwell::ComparisonOp depth_test_function; - StencilFace front_stencil; - StencilFace back_stencil; - - std::size_t Hash() const noexcept; - - bool operator==(const DepthStencil& rhs) const noexcept; - - bool operator!=(const DepthStencil& rhs) const noexcept { - return !operator==(rhs); + Maxwell::FrontFace FrontFace() const noexcept { + return UnpackFrontFace(front_face.Value()); } }; - struct ColorBlending { - constexpr ColorBlending( - std::array<float, 4> blend_constants, std::size_t attachments_count, - std::array<BlendingAttachment, Maxwell::NumRenderTargets> attachments) - : attachments_count{attachments_count}, attachments{attachments} {} - ColorBlending() = default; - - std::size_t attachments_count; - std::array<BlendingAttachment, Maxwell::NumRenderTargets> attachments; - - std::size_t Hash() const noexcept; - - bool operator==(const ColorBlending& rhs) const noexcept; - - bool operator!=(const ColorBlending& rhs) const noexcept { - return !operator==(rhs); - } + union { + u32 raw; + BitField<0, 1, u32> no_extended_dynamic_state; + BitField<2, 1, u32> primitive_restart_enable; + BitField<3, 1, u32> depth_bias_enable; + BitField<4, 1, u32> depth_clamp_disabled; + BitField<5, 1, u32> ndc_minus_one_to_one; + BitField<6, 2, u32> polygon_mode; + BitField<8, 5, u32> patch_control_points_minus_one; + BitField<13, 2, u32> tessellation_primitive; + BitField<15, 2, u32> tessellation_spacing; + BitField<17, 1, u32> tessellation_clockwise; + BitField<18, 1, u32> logic_op_enable; + BitField<19, 4, u32> logic_op; + BitField<23, 1, u32> rasterize_enable; + BitField<24, 4, Maxwell::PrimitiveTopology> topology; }; + u32 point_size; + std::array<u32, Maxwell::NumVertexArrays> binding_divisors; + std::array<VertexAttribute, Maxwell::NumVertexAttributes> attributes; + std::array<BlendingAttachment, Maxwell::NumRenderTargets> attachments; + std::array<u16, Maxwell::NumViewports> viewport_swizzles; + DynamicState dynamic_state; + + void Fill(const Maxwell& regs, bool has_extended_dynamic_state); std::size_t Hash() const noexcept; @@ -249,26 +204,14 @@ struct FixedPipelineState { return !operator==(rhs); } - VertexInput vertex_input; - InputAssembly input_assembly; - Tessellation tessellation; - Rasterizer rasterizer; - DepthStencil depth_stencil; - ColorBlending color_blending; + std::size_t Size() const noexcept { + const std::size_t total_size = sizeof *this; + return total_size - (no_extended_dynamic_state != 0 ? 0 : sizeof(DynamicState)); + } }; -static_assert(std::is_trivially_copyable_v<FixedPipelineState::VertexBinding>); -static_assert(std::is_trivially_copyable_v<FixedPipelineState::VertexAttribute>); -static_assert(std::is_trivially_copyable_v<FixedPipelineState::StencilFace>); -static_assert(std::is_trivially_copyable_v<FixedPipelineState::BlendingAttachment>); -static_assert(std::is_trivially_copyable_v<FixedPipelineState::VertexInput>); -static_assert(std::is_trivially_copyable_v<FixedPipelineState::InputAssembly>); -static_assert(std::is_trivially_copyable_v<FixedPipelineState::Tessellation>); -static_assert(std::is_trivially_copyable_v<FixedPipelineState::Rasterizer>); -static_assert(std::is_trivially_copyable_v<FixedPipelineState::DepthStencil>); -static_assert(std::is_trivially_copyable_v<FixedPipelineState::ColorBlending>); +static_assert(std::has_unique_object_representations_v<FixedPipelineState>); static_assert(std::is_trivially_copyable_v<FixedPipelineState>); - -FixedPipelineState GetFixedPipelineState(const Maxwell& regs); +static_assert(std::is_trivially_constructible_v<FixedPipelineState>); } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 8681b821f..d22de1d81 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -21,29 +21,29 @@ namespace Sampler { VkFilter Filter(Tegra::Texture::TextureFilter filter) { switch (filter) { - case Tegra::Texture::TextureFilter::Linear: - return VK_FILTER_LINEAR; case Tegra::Texture::TextureFilter::Nearest: return VK_FILTER_NEAREST; + case Tegra::Texture::TextureFilter::Linear: + return VK_FILTER_LINEAR; } - UNIMPLEMENTED_MSG("Unimplemented sampler filter={}", static_cast<u32>(filter)); + UNREACHABLE_MSG("Invalid sampler filter={}", static_cast<u32>(filter)); return {}; } VkSamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filter) { switch (mipmap_filter) { case Tegra::Texture::TextureMipmapFilter::None: - // TODO(Rodrigo): None seems to be mapped to OpenGL's mag and min filters without mipmapping - // (e.g. GL_NEAREST and GL_LINEAR). Vulkan doesn't have such a thing, find out if we have to - // use an image view with a single mipmap level to emulate this. - return VK_SAMPLER_MIPMAP_MODE_LINEAR; - ; - case Tegra::Texture::TextureMipmapFilter::Linear: - return VK_SAMPLER_MIPMAP_MODE_LINEAR; + // There are no Vulkan filter modes that directly correspond to OpenGL minification filters + // of GL_LINEAR or GL_NEAREST, but they can be emulated using + // VK_SAMPLER_MIPMAP_MODE_NEAREST, minLod = 0, and maxLod = 0.25, and using minFilter = + // VK_FILTER_LINEAR or minFilter = VK_FILTER_NEAREST, respectively. + return VK_SAMPLER_MIPMAP_MODE_NEAREST; case Tegra::Texture::TextureMipmapFilter::Nearest: return VK_SAMPLER_MIPMAP_MODE_NEAREST; + case Tegra::Texture::TextureMipmapFilter::Linear: + return VK_SAMPLER_MIPMAP_MODE_LINEAR; } - UNIMPLEMENTED_MSG("Unimplemented sampler mipmap mode={}", static_cast<u32>(mipmap_filter)); + UNREACHABLE_MSG("Invalid sampler mipmap mode={}", static_cast<u32>(mipmap_filter)); return {}; } @@ -118,89 +118,101 @@ struct FormatTuple { VkFormat format; ///< Vulkan format int usage = 0; ///< Describes image format usage } constexpr tex_format_tuples[] = { - {VK_FORMAT_A8B8G8R8_UNORM_PACK32, Attachable | Storage}, // ABGR8U - {VK_FORMAT_A8B8G8R8_SNORM_PACK32, Attachable | Storage}, // ABGR8S - {VK_FORMAT_A8B8G8R8_UINT_PACK32, Attachable | Storage}, // ABGR8UI - {VK_FORMAT_B5G6R5_UNORM_PACK16}, // B5G6R5U - {VK_FORMAT_A2B10G10R10_UNORM_PACK32, Attachable | Storage}, // A2B10G10R10U - {VK_FORMAT_A1R5G5B5_UNORM_PACK16, Attachable}, // A1B5G5R5U (flipped with swizzle) - {VK_FORMAT_R8_UNORM, Attachable | Storage}, // R8U - {VK_FORMAT_R8_UINT, Attachable | Storage}, // R8UI - {VK_FORMAT_R16G16B16A16_SFLOAT, Attachable | Storage}, // RGBA16F - {VK_FORMAT_R16G16B16A16_UNORM, Attachable | Storage}, // RGBA16U - {VK_FORMAT_R16G16B16A16_SNORM, Attachable | Storage}, // RGBA16S - {VK_FORMAT_R16G16B16A16_UINT, Attachable | Storage}, // RGBA16UI - {VK_FORMAT_B10G11R11_UFLOAT_PACK32, Attachable | Storage}, // R11FG11FB10F - {VK_FORMAT_R32G32B32A32_UINT, Attachable | Storage}, // RGBA32UI - {VK_FORMAT_BC1_RGBA_UNORM_BLOCK}, // DXT1 - {VK_FORMAT_BC2_UNORM_BLOCK}, // DXT23 - {VK_FORMAT_BC3_UNORM_BLOCK}, // DXT45 - {VK_FORMAT_BC4_UNORM_BLOCK}, // DXN1 - {VK_FORMAT_BC5_UNORM_BLOCK}, // DXN2UNORM - {VK_FORMAT_BC5_SNORM_BLOCK}, // DXN2SNORM - {VK_FORMAT_BC7_UNORM_BLOCK}, // BC7U - {VK_FORMAT_BC6H_UFLOAT_BLOCK}, // BC6H_UF16 - {VK_FORMAT_BC6H_SFLOAT_BLOCK}, // BC6H_SF16 - {VK_FORMAT_ASTC_4x4_UNORM_BLOCK}, // ASTC_2D_4X4 - {VK_FORMAT_B8G8R8A8_UNORM}, // BGRA8 - {VK_FORMAT_R32G32B32A32_SFLOAT, Attachable | Storage}, // RGBA32F - {VK_FORMAT_R32G32_SFLOAT, Attachable | Storage}, // RG32F - {VK_FORMAT_R32_SFLOAT, Attachable | Storage}, // R32F - {VK_FORMAT_R16_SFLOAT, Attachable | Storage}, // R16F - {VK_FORMAT_R16_UNORM, Attachable | Storage}, // R16U - {VK_FORMAT_UNDEFINED}, // R16S - {VK_FORMAT_UNDEFINED}, // R16UI - {VK_FORMAT_UNDEFINED}, // R16I - {VK_FORMAT_R16G16_UNORM, Attachable | Storage}, // RG16 - {VK_FORMAT_R16G16_SFLOAT, Attachable | Storage}, // RG16F - {VK_FORMAT_UNDEFINED}, // RG16UI - {VK_FORMAT_UNDEFINED}, // RG16I - {VK_FORMAT_R16G16_SNORM, Attachable | Storage}, // RG16S - {VK_FORMAT_UNDEFINED}, // RGB32F - {VK_FORMAT_R8G8B8A8_SRGB, Attachable}, // RGBA8_SRGB - {VK_FORMAT_R8G8_UNORM, Attachable | Storage}, // RG8U - {VK_FORMAT_R8G8_SNORM, Attachable | Storage}, // RG8S - {VK_FORMAT_R32G32_UINT, Attachable | Storage}, // RG32UI - {VK_FORMAT_UNDEFINED}, // RGBX16F - {VK_FORMAT_R32_UINT, Attachable | Storage}, // R32UI - {VK_FORMAT_R32_SINT, Attachable | Storage}, // R32I - {VK_FORMAT_ASTC_8x8_UNORM_BLOCK}, // ASTC_2D_8X8 - {VK_FORMAT_UNDEFINED}, // ASTC_2D_8X5 - {VK_FORMAT_UNDEFINED}, // ASTC_2D_5X4 - {VK_FORMAT_UNDEFINED}, // BGRA8_SRGB - {VK_FORMAT_BC1_RGBA_SRGB_BLOCK}, // DXT1_SRGB - {VK_FORMAT_BC2_SRGB_BLOCK}, // DXT23_SRGB - {VK_FORMAT_BC3_SRGB_BLOCK}, // DXT45_SRGB - {VK_FORMAT_BC7_SRGB_BLOCK}, // BC7U_SRGB - {VK_FORMAT_R4G4B4A4_UNORM_PACK16, Attachable}, // R4G4B4A4U - {VK_FORMAT_ASTC_4x4_SRGB_BLOCK}, // ASTC_2D_4X4_SRGB - {VK_FORMAT_ASTC_8x8_SRGB_BLOCK}, // ASTC_2D_8X8_SRGB - {VK_FORMAT_ASTC_8x5_SRGB_BLOCK}, // ASTC_2D_8X5_SRGB - {VK_FORMAT_ASTC_5x4_SRGB_BLOCK}, // ASTC_2D_5X4_SRGB - {VK_FORMAT_ASTC_5x5_UNORM_BLOCK}, // ASTC_2D_5X5 - {VK_FORMAT_ASTC_5x5_SRGB_BLOCK}, // ASTC_2D_5X5_SRGB - {VK_FORMAT_ASTC_10x8_UNORM_BLOCK}, // ASTC_2D_10X8 - {VK_FORMAT_ASTC_10x8_SRGB_BLOCK}, // ASTC_2D_10X8_SRGB - {VK_FORMAT_ASTC_6x6_UNORM_BLOCK}, // ASTC_2D_6X6 - {VK_FORMAT_ASTC_6x6_SRGB_BLOCK}, // ASTC_2D_6X6_SRGB - {VK_FORMAT_ASTC_10x10_UNORM_BLOCK}, // ASTC_2D_10X10 - {VK_FORMAT_ASTC_10x10_SRGB_BLOCK}, // ASTC_2D_10X10_SRGB - {VK_FORMAT_ASTC_12x12_UNORM_BLOCK}, // ASTC_2D_12X12 - {VK_FORMAT_ASTC_12x12_SRGB_BLOCK}, // ASTC_2D_12X12_SRGB - {VK_FORMAT_ASTC_8x6_UNORM_BLOCK}, // ASTC_2D_8X6 - {VK_FORMAT_ASTC_8x6_SRGB_BLOCK}, // ASTC_2D_8X6_SRGB - {VK_FORMAT_ASTC_6x5_UNORM_BLOCK}, // ASTC_2D_6X5 - {VK_FORMAT_ASTC_6x5_SRGB_BLOCK}, // ASTC_2D_6X5_SRGB - {VK_FORMAT_E5B9G9R9_UFLOAT_PACK32}, // E5B9G9R9F + {VK_FORMAT_A8B8G8R8_UNORM_PACK32, Attachable | Storage}, // A8B8G8R8_UNORM + {VK_FORMAT_A8B8G8R8_SNORM_PACK32, Attachable | Storage}, // A8B8G8R8_SNORM + {VK_FORMAT_A8B8G8R8_SINT_PACK32, Attachable | Storage}, // A8B8G8R8_SINT + {VK_FORMAT_A8B8G8R8_UINT_PACK32, Attachable | Storage}, // A8B8G8R8_UINT + {VK_FORMAT_R5G6B5_UNORM_PACK16, Attachable}, // R5G6B5_UNORM + {VK_FORMAT_B5G6R5_UNORM_PACK16, Attachable}, // B5G6R5_UNORM + {VK_FORMAT_A1R5G5B5_UNORM_PACK16, Attachable}, // A1R5G5B5_UNORM + {VK_FORMAT_A2B10G10R10_UNORM_PACK32, Attachable | Storage}, // A2B10G10R10_UNORM + {VK_FORMAT_A2B10G10R10_UINT_PACK32, Attachable | Storage}, // A2B10G10R10_UINT + {VK_FORMAT_A1R5G5B5_UNORM_PACK16, Attachable}, // A1B5G5R5_UNORM (flipped with swizzle) + {VK_FORMAT_R8_UNORM, Attachable | Storage}, // R8_UNORM + {VK_FORMAT_R8_SNORM, Attachable | Storage}, // R8_SNORM + {VK_FORMAT_R8_SINT, Attachable | Storage}, // R8_SINT + {VK_FORMAT_R8_UINT, Attachable | Storage}, // R8_UINT + {VK_FORMAT_R16G16B16A16_SFLOAT, Attachable | Storage}, // R16G16B16A16_FLOAT + {VK_FORMAT_R16G16B16A16_UNORM, Attachable | Storage}, // R16G16B16A16_UNORM + {VK_FORMAT_R16G16B16A16_SNORM, Attachable | Storage}, // R16G16B16A16_SNORM + {VK_FORMAT_R16G16B16A16_SINT, Attachable | Storage}, // R16G16B16A16_SINT + {VK_FORMAT_R16G16B16A16_UINT, Attachable | Storage}, // R16G16B16A16_UINT + {VK_FORMAT_B10G11R11_UFLOAT_PACK32, Attachable | Storage}, // B10G11R11_FLOAT + {VK_FORMAT_R32G32B32A32_UINT, Attachable | Storage}, // R32G32B32A32_UINT + {VK_FORMAT_BC1_RGBA_UNORM_BLOCK}, // BC1_RGBA_UNORM + {VK_FORMAT_BC2_UNORM_BLOCK}, // BC2_UNORM + {VK_FORMAT_BC3_UNORM_BLOCK}, // BC3_UNORM + {VK_FORMAT_BC4_UNORM_BLOCK}, // BC4_UNORM + {VK_FORMAT_BC4_SNORM_BLOCK}, // BC4_SNORM + {VK_FORMAT_BC5_UNORM_BLOCK}, // BC5_UNORM + {VK_FORMAT_BC5_SNORM_BLOCK}, // BC5_SNORM + {VK_FORMAT_BC7_UNORM_BLOCK}, // BC7_UNORM + {VK_FORMAT_BC6H_UFLOAT_BLOCK}, // BC6H_UFLOAT + {VK_FORMAT_BC6H_SFLOAT_BLOCK}, // BC6H_SFLOAT + {VK_FORMAT_ASTC_4x4_UNORM_BLOCK}, // ASTC_2D_4X4_UNORM + {VK_FORMAT_B8G8R8A8_UNORM, Attachable}, // B8G8R8A8_UNORM + {VK_FORMAT_R32G32B32A32_SFLOAT, Attachable | Storage}, // R32G32B32A32_FLOAT + {VK_FORMAT_R32G32B32A32_SINT, Attachable | Storage}, // R32G32B32A32_SINT + {VK_FORMAT_R32G32_SFLOAT, Attachable | Storage}, // R32G32_FLOAT + {VK_FORMAT_R32G32_SINT, Attachable | Storage}, // R32G32_SINT + {VK_FORMAT_R32_SFLOAT, Attachable | Storage}, // R32_FLOAT + {VK_FORMAT_R16_SFLOAT, Attachable | Storage}, // R16_FLOAT + {VK_FORMAT_R16_UNORM, Attachable | Storage}, // R16_UNORM + {VK_FORMAT_UNDEFINED}, // R16_SNORM + {VK_FORMAT_R16_UINT, Attachable | Storage}, // R16_UINT + {VK_FORMAT_UNDEFINED}, // R16_SINT + {VK_FORMAT_R16G16_UNORM, Attachable | Storage}, // R16G16_UNORM + {VK_FORMAT_R16G16_SFLOAT, Attachable | Storage}, // R16G16_FLOAT + {VK_FORMAT_UNDEFINED}, // R16G16_UINT + {VK_FORMAT_UNDEFINED}, // R16G16_SINT + {VK_FORMAT_R16G16_SNORM, Attachable | Storage}, // R16G16_SNORM + {VK_FORMAT_UNDEFINED}, // R32G32B32_FLOAT + {VK_FORMAT_R8G8B8A8_SRGB, Attachable}, // A8B8G8R8_SRGB + {VK_FORMAT_R8G8_UNORM, Attachable | Storage}, // R8G8_UNORM + {VK_FORMAT_R8G8_SNORM, Attachable | Storage}, // R8G8_SNORM + {VK_FORMAT_R8G8_SINT, Attachable | Storage}, // R8G8_SINT + {VK_FORMAT_R8G8_UINT, Attachable | Storage}, // R8G8_UINT + {VK_FORMAT_R32G32_UINT, Attachable | Storage}, // R32G32_UINT + {VK_FORMAT_UNDEFINED}, // R16G16B16X16_FLOAT + {VK_FORMAT_R32_UINT, Attachable | Storage}, // R32_UINT + {VK_FORMAT_R32_SINT, Attachable | Storage}, // R32_SINT + {VK_FORMAT_ASTC_8x8_UNORM_BLOCK}, // ASTC_2D_8X8_UNORM + {VK_FORMAT_UNDEFINED}, // ASTC_2D_8X5_UNORM + {VK_FORMAT_UNDEFINED}, // ASTC_2D_5X4_UNORM + {VK_FORMAT_B8G8R8A8_SRGB, Attachable}, // B8G8R8A8_SRGB + {VK_FORMAT_BC1_RGBA_SRGB_BLOCK}, // BC1_RGBA_SRGB + {VK_FORMAT_BC2_SRGB_BLOCK}, // BC2_SRGB + {VK_FORMAT_BC3_SRGB_BLOCK}, // BC3_SRGB + {VK_FORMAT_BC7_SRGB_BLOCK}, // BC7_SRGB + {VK_FORMAT_R4G4B4A4_UNORM_PACK16, Attachable}, // A4B4G4R4_UNORM + {VK_FORMAT_ASTC_4x4_SRGB_BLOCK}, // ASTC_2D_4X4_SRGB + {VK_FORMAT_ASTC_8x8_SRGB_BLOCK}, // ASTC_2D_8X8_SRGB + {VK_FORMAT_ASTC_8x5_SRGB_BLOCK}, // ASTC_2D_8X5_SRGB + {VK_FORMAT_ASTC_5x4_SRGB_BLOCK}, // ASTC_2D_5X4_SRGB + {VK_FORMAT_ASTC_5x5_UNORM_BLOCK}, // ASTC_2D_5X5_UNORM + {VK_FORMAT_ASTC_5x5_SRGB_BLOCK}, // ASTC_2D_5X5_SRGB + {VK_FORMAT_ASTC_10x8_UNORM_BLOCK}, // ASTC_2D_10X8_UNORM + {VK_FORMAT_ASTC_10x8_SRGB_BLOCK}, // ASTC_2D_10X8_SRGB + {VK_FORMAT_ASTC_6x6_UNORM_BLOCK}, // ASTC_2D_6X6_UNORM + {VK_FORMAT_ASTC_6x6_SRGB_BLOCK}, // ASTC_2D_6X6_SRGB + {VK_FORMAT_ASTC_10x10_UNORM_BLOCK}, // ASTC_2D_10X10_UNORM + {VK_FORMAT_ASTC_10x10_SRGB_BLOCK}, // ASTC_2D_10X10_SRGB + {VK_FORMAT_ASTC_12x12_UNORM_BLOCK}, // ASTC_2D_12X12_UNORM + {VK_FORMAT_ASTC_12x12_SRGB_BLOCK}, // ASTC_2D_12X12_SRGB + {VK_FORMAT_ASTC_8x6_UNORM_BLOCK}, // ASTC_2D_8X6_UNORM + {VK_FORMAT_ASTC_8x6_SRGB_BLOCK}, // ASTC_2D_8X6_SRGB + {VK_FORMAT_ASTC_6x5_UNORM_BLOCK}, // ASTC_2D_6X5_UNORM + {VK_FORMAT_ASTC_6x5_SRGB_BLOCK}, // ASTC_2D_6X5_SRGB + {VK_FORMAT_E5B9G9R9_UFLOAT_PACK32}, // E5B9G9R9_FLOAT // Depth formats - {VK_FORMAT_D32_SFLOAT, Attachable}, // Z32F - {VK_FORMAT_D16_UNORM, Attachable}, // Z16 + {VK_FORMAT_D32_SFLOAT, Attachable}, // D32_FLOAT + {VK_FORMAT_D16_UNORM, Attachable}, // D16_UNORM // DepthStencil formats - {VK_FORMAT_D24_UNORM_S8_UINT, Attachable}, // Z24S8 - {VK_FORMAT_D24_UNORM_S8_UINT, Attachable}, // S8Z24 (emulated) - {VK_FORMAT_D32_SFLOAT_S8_UINT, Attachable}, // Z32FS8 + {VK_FORMAT_D24_UNORM_S8_UINT, Attachable}, // D24_UNORM_S8_UINT + {VK_FORMAT_D24_UNORM_S8_UINT, Attachable}, // S8_UINT_D24_UNORM (emulated) + {VK_FORMAT_D32_SFLOAT_S8_UINT, Attachable}, // D32_FLOAT_S8_UINT }; static_assert(std::size(tex_format_tuples) == VideoCore::Surface::MaxPixelFormat); @@ -221,7 +233,7 @@ FormatInfo SurfaceFormat(const VKDevice& device, FormatType format_type, PixelFo return {VK_FORMAT_A8B8G8R8_UNORM_PACK32, true, true}; } - // Use ABGR8 on hardware that doesn't support ASTC natively + // Use A8B8G8R8_UNORM on hardware that doesn't support ASTC natively if (!device.IsOptimalAstcSupported() && VideoCore::Surface::IsPixelFormatASTC(pixel_format)) { tuple.format = VideoCore::Surface::IsPixelFormatSRGB(pixel_format) ? VK_FORMAT_A8B8G8R8_SRGB_PACK32 @@ -295,6 +307,30 @@ VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const VKDevice& device, VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size) { switch (type) { + case Maxwell::VertexAttribute::Type::UnsignedNorm: + switch (size) { + case Maxwell::VertexAttribute::Size::Size_8: + return VK_FORMAT_R8_UNORM; + case Maxwell::VertexAttribute::Size::Size_8_8: + return VK_FORMAT_R8G8_UNORM; + case Maxwell::VertexAttribute::Size::Size_8_8_8: + return VK_FORMAT_R8G8B8_UNORM; + case Maxwell::VertexAttribute::Size::Size_8_8_8_8: + return VK_FORMAT_R8G8B8A8_UNORM; + case Maxwell::VertexAttribute::Size::Size_16: + return VK_FORMAT_R16_UNORM; + case Maxwell::VertexAttribute::Size::Size_16_16: + return VK_FORMAT_R16G16_UNORM; + case Maxwell::VertexAttribute::Size::Size_16_16_16: + return VK_FORMAT_R16G16B16_UNORM; + case Maxwell::VertexAttribute::Size::Size_16_16_16_16: + return VK_FORMAT_R16G16B16A16_UNORM; + case Maxwell::VertexAttribute::Size::Size_10_10_10_2: + return VK_FORMAT_A2B10G10R10_UNORM_PACK32; + default: + break; + } + break; case Maxwell::VertexAttribute::Type::SignedNorm: switch (size) { case Maxwell::VertexAttribute::Size::Size_8: @@ -319,44 +355,50 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib break; } break; - case Maxwell::VertexAttribute::Type::UnsignedNorm: + case Maxwell::VertexAttribute::Type::UnsignedScaled: switch (size) { case Maxwell::VertexAttribute::Size::Size_8: - return VK_FORMAT_R8_UNORM; + return VK_FORMAT_R8_USCALED; case Maxwell::VertexAttribute::Size::Size_8_8: - return VK_FORMAT_R8G8_UNORM; + return VK_FORMAT_R8G8_USCALED; case Maxwell::VertexAttribute::Size::Size_8_8_8: - return VK_FORMAT_R8G8B8_UNORM; + return VK_FORMAT_R8G8B8_USCALED; case Maxwell::VertexAttribute::Size::Size_8_8_8_8: - return VK_FORMAT_R8G8B8A8_UNORM; + return VK_FORMAT_R8G8B8A8_USCALED; case Maxwell::VertexAttribute::Size::Size_16: - return VK_FORMAT_R16_UNORM; + return VK_FORMAT_R16_USCALED; case Maxwell::VertexAttribute::Size::Size_16_16: - return VK_FORMAT_R16G16_UNORM; + return VK_FORMAT_R16G16_USCALED; case Maxwell::VertexAttribute::Size::Size_16_16_16: - return VK_FORMAT_R16G16B16_UNORM; + return VK_FORMAT_R16G16B16_USCALED; case Maxwell::VertexAttribute::Size::Size_16_16_16_16: - return VK_FORMAT_R16G16B16A16_UNORM; + return VK_FORMAT_R16G16B16A16_USCALED; case Maxwell::VertexAttribute::Size::Size_10_10_10_2: - return VK_FORMAT_A2B10G10R10_UNORM_PACK32; + return VK_FORMAT_A2B10G10R10_USCALED_PACK32; default: break; } break; - case Maxwell::VertexAttribute::Type::SignedInt: + case Maxwell::VertexAttribute::Type::SignedScaled: switch (size) { - case Maxwell::VertexAttribute::Size::Size_16_16_16_16: - return VK_FORMAT_R16G16B16A16_SINT; case Maxwell::VertexAttribute::Size::Size_8: - return VK_FORMAT_R8_SINT; + return VK_FORMAT_R8_SSCALED; case Maxwell::VertexAttribute::Size::Size_8_8: - return VK_FORMAT_R8G8_SINT; + return VK_FORMAT_R8G8_SSCALED; case Maxwell::VertexAttribute::Size::Size_8_8_8: - return VK_FORMAT_R8G8B8_SINT; + return VK_FORMAT_R8G8B8_SSCALED; case Maxwell::VertexAttribute::Size::Size_8_8_8_8: - return VK_FORMAT_R8G8B8A8_SINT; - case Maxwell::VertexAttribute::Size::Size_32: - return VK_FORMAT_R32_SINT; + return VK_FORMAT_R8G8B8A8_SSCALED; + case Maxwell::VertexAttribute::Size::Size_16: + return VK_FORMAT_R16_SSCALED; + case Maxwell::VertexAttribute::Size::Size_16_16: + return VK_FORMAT_R16G16_SSCALED; + case Maxwell::VertexAttribute::Size::Size_16_16_16: + return VK_FORMAT_R16G16B16_SSCALED; + case Maxwell::VertexAttribute::Size::Size_16_16_16_16: + return VK_FORMAT_R16G16B16A16_SSCALED; + case Maxwell::VertexAttribute::Size::Size_10_10_10_2: + return VK_FORMAT_A2B10G10R10_SSCALED_PACK32; default: break; } @@ -387,56 +429,54 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib return VK_FORMAT_R32G32B32_UINT; case Maxwell::VertexAttribute::Size::Size_32_32_32_32: return VK_FORMAT_R32G32B32A32_UINT; + case Maxwell::VertexAttribute::Size::Size_10_10_10_2: + return VK_FORMAT_A2B10G10R10_UINT_PACK32; default: break; } break; - case Maxwell::VertexAttribute::Type::UnsignedScaled: + case Maxwell::VertexAttribute::Type::SignedInt: switch (size) { case Maxwell::VertexAttribute::Size::Size_8: - return VK_FORMAT_R8_USCALED; + return VK_FORMAT_R8_SINT; case Maxwell::VertexAttribute::Size::Size_8_8: - return VK_FORMAT_R8G8_USCALED; + return VK_FORMAT_R8G8_SINT; case Maxwell::VertexAttribute::Size::Size_8_8_8: - return VK_FORMAT_R8G8B8_USCALED; + return VK_FORMAT_R8G8B8_SINT; case Maxwell::VertexAttribute::Size::Size_8_8_8_8: - return VK_FORMAT_R8G8B8A8_USCALED; + return VK_FORMAT_R8G8B8A8_SINT; case Maxwell::VertexAttribute::Size::Size_16: - return VK_FORMAT_R16_USCALED; + return VK_FORMAT_R16_SINT; case Maxwell::VertexAttribute::Size::Size_16_16: - return VK_FORMAT_R16G16_USCALED; + return VK_FORMAT_R16G16_SINT; case Maxwell::VertexAttribute::Size::Size_16_16_16: - return VK_FORMAT_R16G16B16_USCALED; + return VK_FORMAT_R16G16B16_SINT; case Maxwell::VertexAttribute::Size::Size_16_16_16_16: - return VK_FORMAT_R16G16B16A16_USCALED; + return VK_FORMAT_R16G16B16A16_SINT; + case Maxwell::VertexAttribute::Size::Size_32: + return VK_FORMAT_R32_SINT; + case Maxwell::VertexAttribute::Size::Size_32_32: + return VK_FORMAT_R32G32_SINT; + case Maxwell::VertexAttribute::Size::Size_32_32_32: + return VK_FORMAT_R32G32B32_SINT; + case Maxwell::VertexAttribute::Size::Size_32_32_32_32: + return VK_FORMAT_R32G32B32A32_SINT; + case Maxwell::VertexAttribute::Size::Size_10_10_10_2: + return VK_FORMAT_A2B10G10R10_SINT_PACK32; default: break; } break; - case Maxwell::VertexAttribute::Type::SignedScaled: + case Maxwell::VertexAttribute::Type::Float: switch (size) { - case Maxwell::VertexAttribute::Size::Size_8: - return VK_FORMAT_R8_SSCALED; - case Maxwell::VertexAttribute::Size::Size_8_8: - return VK_FORMAT_R8G8_SSCALED; - case Maxwell::VertexAttribute::Size::Size_8_8_8: - return VK_FORMAT_R8G8B8_SSCALED; - case Maxwell::VertexAttribute::Size::Size_8_8_8_8: - return VK_FORMAT_R8G8B8A8_SSCALED; case Maxwell::VertexAttribute::Size::Size_16: - return VK_FORMAT_R16_SSCALED; + return VK_FORMAT_R16_SFLOAT; case Maxwell::VertexAttribute::Size::Size_16_16: - return VK_FORMAT_R16G16_SSCALED; + return VK_FORMAT_R16G16_SFLOAT; case Maxwell::VertexAttribute::Size::Size_16_16_16: - return VK_FORMAT_R16G16B16_SSCALED; + return VK_FORMAT_R16G16B16_SFLOAT; case Maxwell::VertexAttribute::Size::Size_16_16_16_16: - return VK_FORMAT_R16G16B16A16_SSCALED; - default: - break; - } - break; - case Maxwell::VertexAttribute::Type::Float: - switch (size) { + return VK_FORMAT_R16G16B16A16_SFLOAT; case Maxwell::VertexAttribute::Size::Size_32: return VK_FORMAT_R32_SFLOAT; case Maxwell::VertexAttribute::Size::Size_32_32: @@ -445,14 +485,6 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib return VK_FORMAT_R32G32B32_SFLOAT; case Maxwell::VertexAttribute::Size::Size_32_32_32_32: return VK_FORMAT_R32G32B32A32_SFLOAT; - case Maxwell::VertexAttribute::Size::Size_16: - return VK_FORMAT_R16_SFLOAT; - case Maxwell::VertexAttribute::Size::Size_16_16: - return VK_FORMAT_R16G16_SFLOAT; - case Maxwell::VertexAttribute::Size::Size_16_16_16: - return VK_FORMAT_R16G16B16_SFLOAT; - case Maxwell::VertexAttribute::Size::Size_16_16_16_16: - return VK_FORMAT_R16G16B16A16_SFLOAT; default: break; } @@ -672,4 +704,27 @@ VkComponentSwizzle SwizzleSource(Tegra::Texture::SwizzleSource swizzle) { return {}; } +VkViewportCoordinateSwizzleNV ViewportSwizzle(Maxwell::ViewportSwizzle swizzle) { + switch (swizzle) { + case Maxwell::ViewportSwizzle::PositiveX: + return VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_X_NV; + case Maxwell::ViewportSwizzle::NegativeX: + return VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_X_NV; + case Maxwell::ViewportSwizzle::PositiveY: + return VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_Y_NV; + case Maxwell::ViewportSwizzle::NegativeY: + return VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_Y_NV; + case Maxwell::ViewportSwizzle::PositiveZ: + return VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_Z_NV; + case Maxwell::ViewportSwizzle::NegativeZ: + return VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_Z_NV; + case Maxwell::ViewportSwizzle::PositiveW: + return VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_W_NV; + case Maxwell::ViewportSwizzle::NegativeW: + return VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_W_NV; + } + UNREACHABLE_MSG("Invalid swizzle={}", static_cast<int>(swizzle)); + return {}; +} + } // namespace Vulkan::MaxwellToVK diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.h b/src/video_core/renderer_vulkan/maxwell_to_vk.h index 81bce4c6c..7e213452f 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.h +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.h @@ -59,4 +59,6 @@ VkCullModeFlags CullFace(Maxwell::CullFace cull_face); VkComponentSwizzle SwizzleSource(Tegra::Texture::SwizzleSource swizzle); +VkViewportCoordinateSwizzleNV ViewportSwizzle(Maxwell::ViewportSwizzle swizzle); + } // namespace Vulkan::MaxwellToVK diff --git a/src/video_core/renderer_vulkan/nsight_aftermath_tracker.cpp b/src/video_core/renderer_vulkan/nsight_aftermath_tracker.cpp new file mode 100644 index 000000000..5b01020ec --- /dev/null +++ b/src/video_core/renderer_vulkan/nsight_aftermath_tracker.cpp @@ -0,0 +1,220 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#ifdef HAS_NSIGHT_AFTERMATH + +#include <mutex> +#include <string> +#include <string_view> +#include <utility> +#include <vector> + +#include <fmt/format.h> + +#define VK_NO_PROTOTYPES +#include <vulkan/vulkan.h> + +#include <GFSDK_Aftermath.h> +#include <GFSDK_Aftermath_Defines.h> +#include <GFSDK_Aftermath_GpuCrashDump.h> +#include <GFSDK_Aftermath_GpuCrashDumpDecoding.h> + +#include "common/common_paths.h" +#include "common/common_types.h" +#include "common/file_util.h" +#include "common/logging/log.h" +#include "common/scope_exit.h" + +#include "video_core/renderer_vulkan/nsight_aftermath_tracker.h" + +namespace Vulkan { + +static constexpr char AFTERMATH_LIB_NAME[] = "GFSDK_Aftermath_Lib.x64.dll"; + +NsightAftermathTracker::NsightAftermathTracker() = default; + +NsightAftermathTracker::~NsightAftermathTracker() { + if (initialized) { + (void)GFSDK_Aftermath_DisableGpuCrashDumps(); + } +} + +bool NsightAftermathTracker::Initialize() { + if (!dl.Open(AFTERMATH_LIB_NAME)) { + LOG_ERROR(Render_Vulkan, "Failed to load Nsight Aftermath DLL"); + return false; + } + + if (!dl.GetSymbol("GFSDK_Aftermath_DisableGpuCrashDumps", + &GFSDK_Aftermath_DisableGpuCrashDumps) || + !dl.GetSymbol("GFSDK_Aftermath_EnableGpuCrashDumps", + &GFSDK_Aftermath_EnableGpuCrashDumps) || + !dl.GetSymbol("GFSDK_Aftermath_GetShaderDebugInfoIdentifier", + &GFSDK_Aftermath_GetShaderDebugInfoIdentifier) || + !dl.GetSymbol("GFSDK_Aftermath_GetShaderHashSpirv", &GFSDK_Aftermath_GetShaderHashSpirv) || + !dl.GetSymbol("GFSDK_Aftermath_GpuCrashDump_CreateDecoder", + &GFSDK_Aftermath_GpuCrashDump_CreateDecoder) || + !dl.GetSymbol("GFSDK_Aftermath_GpuCrashDump_DestroyDecoder", + &GFSDK_Aftermath_GpuCrashDump_DestroyDecoder) || + !dl.GetSymbol("GFSDK_Aftermath_GpuCrashDump_GenerateJSON", + &GFSDK_Aftermath_GpuCrashDump_GenerateJSON) || + !dl.GetSymbol("GFSDK_Aftermath_GpuCrashDump_GetJSON", + &GFSDK_Aftermath_GpuCrashDump_GetJSON)) { + LOG_ERROR(Render_Vulkan, "Failed to load Nsight Aftermath function pointers"); + return false; + } + + dump_dir = Common::FS::GetUserPath(Common::FS::UserPath::LogDir) + "gpucrash"; + + (void)Common::FS::DeleteDirRecursively(dump_dir); + if (!Common::FS::CreateDir(dump_dir)) { + LOG_ERROR(Render_Vulkan, "Failed to create Nsight Aftermath dump directory"); + return false; + } + + if (!GFSDK_Aftermath_SUCCEED(GFSDK_Aftermath_EnableGpuCrashDumps( + GFSDK_Aftermath_Version_API, GFSDK_Aftermath_GpuCrashDumpWatchedApiFlags_Vulkan, + GFSDK_Aftermath_GpuCrashDumpFeatureFlags_Default, GpuCrashDumpCallback, + ShaderDebugInfoCallback, CrashDumpDescriptionCallback, this))) { + LOG_ERROR(Render_Vulkan, "GFSDK_Aftermath_EnableGpuCrashDumps failed"); + return false; + } + + LOG_INFO(Render_Vulkan, "Nsight Aftermath dump directory is \"{}\"", dump_dir); + + initialized = true; + return true; +} + +void NsightAftermathTracker::SaveShader(const std::vector<u32>& spirv) const { + if (!initialized) { + return; + } + + std::vector<u32> spirv_copy = spirv; + GFSDK_Aftermath_SpirvCode shader; + shader.pData = spirv_copy.data(); + shader.size = static_cast<u32>(spirv_copy.size() * 4); + + std::scoped_lock lock{mutex}; + + GFSDK_Aftermath_ShaderHash hash; + if (!GFSDK_Aftermath_SUCCEED( + GFSDK_Aftermath_GetShaderHashSpirv(GFSDK_Aftermath_Version_API, &shader, &hash))) { + LOG_ERROR(Render_Vulkan, "Failed to hash SPIR-V module"); + return; + } + + Common::FS::IOFile file(fmt::format("{}/source_{:016x}.spv", dump_dir, hash.hash), "wb"); + if (!file.IsOpen()) { + LOG_ERROR(Render_Vulkan, "Failed to dump SPIR-V module with hash={:016x}", hash.hash); + return; + } + if (file.WriteArray(spirv.data(), spirv.size()) != spirv.size()) { + LOG_ERROR(Render_Vulkan, "Failed to write SPIR-V module with hash={:016x}", hash.hash); + return; + } +} + +void NsightAftermathTracker::OnGpuCrashDumpCallback(const void* gpu_crash_dump, + u32 gpu_crash_dump_size) { + std::scoped_lock lock{mutex}; + + LOG_CRITICAL(Render_Vulkan, "called"); + + GFSDK_Aftermath_GpuCrashDump_Decoder decoder; + if (!GFSDK_Aftermath_SUCCEED(GFSDK_Aftermath_GpuCrashDump_CreateDecoder( + GFSDK_Aftermath_Version_API, gpu_crash_dump, gpu_crash_dump_size, &decoder))) { + LOG_ERROR(Render_Vulkan, "Failed to create decoder"); + return; + } + SCOPE_EXIT({ GFSDK_Aftermath_GpuCrashDump_DestroyDecoder(decoder); }); + + u32 json_size = 0; + if (!GFSDK_Aftermath_SUCCEED(GFSDK_Aftermath_GpuCrashDump_GenerateJSON( + decoder, GFSDK_Aftermath_GpuCrashDumpDecoderFlags_ALL_INFO, + GFSDK_Aftermath_GpuCrashDumpFormatterFlags_NONE, nullptr, nullptr, nullptr, nullptr, + this, &json_size))) { + LOG_ERROR(Render_Vulkan, "Failed to generate JSON"); + return; + } + std::vector<char> json(json_size); + if (!GFSDK_Aftermath_SUCCEED( + GFSDK_Aftermath_GpuCrashDump_GetJSON(decoder, json_size, json.data()))) { + LOG_ERROR(Render_Vulkan, "Failed to query JSON"); + return; + } + + const std::string base_name = [this] { + const int id = dump_id++; + if (id == 0) { + return fmt::format("{}/crash.nv-gpudmp", dump_dir); + } else { + return fmt::format("{}/crash_{}.nv-gpudmp", dump_dir, id); + } + }(); + + std::string_view dump_view(static_cast<const char*>(gpu_crash_dump), gpu_crash_dump_size); + if (Common::FS::WriteStringToFile(false, base_name, dump_view) != gpu_crash_dump_size) { + LOG_ERROR(Render_Vulkan, "Failed to write dump file"); + return; + } + const std::string_view json_view(json.data(), json.size()); + if (Common::FS::WriteStringToFile(true, base_name + ".json", json_view) != json.size()) { + LOG_ERROR(Render_Vulkan, "Failed to write JSON"); + return; + } +} + +void NsightAftermathTracker::OnShaderDebugInfoCallback(const void* shader_debug_info, + u32 shader_debug_info_size) { + std::scoped_lock lock{mutex}; + + GFSDK_Aftermath_ShaderDebugInfoIdentifier identifier; + if (!GFSDK_Aftermath_SUCCEED(GFSDK_Aftermath_GetShaderDebugInfoIdentifier( + GFSDK_Aftermath_Version_API, shader_debug_info, shader_debug_info_size, &identifier))) { + LOG_ERROR(Render_Vulkan, "GFSDK_Aftermath_GetShaderDebugInfoIdentifier failed"); + return; + } + + const std::string path = + fmt::format("{}/shader_{:016x}{:016x}.nvdbg", dump_dir, identifier.id[0], identifier.id[1]); + Common::FS::IOFile file(path, "wb"); + if (!file.IsOpen()) { + LOG_ERROR(Render_Vulkan, "Failed to create file {}", path); + return; + } + if (file.WriteBytes(static_cast<const u8*>(shader_debug_info), shader_debug_info_size) != + shader_debug_info_size) { + LOG_ERROR(Render_Vulkan, "Failed to write file {}", path); + return; + } +} + +void NsightAftermathTracker::OnCrashDumpDescriptionCallback( + PFN_GFSDK_Aftermath_AddGpuCrashDumpDescription add_description) { + add_description(GFSDK_Aftermath_GpuCrashDumpDescriptionKey_ApplicationName, "yuzu"); +} + +void NsightAftermathTracker::GpuCrashDumpCallback(const void* gpu_crash_dump, + u32 gpu_crash_dump_size, void* user_data) { + static_cast<NsightAftermathTracker*>(user_data)->OnGpuCrashDumpCallback(gpu_crash_dump, + gpu_crash_dump_size); +} + +void NsightAftermathTracker::ShaderDebugInfoCallback(const void* shader_debug_info, + u32 shader_debug_info_size, void* user_data) { + static_cast<NsightAftermathTracker*>(user_data)->OnShaderDebugInfoCallback( + shader_debug_info, shader_debug_info_size); +} + +void NsightAftermathTracker::CrashDumpDescriptionCallback( + PFN_GFSDK_Aftermath_AddGpuCrashDumpDescription add_description, void* user_data) { + static_cast<NsightAftermathTracker*>(user_data)->OnCrashDumpDescriptionCallback( + add_description); +} + +} // namespace Vulkan + +#endif // HAS_NSIGHT_AFTERMATH diff --git a/src/video_core/renderer_vulkan/nsight_aftermath_tracker.h b/src/video_core/renderer_vulkan/nsight_aftermath_tracker.h new file mode 100644 index 000000000..afe7ae99e --- /dev/null +++ b/src/video_core/renderer_vulkan/nsight_aftermath_tracker.h @@ -0,0 +1,87 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <mutex> +#include <string> +#include <vector> + +#define VK_NO_PROTOTYPES +#include <vulkan/vulkan.h> + +#ifdef HAS_NSIGHT_AFTERMATH +#include <GFSDK_Aftermath_Defines.h> +#include <GFSDK_Aftermath_GpuCrashDump.h> +#include <GFSDK_Aftermath_GpuCrashDumpDecoding.h> +#endif + +#include "common/common_types.h" +#include "common/dynamic_library.h" + +namespace Vulkan { + +class NsightAftermathTracker { +public: + NsightAftermathTracker(); + ~NsightAftermathTracker(); + + NsightAftermathTracker(const NsightAftermathTracker&) = delete; + NsightAftermathTracker& operator=(const NsightAftermathTracker&) = delete; + + // Delete move semantics because Aftermath initialization uses a pointer to this. + NsightAftermathTracker(NsightAftermathTracker&&) = delete; + NsightAftermathTracker& operator=(NsightAftermathTracker&&) = delete; + + bool Initialize(); + + void SaveShader(const std::vector<u32>& spirv) const; + +private: +#ifdef HAS_NSIGHT_AFTERMATH + static void GpuCrashDumpCallback(const void* gpu_crash_dump, u32 gpu_crash_dump_size, + void* user_data); + + static void ShaderDebugInfoCallback(const void* shader_debug_info, u32 shader_debug_info_size, + void* user_data); + + static void CrashDumpDescriptionCallback( + PFN_GFSDK_Aftermath_AddGpuCrashDumpDescription add_description, void* user_data); + + void OnGpuCrashDumpCallback(const void* gpu_crash_dump, u32 gpu_crash_dump_size); + + void OnShaderDebugInfoCallback(const void* shader_debug_info, u32 shader_debug_info_size); + + void OnCrashDumpDescriptionCallback( + PFN_GFSDK_Aftermath_AddGpuCrashDumpDescription add_description); + + mutable std::mutex mutex; + + std::string dump_dir; + int dump_id = 0; + + bool initialized = false; + + Common::DynamicLibrary dl; + PFN_GFSDK_Aftermath_DisableGpuCrashDumps GFSDK_Aftermath_DisableGpuCrashDumps; + PFN_GFSDK_Aftermath_EnableGpuCrashDumps GFSDK_Aftermath_EnableGpuCrashDumps; + PFN_GFSDK_Aftermath_GetShaderDebugInfoIdentifier GFSDK_Aftermath_GetShaderDebugInfoIdentifier; + PFN_GFSDK_Aftermath_GetShaderHashSpirv GFSDK_Aftermath_GetShaderHashSpirv; + PFN_GFSDK_Aftermath_GpuCrashDump_CreateDecoder GFSDK_Aftermath_GpuCrashDump_CreateDecoder; + PFN_GFSDK_Aftermath_GpuCrashDump_DestroyDecoder GFSDK_Aftermath_GpuCrashDump_DestroyDecoder; + PFN_GFSDK_Aftermath_GpuCrashDump_GenerateJSON GFSDK_Aftermath_GpuCrashDump_GenerateJSON; + PFN_GFSDK_Aftermath_GpuCrashDump_GetJSON GFSDK_Aftermath_GpuCrashDump_GetJSON; +#endif +}; + +#ifndef HAS_NSIGHT_AFTERMATH +inline NsightAftermathTracker::NsightAftermathTracker() = default; +inline NsightAftermathTracker::~NsightAftermathTracker() = default; +inline bool NsightAftermathTracker::Initialize() { + return false; +} +inline void NsightAftermathTracker::SaveShader(const std::vector<u32>&) const {} +#endif + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index dd590c38b..f2610868e 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -12,24 +12,22 @@ #include <fmt/format.h> -#include "common/assert.h" #include "common/dynamic_library.h" +#include "common/file_util.h" #include "common/logging/log.h" #include "common/telemetry.h" #include "core/core.h" #include "core/core_timing.h" #include "core/frontend/emu_window.h" -#include "core/memory.h" -#include "core/perf_stats.h" #include "core/settings.h" #include "core/telemetry_session.h" #include "video_core/gpu.h" #include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/vk_blit_screen.h" #include "video_core/renderer_vulkan/vk_device.h" +#include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_state_tracker.h" #include "video_core/renderer_vulkan/vk_swapchain.h" @@ -42,7 +40,7 @@ #include <vulkan/vulkan_win32.h> #endif -#ifdef __linux__ +#if !defined(_WIN32) && !defined(__APPLE__) #include <X11/Xlib.h> #include <vulkan/vulkan_wayland.h> #include <vulkan/vulkan_xlib.h> @@ -58,7 +56,7 @@ VkBool32 DebugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT severity, VkDebugUtilsMessageTypeFlagsEXT type, const VkDebugUtilsMessengerCallbackDataEXT* data, [[maybe_unused]] void* user_data) { - const char* message{data->pMessage}; + const char* const message{data->pMessage}; if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) { LOG_CRITICAL(Render_Vulkan, "{}", message); @@ -79,7 +77,8 @@ Common::DynamicLibrary OpenVulkanLibrary() { char* libvulkan_env = getenv("LIBVULKAN_PATH"); if (!libvulkan_env || !library.Open(libvulkan_env)) { // Use the libvulkan.dylib from the application bundle. - std::string filename = File::GetBundleDirectory() + "/Contents/Frameworks/libvulkan.dylib"; + const std::string filename = + Common::FS::GetBundleDirectory() + "/Contents/Frameworks/libvulkan.dylib"; library.Open(filename.c_str()); } #else @@ -87,15 +86,15 @@ Common::DynamicLibrary OpenVulkanLibrary() { if (!library.Open(filename.c_str())) { // Android devices may not have libvulkan.so.1, only libvulkan.so. filename = Common::DynamicLibrary::GetVersionedFilename("vulkan"); - library.Open(filename.c_str()); + (void)library.Open(filename.c_str()); } #endif return library; } -vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatch& dld, - WindowSystemType window_type = WindowSystemType::Headless, - bool enable_layers = false) { +std::pair<vk::Instance, u32> CreateInstance( + Common::DynamicLibrary& library, vk::InstanceDispatch& dld, + WindowSystemType window_type = WindowSystemType::Headless, bool enable_layers = false) { if (!library.IsOpen()) { LOG_ERROR(Render_Vulkan, "Vulkan library not available"); return {}; @@ -119,7 +118,7 @@ vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatc extensions.push_back(VK_KHR_WIN32_SURFACE_EXTENSION_NAME); break; #endif -#ifdef __linux__ +#if !defined(_WIN32) && !defined(__APPLE__) case Core::Frontend::WindowSystemType::X11: extensions.push_back(VK_KHR_XLIB_SURFACE_EXTENSION_NAME); break; @@ -156,12 +155,35 @@ vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatc } } - static constexpr std::array layers_data{"VK_LAYER_LUNARG_standard_validation"}; - vk::Span<const char*> layers = layers_data; - if (!enable_layers) { - layers = {}; + std::vector<const char*> layers; + layers.reserve(1); + if (enable_layers) { + layers.push_back("VK_LAYER_KHRONOS_validation"); + } + + const std::optional layer_properties = vk::EnumerateInstanceLayerProperties(dld); + if (!layer_properties) { + LOG_ERROR(Render_Vulkan, "Failed to query layer properties, disabling layers"); + layers.clear(); + } + + for (auto layer_it = layers.begin(); layer_it != layers.end();) { + const char* const layer = *layer_it; + const auto it = std::find_if( + layer_properties->begin(), layer_properties->end(), + [layer](const VkLayerProperties& prop) { return !std::strcmp(layer, prop.layerName); }); + if (it == layer_properties->end()) { + LOG_ERROR(Render_Vulkan, "Layer {} not available, removing it", layer); + layer_it = layers.erase(layer_it); + } else { + ++layer_it; + } } - vk::Instance instance = vk::Instance::Create(layers, extensions, dld); + + // Limit the maximum version of Vulkan to avoid using untested version. + const u32 version = std::min(vk::AvailableVersion(dld), static_cast<u32>(VK_API_VERSION_1_1)); + + vk::Instance instance = vk::Instance::Create(version, layers, extensions, dld); if (!instance) { LOG_ERROR(Render_Vulkan, "Failed to create Vulkan instance"); return {}; @@ -169,7 +191,7 @@ vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatc if (!vk::Load(*instance, dld)) { LOG_ERROR(Render_Vulkan, "Failed to load Vulkan instance function pointers"); } - return instance; + return std::make_pair(std::move(instance), version); } std::string GetReadableVersion(u32 version) { @@ -218,8 +240,12 @@ std::string BuildCommaSeparatedExtensions(std::vector<std::string> available_ext } // Anonymous namespace -RendererVulkan::RendererVulkan(Core::Frontend::EmuWindow& window, Core::System& system) - : RendererBase(window), system{system} {} +RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_, + Core::Frontend::EmuWindow& emu_window, + Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_, + std::unique_ptr<Core::Frontend::GraphicsContext> context) + : RendererBase{emu_window, std::move(context)}, telemetry_session{telemetry_session_}, + cpu_memory{cpu_memory_}, gpu{gpu_} {} RendererVulkan::~RendererVulkan() { ShutDown(); @@ -246,11 +272,11 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { scheduler->WaitWorker(); swapchain->AcquireNextImage(); - const auto [fence, render_semaphore] = blit_screen->Draw(*framebuffer, use_accelerated); + const VkSemaphore render_semaphore = blit_screen->Draw(*framebuffer, use_accelerated); - scheduler->Flush(false, render_semaphore); + scheduler->Flush(render_semaphore); - if (swapchain->Present(render_semaphore, fence)) { + if (swapchain->Present(render_semaphore)) { blit_screen->Recreate(); } @@ -260,15 +286,10 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { render_window.PollEvents(); } -bool RendererVulkan::TryPresent(int /*timeout_ms*/) { - // TODO (bunnei): ImplementMe - return true; -} - bool RendererVulkan::Init() { library = OpenVulkanLibrary(); - instance = CreateInstance(library, dld, render_window.GetWindowInfo().type, - Settings::values.renderer_debug); + std::tie(instance, instance_version) = CreateInstance( + library, dld, render_window.GetWindowInfo().type, Settings::values.renderer_debug); if (!instance || !CreateDebugCallback() || !CreateSurface() || !PickDevices()) { return false; } @@ -277,23 +298,21 @@ bool RendererVulkan::Init() { memory_manager = std::make_unique<VKMemoryManager>(*device); - resource_manager = std::make_unique<VKResourceManager>(*device); + state_tracker = std::make_unique<StateTracker>(gpu); + + scheduler = std::make_unique<VKScheduler>(*device, *state_tracker); const auto& framebuffer = render_window.GetFramebufferLayout(); - swapchain = std::make_unique<VKSwapchain>(*surface, *device); + swapchain = std::make_unique<VKSwapchain>(*surface, *device, *scheduler); swapchain->Create(framebuffer.width, framebuffer.height, false); - state_tracker = std::make_unique<StateTracker>(system); - - scheduler = std::make_unique<VKScheduler>(*device, *resource_manager, *state_tracker); - - rasterizer = std::make_unique<RasterizerVulkan>(system, render_window, screen_info, *device, - *resource_manager, *memory_manager, - *state_tracker, *scheduler); + rasterizer = std::make_unique<RasterizerVulkan>(render_window, gpu, gpu.MemoryManager(), + cpu_memory, screen_info, *device, + *memory_manager, *state_tracker, *scheduler); - blit_screen = std::make_unique<VKBlitScreen>(system, render_window, *rasterizer, *device, - *resource_manager, *memory_manager, *swapchain, - *scheduler, screen_info); + blit_screen = + std::make_unique<VKBlitScreen>(cpu_memory, render_window, *rasterizer, *device, + *memory_manager, *swapchain, *scheduler, screen_info); return true; } @@ -311,7 +330,6 @@ void RendererVulkan::ShutDown() { scheduler.reset(); swapchain.reset(); memory_manager.reset(); - resource_manager.reset(); device.reset(); } @@ -345,7 +363,7 @@ bool RendererVulkan::CreateSurface() { } } #endif -#ifdef __linux__ +#if !defined(_WIN32) && !defined(__APPLE__) if (window_info.type == Core::Frontend::WindowSystemType::X11) { const VkXlibSurfaceCreateInfoKHR xlib_ci{ VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR, nullptr, 0, @@ -390,7 +408,7 @@ bool RendererVulkan::PickDevices() { return false; } - const s32 device_index = Settings::values.vulkan_device; + const s32 device_index = Settings::values.vulkan_device.GetValue(); if (device_index < 0 || device_index >= static_cast<s32>(devices->size())) { LOG_ERROR(Render_Vulkan, "Invalid device index {}!", device_index); return false; @@ -401,7 +419,8 @@ bool RendererVulkan::PickDevices() { return false; } - device = std::make_unique<VKDevice>(*instance, physical_device, *surface, dld); + device = + std::make_unique<VKDevice>(*instance, instance_version, physical_device, *surface, dld); return device->Create(); } @@ -411,7 +430,7 @@ void RendererVulkan::Report() const { const std::string driver_version = GetDriverVersion(*device); const std::string driver_name = fmt::format("{} {}", vendor_name, driver_version); - const std::string api_version = GetReadableVersion(device->GetApiVersion()); + const std::string api_version = GetReadableVersion(device->ApiVersion()); const std::string extensions = BuildCommaSeparatedExtensions(device->GetAvailableExtensions()); @@ -419,8 +438,7 @@ void RendererVulkan::Report() const { LOG_INFO(Render_Vulkan, "Device: {}", model_name); LOG_INFO(Render_Vulkan, "Vulkan: {}", api_version); - auto& telemetry_session = system.TelemetrySession(); - constexpr auto field = Telemetry::FieldType::UserSystem; + static constexpr auto field = Common::Telemetry::FieldType::UserSystem; telemetry_session.AddField(field, "GPU_Vendor", vendor_name); telemetry_session.AddField(field, "GPU_Model", model_name); telemetry_session.AddField(field, "GPU_Vulkan_Driver", driver_name); @@ -431,7 +449,7 @@ void RendererVulkan::Report() const { std::vector<std::string> RendererVulkan::EnumerateDevices() { vk::InstanceDispatch dld; Common::DynamicLibrary library = OpenVulkanLibrary(); - vk::Instance instance = CreateInstance(library, dld); + vk::Instance instance = CreateInstance(library, dld).first; if (!instance) { return {}; } diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index 18270909b..1044ca124 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -5,7 +5,6 @@ #pragma once #include <memory> -#include <optional> #include <string> #include <vector> @@ -15,7 +14,15 @@ #include "video_core/renderer_vulkan/wrapper.h" namespace Core { -class System; +class TelemetrySession; +} + +namespace Core::Memory { +class Memory; +} + +namespace Tegra { +class GPU; } namespace Vulkan { @@ -23,9 +30,7 @@ namespace Vulkan { class StateTracker; class VKBlitScreen; class VKDevice; -class VKFence; class VKMemoryManager; -class VKResourceManager; class VKSwapchain; class VKScheduler; class VKImage; @@ -39,13 +44,15 @@ struct VKScreenInfo { class RendererVulkan final : public VideoCore::RendererBase { public: - explicit RendererVulkan(Core::Frontend::EmuWindow& window, Core::System& system); + explicit RendererVulkan(Core::TelemetrySession& telemtry_session, + Core::Frontend::EmuWindow& emu_window, Core::Memory::Memory& cpu_memory, + Tegra::GPU& gpu, + std::unique_ptr<Core::Frontend::GraphicsContext> context); ~RendererVulkan() override; bool Init() override; void ShutDown() override; void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; - bool TryPresent(int timeout_ms) override; static std::vector<std::string> EnumerateDevices(); @@ -58,23 +65,26 @@ private: void Report() const; - Core::System& system; + Core::TelemetrySession& telemetry_session; + Core::Memory::Memory& cpu_memory; + Tegra::GPU& gpu; Common::DynamicLibrary library; vk::InstanceDispatch dld; vk::Instance instance; + u32 instance_version{}; + vk::SurfaceKHR surface; VKScreenInfo screen_info; vk::DebugCallback debug_callback; std::unique_ptr<VKDevice> device; - std::unique_ptr<VKSwapchain> swapchain; std::unique_ptr<VKMemoryManager> memory_manager; - std::unique_ptr<VKResourceManager> resource_manager; std::unique_ptr<StateTracker> state_tracker; std::unique_ptr<VKScheduler> scheduler; + std::unique_ptr<VKSwapchain> swapchain; std::unique_ptr<VKBlitScreen> blit_screen; }; diff --git a/src/video_core/renderer_vulkan/shaders/quad_indexed.comp b/src/video_core/renderer_vulkan/shaders/quad_indexed.comp new file mode 100644 index 000000000..5a472ba9b --- /dev/null +++ b/src/video_core/renderer_vulkan/shaders/quad_indexed.comp @@ -0,0 +1,50 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +/* + * Build instructions: + * $ glslangValidator -V quad_indexed.comp -o output.spv + * $ spirv-opt -O --strip-debug output.spv -o optimized.spv + * $ xxd -i optimized.spv + * + * Then copy that bytecode to the C++ file + */ + +#version 460 core + +layout (local_size_x = 1024) in; + +layout (std430, set = 0, binding = 0) readonly buffer InputBuffer { + uint input_indexes[]; +}; + +layout (std430, set = 0, binding = 1) writeonly buffer OutputBuffer { + uint output_indexes[]; +}; + +layout (push_constant) uniform PushConstants { + uint base_vertex; + int index_shift; // 0: uint8, 1: uint16, 2: uint32 +}; + +void main() { + int primitive = int(gl_GlobalInvocationID.x); + if (primitive * 6 >= output_indexes.length()) { + return; + } + + int index_size = 8 << index_shift; + int flipped_shift = 2 - index_shift; + int mask = (1 << flipped_shift) - 1; + + const int quad_swizzle[6] = int[](0, 1, 2, 0, 2, 3); + for (uint vertex = 0; vertex < 6; ++vertex) { + int offset = primitive * 4 + quad_swizzle[vertex]; + int int_offset = offset >> flipped_shift; + int bit_offset = (offset & mask) * index_size; + uint packed_input = input_indexes[int_offset]; + uint index = bitfieldExtract(packed_input, bit_offset, index_size); + output_indexes[primitive * 6 + vertex] = index + base_vertex; + } +} diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index fbd406f2b..b5b60309e 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -12,11 +12,9 @@ #include "common/assert.h" #include "common/common_types.h" #include "common/math_util.h" - #include "core/core.h" #include "core/frontend/emu_window.h" #include "core/memory.h" - #include "video_core/gpu.h" #include "video_core/morton.h" #include "video_core/rasterizer_interface.h" @@ -24,8 +22,8 @@ #include "video_core/renderer_vulkan/vk_blit_screen.h" #include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_image.h" +#include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_shader_util.h" #include "video_core/renderer_vulkan/vk_swapchain.h" @@ -141,24 +139,28 @@ struct ScreenRectVertex { std::array<f32, 2> tex_coord; static VkVertexInputBindingDescription GetDescription() { - VkVertexInputBindingDescription description; - description.binding = 0; - description.stride = sizeof(ScreenRectVertex); - description.inputRate = VK_VERTEX_INPUT_RATE_VERTEX; - return description; + return { + .binding = 0, + .stride = sizeof(ScreenRectVertex), + .inputRate = VK_VERTEX_INPUT_RATE_VERTEX, + }; } static std::array<VkVertexInputAttributeDescription, 2> GetAttributes() { - std::array<VkVertexInputAttributeDescription, 2> attributes; - attributes[0].location = 0; - attributes[0].binding = 0; - attributes[0].format = VK_FORMAT_R32G32_SFLOAT; - attributes[0].offset = offsetof(ScreenRectVertex, position); - attributes[1].location = 1; - attributes[1].binding = 0; - attributes[1].format = VK_FORMAT_R32G32_SFLOAT; - attributes[1].offset = offsetof(ScreenRectVertex, tex_coord); - return attributes; + return {{ + { + .location = 0, + .binding = 0, + .format = VK_FORMAT_R32G32_SFLOAT, + .offset = offsetof(ScreenRectVertex, position), + }, + { + .location = 1, + .binding = 0, + .format = VK_FORMAT_R32G32_SFLOAT, + .offset = offsetof(ScreenRectVertex, tex_coord), + }, + }}; } }; @@ -183,9 +185,9 @@ std::size_t GetSizeInBytes(const Tegra::FramebufferConfig& framebuffer) { VkFormat GetFormat(const Tegra::FramebufferConfig& framebuffer) { switch (framebuffer.pixel_format) { - case Tegra::FramebufferConfig::PixelFormat::ABGR8: + case Tegra::FramebufferConfig::PixelFormat::A8B8G8R8_UNORM: return VK_FORMAT_A8B8G8R8_UNORM_PACK32; - case Tegra::FramebufferConfig::PixelFormat::RGB565: + case Tegra::FramebufferConfig::PixelFormat::RGB565_UNORM: return VK_FORMAT_R5G6B5_UNORM_PACK16; default: UNIMPLEMENTED_MSG("Unknown framebuffer pixel format: {}", @@ -206,17 +208,15 @@ struct VKBlitScreen::BufferData { // Unaligned image data goes here }; -VKBlitScreen::VKBlitScreen(Core::System& system, Core::Frontend::EmuWindow& render_window, - VideoCore::RasterizerInterface& rasterizer, const VKDevice& device, - VKResourceManager& resource_manager, VKMemoryManager& memory_manager, - VKSwapchain& swapchain, VKScheduler& scheduler, - const VKScreenInfo& screen_info) - : system{system}, render_window{render_window}, rasterizer{rasterizer}, device{device}, - resource_manager{resource_manager}, memory_manager{memory_manager}, swapchain{swapchain}, - scheduler{scheduler}, image_count{swapchain.GetImageCount()}, screen_info{screen_info} { - watches.resize(image_count); - std::generate(watches.begin(), watches.end(), - []() { return std::make_unique<VKFenceWatch>(); }); +VKBlitScreen::VKBlitScreen(Core::Memory::Memory& cpu_memory_, + Core::Frontend::EmuWindow& render_window_, + VideoCore::RasterizerInterface& rasterizer_, const VKDevice& device_, + VKMemoryManager& memory_manager_, VKSwapchain& swapchain_, + VKScheduler& scheduler_, const VKScreenInfo& screen_info_) + : cpu_memory{cpu_memory_}, render_window{render_window_}, rasterizer{rasterizer_}, + device{device_}, memory_manager{memory_manager_}, swapchain{swapchain_}, + scheduler{scheduler_}, image_count{swapchain.GetImageCount()}, screen_info{screen_info_} { + resource_ticks.resize(image_count); CreateStaticResources(); CreateDynamicResources(); @@ -228,15 +228,16 @@ void VKBlitScreen::Recreate() { CreateDynamicResources(); } -std::tuple<VKFence&, VkSemaphore> VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, - bool use_accelerated) { +VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool use_accelerated) { RefreshResources(framebuffer); // Finish any pending renderpass scheduler.RequestOutsideRenderPassOperationContext(); const std::size_t image_index = swapchain.GetImageIndex(); - watches[image_index]->Watch(scheduler.GetFence()); + + scheduler.Wait(resource_ticks[image_index]); + resource_ticks[image_index] = scheduler.CurrentTick(); VKImage* blit_image = use_accelerated ? screen_info.image : raw_images[image_index].get(); @@ -255,7 +256,7 @@ std::tuple<VKFence&, VkSemaphore> VKBlitScreen::Draw(const Tegra::FramebufferCon const auto pixel_format = VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format); const VAddr framebuffer_addr = framebuffer.address + framebuffer.offset; - const auto host_ptr = system.Memory().GetPointer(framebuffer_addr); + const auto host_ptr = cpu_memory.GetPointer(framebuffer_addr); rasterizer.FlushRegion(ToCacheAddr(host_ptr), GetSizeInBytes(framebuffer)); // TODO(Rodrigo): Read this from HLE @@ -267,20 +268,25 @@ std::tuple<VKFence&, VkSemaphore> VKBlitScreen::Draw(const Tegra::FramebufferCon blit_image->Transition(0, 1, 0, 1, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); - VkBufferImageCopy copy; - copy.bufferOffset = image_offset; - copy.bufferRowLength = 0; - copy.bufferImageHeight = 0; - copy.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - copy.imageSubresource.mipLevel = 0; - copy.imageSubresource.baseArrayLayer = 0; - copy.imageSubresource.layerCount = 1; - copy.imageOffset.x = 0; - copy.imageOffset.y = 0; - copy.imageOffset.z = 0; - copy.imageExtent.width = framebuffer.width; - copy.imageExtent.height = framebuffer.height; - copy.imageExtent.depth = 1; + const VkBufferImageCopy copy{ + .bufferOffset = image_offset, + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource = + { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1, + }, + .imageOffset = {.x = 0, .y = 0, .z = 0}, + .imageExtent = + { + .width = framebuffer.width, + .height = framebuffer.height, + .depth = 1, + }, + }; scheduler.Record( [buffer = *buffer, image = *blit_image->GetHandle(), copy](vk::CommandBuffer cmdbuf) { cmdbuf.CopyBufferToImage(buffer, image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy); @@ -295,11 +301,9 @@ std::tuple<VKFence&, VkSemaphore> VKBlitScreen::Draw(const Tegra::FramebufferCon descriptor_set = descriptor_sets[image_index], buffer = *buffer, size = swapchain.GetSize(), pipeline = *pipeline, layout = *pipeline_layout](vk::CommandBuffer cmdbuf) { - VkClearValue clear_color; - clear_color.color.float32[0] = 0.0f; - clear_color.color.float32[1] = 0.0f; - clear_color.color.float32[2] = 0.0f; - clear_color.color.float32[3] = 0.0f; + const VkClearValue clear_color{ + .color = {.float32 = {0.0f, 0.0f, 0.0f, 0.0f}}, + }; VkRenderPassBeginInfo renderpass_bi; renderpass_bi.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; @@ -336,7 +340,7 @@ std::tuple<VKFence&, VkSemaphore> VKBlitScreen::Draw(const Tegra::FramebufferCon cmdbuf.EndRenderPass(); }); - return {scheduler.GetFence(), *semaphores[image_index]}; + return *semaphores[image_index]; } void VKBlitScreen::CreateStaticResources() { @@ -379,93 +383,109 @@ void VKBlitScreen::CreateSemaphores() { } void VKBlitScreen::CreateDescriptorPool() { - std::array<VkDescriptorPoolSize, 2> pool_sizes; - pool_sizes[0].type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; - pool_sizes[0].descriptorCount = static_cast<u32>(image_count); - pool_sizes[1].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; - pool_sizes[1].descriptorCount = static_cast<u32>(image_count); - - VkDescriptorPoolCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; - ci.maxSets = static_cast<u32>(image_count); - ci.poolSizeCount = static_cast<u32>(pool_sizes.size()); - ci.pPoolSizes = pool_sizes.data(); + const std::array<VkDescriptorPoolSize, 2> pool_sizes{{ + { + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .descriptorCount = static_cast<u32>(image_count), + }, + { + .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .descriptorCount = static_cast<u32>(image_count), + }, + }}; + + const VkDescriptorPoolCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, + .pNext = nullptr, + .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, + .maxSets = static_cast<u32>(image_count), + .poolSizeCount = static_cast<u32>(pool_sizes.size()), + .pPoolSizes = pool_sizes.data(), + }; descriptor_pool = device.GetLogical().CreateDescriptorPool(ci); } void VKBlitScreen::CreateRenderPass() { - VkAttachmentDescription color_attachment; - color_attachment.flags = 0; - color_attachment.format = swapchain.GetImageFormat(); - color_attachment.samples = VK_SAMPLE_COUNT_1_BIT; - color_attachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; - color_attachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE; - color_attachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; - color_attachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; - color_attachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; - color_attachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; - - VkAttachmentReference color_attachment_ref; - color_attachment_ref.attachment = 0; - color_attachment_ref.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; - - VkSubpassDescription subpass_description; - subpass_description.flags = 0; - subpass_description.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; - subpass_description.inputAttachmentCount = 0; - subpass_description.pInputAttachments = nullptr; - subpass_description.colorAttachmentCount = 1; - subpass_description.pColorAttachments = &color_attachment_ref; - subpass_description.pResolveAttachments = nullptr; - subpass_description.pDepthStencilAttachment = nullptr; - subpass_description.preserveAttachmentCount = 0; - subpass_description.pPreserveAttachments = nullptr; - - VkSubpassDependency dependency; - dependency.srcSubpass = VK_SUBPASS_EXTERNAL; - dependency.dstSubpass = 0; - dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; - dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; - dependency.srcAccessMask = 0; - dependency.dstAccessMask = - VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; - dependency.dependencyFlags = 0; - - VkRenderPassCreateInfo renderpass_ci; - renderpass_ci.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO; - renderpass_ci.pNext = nullptr; - renderpass_ci.flags = 0; - renderpass_ci.attachmentCount = 1; - renderpass_ci.pAttachments = &color_attachment; - renderpass_ci.subpassCount = 1; - renderpass_ci.pSubpasses = &subpass_description; - renderpass_ci.dependencyCount = 1; - renderpass_ci.pDependencies = &dependency; + const VkAttachmentDescription color_attachment{ + .flags = 0, + .format = swapchain.GetImageFormat(), + .samples = VK_SAMPLE_COUNT_1_BIT, + .loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE, + .stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE, + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, + .finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, + }; + + const VkAttachmentReference color_attachment_ref{ + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + }; + + const VkSubpassDescription subpass_description{ + .flags = 0, + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .inputAttachmentCount = 0, + .pInputAttachments = nullptr, + .colorAttachmentCount = 1, + .pColorAttachments = &color_attachment_ref, + .pResolveAttachments = nullptr, + .pDepthStencilAttachment = nullptr, + .preserveAttachmentCount = 0, + .pPreserveAttachments = nullptr, + }; + + const VkSubpassDependency dependency{ + .srcSubpass = VK_SUBPASS_EXTERNAL, + .dstSubpass = 0, + .srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, + .dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, + .srcAccessMask = 0, + .dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, + .dependencyFlags = 0, + }; + + const VkRenderPassCreateInfo renderpass_ci{ + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .attachmentCount = 1, + .pAttachments = &color_attachment, + .subpassCount = 1, + .pSubpasses = &subpass_description, + .dependencyCount = 1, + .pDependencies = &dependency, + }; renderpass = device.GetLogical().CreateRenderPass(renderpass_ci); } void VKBlitScreen::CreateDescriptorSetLayout() { - std::array<VkDescriptorSetLayoutBinding, 2> layout_bindings; - layout_bindings[0].binding = 0; - layout_bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; - layout_bindings[0].descriptorCount = 1; - layout_bindings[0].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; - layout_bindings[0].pImmutableSamplers = nullptr; - layout_bindings[1].binding = 1; - layout_bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; - layout_bindings[1].descriptorCount = 1; - layout_bindings[1].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; - layout_bindings[1].pImmutableSamplers = nullptr; - - VkDescriptorSetLayoutCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.bindingCount = static_cast<u32>(layout_bindings.size()); - ci.pBindings = layout_bindings.data(); + const std::array<VkDescriptorSetLayoutBinding, 2> layout_bindings{{ + { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_VERTEX_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT, + .pImmutableSamplers = nullptr, + }, + }}; + + const VkDescriptorSetLayoutCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .bindingCount = static_cast<u32>(layout_bindings.size()), + .pBindings = layout_bindings.data(), + }; descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout(ci); } @@ -473,175 +493,192 @@ void VKBlitScreen::CreateDescriptorSetLayout() { void VKBlitScreen::CreateDescriptorSets() { const std::vector layouts(image_count, *descriptor_set_layout); - VkDescriptorSetAllocateInfo ai; - ai.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; - ai.pNext = nullptr; - ai.descriptorPool = *descriptor_pool; - ai.descriptorSetCount = static_cast<u32>(image_count); - ai.pSetLayouts = layouts.data(); + const VkDescriptorSetAllocateInfo ai{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .pNext = nullptr, + .descriptorPool = *descriptor_pool, + .descriptorSetCount = static_cast<u32>(image_count), + .pSetLayouts = layouts.data(), + }; + descriptor_sets = descriptor_pool.Allocate(ai); } void VKBlitScreen::CreatePipelineLayout() { - VkPipelineLayoutCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.setLayoutCount = 1; - ci.pSetLayouts = descriptor_set_layout.address(); - ci.pushConstantRangeCount = 0; - ci.pPushConstantRanges = nullptr; + const VkPipelineLayoutCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .setLayoutCount = 1, + .pSetLayouts = descriptor_set_layout.address(), + .pushConstantRangeCount = 0, + .pPushConstantRanges = nullptr, + }; pipeline_layout = device.GetLogical().CreatePipelineLayout(ci); } void VKBlitScreen::CreateGraphicsPipeline() { - std::array<VkPipelineShaderStageCreateInfo, 2> shader_stages; - shader_stages[0].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; - shader_stages[0].pNext = nullptr; - shader_stages[0].flags = 0; - shader_stages[0].stage = VK_SHADER_STAGE_VERTEX_BIT; - shader_stages[0].module = *vertex_shader; - shader_stages[0].pName = "main"; - shader_stages[0].pSpecializationInfo = nullptr; - shader_stages[1].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; - shader_stages[1].pNext = nullptr; - shader_stages[1].flags = 0; - shader_stages[1].stage = VK_SHADER_STAGE_FRAGMENT_BIT; - shader_stages[1].module = *fragment_shader; - shader_stages[1].pName = "main"; - shader_stages[1].pSpecializationInfo = nullptr; + const std::array<VkPipelineShaderStageCreateInfo, 2> shader_stages{{ + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = *vertex_shader, + .pName = "main", + .pSpecializationInfo = nullptr, + }, + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = *fragment_shader, + .pName = "main", + .pSpecializationInfo = nullptr, + }, + }}; const auto vertex_binding_description = ScreenRectVertex::GetDescription(); const auto vertex_attrs_description = ScreenRectVertex::GetAttributes(); - VkPipelineVertexInputStateCreateInfo vertex_input_ci; - vertex_input_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; - vertex_input_ci.pNext = nullptr; - vertex_input_ci.flags = 0; - vertex_input_ci.vertexBindingDescriptionCount = 1; - vertex_input_ci.pVertexBindingDescriptions = &vertex_binding_description; - vertex_input_ci.vertexAttributeDescriptionCount = u32{vertex_attrs_description.size()}; - vertex_input_ci.pVertexAttributeDescriptions = vertex_attrs_description.data(); - - VkPipelineInputAssemblyStateCreateInfo input_assembly_ci; - input_assembly_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; - input_assembly_ci.pNext = nullptr; - input_assembly_ci.flags = 0; - input_assembly_ci.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP; - input_assembly_ci.primitiveRestartEnable = VK_FALSE; - - VkPipelineViewportStateCreateInfo viewport_state_ci; - viewport_state_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; - viewport_state_ci.pNext = nullptr; - viewport_state_ci.flags = 0; - viewport_state_ci.viewportCount = 1; - viewport_state_ci.pViewports = nullptr; - viewport_state_ci.scissorCount = 1; - viewport_state_ci.pScissors = nullptr; - - VkPipelineRasterizationStateCreateInfo rasterization_ci; - rasterization_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; - rasterization_ci.pNext = nullptr; - rasterization_ci.flags = 0; - rasterization_ci.depthClampEnable = VK_FALSE; - rasterization_ci.rasterizerDiscardEnable = VK_FALSE; - rasterization_ci.polygonMode = VK_POLYGON_MODE_FILL; - rasterization_ci.cullMode = VK_CULL_MODE_NONE; - rasterization_ci.frontFace = VK_FRONT_FACE_CLOCKWISE; - rasterization_ci.depthBiasEnable = VK_FALSE; - rasterization_ci.depthBiasConstantFactor = 0.0f; - rasterization_ci.depthBiasClamp = 0.0f; - rasterization_ci.depthBiasSlopeFactor = 0.0f; - rasterization_ci.lineWidth = 1.0f; - - VkPipelineMultisampleStateCreateInfo multisampling_ci; - multisampling_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; - multisampling_ci.pNext = nullptr; - multisampling_ci.flags = 0; - multisampling_ci.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; - multisampling_ci.sampleShadingEnable = VK_FALSE; - multisampling_ci.minSampleShading = 0.0f; - multisampling_ci.pSampleMask = nullptr; - multisampling_ci.alphaToCoverageEnable = VK_FALSE; - multisampling_ci.alphaToOneEnable = VK_FALSE; - - VkPipelineColorBlendAttachmentState color_blend_attachment; - color_blend_attachment.blendEnable = VK_FALSE; - color_blend_attachment.srcColorBlendFactor = VK_BLEND_FACTOR_ZERO; - color_blend_attachment.dstColorBlendFactor = VK_BLEND_FACTOR_ZERO; - color_blend_attachment.colorBlendOp = VK_BLEND_OP_ADD; - color_blend_attachment.srcAlphaBlendFactor = VK_BLEND_FACTOR_ZERO; - color_blend_attachment.dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO; - color_blend_attachment.alphaBlendOp = VK_BLEND_OP_ADD; - color_blend_attachment.colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | - VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; - - VkPipelineColorBlendStateCreateInfo color_blend_ci; - color_blend_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; - color_blend_ci.flags = 0; - color_blend_ci.pNext = nullptr; - color_blend_ci.logicOpEnable = VK_FALSE; - color_blend_ci.logicOp = VK_LOGIC_OP_COPY; - color_blend_ci.attachmentCount = 1; - color_blend_ci.pAttachments = &color_blend_attachment; - color_blend_ci.blendConstants[0] = 0.0f; - color_blend_ci.blendConstants[1] = 0.0f; - color_blend_ci.blendConstants[2] = 0.0f; - color_blend_ci.blendConstants[3] = 0.0f; - - static constexpr std::array dynamic_states = {VK_DYNAMIC_STATE_VIEWPORT, - VK_DYNAMIC_STATE_SCISSOR}; - VkPipelineDynamicStateCreateInfo dynamic_state_ci; - dynamic_state_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO; - dynamic_state_ci.pNext = nullptr; - dynamic_state_ci.flags = 0; - dynamic_state_ci.dynamicStateCount = static_cast<u32>(dynamic_states.size()); - dynamic_state_ci.pDynamicStates = dynamic_states.data(); - - VkGraphicsPipelineCreateInfo pipeline_ci; - pipeline_ci.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; - pipeline_ci.pNext = nullptr; - pipeline_ci.flags = 0; - pipeline_ci.stageCount = static_cast<u32>(shader_stages.size()); - pipeline_ci.pStages = shader_stages.data(); - pipeline_ci.pVertexInputState = &vertex_input_ci; - pipeline_ci.pInputAssemblyState = &input_assembly_ci; - pipeline_ci.pTessellationState = nullptr; - pipeline_ci.pViewportState = &viewport_state_ci; - pipeline_ci.pRasterizationState = &rasterization_ci; - pipeline_ci.pMultisampleState = &multisampling_ci; - pipeline_ci.pDepthStencilState = nullptr; - pipeline_ci.pColorBlendState = &color_blend_ci; - pipeline_ci.pDynamicState = &dynamic_state_ci; - pipeline_ci.layout = *pipeline_layout; - pipeline_ci.renderPass = *renderpass; - pipeline_ci.subpass = 0; - pipeline_ci.basePipelineHandle = 0; - pipeline_ci.basePipelineIndex = 0; + const VkPipelineVertexInputStateCreateInfo vertex_input_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = &vertex_binding_description, + .vertexAttributeDescriptionCount = u32{vertex_attrs_description.size()}, + .pVertexAttributeDescriptions = vertex_attrs_description.data(), + }; + + const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP, + .primitiveRestartEnable = VK_FALSE, + }; + + const VkPipelineViewportStateCreateInfo viewport_state_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .viewportCount = 1, + .pViewports = nullptr, + .scissorCount = 1, + .pScissors = nullptr, + }; + + const VkPipelineRasterizationStateCreateInfo rasterization_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .depthClampEnable = VK_FALSE, + .rasterizerDiscardEnable = VK_FALSE, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_NONE, + .frontFace = VK_FRONT_FACE_CLOCKWISE, + .depthBiasEnable = VK_FALSE, + .depthBiasConstantFactor = 0.0f, + .depthBiasClamp = 0.0f, + .depthBiasSlopeFactor = 0.0f, + .lineWidth = 1.0f, + }; + + const VkPipelineMultisampleStateCreateInfo multisampling_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT, + .sampleShadingEnable = VK_FALSE, + .minSampleShading = 0.0f, + .pSampleMask = nullptr, + .alphaToCoverageEnable = VK_FALSE, + .alphaToOneEnable = VK_FALSE, + }; + + const VkPipelineColorBlendAttachmentState color_blend_attachment{ + .blendEnable = VK_FALSE, + .srcColorBlendFactor = VK_BLEND_FACTOR_ZERO, + .dstColorBlendFactor = VK_BLEND_FACTOR_ZERO, + .colorBlendOp = VK_BLEND_OP_ADD, + .srcAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, + .dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, + .alphaBlendOp = VK_BLEND_OP_ADD, + .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT, + }; + + const VkPipelineColorBlendStateCreateInfo color_blend_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .logicOpEnable = VK_FALSE, + .logicOp = VK_LOGIC_OP_COPY, + .attachmentCount = 1, + .pAttachments = &color_blend_attachment, + .blendConstants = {0.0f, 0.0f, 0.0f, 0.0f}, + }; + + static constexpr std::array dynamic_states{ + VK_DYNAMIC_STATE_VIEWPORT, + VK_DYNAMIC_STATE_SCISSOR, + }; + const VkPipelineDynamicStateCreateInfo dynamic_state_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .dynamicStateCount = static_cast<u32>(dynamic_states.size()), + .pDynamicStates = dynamic_states.data(), + }; + + const VkGraphicsPipelineCreateInfo pipeline_ci{ + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stageCount = static_cast<u32>(shader_stages.size()), + .pStages = shader_stages.data(), + .pVertexInputState = &vertex_input_ci, + .pInputAssemblyState = &input_assembly_ci, + .pTessellationState = nullptr, + .pViewportState = &viewport_state_ci, + .pRasterizationState = &rasterization_ci, + .pMultisampleState = &multisampling_ci, + .pDepthStencilState = nullptr, + .pColorBlendState = &color_blend_ci, + .pDynamicState = &dynamic_state_ci, + .layout = *pipeline_layout, + .renderPass = *renderpass, + .subpass = 0, + .basePipelineHandle = 0, + .basePipelineIndex = 0, + }; pipeline = device.GetLogical().CreateGraphicsPipeline(pipeline_ci); } void VKBlitScreen::CreateSampler() { - VkSamplerCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.magFilter = VK_FILTER_LINEAR; - ci.minFilter = VK_FILTER_NEAREST; - ci.mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR; - ci.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER; - ci.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER; - ci.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER; - ci.mipLodBias = 0.0f; - ci.anisotropyEnable = VK_FALSE; - ci.maxAnisotropy = 0.0f; - ci.compareEnable = VK_FALSE; - ci.compareOp = VK_COMPARE_OP_NEVER; - ci.minLod = 0.0f; - ci.maxLod = 0.0f; - ci.borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK; - ci.unnormalizedCoordinates = VK_FALSE; + const VkSamplerCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .magFilter = VK_FILTER_LINEAR, + .minFilter = VK_FILTER_NEAREST, + .mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR, + .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, + .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, + .addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, + .mipLodBias = 0.0f, + .anisotropyEnable = VK_FALSE, + .maxAnisotropy = 0.0f, + .compareEnable = VK_FALSE, + .compareOp = VK_COMPARE_OP_NEVER, + .minLod = 0.0f, + .maxLod = 0.0f, + .borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK, + .unnormalizedCoordinates = VK_FALSE, + }; sampler = device.GetLogical().CreateSampler(ci); } @@ -650,15 +687,17 @@ void VKBlitScreen::CreateFramebuffers() { const VkExtent2D size{swapchain.GetSize()}; framebuffers.resize(image_count); - VkFramebufferCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.renderPass = *renderpass; - ci.attachmentCount = 1; - ci.width = size.width; - ci.height = size.height; - ci.layers = 1; + VkFramebufferCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .renderPass = *renderpass, + .attachmentCount = 1, + .pAttachments = nullptr, + .width = size.width, + .height = size.height, + .layers = 1, + }; for (std::size_t i = 0; i < image_count; ++i) { const VkImageView image_view{swapchain.GetImageViewIndex(i)}; @@ -669,7 +708,7 @@ void VKBlitScreen::CreateFramebuffers() { void VKBlitScreen::ReleaseRawImages() { for (std::size_t i = 0; i < raw_images.size(); ++i) { - watches[i]->Wait(); + scheduler.Wait(resource_ticks.at(i)); } raw_images.clear(); raw_buffer_commits.clear(); @@ -678,16 +717,17 @@ void VKBlitScreen::ReleaseRawImages() { } void VKBlitScreen::CreateStagingBuffer(const Tegra::FramebufferConfig& framebuffer) { - VkBufferCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.size = CalculateBufferSize(framebuffer); - ci.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | - VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; - ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - ci.queueFamilyIndexCount = 0; - ci.pQueueFamilyIndices = nullptr; + const VkBufferCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = CalculateBufferSize(framebuffer), + .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; buffer = device.GetLogical().CreateBuffer(ci); buffer_commit = memory_manager.Commit(buffer, true); @@ -697,24 +737,28 @@ void VKBlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) raw_images.resize(image_count); raw_buffer_commits.resize(image_count); - VkImageCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.imageType = VK_IMAGE_TYPE_2D; - ci.format = GetFormat(framebuffer); - ci.extent.width = framebuffer.width; - ci.extent.height = framebuffer.height; - ci.extent.depth = 1; - ci.mipLevels = 1; - ci.arrayLayers = 1; - ci.samples = VK_SAMPLE_COUNT_1_BIT; - ci.tiling = VK_IMAGE_TILING_LINEAR; - ci.usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT; - ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - ci.queueFamilyIndexCount = 0; - ci.pQueueFamilyIndices = nullptr; - ci.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + const VkImageCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .imageType = VK_IMAGE_TYPE_2D, + .format = GetFormat(framebuffer), + .extent = + { + .width = framebuffer.width, + .height = framebuffer.height, + .depth = 1, + }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = VK_SAMPLE_COUNT_1_BIT, + .tiling = VK_IMAGE_TILING_LINEAR, + .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, + }; for (std::size_t i = 0; i < image_count; ++i) { raw_images[i] = std::make_unique<VKImage>(device, scheduler, ci, VK_IMAGE_ASPECT_COLOR_BIT); @@ -723,39 +767,43 @@ void VKBlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) } void VKBlitScreen::UpdateDescriptorSet(std::size_t image_index, VkImageView image_view) const { - VkDescriptorBufferInfo buffer_info; - buffer_info.buffer = *buffer; - buffer_info.offset = offsetof(BufferData, uniform); - buffer_info.range = sizeof(BufferData::uniform); - - VkWriteDescriptorSet ubo_write; - ubo_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - ubo_write.pNext = nullptr; - ubo_write.dstSet = descriptor_sets[image_index]; - ubo_write.dstBinding = 0; - ubo_write.dstArrayElement = 0; - ubo_write.descriptorCount = 1; - ubo_write.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; - ubo_write.pImageInfo = nullptr; - ubo_write.pBufferInfo = &buffer_info; - ubo_write.pTexelBufferView = nullptr; - - VkDescriptorImageInfo image_info; - image_info.sampler = *sampler; - image_info.imageView = image_view; - image_info.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - - VkWriteDescriptorSet sampler_write; - sampler_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - sampler_write.pNext = nullptr; - sampler_write.dstSet = descriptor_sets[image_index]; - sampler_write.dstBinding = 1; - sampler_write.dstArrayElement = 0; - sampler_write.descriptorCount = 1; - sampler_write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; - sampler_write.pImageInfo = &image_info; - sampler_write.pBufferInfo = nullptr; - sampler_write.pTexelBufferView = nullptr; + const VkDescriptorBufferInfo buffer_info{ + .buffer = *buffer, + .offset = offsetof(BufferData, uniform), + .range = sizeof(BufferData::uniform), + }; + + const VkWriteDescriptorSet ubo_write{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .pNext = nullptr, + .dstSet = descriptor_sets[image_index], + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .pImageInfo = nullptr, + .pBufferInfo = &buffer_info, + .pTexelBufferView = nullptr, + }; + + const VkDescriptorImageInfo image_info{ + .sampler = *sampler, + .imageView = image_view, + .imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + }; + + const VkWriteDescriptorSet sampler_write{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .pNext = nullptr, + .dstSet = descriptor_sets[image_index], + .dstBinding = 1, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .pImageInfo = &image_info, + .pBufferInfo = nullptr, + .pTexelBufferView = nullptr, + }; device.GetLogical().UpdateDescriptorSets(std::array{ubo_write, sampler_write}, {}); } diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h index 5eb544aea..8f2839214 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.h +++ b/src/video_core/renderer_vulkan/vk_blit_screen.h @@ -4,18 +4,19 @@ #pragma once -#include <array> #include <memory> -#include <tuple> #include "video_core/renderer_vulkan/vk_memory_manager.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/wrapper.h" namespace Core { class System; } +namespace Core::Memory { +class Memory; +} + namespace Core::Frontend { class EmuWindow; } @@ -31,26 +32,26 @@ class RasterizerInterface; namespace Vulkan { struct ScreenInfo; + class RasterizerVulkan; class VKDevice; -class VKFence; class VKImage; class VKScheduler; class VKSwapchain; class VKBlitScreen final { public: - explicit VKBlitScreen(Core::System& system, Core::Frontend::EmuWindow& render_window, + explicit VKBlitScreen(Core::Memory::Memory& cpu_memory, + Core::Frontend::EmuWindow& render_window, VideoCore::RasterizerInterface& rasterizer, const VKDevice& device, - VKResourceManager& resource_manager, VKMemoryManager& memory_manager, - VKSwapchain& swapchain, VKScheduler& scheduler, - const VKScreenInfo& screen_info); + VKMemoryManager& memory_manager, VKSwapchain& swapchain, + VKScheduler& scheduler, const VKScreenInfo& screen_info); ~VKBlitScreen(); void Recreate(); - std::tuple<VKFence&, VkSemaphore> Draw(const Tegra::FramebufferConfig& framebuffer, - bool use_accelerated); + [[nodiscard]] VkSemaphore Draw(const Tegra::FramebufferConfig& framebuffer, + bool use_accelerated); private: struct BufferData; @@ -82,11 +83,10 @@ private: u64 GetRawImageOffset(const Tegra::FramebufferConfig& framebuffer, std::size_t image_index) const; - Core::System& system; + Core::Memory::Memory& cpu_memory; Core::Frontend::EmuWindow& render_window; VideoCore::RasterizerInterface& rasterizer; const VKDevice& device; - VKResourceManager& resource_manager; VKMemoryManager& memory_manager; VKSwapchain& swapchain; VKScheduler& scheduler; @@ -107,7 +107,7 @@ private: vk::Buffer buffer; VKMemoryCommit buffer_commit; - std::vector<std::unique_ptr<VKFenceWatch>> watches; + std::vector<u64> resource_ticks; std::vector<vk::Semaphore> semaphores; std::vector<std::unique_ptr<VKImage>> raw_images; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 0d167afbd..d9d3da9ea 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -5,12 +5,9 @@ #include <algorithm> #include <cstring> #include <memory> -#include <optional> -#include <tuple> -#include "common/assert.h" -#include "common/bit_util.h" #include "core/core.h" +#include "video_core/buffer_cache/buffer_cache.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" #include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_scheduler.h" @@ -40,112 +37,88 @@ std::unique_ptr<VKStreamBuffer> CreateStreamBuffer(const VKDevice& device, VKSch } // Anonymous namespace -CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager, - VAddr cpu_addr, std::size_t size) - : VideoCommon::BufferBlock{cpu_addr, size} { - VkBufferCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.size = static_cast<VkDeviceSize>(size); - ci.usage = BUFFER_USAGE | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - ci.queueFamilyIndexCount = 0; - ci.pQueueFamilyIndices = nullptr; +Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler_, + VKStagingBufferPool& staging_pool_, VAddr cpu_addr, std::size_t size) + : BufferBlock{cpu_addr, size}, scheduler{scheduler_}, staging_pool{staging_pool_} { + const VkBufferCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = static_cast<VkDeviceSize>(size), + .usage = BUFFER_USAGE | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; buffer.handle = device.GetLogical().CreateBuffer(ci); buffer.commit = memory_manager.Commit(buffer.handle, false); } -CachedBufferBlock::~CachedBufferBlock() = default; +Buffer::~Buffer() = default; -VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, - const VKDevice& device, VKMemoryManager& memory_manager, - VKScheduler& scheduler, VKStagingBufferPool& staging_pool) - : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, system, - CreateStreamBuffer(device, - scheduler)}, - device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{ - staging_pool} {} - -VKBufferCache::~VKBufferCache() = default; - -Buffer VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { - return std::make_shared<CachedBufferBlock>(device, memory_manager, cpu_addr, size); -} - -const VkBuffer* VKBufferCache::ToHandle(const Buffer& buffer) { - return buffer->GetHandle(); -} - -const VkBuffer* VKBufferCache::GetEmptyBuffer(std::size_t size) { - size = std::max(size, std::size_t(4)); - const auto& empty = staging_pool.GetUnusedBuffer(size, false); - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) { - cmdbuf.FillBuffer(buffer, 0, size, 0); - }); - return empty.handle.address(); -} - -void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - const u8* data) { +void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) { const auto& staging = staging_pool.GetUnusedBuffer(size, true); std::memcpy(staging.commit->Map(size), data, size); scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([staging = *staging.handle, buffer = *buffer->GetHandle(), offset, - size](vk::CommandBuffer cmdbuf) { - cmdbuf.CopyBuffer(staging, buffer, VkBufferCopy{0, offset, size}); - - VkBufferMemoryBarrier barrier; - barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barrier.pNext = nullptr; - barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - barrier.dstAccessMask = UPLOAD_ACCESS_BARRIERS; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.buffer = buffer; - barrier.offset = offset; - barrier.size = size; + + const VkBuffer handle = Handle(); + scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) { + cmdbuf.CopyBuffer(staging, handle, VkBufferCopy{0, offset, size}); + + const VkBufferMemoryBarrier barrier{ + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = UPLOAD_ACCESS_BARRIERS, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = handle, + .offset = offset, + .size = size, + }; cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, {}, barrier, {}); }); } -void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - u8* data) { +void Buffer::Download(std::size_t offset, std::size_t size, u8* data) { const auto& staging = staging_pool.GetUnusedBuffer(size, true); scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([staging = *staging.handle, buffer = *buffer->GetHandle(), offset, - size](vk::CommandBuffer cmdbuf) { - VkBufferMemoryBarrier barrier; - barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barrier.pNext = nullptr; - barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.buffer = buffer; - barrier.offset = offset; - barrier.size = size; + + const VkBuffer handle = Handle(); + scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) { + const VkBufferMemoryBarrier barrier{ + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = handle, + .offset = offset, + .size = size, + }; cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, {}, barrier, {}); - cmdbuf.CopyBuffer(buffer, staging, VkBufferCopy{offset, 0, size}); + cmdbuf.CopyBuffer(handle, staging, VkBufferCopy{offset, 0, size}); }); scheduler.Finish(); std::memcpy(data, staging.commit->Map(size), size); } -void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, - std::size_t dst_offset, std::size_t size) { +void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, + std::size_t size) { scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([src_buffer = *src->GetHandle(), dst_buffer = *dst->GetHandle(), src_offset, - dst_offset, size](vk::CommandBuffer cmdbuf) { + + const VkBuffer dst_buffer = Handle(); + scheduler.Record([src_buffer = src.Handle(), dst_buffer, src_offset, dst_offset, + size](vk::CommandBuffer cmdbuf) { cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, size}); std::array<VkBufferMemoryBarrier, 2> barriers; @@ -172,4 +145,31 @@ void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t }); } +VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, + Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory, + const VKDevice& device_, VKMemoryManager& memory_manager_, + VKScheduler& scheduler_, VKStagingBufferPool& staging_pool_) + : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, gpu_memory, cpu_memory, + CreateStreamBuffer(device_, + scheduler_)}, + device{device_}, memory_manager{memory_manager_}, scheduler{scheduler_}, staging_pool{ + staging_pool_} {} + +VKBufferCache::~VKBufferCache() = default; + +std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { + return std::make_shared<Buffer>(device, memory_manager, scheduler, staging_pool, cpu_addr, + size); +} + +VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) { + size = std::max(size, std::size_t(4)); + const auto& empty = staging_pool.GetUnusedBuffer(size, false); + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) { + cmdbuf.FillBuffer(buffer, 0, size, 0); + }); + return {*empty.handle, 0, 0}; +} + } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index d3c23da98..7fb5ceedf 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -5,68 +5,60 @@ #pragma once #include <memory> -#include <unordered_map> -#include <vector> #include "common/common_types.h" #include "video_core/buffer_cache/buffer_cache.h" -#include "video_core/rasterizer_cache.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_stream_buffer.h" #include "video_core/renderer_vulkan/wrapper.h" -namespace Core { -class System; -} - namespace Vulkan { class VKDevice; class VKMemoryManager; class VKScheduler; -class CachedBufferBlock final : public VideoCommon::BufferBlock { +class Buffer final : public VideoCommon::BufferBlock { public: - explicit CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager, - VAddr cpu_addr, std::size_t size); - ~CachedBufferBlock(); + explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler, + VKStagingBufferPool& staging_pool, VAddr cpu_addr, std::size_t size); + ~Buffer(); + + void Upload(std::size_t offset, std::size_t size, const u8* data); + + void Download(std::size_t offset, std::size_t size, u8* data); + + void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, + std::size_t size); - const VkBuffer* GetHandle() const { - return buffer.handle.address(); + VkBuffer Handle() const { + return *buffer.handle; + } + + u64 Address() const { + return 0; } private: + VKScheduler& scheduler; + VKStagingBufferPool& staging_pool; + VKBuffer buffer; }; -using Buffer = std::shared_ptr<CachedBufferBlock>; - class VKBufferCache final : public VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer> { public: - explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system, + explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, + Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory, const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler, VKStagingBufferPool& staging_pool); ~VKBufferCache(); - const VkBuffer* GetEmptyBuffer(std::size_t size) override; + BufferInfo GetEmptyBuffer(std::size_t size) override; protected: - void WriteBarrier() override {} - - Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override; - - const VkBuffer* ToHandle(const Buffer& buffer) override; - - void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - const u8* data) override; - - void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, - u8* data) override; - - void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset, - std::size_t dst_offset, std::size_t size) override; + std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override; private: const VKDevice& device; diff --git a/src/video_core/renderer_vulkan/vk_command_pool.cpp b/src/video_core/renderer_vulkan/vk_command_pool.cpp new file mode 100644 index 000000000..6339f4fe0 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_command_pool.cpp @@ -0,0 +1,46 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <cstddef> + +#include "video_core/renderer_vulkan/vk_command_pool.h" +#include "video_core/renderer_vulkan/vk_device.h" +#include "video_core/renderer_vulkan/wrapper.h" + +namespace Vulkan { + +constexpr size_t COMMAND_BUFFER_POOL_SIZE = 0x1000; + +struct CommandPool::Pool { + vk::CommandPool handle; + vk::CommandBuffers cmdbufs; +}; + +CommandPool::CommandPool(MasterSemaphore& master_semaphore, const VKDevice& device) + : ResourcePool(master_semaphore, COMMAND_BUFFER_POOL_SIZE), device{device} {} + +CommandPool::~CommandPool() = default; + +void CommandPool::Allocate(size_t begin, size_t end) { + // Command buffers are going to be commited, recorded, executed every single usage cycle. + // They are also going to be reseted when commited. + Pool& pool = pools.emplace_back(); + pool.handle = device.GetLogical().CreateCommandPool({ + .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .pNext = nullptr, + .flags = + VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, + .queueFamilyIndex = device.GetGraphicsFamily(), + }); + pool.cmdbufs = pool.handle.Allocate(COMMAND_BUFFER_POOL_SIZE); +} + +VkCommandBuffer CommandPool::Commit() { + const size_t index = CommitResource(); + const auto pool_index = index / COMMAND_BUFFER_POOL_SIZE; + const auto sub_index = index % COMMAND_BUFFER_POOL_SIZE; + return pools[pool_index].cmdbufs[sub_index]; +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_command_pool.h b/src/video_core/renderer_vulkan/vk_command_pool.h new file mode 100644 index 000000000..b9cb3fb5d --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_command_pool.h @@ -0,0 +1,34 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <cstddef> +#include <vector> + +#include "video_core/renderer_vulkan/vk_resource_pool.h" +#include "video_core/renderer_vulkan/wrapper.h" + +namespace Vulkan { + +class MasterSemaphore; +class VKDevice; + +class CommandPool final : public ResourcePool { +public: + explicit CommandPool(MasterSemaphore& master_semaphore, const VKDevice& device); + ~CommandPool() override; + + void Allocate(size_t begin, size_t end) override; + + VkCommandBuffer Commit(); + +private: + struct Pool; + + const VKDevice& device; + std::vector<Pool> pools; +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 9d92305f4..9637c6059 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -6,7 +6,7 @@ #include <memory> #include <optional> #include <utility> -#include <vector> + #include "common/alignment.h" #include "common/assert.h" #include "common/common_types.h" @@ -112,35 +112,36 @@ constexpr u8 quad_array[] = { 0xf9, 0x00, 0x02, 0x00, 0x21, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x23, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x4b, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x4e, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x4c, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x4b, 0x00, 0x00, 0x00, - 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00}; + 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00, +}; VkDescriptorSetLayoutBinding BuildQuadArrayPassDescriptorSetLayoutBinding() { - VkDescriptorSetLayoutBinding binding; - binding.binding = 0; - binding.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - binding.descriptorCount = 1; - binding.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; - binding.pImmutableSamplers = nullptr; - return binding; + return { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }; } VkDescriptorUpdateTemplateEntryKHR BuildQuadArrayPassDescriptorUpdateTemplateEntry() { - VkDescriptorUpdateTemplateEntryKHR entry; - entry.dstBinding = 0; - entry.dstArrayElement = 0; - entry.descriptorCount = 1; - entry.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - entry.offset = 0; - entry.stride = sizeof(DescriptorUpdateEntry); - return entry; + return { + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .offset = 0, + .stride = sizeof(DescriptorUpdateEntry), + }; } -VkPushConstantRange BuildQuadArrayPassPushConstantRange() { - VkPushConstantRange range; - range.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; - range.offset = 0; - range.size = sizeof(u32); - return range; +VkPushConstantRange BuildComputePushConstantRange(std::size_t size) { + return { + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .offset = 0, + .size = static_cast<u32>(size), + }; } // Uint8 SPIR-V module. Generated from the "shaders/" directory. @@ -218,32 +219,161 @@ constexpr u8 uint8_pass[] = { 0x2a, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x1d, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1d, 0x00, 0x00, 0x00, - 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00}; - -std::array<VkDescriptorSetLayoutBinding, 2> BuildUint8PassDescriptorSetBindings() { - std::array<VkDescriptorSetLayoutBinding, 2> bindings; - bindings[0].binding = 0; - bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - bindings[0].descriptorCount = 1; - bindings[0].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; - bindings[0].pImmutableSamplers = nullptr; - bindings[1].binding = 1; - bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - bindings[1].descriptorCount = 1; - bindings[1].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; - bindings[1].pImmutableSamplers = nullptr; - return bindings; + 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00, +}; + +// Quad indexed SPIR-V module. Generated from the "shaders/" directory. +constexpr u8 QUAD_INDEXED_SPV[] = { + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x07, 0x00, 0x08, 0x00, 0x7c, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, + 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, + 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x05, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x05, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x22, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x22, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x57, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x59, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x59, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x72, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, + 0x0d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, 0x15, 0x00, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x16, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x02, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x04, 0x00, 0x22, 0x00, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x24, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x25, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x26, 0x00, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x43, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x09, 0x00, 0x41, 0x00, 0x00, 0x00, + 0x44, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, + 0x42, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, + 0x56, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x57, 0x00, 0x00, 0x00, + 0x56, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x58, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x57, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x58, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x69, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, + 0x00, 0x04, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x72, 0x00, 0x00, 0x00, + 0x70, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x74, 0x00, 0x00, 0x00, + 0xf8, 0x00, 0x02, 0x00, 0x74, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, + 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x75, 0x00, 0x00, 0x00, + 0xf8, 0x00, 0x02, 0x00, 0x75, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, + 0x44, 0x00, 0x05, 0x00, 0x09, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0xaf, 0x00, 0x05, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x1e, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1d, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, + 0x73, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x26, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, + 0xc4, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x82, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x31, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x82, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x32, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, + 0xf9, 0x00, 0x02, 0x00, 0x35, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x35, 0x00, 0x00, 0x00, + 0xf5, 0x00, 0x07, 0x00, 0x09, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, + 0x1b, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x00, 0x00, + 0xf6, 0x00, 0x04, 0x00, 0x37, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xfa, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, + 0xf8, 0x00, 0x02, 0x00, 0x36, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x40, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, + 0x47, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, + 0xc3, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x2e, 0x00, 0x00, 0x00, 0xc7, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x32, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x54, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x5b, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, + 0x4e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x5d, 0x00, 0x00, 0x00, + 0x5c, 0x00, 0x00, 0x00, 0xcb, 0x00, 0x06, 0x00, 0x09, 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00, + 0x5d, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x69, 0x00, 0x00, 0x00, 0x6a, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x42, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, + 0x6a, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x09, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, + 0x62, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x5b, 0x00, 0x00, 0x00, + 0x6d, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, + 0x3e, 0x00, 0x03, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, + 0xf9, 0x00, 0x02, 0x00, 0x35, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00, + 0xf9, 0x00, 0x02, 0x00, 0x73, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x76, 0x00, 0x00, 0x00, + 0xf9, 0x00, 0x02, 0x00, 0x74, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x73, 0x00, 0x00, 0x00, + 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00, +}; + +std::array<VkDescriptorSetLayoutBinding, 2> BuildInputOutputDescriptorSetBindings() { + return {{ + { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + }}; } -VkDescriptorUpdateTemplateEntryKHR BuildUint8PassDescriptorUpdateTemplateEntry() { - VkDescriptorUpdateTemplateEntryKHR entry; - entry.dstBinding = 0; - entry.dstArrayElement = 0; - entry.descriptorCount = 2; - entry.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - entry.offset = 0; - entry.stride = sizeof(DescriptorUpdateEntry); - return entry; +VkDescriptorUpdateTemplateEntryKHR BuildInputOutputDescriptorUpdateTemplate() { + return { + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 2, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .offset = 0, + .stride = sizeof(DescriptorUpdateEntry), + }; } } // Anonymous namespace @@ -253,37 +383,37 @@ VKComputePass::VKComputePass(const VKDevice& device, VKDescriptorPool& descripto vk::Span<VkDescriptorUpdateTemplateEntryKHR> templates, vk::Span<VkPushConstantRange> push_constants, std::size_t code_size, const u8* code) { - VkDescriptorSetLayoutCreateInfo descriptor_layout_ci; - descriptor_layout_ci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; - descriptor_layout_ci.pNext = nullptr; - descriptor_layout_ci.flags = 0; - descriptor_layout_ci.bindingCount = bindings.size(); - descriptor_layout_ci.pBindings = bindings.data(); - descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout(descriptor_layout_ci); - - VkPipelineLayoutCreateInfo pipeline_layout_ci; - pipeline_layout_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; - pipeline_layout_ci.pNext = nullptr; - pipeline_layout_ci.flags = 0; - pipeline_layout_ci.setLayoutCount = 1; - pipeline_layout_ci.pSetLayouts = descriptor_set_layout.address(); - pipeline_layout_ci.pushConstantRangeCount = push_constants.size(); - pipeline_layout_ci.pPushConstantRanges = push_constants.data(); - layout = device.GetLogical().CreatePipelineLayout(pipeline_layout_ci); + descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .bindingCount = bindings.size(), + .pBindings = bindings.data(), + }); + + layout = device.GetLogical().CreatePipelineLayout({ + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .setLayoutCount = 1, + .pSetLayouts = descriptor_set_layout.address(), + .pushConstantRangeCount = push_constants.size(), + .pPushConstantRanges = push_constants.data(), + }); if (!templates.empty()) { - VkDescriptorUpdateTemplateCreateInfoKHR template_ci; - template_ci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR; - template_ci.pNext = nullptr; - template_ci.flags = 0; - template_ci.descriptorUpdateEntryCount = templates.size(); - template_ci.pDescriptorUpdateEntries = templates.data(); - template_ci.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR; - template_ci.descriptorSetLayout = *descriptor_set_layout; - template_ci.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; - template_ci.pipelineLayout = *layout; - template_ci.set = 0; - descriptor_template = device.GetLogical().CreateDescriptorUpdateTemplateKHR(template_ci); + descriptor_template = device.GetLogical().CreateDescriptorUpdateTemplateKHR({ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR, + .pNext = nullptr, + .flags = 0, + .descriptorUpdateEntryCount = templates.size(), + .pDescriptorUpdateEntries = templates.data(), + .templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR, + .descriptorSetLayout = *descriptor_set_layout, + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .pipelineLayout = *layout, + .set = 0, + }); descriptor_allocator.emplace(descriptor_pool, *descriptor_set_layout); } @@ -291,42 +421,42 @@ VKComputePass::VKComputePass(const VKDevice& device, VKDescriptorPool& descripto auto code_copy = std::make_unique<u32[]>(code_size / sizeof(u32) + 1); std::memcpy(code_copy.get(), code, code_size); - VkShaderModuleCreateInfo module_ci; - module_ci.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; - module_ci.pNext = nullptr; - module_ci.flags = 0; - module_ci.codeSize = code_size; - module_ci.pCode = code_copy.get(); - module = device.GetLogical().CreateShaderModule(module_ci); - - VkComputePipelineCreateInfo pipeline_ci; - pipeline_ci.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; - pipeline_ci.pNext = nullptr; - pipeline_ci.flags = 0; - pipeline_ci.layout = *layout; - pipeline_ci.basePipelineHandle = nullptr; - pipeline_ci.basePipelineIndex = 0; - - VkPipelineShaderStageCreateInfo& stage_ci = pipeline_ci.stage; - stage_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; - stage_ci.pNext = nullptr; - stage_ci.flags = 0; - stage_ci.stage = VK_SHADER_STAGE_COMPUTE_BIT; - stage_ci.module = *module; - stage_ci.pName = "main"; - stage_ci.pSpecializationInfo = nullptr; - - pipeline = device.GetLogical().CreateComputePipeline(pipeline_ci); + module = device.GetLogical().CreateShaderModule({ + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .codeSize = code_size, + .pCode = code_copy.get(), + }); + + pipeline = device.GetLogical().CreateComputePipeline({ + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = *module, + .pName = "main", + .pSpecializationInfo = nullptr, + }, + .layout = *layout, + .basePipelineHandle = nullptr, + .basePipelineIndex = 0, + }); } VKComputePass::~VKComputePass() = default; -VkDescriptorSet VKComputePass::CommitDescriptorSet(VKUpdateDescriptorQueue& update_descriptor_queue, - VKFence& fence) { +VkDescriptorSet VKComputePass::CommitDescriptorSet( + VKUpdateDescriptorQueue& update_descriptor_queue) { if (!descriptor_template) { return nullptr; } - const auto set = descriptor_allocator->Commit(fence); + const VkDescriptorSet set = descriptor_allocator->Commit(); update_descriptor_queue.Send(*descriptor_template, set); return set; } @@ -337,20 +467,20 @@ QuadArrayPass::QuadArrayPass(const VKDevice& device, VKScheduler& scheduler, VKUpdateDescriptorQueue& update_descriptor_queue) : VKComputePass(device, descriptor_pool, BuildQuadArrayPassDescriptorSetLayoutBinding(), BuildQuadArrayPassDescriptorUpdateTemplateEntry(), - BuildQuadArrayPassPushConstantRange(), std::size(quad_array), quad_array), + BuildComputePushConstantRange(sizeof(u32)), std::size(quad_array), quad_array), scheduler{scheduler}, staging_buffer_pool{staging_buffer_pool}, update_descriptor_queue{update_descriptor_queue} {} QuadArrayPass::~QuadArrayPass() = default; -std::pair<const VkBuffer*, VkDeviceSize> QuadArrayPass::Assemble(u32 num_vertices, u32 first) { - const u32 num_triangle_vertices = num_vertices * 6 / 4; +std::pair<VkBuffer, VkDeviceSize> QuadArrayPass::Assemble(u32 num_vertices, u32 first) { + const u32 num_triangle_vertices = (num_vertices / 4) * 6; const std::size_t staging_size = num_triangle_vertices * sizeof(u32); auto& buffer = staging_buffer_pool.GetUnusedBuffer(staging_size, false); update_descriptor_queue.Acquire(); - update_descriptor_queue.AddBuffer(buffer.handle.address(), 0, staging_size); - const auto set = CommitDescriptorSet(update_descriptor_queue, scheduler.GetFence()); + update_descriptor_queue.AddBuffer(*buffer.handle, 0, staging_size); + const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); scheduler.RequestOutsideRenderPassOperationContext(); @@ -377,29 +507,29 @@ std::pair<const VkBuffer*, VkDeviceSize> QuadArrayPass::Assemble(u32 num_vertice cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, {barrier}, {}); }); - return {buffer.handle.address(), 0}; + return {*buffer.handle, 0}; } Uint8Pass::Uint8Pass(const VKDevice& device, VKScheduler& scheduler, VKDescriptorPool& descriptor_pool, VKStagingBufferPool& staging_buffer_pool, VKUpdateDescriptorQueue& update_descriptor_queue) - : VKComputePass(device, descriptor_pool, BuildUint8PassDescriptorSetBindings(), - BuildUint8PassDescriptorUpdateTemplateEntry(), {}, std::size(uint8_pass), + : VKComputePass(device, descriptor_pool, BuildInputOutputDescriptorSetBindings(), + BuildInputOutputDescriptorUpdateTemplate(), {}, std::size(uint8_pass), uint8_pass), scheduler{scheduler}, staging_buffer_pool{staging_buffer_pool}, update_descriptor_queue{update_descriptor_queue} {} Uint8Pass::~Uint8Pass() = default; -std::pair<const VkBuffer*, u64> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer, - u64 src_offset) { - const auto staging_size = static_cast<u32>(num_vertices * sizeof(u16)); +std::pair<VkBuffer, u64> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer, + u64 src_offset) { + const u32 staging_size = static_cast<u32>(num_vertices * sizeof(u16)); auto& buffer = staging_buffer_pool.GetUnusedBuffer(staging_size, false); update_descriptor_queue.Acquire(); - update_descriptor_queue.AddBuffer(&src_buffer, src_offset, num_vertices); - update_descriptor_queue.AddBuffer(buffer.handle.address(), 0, staging_size); - const auto set = CommitDescriptorSet(update_descriptor_queue, scheduler.GetFence()); + update_descriptor_queue.AddBuffer(src_buffer, src_offset, num_vertices); + update_descriptor_queue.AddBuffer(*buffer.handle, 0, staging_size); + const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = *buffer.handle, set, @@ -422,7 +552,73 @@ std::pair<const VkBuffer*, u64> Uint8Pass::Assemble(u32 num_vertices, VkBuffer s cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {}); }); - return {buffer.handle.address(), 0}; + return {*buffer.handle, 0}; +} + +QuadIndexedPass::QuadIndexedPass(const VKDevice& device, VKScheduler& scheduler, + VKDescriptorPool& descriptor_pool, + VKStagingBufferPool& staging_buffer_pool, + VKUpdateDescriptorQueue& update_descriptor_queue) + : VKComputePass(device, descriptor_pool, BuildInputOutputDescriptorSetBindings(), + BuildInputOutputDescriptorUpdateTemplate(), + BuildComputePushConstantRange(sizeof(u32) * 2), std::size(QUAD_INDEXED_SPV), + QUAD_INDEXED_SPV), + scheduler{scheduler}, staging_buffer_pool{staging_buffer_pool}, + update_descriptor_queue{update_descriptor_queue} {} + +QuadIndexedPass::~QuadIndexedPass() = default; + +std::pair<VkBuffer, u64> QuadIndexedPass::Assemble( + Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, u32 base_vertex, + VkBuffer src_buffer, u64 src_offset) { + const u32 index_shift = [index_format] { + switch (index_format) { + case Tegra::Engines::Maxwell3D::Regs::IndexFormat::UnsignedByte: + return 0; + case Tegra::Engines::Maxwell3D::Regs::IndexFormat::UnsignedShort: + return 1; + case Tegra::Engines::Maxwell3D::Regs::IndexFormat::UnsignedInt: + return 2; + } + UNREACHABLE(); + return 2; + }(); + const u32 input_size = num_vertices << index_shift; + const u32 num_tri_vertices = (num_vertices / 4) * 6; + + const std::size_t staging_size = num_tri_vertices * sizeof(u32); + auto& buffer = staging_buffer_pool.GetUnusedBuffer(staging_size, false); + + update_descriptor_queue.Acquire(); + update_descriptor_queue.AddBuffer(src_buffer, src_offset, input_size); + update_descriptor_queue.AddBuffer(*buffer.handle, 0, staging_size); + const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); + + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = *buffer.handle, set, + num_tri_vertices, base_vertex, index_shift](vk::CommandBuffer cmdbuf) { + static constexpr u32 dispatch_size = 1024; + const std::array push_constants = {base_vertex, index_shift}; + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {}); + cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), + &push_constants); + cmdbuf.Dispatch(Common::AlignUp(num_tri_vertices, dispatch_size) / dispatch_size, 1, 1); + + VkBufferMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.pNext = nullptr; + barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.buffer = buffer; + barrier.offset = 0; + barrier.size = static_cast<VkDeviceSize>(num_tri_vertices * sizeof(u32)); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {}); + }); + return {*buffer.handle, 0}; } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index c62516bff..acc94f27e 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -6,15 +6,15 @@ #include <optional> #include <utility> -#include <vector> + #include "common/common_types.h" +#include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/wrapper.h" namespace Vulkan { class VKDevice; -class VKFence; class VKScheduler; class VKStagingBufferPool; class VKUpdateDescriptorQueue; @@ -29,8 +29,7 @@ public: ~VKComputePass(); protected: - VkDescriptorSet CommitDescriptorSet(VKUpdateDescriptorQueue& update_descriptor_queue, - VKFence& fence); + VkDescriptorSet CommitDescriptorSet(VKUpdateDescriptorQueue& update_descriptor_queue); vk::DescriptorUpdateTemplateKHR descriptor_template; vk::PipelineLayout layout; @@ -50,7 +49,7 @@ public: VKUpdateDescriptorQueue& update_descriptor_queue); ~QuadArrayPass(); - std::pair<const VkBuffer*, VkDeviceSize> Assemble(u32 num_vertices, u32 first); + std::pair<VkBuffer, VkDeviceSize> Assemble(u32 num_vertices, u32 first); private: VKScheduler& scheduler; @@ -65,7 +64,25 @@ public: VKUpdateDescriptorQueue& update_descriptor_queue); ~Uint8Pass(); - std::pair<const VkBuffer*, u64> Assemble(u32 num_vertices, VkBuffer src_buffer, u64 src_offset); + std::pair<VkBuffer, u64> Assemble(u32 num_vertices, VkBuffer src_buffer, u64 src_offset); + +private: + VKScheduler& scheduler; + VKStagingBufferPool& staging_buffer_pool; + VKUpdateDescriptorQueue& update_descriptor_queue; +}; + +class QuadIndexedPass final : public VKComputePass { +public: + explicit QuadIndexedPass(const VKDevice& device, VKScheduler& scheduler, + VKDescriptorPool& descriptor_pool, + VKStagingBufferPool& staging_buffer_pool, + VKUpdateDescriptorQueue& update_descriptor_queue); + ~QuadIndexedPass(); + + std::pair<VkBuffer, u64> Assemble(Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, + u32 num_vertices, u32 base_vertex, VkBuffer src_buffer, + u64 src_offset); private: VKScheduler& scheduler; diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 23beafa4f..9be72dc9b 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -2,14 +2,12 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <memory> #include <vector> #include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_shader_decompiler.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" @@ -34,7 +32,7 @@ VkDescriptorSet VKComputePipeline::CommitDescriptorSet() { if (!descriptor_template) { return {}; } - const auto set = descriptor_allocator.Commit(scheduler.GetFence()); + const VkDescriptorSet set = descriptor_allocator.Commit(); update_descriptor_queue.Send(*descriptor_template, set); return set; } @@ -45,39 +43,41 @@ vk::DescriptorSetLayout VKComputePipeline::CreateDescriptorSetLayout() const { const auto add_bindings = [&](VkDescriptorType descriptor_type, std::size_t num_entries) { // TODO(Rodrigo): Maybe make individual bindings here? for (u32 bindpoint = 0; bindpoint < static_cast<u32>(num_entries); ++bindpoint) { - VkDescriptorSetLayoutBinding& entry = bindings.emplace_back(); - entry.binding = binding++; - entry.descriptorType = descriptor_type; - entry.descriptorCount = 1; - entry.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; - entry.pImmutableSamplers = nullptr; + bindings.push_back({ + .binding = binding++, + .descriptorType = descriptor_type, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }); } }; add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, entries.const_buffers.size()); add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, entries.global_buffers.size()); - add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.texel_buffers.size()); + add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.uniform_texels.size()); add_bindings(VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, entries.samplers.size()); + add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, entries.storage_texels.size()); add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, entries.images.size()); - VkDescriptorSetLayoutCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.bindingCount = static_cast<u32>(bindings.size()); - ci.pBindings = bindings.data(); - return device.GetLogical().CreateDescriptorSetLayout(ci); + return device.GetLogical().CreateDescriptorSetLayout({ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .bindingCount = static_cast<u32>(bindings.size()), + .pBindings = bindings.data(), + }); } vk::PipelineLayout VKComputePipeline::CreatePipelineLayout() const { - VkPipelineLayoutCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.setLayoutCount = 1; - ci.pSetLayouts = descriptor_set_layout.address(); - ci.pushConstantRangeCount = 0; - ci.pPushConstantRanges = nullptr; - return device.GetLogical().CreatePipelineLayout(ci); + return device.GetLogical().CreatePipelineLayout({ + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .setLayoutCount = 1, + .pSetLayouts = descriptor_set_layout.address(), + .pushConstantRangeCount = 0, + .pPushConstantRanges = nullptr, + }); } vk::DescriptorUpdateTemplateKHR VKComputePipeline::CreateDescriptorUpdateTemplate() const { @@ -90,57 +90,63 @@ vk::DescriptorUpdateTemplateKHR VKComputePipeline::CreateDescriptorUpdateTemplat return {}; } - VkDescriptorUpdateTemplateCreateInfoKHR ci; - ci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR; - ci.pNext = nullptr; - ci.flags = 0; - ci.descriptorUpdateEntryCount = static_cast<u32>(template_entries.size()); - ci.pDescriptorUpdateEntries = template_entries.data(); - ci.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR; - ci.descriptorSetLayout = *descriptor_set_layout; - ci.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; - ci.pipelineLayout = *layout; - ci.set = DESCRIPTOR_SET; - return device.GetLogical().CreateDescriptorUpdateTemplateKHR(ci); + return device.GetLogical().CreateDescriptorUpdateTemplateKHR({ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR, + .pNext = nullptr, + .flags = 0, + .descriptorUpdateEntryCount = static_cast<u32>(template_entries.size()), + .pDescriptorUpdateEntries = template_entries.data(), + .templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR, + .descriptorSetLayout = *descriptor_set_layout, + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .pipelineLayout = *layout, + .set = DESCRIPTOR_SET, + }); } vk::ShaderModule VKComputePipeline::CreateShaderModule(const std::vector<u32>& code) const { - VkShaderModuleCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.codeSize = code.size() * sizeof(u32); - ci.pCode = code.data(); - return device.GetLogical().CreateShaderModule(ci); + device.SaveShader(code); + + return device.GetLogical().CreateShaderModule({ + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .codeSize = code.size() * sizeof(u32), + .pCode = code.data(), + }); } vk::Pipeline VKComputePipeline::CreatePipeline() const { - VkComputePipelineCreateInfo ci; - VkPipelineShaderStageCreateInfo& stage_ci = ci.stage; - stage_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; - stage_ci.pNext = nullptr; - stage_ci.flags = 0; - stage_ci.stage = VK_SHADER_STAGE_COMPUTE_BIT; - stage_ci.module = *shader_module; - stage_ci.pName = "main"; - stage_ci.pSpecializationInfo = nullptr; - - VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci; - subgroup_size_ci.sType = - VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT; - subgroup_size_ci.pNext = nullptr; - subgroup_size_ci.requiredSubgroupSize = GuestWarpSize; + + VkComputePipelineCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = *shader_module, + .pName = "main", + .pSpecializationInfo = nullptr, + }, + .layout = *layout, + .basePipelineHandle = nullptr, + .basePipelineIndex = 0, + }; + + const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT, + .pNext = nullptr, + .requiredSubgroupSize = GuestWarpSize, + }; if (entries.uses_warps && device.IsGuestWarpSizeSupported(VK_SHADER_STAGE_COMPUTE_BIT)) { - stage_ci.pNext = &subgroup_size_ci; + ci.stage.pNext = &subgroup_size_ci; } - ci.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.layout = *layout; - ci.basePipelineHandle = nullptr; - ci.basePipelineIndex = 0; return device.GetLogical().CreateComputePipeline(ci); } diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.h b/src/video_core/renderer_vulkan/vk_compute_pipeline.h index 33b9af29e..6e2f22a4a 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.h @@ -4,8 +4,6 @@ #pragma once -#include <memory> - #include "common/common_types.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_shader_decompiler.h" diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp index e9d528aa6..f38e089d5 100644 --- a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp @@ -2,13 +2,13 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <memory> #include <vector> #include "common/common_types.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" +#include "video_core/renderer_vulkan/vk_resource_pool.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/wrapper.h" namespace Vulkan { @@ -16,14 +16,15 @@ namespace Vulkan { // Prefer small grow rates to avoid saturating the descriptor pool with barely used pipelines. constexpr std::size_t SETS_GROW_RATE = 0x20; -DescriptorAllocator::DescriptorAllocator(VKDescriptorPool& descriptor_pool, - VkDescriptorSetLayout layout) - : VKFencedPool{SETS_GROW_RATE}, descriptor_pool{descriptor_pool}, layout{layout} {} +DescriptorAllocator::DescriptorAllocator(VKDescriptorPool& descriptor_pool_, + VkDescriptorSetLayout layout_) + : ResourcePool(descriptor_pool_.master_semaphore, SETS_GROW_RATE), + descriptor_pool{descriptor_pool_}, layout{layout_} {} DescriptorAllocator::~DescriptorAllocator() = default; -VkDescriptorSet DescriptorAllocator::Commit(VKFence& fence) { - const std::size_t index = CommitResource(fence); +VkDescriptorSet DescriptorAllocator::Commit() { + const std::size_t index = CommitResource(); return descriptors_allocations[index / SETS_GROW_RATE][index % SETS_GROW_RATE]; } @@ -31,8 +32,9 @@ void DescriptorAllocator::Allocate(std::size_t begin, std::size_t end) { descriptors_allocations.push_back(descriptor_pool.AllocateDescriptors(layout, end - begin)); } -VKDescriptorPool::VKDescriptorPool(const VKDevice& device) - : device{device}, active_pool{AllocateNewPool()} {} +VKDescriptorPool::VKDescriptorPool(const VKDevice& device_, VKScheduler& scheduler) + : device{device_}, master_semaphore{scheduler.GetMasterSemaphore()}, active_pool{ + AllocateNewPool()} {} VKDescriptorPool::~VKDescriptorPool() = default; @@ -43,27 +45,31 @@ vk::DescriptorPool* VKDescriptorPool::AllocateNewPool() { {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, num_sets * 60}, {VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, num_sets * 64}, {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, num_sets * 64}, - {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, num_sets * 40}}; - - VkDescriptorPoolCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; - ci.maxSets = num_sets; - ci.poolSizeCount = static_cast<u32>(std::size(pool_sizes)); - ci.pPoolSizes = std::data(pool_sizes); + {VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, num_sets * 64}, + {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, num_sets * 40}, + }; + + const VkDescriptorPoolCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, + .pNext = nullptr, + .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, + .maxSets = num_sets, + .poolSizeCount = static_cast<u32>(std::size(pool_sizes)), + .pPoolSizes = std::data(pool_sizes), + }; return &pools.emplace_back(device.GetLogical().CreateDescriptorPool(ci)); } vk::DescriptorSets VKDescriptorPool::AllocateDescriptors(VkDescriptorSetLayout layout, std::size_t count) { const std::vector layout_copies(count, layout); - VkDescriptorSetAllocateInfo ai; - ai.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; - ai.pNext = nullptr; - ai.descriptorPool = **active_pool; - ai.descriptorSetCount = static_cast<u32>(count); - ai.pSetLayouts = layout_copies.data(); + VkDescriptorSetAllocateInfo ai{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .pNext = nullptr, + .descriptorPool = **active_pool, + .descriptorSetCount = static_cast<u32>(count), + .pSetLayouts = layout_copies.data(), + }; vk::DescriptorSets sets = active_pool->Allocate(ai); if (!sets.IsOutOfPoolMemory()) { diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.h b/src/video_core/renderer_vulkan/vk_descriptor_pool.h index ab40c70f0..544f32a20 100644 --- a/src/video_core/renderer_vulkan/vk_descriptor_pool.h +++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.h @@ -4,25 +4,26 @@ #pragma once -#include <memory> #include <vector> -#include "common/common_types.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" +#include "video_core/renderer_vulkan/vk_resource_pool.h" #include "video_core/renderer_vulkan/wrapper.h" namespace Vulkan { +class VKDevice; class VKDescriptorPool; +class VKScheduler; -class DescriptorAllocator final : public VKFencedPool { +class DescriptorAllocator final : public ResourcePool { public: explicit DescriptorAllocator(VKDescriptorPool& descriptor_pool, VkDescriptorSetLayout layout); ~DescriptorAllocator() override; + DescriptorAllocator& operator=(const DescriptorAllocator&) = delete; DescriptorAllocator(const DescriptorAllocator&) = delete; - VkDescriptorSet Commit(VKFence& fence); + VkDescriptorSet Commit(); protected: void Allocate(std::size_t begin, std::size_t end) override; @@ -38,15 +39,19 @@ class VKDescriptorPool final { friend DescriptorAllocator; public: - explicit VKDescriptorPool(const VKDevice& device); + explicit VKDescriptorPool(const VKDevice& device, VKScheduler& scheduler); ~VKDescriptorPool(); + VKDescriptorPool(const VKDescriptorPool&) = delete; + VKDescriptorPool& operator=(const VKDescriptorPool&) = delete; + private: vk::DescriptorPool* AllocateNewPool(); vk::DescriptorSets AllocateDescriptors(VkDescriptorSetLayout layout, std::size_t count); const VKDevice& device; + MasterSemaphore& master_semaphore; std::vector<vk::DescriptorPool> pools; vk::DescriptorPool* active_pool; diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp index 52d29e49d..f34ed6735 100644 --- a/src/video_core/renderer_vulkan/vk_device.cpp +++ b/src/video_core/renderer_vulkan/vk_device.cpp @@ -4,11 +4,11 @@ #include <bitset> #include <chrono> -#include <cstdlib> #include <optional> #include <string_view> #include <thread> #include <unordered_set> +#include <utility> #include <vector> #include "common/assert.h" @@ -22,19 +22,30 @@ namespace { namespace Alternatives { -constexpr std::array Depth24UnormS8_UINT = {VK_FORMAT_D32_SFLOAT_S8_UINT, - VK_FORMAT_D16_UNORM_S8_UINT, VkFormat{}}; -constexpr std::array Depth16UnormS8_UINT = {VK_FORMAT_D24_UNORM_S8_UINT, - VK_FORMAT_D32_SFLOAT_S8_UINT, VkFormat{}}; +constexpr std::array Depth24UnormS8_UINT{ + VK_FORMAT_D32_SFLOAT_S8_UINT, + VK_FORMAT_D16_UNORM_S8_UINT, + VkFormat{}, +}; + +constexpr std::array Depth16UnormS8_UINT{ + VK_FORMAT_D24_UNORM_S8_UINT, + VK_FORMAT_D32_SFLOAT_S8_UINT, + VkFormat{}, +}; } // namespace Alternatives -constexpr std::array REQUIRED_EXTENSIONS = { +constexpr std::array REQUIRED_EXTENSIONS{ VK_KHR_SWAPCHAIN_EXTENSION_NAME, + VK_KHR_MAINTENANCE1_EXTENSION_NAME, + VK_KHR_STORAGE_BUFFER_STORAGE_CLASS_EXTENSION_NAME, + VK_KHR_SHADER_DRAW_PARAMETERS_EXTENSION_NAME, VK_KHR_16BIT_STORAGE_EXTENSION_NAME, VK_KHR_8BIT_STORAGE_EXTENSION_NAME, VK_KHR_DRIVER_PROPERTIES_EXTENSION_NAME, VK_KHR_DESCRIPTOR_UPDATE_TEMPLATE_EXTENSION_NAME, + VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME, VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME, VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME, VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME, @@ -71,76 +82,105 @@ VkFormatFeatureFlags GetFormatFeatures(VkFormatProperties properties, FormatType } } +[[nodiscard]] bool IsRDNA(std::string_view device_name, VkDriverIdKHR driver_id) { + static constexpr std::array RDNA_DEVICES{ + "5700", + "5600", + "5500", + "5300", + }; + if (driver_id != VK_DRIVER_ID_AMD_PROPRIETARY_KHR) { + return false; + } + return std::any_of(RDNA_DEVICES.begin(), RDNA_DEVICES.end(), [device_name](const char* name) { + return device_name.find(name) != std::string_view::npos; + }); +} + std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties( vk::PhysicalDevice physical, const vk::InstanceDispatch& dld) { - static constexpr std::array formats{VK_FORMAT_A8B8G8R8_UNORM_PACK32, - VK_FORMAT_A8B8G8R8_UINT_PACK32, - VK_FORMAT_A8B8G8R8_SNORM_PACK32, - VK_FORMAT_A8B8G8R8_SRGB_PACK32, - VK_FORMAT_B5G6R5_UNORM_PACK16, - VK_FORMAT_A2B10G10R10_UNORM_PACK32, - VK_FORMAT_A1R5G5B5_UNORM_PACK16, - VK_FORMAT_R32G32B32A32_SFLOAT, - VK_FORMAT_R32G32B32A32_UINT, - VK_FORMAT_R32G32_SFLOAT, - VK_FORMAT_R32G32_UINT, - VK_FORMAT_R16G16B16A16_UINT, - VK_FORMAT_R16G16B16A16_SNORM, - VK_FORMAT_R16G16B16A16_UNORM, - VK_FORMAT_R16G16_UNORM, - VK_FORMAT_R16G16_SNORM, - VK_FORMAT_R16G16_SFLOAT, - VK_FORMAT_R16_UNORM, - VK_FORMAT_R8G8B8A8_SRGB, - VK_FORMAT_R8G8_UNORM, - VK_FORMAT_R8G8_SNORM, - VK_FORMAT_R8_UNORM, - VK_FORMAT_R8_UINT, - VK_FORMAT_B10G11R11_UFLOAT_PACK32, - VK_FORMAT_R32_SFLOAT, - VK_FORMAT_R32_UINT, - VK_FORMAT_R32_SINT, - VK_FORMAT_R16_SFLOAT, - VK_FORMAT_R16G16B16A16_SFLOAT, - VK_FORMAT_B8G8R8A8_UNORM, - VK_FORMAT_R4G4B4A4_UNORM_PACK16, - VK_FORMAT_D32_SFLOAT, - VK_FORMAT_D16_UNORM, - VK_FORMAT_D16_UNORM_S8_UINT, - VK_FORMAT_D24_UNORM_S8_UINT, - VK_FORMAT_D32_SFLOAT_S8_UINT, - VK_FORMAT_BC1_RGBA_UNORM_BLOCK, - VK_FORMAT_BC2_UNORM_BLOCK, - VK_FORMAT_BC3_UNORM_BLOCK, - VK_FORMAT_BC4_UNORM_BLOCK, - VK_FORMAT_BC5_UNORM_BLOCK, - VK_FORMAT_BC5_SNORM_BLOCK, - VK_FORMAT_BC7_UNORM_BLOCK, - VK_FORMAT_BC6H_UFLOAT_BLOCK, - VK_FORMAT_BC6H_SFLOAT_BLOCK, - VK_FORMAT_BC1_RGBA_SRGB_BLOCK, - VK_FORMAT_BC2_SRGB_BLOCK, - VK_FORMAT_BC3_SRGB_BLOCK, - VK_FORMAT_BC7_SRGB_BLOCK, - VK_FORMAT_ASTC_4x4_SRGB_BLOCK, - VK_FORMAT_ASTC_8x8_SRGB_BLOCK, - VK_FORMAT_ASTC_8x5_SRGB_BLOCK, - VK_FORMAT_ASTC_5x4_SRGB_BLOCK, - VK_FORMAT_ASTC_5x5_UNORM_BLOCK, - VK_FORMAT_ASTC_5x5_SRGB_BLOCK, - VK_FORMAT_ASTC_10x8_UNORM_BLOCK, - VK_FORMAT_ASTC_10x8_SRGB_BLOCK, - VK_FORMAT_ASTC_6x6_UNORM_BLOCK, - VK_FORMAT_ASTC_6x6_SRGB_BLOCK, - VK_FORMAT_ASTC_10x10_UNORM_BLOCK, - VK_FORMAT_ASTC_10x10_SRGB_BLOCK, - VK_FORMAT_ASTC_12x12_UNORM_BLOCK, - VK_FORMAT_ASTC_12x12_SRGB_BLOCK, - VK_FORMAT_ASTC_8x6_UNORM_BLOCK, - VK_FORMAT_ASTC_8x6_SRGB_BLOCK, - VK_FORMAT_ASTC_6x5_UNORM_BLOCK, - VK_FORMAT_ASTC_6x5_SRGB_BLOCK, - VK_FORMAT_E5B9G9R9_UFLOAT_PACK32}; + static constexpr std::array formats{ + VK_FORMAT_A8B8G8R8_UNORM_PACK32, + VK_FORMAT_A8B8G8R8_UINT_PACK32, + VK_FORMAT_A8B8G8R8_SNORM_PACK32, + VK_FORMAT_A8B8G8R8_SINT_PACK32, + VK_FORMAT_A8B8G8R8_SRGB_PACK32, + VK_FORMAT_B5G6R5_UNORM_PACK16, + VK_FORMAT_A2B10G10R10_UNORM_PACK32, + VK_FORMAT_A2B10G10R10_UINT_PACK32, + VK_FORMAT_A1R5G5B5_UNORM_PACK16, + VK_FORMAT_R32G32B32A32_SFLOAT, + VK_FORMAT_R32G32B32A32_SINT, + VK_FORMAT_R32G32B32A32_UINT, + VK_FORMAT_R32G32_SFLOAT, + VK_FORMAT_R32G32_SINT, + VK_FORMAT_R32G32_UINT, + VK_FORMAT_R16G16B16A16_SINT, + VK_FORMAT_R16G16B16A16_UINT, + VK_FORMAT_R16G16B16A16_SNORM, + VK_FORMAT_R16G16B16A16_UNORM, + VK_FORMAT_R16G16_UNORM, + VK_FORMAT_R16G16_SNORM, + VK_FORMAT_R16G16_SFLOAT, + VK_FORMAT_R16_UNORM, + VK_FORMAT_R16_UINT, + VK_FORMAT_R8G8B8A8_SRGB, + VK_FORMAT_R8G8_UNORM, + VK_FORMAT_R8G8_SNORM, + VK_FORMAT_R8G8_SINT, + VK_FORMAT_R8G8_UINT, + VK_FORMAT_R8_UNORM, + VK_FORMAT_R8_SNORM, + VK_FORMAT_R8_SINT, + VK_FORMAT_R8_UINT, + VK_FORMAT_B10G11R11_UFLOAT_PACK32, + VK_FORMAT_R32_SFLOAT, + VK_FORMAT_R32_UINT, + VK_FORMAT_R32_SINT, + VK_FORMAT_R16_SFLOAT, + VK_FORMAT_R16G16B16A16_SFLOAT, + VK_FORMAT_B8G8R8A8_UNORM, + VK_FORMAT_B8G8R8A8_SRGB, + VK_FORMAT_R4G4B4A4_UNORM_PACK16, + VK_FORMAT_D32_SFLOAT, + VK_FORMAT_D16_UNORM, + VK_FORMAT_D16_UNORM_S8_UINT, + VK_FORMAT_D24_UNORM_S8_UINT, + VK_FORMAT_D32_SFLOAT_S8_UINT, + VK_FORMAT_BC1_RGBA_UNORM_BLOCK, + VK_FORMAT_BC2_UNORM_BLOCK, + VK_FORMAT_BC3_UNORM_BLOCK, + VK_FORMAT_BC4_UNORM_BLOCK, + VK_FORMAT_BC4_SNORM_BLOCK, + VK_FORMAT_BC5_UNORM_BLOCK, + VK_FORMAT_BC5_SNORM_BLOCK, + VK_FORMAT_BC7_UNORM_BLOCK, + VK_FORMAT_BC6H_UFLOAT_BLOCK, + VK_FORMAT_BC6H_SFLOAT_BLOCK, + VK_FORMAT_BC1_RGBA_SRGB_BLOCK, + VK_FORMAT_BC2_SRGB_BLOCK, + VK_FORMAT_BC3_SRGB_BLOCK, + VK_FORMAT_BC7_SRGB_BLOCK, + VK_FORMAT_ASTC_4x4_SRGB_BLOCK, + VK_FORMAT_ASTC_8x8_SRGB_BLOCK, + VK_FORMAT_ASTC_8x5_SRGB_BLOCK, + VK_FORMAT_ASTC_5x4_SRGB_BLOCK, + VK_FORMAT_ASTC_5x5_UNORM_BLOCK, + VK_FORMAT_ASTC_5x5_SRGB_BLOCK, + VK_FORMAT_ASTC_10x8_UNORM_BLOCK, + VK_FORMAT_ASTC_10x8_SRGB_BLOCK, + VK_FORMAT_ASTC_6x6_UNORM_BLOCK, + VK_FORMAT_ASTC_6x6_SRGB_BLOCK, + VK_FORMAT_ASTC_10x10_UNORM_BLOCK, + VK_FORMAT_ASTC_10x10_SRGB_BLOCK, + VK_FORMAT_ASTC_12x12_UNORM_BLOCK, + VK_FORMAT_ASTC_12x12_SRGB_BLOCK, + VK_FORMAT_ASTC_8x6_UNORM_BLOCK, + VK_FORMAT_ASTC_8x6_SRGB_BLOCK, + VK_FORMAT_ASTC_6x5_UNORM_BLOCK, + VK_FORMAT_ASTC_6x5_SRGB_BLOCK, + VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, + }; std::unordered_map<VkFormat, VkFormatProperties> format_properties; for (const auto format : formats) { format_properties.emplace(format, physical.GetFormatProperties(format)); @@ -150,10 +190,10 @@ std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties( } // Anonymous namespace -VKDevice::VKDevice(VkInstance instance, vk::PhysicalDevice physical, VkSurfaceKHR surface, - const vk::InstanceDispatch& dld) - : dld{dld}, physical{physical}, properties{physical.GetProperties()}, - format_properties{GetFormatProperties(physical, dld)} { +VKDevice::VKDevice(VkInstance instance_, u32 instance_version_, vk::PhysicalDevice physical_, + VkSurfaceKHR surface, const vk::InstanceDispatch& dld_) + : dld{dld_}, physical{physical_}, properties{physical.GetProperties()}, + instance_version{instance_version_}, format_properties{GetFormatProperties(physical, dld)} { SetupFamilies(surface); SetupFeatures(); } @@ -164,107 +204,127 @@ bool VKDevice::Create() { const auto queue_cis = GetDeviceQueueCreateInfos(); const std::vector extensions = LoadExtensions(); - VkPhysicalDeviceFeatures2 features2; - features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; - features2.pNext = nullptr; + VkPhysicalDeviceFeatures2 features2{ + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2, + .pNext = nullptr, + }; + const void* first_next = &features2; void** next = &features2.pNext; - auto& features = features2.features; - features.robustBufferAccess = false; - features.fullDrawIndexUint32 = false; - features.imageCubeArray = false; - features.independentBlend = true; - features.geometryShader = true; - features.tessellationShader = true; - features.sampleRateShading = false; - features.dualSrcBlend = false; - features.logicOp = false; - features.multiDrawIndirect = false; - features.drawIndirectFirstInstance = false; - features.depthClamp = true; - features.depthBiasClamp = true; - features.fillModeNonSolid = false; - features.depthBounds = false; - features.wideLines = false; - features.largePoints = true; - features.alphaToOne = false; - features.multiViewport = true; - features.samplerAnisotropy = true; - features.textureCompressionETC2 = false; - features.textureCompressionASTC_LDR = is_optimal_astc_supported; - features.textureCompressionBC = false; - features.occlusionQueryPrecise = true; - features.pipelineStatisticsQuery = false; - features.vertexPipelineStoresAndAtomics = true; - features.fragmentStoresAndAtomics = true; - features.shaderTessellationAndGeometryPointSize = false; - features.shaderImageGatherExtended = true; - features.shaderStorageImageExtendedFormats = false; - features.shaderStorageImageMultisample = false; - features.shaderStorageImageReadWithoutFormat = is_formatless_image_load_supported; - features.shaderStorageImageWriteWithoutFormat = true; - features.shaderUniformBufferArrayDynamicIndexing = false; - features.shaderSampledImageArrayDynamicIndexing = false; - features.shaderStorageBufferArrayDynamicIndexing = false; - features.shaderStorageImageArrayDynamicIndexing = false; - features.shaderClipDistance = false; - features.shaderCullDistance = false; - features.shaderFloat64 = false; - features.shaderInt64 = false; - features.shaderInt16 = false; - features.shaderResourceResidency = false; - features.shaderResourceMinLod = false; - features.sparseBinding = false; - features.sparseResidencyBuffer = false; - features.sparseResidencyImage2D = false; - features.sparseResidencyImage3D = false; - features.sparseResidency2Samples = false; - features.sparseResidency4Samples = false; - features.sparseResidency8Samples = false; - features.sparseResidency16Samples = false; - features.sparseResidencyAliased = false; - features.variableMultisampleRate = false; - features.inheritedQueries = false; - - VkPhysicalDevice16BitStorageFeaturesKHR bit16_storage; - bit16_storage.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR; - bit16_storage.pNext = nullptr; - bit16_storage.storageBuffer16BitAccess = false; - bit16_storage.uniformAndStorageBuffer16BitAccess = true; - bit16_storage.storagePushConstant16 = false; - bit16_storage.storageInputOutput16 = false; + features2.features = { + .robustBufferAccess = false, + .fullDrawIndexUint32 = false, + .imageCubeArray = false, + .independentBlend = true, + .geometryShader = true, + .tessellationShader = true, + .sampleRateShading = false, + .dualSrcBlend = false, + .logicOp = false, + .multiDrawIndirect = false, + .drawIndirectFirstInstance = false, + .depthClamp = true, + .depthBiasClamp = true, + .fillModeNonSolid = false, + .depthBounds = false, + .wideLines = false, + .largePoints = true, + .alphaToOne = false, + .multiViewport = true, + .samplerAnisotropy = true, + .textureCompressionETC2 = false, + .textureCompressionASTC_LDR = is_optimal_astc_supported, + .textureCompressionBC = false, + .occlusionQueryPrecise = true, + .pipelineStatisticsQuery = false, + .vertexPipelineStoresAndAtomics = true, + .fragmentStoresAndAtomics = true, + .shaderTessellationAndGeometryPointSize = false, + .shaderImageGatherExtended = true, + .shaderStorageImageExtendedFormats = false, + .shaderStorageImageMultisample = false, + .shaderStorageImageReadWithoutFormat = is_formatless_image_load_supported, + .shaderStorageImageWriteWithoutFormat = true, + .shaderUniformBufferArrayDynamicIndexing = false, + .shaderSampledImageArrayDynamicIndexing = false, + .shaderStorageBufferArrayDynamicIndexing = false, + .shaderStorageImageArrayDynamicIndexing = false, + .shaderClipDistance = false, + .shaderCullDistance = false, + .shaderFloat64 = false, + .shaderInt64 = false, + .shaderInt16 = false, + .shaderResourceResidency = false, + .shaderResourceMinLod = false, + .sparseBinding = false, + .sparseResidencyBuffer = false, + .sparseResidencyImage2D = false, + .sparseResidencyImage3D = false, + .sparseResidency2Samples = false, + .sparseResidency4Samples = false, + .sparseResidency8Samples = false, + .sparseResidency16Samples = false, + .sparseResidencyAliased = false, + .variableMultisampleRate = false, + .inheritedQueries = false, + }; + + VkPhysicalDeviceTimelineSemaphoreFeaturesKHR timeline_semaphore{ + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES_KHR, + .pNext = nullptr, + .timelineSemaphore = true, + }; + SetNext(next, timeline_semaphore); + + VkPhysicalDevice16BitStorageFeaturesKHR bit16_storage{ + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR, + .pNext = nullptr, + .storageBuffer16BitAccess = false, + .uniformAndStorageBuffer16BitAccess = true, + .storagePushConstant16 = false, + .storageInputOutput16 = false, + }; SetNext(next, bit16_storage); - VkPhysicalDevice8BitStorageFeaturesKHR bit8_storage; - bit8_storage.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR; - bit8_storage.pNext = nullptr; - bit8_storage.storageBuffer8BitAccess = false; - bit8_storage.uniformAndStorageBuffer8BitAccess = true; - bit8_storage.storagePushConstant8 = false; + VkPhysicalDevice8BitStorageFeaturesKHR bit8_storage{ + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR, + .pNext = nullptr, + .storageBuffer8BitAccess = false, + .uniformAndStorageBuffer8BitAccess = true, + .storagePushConstant8 = false, + }; SetNext(next, bit8_storage); - VkPhysicalDeviceHostQueryResetFeaturesEXT host_query_reset; - host_query_reset.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT; - host_query_reset.hostQueryReset = true; + VkPhysicalDeviceHostQueryResetFeaturesEXT host_query_reset{ + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT, + .hostQueryReset = true, + }; SetNext(next, host_query_reset); VkPhysicalDeviceFloat16Int8FeaturesKHR float16_int8; if (is_float16_supported) { - float16_int8.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR; - float16_int8.pNext = nullptr; - float16_int8.shaderFloat16 = true; - float16_int8.shaderInt8 = false; + float16_int8 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR, + .pNext = nullptr, + .shaderFloat16 = true, + .shaderInt8 = false, + }; SetNext(next, float16_int8); } else { LOG_INFO(Render_Vulkan, "Device doesn't support float16 natively"); } + if (!nv_viewport_swizzle) { + LOG_INFO(Render_Vulkan, "Device doesn't support viewport swizzles"); + } + VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR std430_layout; if (khr_uniform_buffer_standard_layout) { - std430_layout.sType = - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_UNIFORM_BUFFER_STANDARD_LAYOUT_FEATURES_KHR; - std430_layout.pNext = nullptr; - std430_layout.uniformBufferStandardLayout = true; + std430_layout = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_UNIFORM_BUFFER_STANDARD_LAYOUT_FEATURES_KHR, + .pNext = nullptr, + .uniformBufferStandardLayout = true, + }; SetNext(next, std430_layout); } else { LOG_INFO(Render_Vulkan, "Device doesn't support packed UBOs"); @@ -272,9 +332,11 @@ bool VKDevice::Create() { VkPhysicalDeviceIndexTypeUint8FeaturesEXT index_type_uint8; if (ext_index_type_uint8) { - index_type_uint8.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INDEX_TYPE_UINT8_FEATURES_EXT; - index_type_uint8.pNext = nullptr; - index_type_uint8.indexTypeUint8 = true; + index_type_uint8 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INDEX_TYPE_UINT8_FEATURES_EXT, + .pNext = nullptr, + .indexTypeUint8 = true, + }; SetNext(next, index_type_uint8); } else { LOG_INFO(Render_Vulkan, "Device doesn't support uint8 indexes"); @@ -282,21 +344,61 @@ bool VKDevice::Create() { VkPhysicalDeviceTransformFeedbackFeaturesEXT transform_feedback; if (ext_transform_feedback) { - transform_feedback.sType = - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT; - transform_feedback.pNext = nullptr; - transform_feedback.transformFeedback = true; - transform_feedback.geometryStreams = true; + transform_feedback = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT, + .pNext = nullptr, + .transformFeedback = true, + .geometryStreams = true, + }; SetNext(next, transform_feedback); } else { LOG_INFO(Render_Vulkan, "Device doesn't support transform feedbacks"); } + VkPhysicalDeviceCustomBorderColorFeaturesEXT custom_border; + if (ext_custom_border_color) { + custom_border = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT, + .pNext = nullptr, + .customBorderColors = VK_TRUE, + .customBorderColorWithoutFormat = VK_TRUE, + }; + SetNext(next, custom_border); + } else { + LOG_INFO(Render_Vulkan, "Device doesn't support custom border colors"); + } + + VkPhysicalDeviceExtendedDynamicStateFeaturesEXT dynamic_state; + if (ext_extended_dynamic_state) { + dynamic_state = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_FEATURES_EXT, + .pNext = nullptr, + .extendedDynamicState = VK_TRUE, + }; + SetNext(next, dynamic_state); + } else { + LOG_INFO(Render_Vulkan, "Device doesn't support extended dynamic state"); + } + if (!ext_depth_range_unrestricted) { LOG_INFO(Render_Vulkan, "Device doesn't support depth range unrestricted"); } - logical = vk::Device::Create(physical, queue_cis, extensions, features2, dld); + VkDeviceDiagnosticsConfigCreateInfoNV diagnostics_nv; + if (nv_device_diagnostics_config) { + nsight_aftermath_tracker.Initialize(); + + diagnostics_nv = { + .sType = VK_STRUCTURE_TYPE_DEVICE_DIAGNOSTICS_CONFIG_CREATE_INFO_NV, + .pNext = &features2, + .flags = VK_DEVICE_DIAGNOSTICS_CONFIG_ENABLE_SHADER_DEBUG_INFO_BIT_NV | + VK_DEVICE_DIAGNOSTICS_CONFIG_ENABLE_RESOURCE_TRACKING_BIT_NV | + VK_DEVICE_DIAGNOSTICS_CONFIG_ENABLE_AUTOMATIC_CHECKPOINTS_BIT_NV, + }; + first_next = &diagnostics_nv; + } + + logical = vk::Device::Create(physical, queue_cis, extensions, first_next, dld); if (!logical) { LOG_ERROR(Render_Vulkan, "Failed to create logical device"); return false; @@ -304,8 +406,19 @@ bool VKDevice::Create() { CollectTelemetryParameters(); + if (ext_extended_dynamic_state && IsRDNA(properties.deviceName, driver_id)) { + // AMD's proprietary driver supports VK_EXT_extended_dynamic_state but on RDNA devices it + // seems to cause stability issues + LOG_WARNING( + Render_Vulkan, + "Blacklisting AMD proprietary on RDNA devices from VK_EXT_extended_dynamic_state"); + ext_extended_dynamic_state = false; + } + graphics_queue = logical.GetQueue(graphics_family); present_queue = logical.GetQueue(present_family); + + use_asynchronous_shaders = Settings::values.use_asynchronous_shaders.GetValue(); return true; } @@ -344,17 +457,12 @@ VkFormat VKDevice::GetSupportedFormat(VkFormat wanted_format, VkFormatFeatureFla void VKDevice::ReportLoss() const { LOG_CRITICAL(Render_Vulkan, "Device loss occured!"); - // Wait some time to let the log flush - std::this_thread::sleep_for(std::chrono::seconds{1}); - - if (!nv_device_diagnostic_checkpoints) { - return; - } + // Wait for the log to flush and for Nsight Aftermath to dump the results + std::this_thread::sleep_for(std::chrono::seconds{3}); +} - [[maybe_unused]] const std::vector data = graphics_queue.GetCheckpointDataNV(dld); - // Catch here in debug builds (or with optimizations disabled) the last graphics pipeline to be - // executed. It can be done on a debugger by evaluating the expression: - // *(VKGraphicsPipeline*)data[0] +void VKDevice::SaveShader(const std::vector<u32>& spirv) const { + nsight_aftermath_tracker.SaveShader(spirv); } bool VKDevice::IsOptimalAstcSupported(const VkPhysicalDeviceFeatures& features) const { @@ -492,43 +600,44 @@ bool VKDevice::IsSuitable(vk::PhysicalDevice physical, VkSurfaceKHR surface) { std::vector<const char*> VKDevice::LoadExtensions() { std::vector<const char*> extensions; - const auto Test = [&](const VkExtensionProperties& extension, - std::optional<std::reference_wrapper<bool>> status, const char* name, - bool push) { - if (extension.extensionName != std::string_view(name)) { - return; - } - if (push) { - extensions.push_back(name); - } - if (status) { - status->get() = true; - } - }; - extensions.reserve(7 + REQUIRED_EXTENSIONS.size()); extensions.insert(extensions.begin(), REQUIRED_EXTENSIONS.begin(), REQUIRED_EXTENSIONS.end()); bool has_khr_shader_float16_int8{}; bool has_ext_subgroup_size_control{}; bool has_ext_transform_feedback{}; - for (const auto& extension : physical.EnumerateDeviceExtensionProperties()) { - Test(extension, khr_uniform_buffer_standard_layout, + bool has_ext_custom_border_color{}; + bool has_ext_extended_dynamic_state{}; + for (const VkExtensionProperties& extension : physical.EnumerateDeviceExtensionProperties()) { + const auto test = [&](std::optional<std::reference_wrapper<bool>> status, const char* name, + bool push) { + if (extension.extensionName != std::string_view(name)) { + return; + } + if (push) { + extensions.push_back(name); + } + if (status) { + status->get() = true; + } + }; + test(nv_viewport_swizzle, VK_NV_VIEWPORT_SWIZZLE_EXTENSION_NAME, true); + test(khr_uniform_buffer_standard_layout, VK_KHR_UNIFORM_BUFFER_STANDARD_LAYOUT_EXTENSION_NAME, true); - Test(extension, has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, - false); - Test(extension, ext_depth_range_unrestricted, - VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true); - Test(extension, ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true); - Test(extension, ext_shader_viewport_index_layer, - VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME, true); - Test(extension, has_ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME, - false); - Test(extension, has_ext_transform_feedback, VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME, - false); + test(has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, false); + test(ext_depth_range_unrestricted, VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true); + test(ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true); + test(ext_shader_viewport_index_layer, VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME, + true); + test(has_ext_transform_feedback, VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME, false); + test(has_ext_custom_border_color, VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME, false); + test(has_ext_extended_dynamic_state, VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME, false); + if (instance_version >= VK_API_VERSION_1_1) { + test(has_ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME, false); + } if (Settings::values.renderer_debug) { - Test(extension, nv_device_diagnostic_checkpoints, - VK_NV_DEVICE_DIAGNOSTIC_CHECKPOINTS_EXTENSION_NAME, true); + test(nv_device_diagnostics_config, VK_NV_DEVICE_DIAGNOSTICS_CONFIG_EXTENSION_NAME, + true); } } @@ -598,6 +707,32 @@ std::vector<const char*> VKDevice::LoadExtensions() { } } + if (has_ext_custom_border_color) { + VkPhysicalDeviceCustomBorderColorFeaturesEXT border_features; + border_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT; + border_features.pNext = nullptr; + features.pNext = &border_features; + physical.GetFeatures2KHR(features); + + if (border_features.customBorderColors && border_features.customBorderColorWithoutFormat) { + extensions.push_back(VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME); + ext_custom_border_color = true; + } + } + + if (has_ext_extended_dynamic_state) { + VkPhysicalDeviceExtendedDynamicStateFeaturesEXT dynamic_state; + dynamic_state.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_FEATURES_EXT; + dynamic_state.pNext = nullptr; + features.pNext = &dynamic_state; + physical.GetFeatures2KHR(features); + + if (dynamic_state.extendedDynamicState) { + extensions.push_back(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME); + ext_extended_dynamic_state = true; + } + } + return extensions; } @@ -633,14 +768,21 @@ void VKDevice::SetupFeatures() { } void VKDevice::CollectTelemetryParameters() { - VkPhysicalDeviceDriverPropertiesKHR driver; - driver.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES_KHR; - driver.pNext = nullptr; + VkPhysicalDeviceDriverPropertiesKHR driver{ + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES_KHR, + .pNext = nullptr, + .driverID = {}, + .driverName = {}, + .driverInfo = {}, + .conformanceVersion = {}, + }; - VkPhysicalDeviceProperties2KHR properties; - properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR; - properties.pNext = &driver; - physical.GetProperties2KHR(properties); + VkPhysicalDeviceProperties2KHR device_properties{ + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR, + .pNext = &driver, + .properties = {}, + }; + physical.GetProperties2KHR(device_properties); driver_id = driver.driverID; vendor_name = driver.driverName; @@ -648,23 +790,26 @@ void VKDevice::CollectTelemetryParameters() { const std::vector extensions = physical.EnumerateDeviceExtensionProperties(); reported_extensions.reserve(std::size(extensions)); for (const auto& extension : extensions) { - reported_extensions.push_back(extension.extensionName); + reported_extensions.emplace_back(extension.extensionName); } } std::vector<VkDeviceQueueCreateInfo> VKDevice::GetDeviceQueueCreateInfos() const { static constexpr float QUEUE_PRIORITY = 1.0f; - std::unordered_set<u32> unique_queue_families = {graphics_family, present_family}; + std::unordered_set<u32> unique_queue_families{graphics_family, present_family}; std::vector<VkDeviceQueueCreateInfo> queue_cis; + queue_cis.reserve(unique_queue_families.size()); for (const u32 queue_family : unique_queue_families) { - VkDeviceQueueCreateInfo& ci = queue_cis.emplace_back(); - ci.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.queueFamilyIndex = queue_family; - ci.queueCount = 1; + auto& ci = queue_cis.emplace_back(VkDeviceQueueCreateInfo{ + .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .queueFamilyIndex = queue_family, + .queueCount = 1, + .pQueuePriorities = nullptr, + }); ci.pQueuePriorities = &QUEUE_PRIORITY; } diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/renderer_vulkan/vk_device.h index 60d64572a..4286673d9 100644 --- a/src/video_core/renderer_vulkan/vk_device.h +++ b/src/video_core/renderer_vulkan/vk_device.h @@ -10,6 +10,7 @@ #include <vector> #include "common/common_types.h" +#include "video_core/renderer_vulkan/nsight_aftermath_tracker.h" #include "video_core/renderer_vulkan/wrapper.h" namespace Vulkan { @@ -23,8 +24,8 @@ const u32 GuestWarpSize = 32; /// Handles data specific to a physical device. class VKDevice final { public: - explicit VKDevice(VkInstance instance, vk::PhysicalDevice physical, VkSurfaceKHR surface, - const vk::InstanceDispatch& dld); + explicit VKDevice(VkInstance instance, u32 instance_version, vk::PhysicalDevice physical, + VkSurfaceKHR surface, const vk::InstanceDispatch& dld); ~VKDevice(); /// Initializes the device. Returns true on success. @@ -43,6 +44,9 @@ public: /// Reports a device loss. void ReportLoss() const; + /// Reports a shader to Nsight Aftermath. + void SaveShader(const std::vector<u32>& spirv) const; + /// Returns the dispatch loader with direct function pointers of the device. const vk::DeviceDispatch& GetDispatchLoader() const { return dld; @@ -78,13 +82,13 @@ public: return present_family; } - /// Returns true if the device is integrated with the host CPU. - bool IsIntegrated() const { - return properties.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU; + /// Returns the current instance Vulkan API version in Vulkan-formatted version numbers. + u32 InstanceApiVersion() const { + return instance_version; } /// Returns the current Vulkan API version provided in Vulkan-formatted version numbers. - u32 GetApiVersion() const { + u32 ApiVersion() const { return properties.apiVersion; } @@ -123,6 +127,11 @@ public: return properties.limits.maxPushConstantsSize; } + /// Returns the maximum size for shared memory. + u32 GetMaxComputeSharedMemorySize() const { + return properties.limits.maxComputeSharedMemorySize; + } + /// Returns true if ASTC is natively supported. bool IsOptimalAstcSupported() const { return is_optimal_astc_supported; @@ -148,6 +157,11 @@ public: return is_formatless_image_load_supported; } + /// Returns true if the device supports VK_NV_viewport_swizzle. + bool IsNvViewportSwizzleSupported() const { + return nv_viewport_swizzle; + } + /// Returns true if the device supports VK_EXT_scalar_block_layout. bool IsKhrUniformBufferStandardLayoutSupported() const { return khr_uniform_buffer_standard_layout; @@ -173,9 +187,14 @@ public: return ext_transform_feedback; } - /// Returns true if the device supports VK_NV_device_diagnostic_checkpoints. - bool IsNvDeviceDiagnosticCheckpoints() const { - return nv_device_diagnostic_checkpoints; + /// Returns true if the device supports VK_EXT_custom_border_color. + bool IsExtCustomBorderColorSupported() const { + return ext_custom_border_color; + } + + /// Returns true if the device supports VK_EXT_extended_dynamic_state. + bool IsExtExtendedDynamicStateSupported() const { + return ext_extended_dynamic_state; } /// Returns the vendor name reported from Vulkan. @@ -188,6 +207,11 @@ public: return reported_extensions; } + /// Returns true if the setting for async shader compilation is enabled. + bool UseAsynchronousShaders() const { + return use_asynchronous_shaders; + } + /// Checks if the physical device is suitable. static bool IsSuitable(vk::PhysicalDevice physical, VkSurfaceKHR surface); @@ -220,6 +244,7 @@ private: vk::Device logical; ///< Logical device. vk::Queue graphics_queue; ///< Main graphics queue. vk::Queue present_queue; ///< Main present queue. + u32 instance_version{}; ///< Vulkan onstance version. u32 graphics_family{}; ///< Main graphics queue family index. u32 present_family{}; ///< Main present queue family index. VkDriverIdKHR driver_id{}; ///< Driver ID. @@ -228,12 +253,18 @@ private: bool is_float16_supported{}; ///< Support for float16 arithmetics. bool is_warp_potentially_bigger{}; ///< Host warp size can be bigger than guest. bool is_formatless_image_load_supported{}; ///< Support for shader image read without format. + bool nv_viewport_swizzle{}; ///< Support for VK_NV_viewport_swizzle. bool khr_uniform_buffer_standard_layout{}; ///< Support for std430 on UBOs. bool ext_index_type_uint8{}; ///< Support for VK_EXT_index_type_uint8. bool ext_depth_range_unrestricted{}; ///< Support for VK_EXT_depth_range_unrestricted. bool ext_shader_viewport_index_layer{}; ///< Support for VK_EXT_shader_viewport_index_layer. bool ext_transform_feedback{}; ///< Support for VK_EXT_transform_feedback. - bool nv_device_diagnostic_checkpoints{}; ///< Support for VK_NV_device_diagnostic_checkpoints. + bool ext_custom_border_color{}; ///< Support for VK_EXT_custom_border_color. + bool ext_extended_dynamic_state{}; ///< Support for VK_EXT_extended_dynamic_state. + bool nv_device_diagnostics_config{}; ///< Support for VK_NV_device_diagnostics_config. + + // Asynchronous Graphics Pipeline setting + bool use_asynchronous_shaders{}; ///< Setting to use asynchronous shaders/graphics pipeline // Telemetry parameters std::string vendor_name; ///< Device's driver name. @@ -241,6 +272,9 @@ private: /// Format properties dictionary. std::unordered_map<VkFormat, VkFormatProperties> format_properties; + + /// Nsight Aftermath GPU crash tracker + NsightAftermathTracker nsight_aftermath_tracker; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.cpp b/src/video_core/renderer_vulkan/vk_fence_manager.cpp new file mode 100644 index 000000000..5babbdd0b --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_fence_manager.cpp @@ -0,0 +1,101 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <memory> +#include <thread> + +#include "video_core/renderer_vulkan/vk_buffer_cache.h" +#include "video_core/renderer_vulkan/vk_device.h" +#include "video_core/renderer_vulkan/vk_fence_manager.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_texture_cache.h" +#include "video_core/renderer_vulkan/wrapper.h" + +namespace Vulkan { + +InnerFence::InnerFence(const VKDevice& device, VKScheduler& scheduler, u32 payload, bool is_stubbed) + : VideoCommon::FenceBase(payload, is_stubbed), device{device}, scheduler{scheduler} {} + +InnerFence::InnerFence(const VKDevice& device, VKScheduler& scheduler, GPUVAddr address, + u32 payload, bool is_stubbed) + : VideoCommon::FenceBase(address, payload, is_stubbed), device{device}, scheduler{scheduler} {} + +InnerFence::~InnerFence() = default; + +void InnerFence::Queue() { + if (is_stubbed) { + return; + } + ASSERT(!event); + + event = device.GetLogical().CreateEvent(); + ticks = scheduler.CurrentTick(); + + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([event = *event](vk::CommandBuffer cmdbuf) { + cmdbuf.SetEvent(event, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT); + }); +} + +bool InnerFence::IsSignaled() const { + if (is_stubbed) { + return true; + } + ASSERT(event); + return IsEventSignalled(); +} + +void InnerFence::Wait() { + if (is_stubbed) { + return; + } + ASSERT(event); + + if (ticks >= scheduler.CurrentTick()) { + scheduler.Flush(); + } + while (!IsEventSignalled()) { + std::this_thread::yield(); + } +} + +bool InnerFence::IsEventSignalled() const { + switch (const VkResult result = event.GetStatus()) { + case VK_EVENT_SET: + return true; + case VK_EVENT_RESET: + return false; + default: + throw vk::Exception(result); + } +} + +VKFenceManager::VKFenceManager(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu, + Tegra::MemoryManager& memory_manager, VKTextureCache& texture_cache, + VKBufferCache& buffer_cache, VKQueryCache& query_cache, + const VKDevice& device_, VKScheduler& scheduler_) + : GenericFenceManager(rasterizer, gpu, texture_cache, buffer_cache, query_cache), + device{device_}, scheduler{scheduler_} {} + +Fence VKFenceManager::CreateFence(u32 value, bool is_stubbed) { + return std::make_shared<InnerFence>(device, scheduler, value, is_stubbed); +} + +Fence VKFenceManager::CreateFence(GPUVAddr addr, u32 value, bool is_stubbed) { + return std::make_shared<InnerFence>(device, scheduler, addr, value, is_stubbed); +} + +void VKFenceManager::QueueFence(Fence& fence) { + fence->Queue(); +} + +bool VKFenceManager::IsFenceSignaled(Fence& fence) const { + return fence->IsSignaled(); +} + +void VKFenceManager::WaitFence(Fence& fence) { + fence->Wait(); +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h new file mode 100644 index 000000000..1547d6d30 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_fence_manager.h @@ -0,0 +1,75 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> + +#include "video_core/fence_manager.h" +#include "video_core/renderer_vulkan/vk_buffer_cache.h" +#include "video_core/renderer_vulkan/wrapper.h" + +namespace Core { +class System; +} + +namespace VideoCore { +class RasterizerInterface; +} + +namespace Vulkan { + +class VKBufferCache; +class VKDevice; +class VKQueryCache; +class VKScheduler; +class VKTextureCache; + +class InnerFence : public VideoCommon::FenceBase { +public: + explicit InnerFence(const VKDevice& device, VKScheduler& scheduler, u32 payload, + bool is_stubbed); + explicit InnerFence(const VKDevice& device, VKScheduler& scheduler, GPUVAddr address, + u32 payload, bool is_stubbed); + ~InnerFence(); + + void Queue(); + + bool IsSignaled() const; + + void Wait(); + +private: + bool IsEventSignalled() const; + + const VKDevice& device; + VKScheduler& scheduler; + vk::Event event; + u64 ticks = 0; +}; +using Fence = std::shared_ptr<InnerFence>; + +using GenericFenceManager = + VideoCommon::FenceManager<Fence, VKTextureCache, VKBufferCache, VKQueryCache>; + +class VKFenceManager final : public GenericFenceManager { +public: + explicit VKFenceManager(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu, + Tegra::MemoryManager& memory_manager, VKTextureCache& texture_cache, + VKBufferCache& buffer_cache, VKQueryCache& query_cache, + const VKDevice& device, VKScheduler& scheduler); + +protected: + Fence CreateFence(u32 value, bool is_stubbed) override; + Fence CreateFence(GPUVAddr addr, u32 value, bool is_stubbed) override; + void QueueFence(Fence& fence) override; + bool IsFenceSignaled(Fence& fence) const override; + void WaitFence(Fence& fence) override; + +private: + const VKDevice& device; + VKScheduler& scheduler; +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index b540b838d..0e8f9c352 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -2,11 +2,11 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <algorithm> #include <array> #include <cstring> #include <vector> -#include "common/assert.h" #include "common/common_types.h" #include "common/microprofile.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" @@ -26,16 +26,17 @@ MICROPROFILE_DECLARE(Vulkan_PipelineCache); namespace { -VkStencilOpState GetStencilFaceState(const FixedPipelineState::StencilFace& face) { - VkStencilOpState state; - state.failOp = MaxwellToVK::StencilOp(face.action_stencil_fail); - state.passOp = MaxwellToVK::StencilOp(face.action_depth_pass); - state.depthFailOp = MaxwellToVK::StencilOp(face.action_depth_fail); - state.compareOp = MaxwellToVK::ComparisonOp(face.test_func); - state.compareMask = 0; - state.writeMask = 0; - state.reference = 0; - return state; +template <class StencilFace> +VkStencilOpState GetStencilFaceState(const StencilFace& face) { + return { + .failOp = MaxwellToVK::StencilOp(face.ActionStencilFail()), + .passOp = MaxwellToVK::StencilOp(face.ActionDepthPass()), + .depthFailOp = MaxwellToVK::StencilOp(face.ActionDepthFail()), + .compareOp = MaxwellToVK::ComparisonOp(face.TestFunc()), + .compareMask = 0, + .writeMask = 0, + .reference = 0, + }; } bool SupportsPrimitiveRestart(VkPrimitiveTopology topology) { @@ -50,6 +51,24 @@ bool SupportsPrimitiveRestart(VkPrimitiveTopology topology) { topology) == std::end(unsupported_topologies); } +VkViewportSwizzleNV UnpackViewportSwizzle(u16 swizzle) { + union Swizzle { + u32 raw; + BitField<0, 3, Maxwell::ViewportSwizzle> x; + BitField<4, 3, Maxwell::ViewportSwizzle> y; + BitField<8, 3, Maxwell::ViewportSwizzle> z; + BitField<12, 3, Maxwell::ViewportSwizzle> w; + }; + const Swizzle unpacked{swizzle}; + + return { + .x = MaxwellToVK::ViewportSwizzle(unpacked.x), + .y = MaxwellToVK::ViewportSwizzle(unpacked.y), + .z = MaxwellToVK::ViewportSwizzle(unpacked.z), + .w = MaxwellToVK::ViewportSwizzle(unpacked.w), + }; +} + } // Anonymous namespace VKGraphicsPipeline::VKGraphicsPipeline(const VKDevice& device, VKScheduler& scheduler, @@ -59,15 +78,14 @@ VKGraphicsPipeline::VKGraphicsPipeline(const VKDevice& device, VKScheduler& sche const GraphicsPipelineCacheKey& key, vk::Span<VkDescriptorSetLayoutBinding> bindings, const SPIRVProgram& program) - : device{device}, scheduler{scheduler}, fixed_state{key.fixed_state}, hash{key.Hash()}, + : device{device}, scheduler{scheduler}, cache_key{key}, hash{cache_key.Hash()}, descriptor_set_layout{CreateDescriptorSetLayout(bindings)}, descriptor_allocator{descriptor_pool, *descriptor_set_layout}, update_descriptor_queue{update_descriptor_queue}, layout{CreatePipelineLayout()}, descriptor_template{CreateDescriptorUpdateTemplate(program)}, modules{CreateShaderModules( program)}, - renderpass{renderpass_cache.GetRenderPass(key.renderpass_params)}, pipeline{CreatePipeline( - key.renderpass_params, - program)} {} + renderpass{renderpass_cache.GetRenderPass(cache_key.renderpass_params)}, + pipeline{CreatePipeline(cache_key.renderpass_params, program)} {} VKGraphicsPipeline::~VKGraphicsPipeline() = default; @@ -75,31 +93,33 @@ VkDescriptorSet VKGraphicsPipeline::CommitDescriptorSet() { if (!descriptor_template) { return {}; } - const auto set = descriptor_allocator.Commit(scheduler.GetFence()); + const VkDescriptorSet set = descriptor_allocator.Commit(); update_descriptor_queue.Send(*descriptor_template, set); return set; } vk::DescriptorSetLayout VKGraphicsPipeline::CreateDescriptorSetLayout( vk::Span<VkDescriptorSetLayoutBinding> bindings) const { - VkDescriptorSetLayoutCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.bindingCount = bindings.size(); - ci.pBindings = bindings.data(); + const VkDescriptorSetLayoutCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .bindingCount = bindings.size(), + .pBindings = bindings.data(), + }; return device.GetLogical().CreateDescriptorSetLayout(ci); } vk::PipelineLayout VKGraphicsPipeline::CreatePipelineLayout() const { - VkPipelineLayoutCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.setLayoutCount = 1; - ci.pSetLayouts = descriptor_set_layout.address(); - ci.pushConstantRangeCount = 0; - ci.pPushConstantRanges = nullptr; + const VkPipelineLayoutCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .setLayoutCount = 1, + .pSetLayouts = descriptor_set_layout.address(), + .pushConstantRangeCount = 0, + .pPushConstantRanges = nullptr, + }; return device.GetLogical().CreatePipelineLayout(ci); } @@ -118,26 +138,29 @@ vk::DescriptorUpdateTemplateKHR VKGraphicsPipeline::CreateDescriptorUpdateTempla return {}; } - VkDescriptorUpdateTemplateCreateInfoKHR ci; - ci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR; - ci.pNext = nullptr; - ci.flags = 0; - ci.descriptorUpdateEntryCount = static_cast<u32>(template_entries.size()); - ci.pDescriptorUpdateEntries = template_entries.data(); - ci.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR; - ci.descriptorSetLayout = *descriptor_set_layout; - ci.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; - ci.pipelineLayout = *layout; - ci.set = DESCRIPTOR_SET; + const VkDescriptorUpdateTemplateCreateInfoKHR ci{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR, + .pNext = nullptr, + .flags = 0, + .descriptorUpdateEntryCount = static_cast<u32>(template_entries.size()), + .pDescriptorUpdateEntries = template_entries.data(), + .templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR, + .descriptorSetLayout = *descriptor_set_layout, + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .pipelineLayout = *layout, + .set = DESCRIPTOR_SET, + }; return device.GetLogical().CreateDescriptorUpdateTemplateKHR(ci); } std::vector<vk::ShaderModule> VKGraphicsPipeline::CreateShaderModules( const SPIRVProgram& program) const { - VkShaderModuleCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; + VkShaderModuleCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .codeSize = 0, + }; std::vector<vk::ShaderModule> modules; modules.reserve(Maxwell::MaxShaderStage); @@ -147,6 +170,8 @@ std::vector<vk::ShaderModule> VKGraphicsPipeline::CreateShaderModules( continue; } + device.SaveShader(stage->code); + ci.codeSize = stage->code.size() * sizeof(u32); ci.pCode = stage->code.data(); modules.push_back(device.GetLogical().CreateShaderModule(ci)); @@ -156,186 +181,251 @@ std::vector<vk::ShaderModule> VKGraphicsPipeline::CreateShaderModules( vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpass_params, const SPIRVProgram& program) const { - const auto& vi = fixed_state.vertex_input; - const auto& ia = fixed_state.input_assembly; - const auto& ds = fixed_state.depth_stencil; - const auto& cd = fixed_state.color_blending; - const auto& ts = fixed_state.tessellation; - const auto& rs = fixed_state.rasterizer; + const auto& state = cache_key.fixed_state; + const auto& viewport_swizzles = state.viewport_swizzles; + + FixedPipelineState::DynamicState dynamic; + if (device.IsExtExtendedDynamicStateSupported()) { + // Insert dummy values, as long as they are valid they don't matter as extended dynamic + // state is ignored + dynamic.raw1 = 0; + dynamic.raw2 = 0; + for (FixedPipelineState::VertexBinding& binding : dynamic.vertex_bindings) { + // Enable all vertex bindings + binding.raw = 0; + binding.enabled.Assign(1); + } + } else { + dynamic = state.dynamic_state; + } std::vector<VkVertexInputBindingDescription> vertex_bindings; std::vector<VkVertexInputBindingDivisorDescriptionEXT> vertex_binding_divisors; - for (std::size_t i = 0; i < vi.num_bindings; ++i) { - const auto& binding = vi.bindings[i]; - const bool instanced = binding.divisor != 0; + for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { + const auto& binding = dynamic.vertex_bindings[index]; + if (!binding.enabled) { + continue; + } + const bool instanced = state.binding_divisors[index] != 0; const auto rate = instanced ? VK_VERTEX_INPUT_RATE_INSTANCE : VK_VERTEX_INPUT_RATE_VERTEX; - auto& vertex_binding = vertex_bindings.emplace_back(); - vertex_binding.binding = binding.index; - vertex_binding.stride = binding.stride; - vertex_binding.inputRate = rate; + vertex_bindings.push_back({ + .binding = static_cast<u32>(index), + .stride = binding.stride, + .inputRate = rate, + }); if (instanced) { - auto& binding_divisor = vertex_binding_divisors.emplace_back(); - binding_divisor.binding = binding.index; - binding_divisor.divisor = binding.divisor; + vertex_binding_divisors.push_back({ + .binding = static_cast<u32>(index), + .divisor = state.binding_divisors[index], + }); } } std::vector<VkVertexInputAttributeDescription> vertex_attributes; const auto& input_attributes = program[0]->entries.attributes; - for (std::size_t i = 0; i < vi.num_attributes; ++i) { - const auto& attribute = vi.attributes[i]; - if (input_attributes.find(attribute.index) == input_attributes.end()) { + for (std::size_t index = 0; index < state.attributes.size(); ++index) { + const auto& attribute = state.attributes[index]; + if (!attribute.enabled) { + continue; + } + if (input_attributes.find(static_cast<u32>(index)) == input_attributes.end()) { // Skip attributes not used by the vertex shaders. continue; } - auto& vertex_attribute = vertex_attributes.emplace_back(); - vertex_attribute.location = attribute.index; - vertex_attribute.binding = attribute.buffer; - vertex_attribute.format = MaxwellToVK::VertexFormat(attribute.type, attribute.size); - vertex_attribute.offset = attribute.offset; + vertex_attributes.push_back({ + .location = static_cast<u32>(index), + .binding = attribute.buffer, + .format = MaxwellToVK::VertexFormat(attribute.Type(), attribute.Size()), + .offset = attribute.offset, + }); } - VkPipelineVertexInputStateCreateInfo vertex_input_ci; - vertex_input_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; - vertex_input_ci.pNext = nullptr; - vertex_input_ci.flags = 0; - vertex_input_ci.vertexBindingDescriptionCount = static_cast<u32>(vertex_bindings.size()); - vertex_input_ci.pVertexBindingDescriptions = vertex_bindings.data(); - vertex_input_ci.vertexAttributeDescriptionCount = static_cast<u32>(vertex_attributes.size()); - vertex_input_ci.pVertexAttributeDescriptions = vertex_attributes.data(); - - VkPipelineVertexInputDivisorStateCreateInfoEXT input_divisor_ci; - input_divisor_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT; - input_divisor_ci.pNext = nullptr; - input_divisor_ci.vertexBindingDivisorCount = static_cast<u32>(vertex_binding_divisors.size()); - input_divisor_ci.pVertexBindingDivisors = vertex_binding_divisors.data(); + VkPipelineVertexInputStateCreateInfo vertex_input_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .vertexBindingDescriptionCount = static_cast<u32>(vertex_bindings.size()), + .pVertexBindingDescriptions = vertex_bindings.data(), + .vertexAttributeDescriptionCount = static_cast<u32>(vertex_attributes.size()), + .pVertexAttributeDescriptions = vertex_attributes.data(), + }; + + const VkPipelineVertexInputDivisorStateCreateInfoEXT input_divisor_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT, + .pNext = nullptr, + .vertexBindingDivisorCount = static_cast<u32>(vertex_binding_divisors.size()), + .pVertexBindingDivisors = vertex_binding_divisors.data(), + }; if (!vertex_binding_divisors.empty()) { vertex_input_ci.pNext = &input_divisor_ci; } - VkPipelineInputAssemblyStateCreateInfo input_assembly_ci; - input_assembly_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; - input_assembly_ci.pNext = nullptr; - input_assembly_ci.flags = 0; - input_assembly_ci.topology = MaxwellToVK::PrimitiveTopology(device, ia.topology); - input_assembly_ci.primitiveRestartEnable = - ia.primitive_restart_enable && SupportsPrimitiveRestart(input_assembly_ci.topology); - - VkPipelineTessellationStateCreateInfo tessellation_ci; - tessellation_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_TESSELLATION_STATE_CREATE_INFO; - tessellation_ci.pNext = nullptr; - tessellation_ci.flags = 0; - tessellation_ci.patchControlPoints = ts.patch_control_points; - - VkPipelineViewportStateCreateInfo viewport_ci; - viewport_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; - viewport_ci.pNext = nullptr; - viewport_ci.flags = 0; - viewport_ci.viewportCount = Maxwell::NumViewports; - viewport_ci.pViewports = nullptr; - viewport_ci.scissorCount = Maxwell::NumViewports; - viewport_ci.pScissors = nullptr; - - VkPipelineRasterizationStateCreateInfo rasterization_ci; - rasterization_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; - rasterization_ci.pNext = nullptr; - rasterization_ci.flags = 0; - rasterization_ci.depthClampEnable = rs.depth_clamp_enable; - rasterization_ci.rasterizerDiscardEnable = VK_FALSE; - rasterization_ci.polygonMode = VK_POLYGON_MODE_FILL; - rasterization_ci.cullMode = - rs.cull_enable ? MaxwellToVK::CullFace(rs.cull_face) : VK_CULL_MODE_NONE; - rasterization_ci.frontFace = MaxwellToVK::FrontFace(rs.front_face); - rasterization_ci.depthBiasEnable = rs.depth_bias_enable; - rasterization_ci.depthBiasConstantFactor = 0.0f; - rasterization_ci.depthBiasClamp = 0.0f; - rasterization_ci.depthBiasSlopeFactor = 0.0f; - rasterization_ci.lineWidth = 1.0f; - - VkPipelineMultisampleStateCreateInfo multisample_ci; - multisample_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; - multisample_ci.pNext = nullptr; - multisample_ci.flags = 0; - multisample_ci.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; - multisample_ci.sampleShadingEnable = VK_FALSE; - multisample_ci.minSampleShading = 0.0f; - multisample_ci.pSampleMask = nullptr; - multisample_ci.alphaToCoverageEnable = VK_FALSE; - multisample_ci.alphaToOneEnable = VK_FALSE; - - VkPipelineDepthStencilStateCreateInfo depth_stencil_ci; - depth_stencil_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; - depth_stencil_ci.pNext = nullptr; - depth_stencil_ci.flags = 0; - depth_stencil_ci.depthTestEnable = ds.depth_test_enable; - depth_stencil_ci.depthWriteEnable = ds.depth_write_enable; - depth_stencil_ci.depthCompareOp = ds.depth_test_enable - ? MaxwellToVK::ComparisonOp(ds.depth_test_function) - : VK_COMPARE_OP_ALWAYS; - depth_stencil_ci.depthBoundsTestEnable = ds.depth_bounds_enable; - depth_stencil_ci.stencilTestEnable = ds.stencil_enable; - depth_stencil_ci.front = GetStencilFaceState(ds.front_stencil); - depth_stencil_ci.back = GetStencilFaceState(ds.back_stencil); - depth_stencil_ci.minDepthBounds = 0.0f; - depth_stencil_ci.maxDepthBounds = 0.0f; + const auto input_assembly_topology = MaxwellToVK::PrimitiveTopology(device, state.topology); + const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .topology = MaxwellToVK::PrimitiveTopology(device, state.topology), + .primitiveRestartEnable = state.primitive_restart_enable != 0 && + SupportsPrimitiveRestart(input_assembly_topology), + }; + + const VkPipelineTessellationStateCreateInfo tessellation_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_TESSELLATION_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .patchControlPoints = state.patch_control_points_minus_one.Value() + 1, + }; + + VkPipelineViewportStateCreateInfo viewport_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .viewportCount = Maxwell::NumViewports, + .pViewports = nullptr, + .scissorCount = Maxwell::NumViewports, + .pScissors = nullptr, + }; + + std::array<VkViewportSwizzleNV, Maxwell::NumViewports> swizzles; + std::transform(viewport_swizzles.begin(), viewport_swizzles.end(), swizzles.begin(), + UnpackViewportSwizzle); + VkPipelineViewportSwizzleStateCreateInfoNV swizzle_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_SWIZZLE_STATE_CREATE_INFO_NV, + .pNext = nullptr, + .flags = 0, + .viewportCount = Maxwell::NumViewports, + .pViewportSwizzles = swizzles.data(), + }; + if (device.IsNvViewportSwizzleSupported()) { + viewport_ci.pNext = &swizzle_ci; + } + + const VkPipelineRasterizationStateCreateInfo rasterization_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .depthClampEnable = + static_cast<VkBool32>(state.depth_clamp_disabled == 0 ? VK_TRUE : VK_FALSE), + .rasterizerDiscardEnable = + static_cast<VkBool32>(state.rasterize_enable == 0 ? VK_TRUE : VK_FALSE), + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = + dynamic.cull_enable ? MaxwellToVK::CullFace(dynamic.CullFace()) : VK_CULL_MODE_NONE, + .frontFace = MaxwellToVK::FrontFace(dynamic.FrontFace()), + .depthBiasEnable = state.depth_bias_enable, + .depthBiasConstantFactor = 0.0f, + .depthBiasClamp = 0.0f, + .depthBiasSlopeFactor = 0.0f, + .lineWidth = 1.0f, + }; + + const VkPipelineMultisampleStateCreateInfo multisample_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT, + .sampleShadingEnable = VK_FALSE, + .minSampleShading = 0.0f, + .pSampleMask = nullptr, + .alphaToCoverageEnable = VK_FALSE, + .alphaToOneEnable = VK_FALSE, + }; + + const VkPipelineDepthStencilStateCreateInfo depth_stencil_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .depthTestEnable = dynamic.depth_test_enable, + .depthWriteEnable = dynamic.depth_write_enable, + .depthCompareOp = dynamic.depth_test_enable + ? MaxwellToVK::ComparisonOp(dynamic.DepthTestFunc()) + : VK_COMPARE_OP_ALWAYS, + .depthBoundsTestEnable = dynamic.depth_bounds_enable, + .stencilTestEnable = dynamic.stencil_enable, + .front = GetStencilFaceState(dynamic.front), + .back = GetStencilFaceState(dynamic.back), + .minDepthBounds = 0.0f, + .maxDepthBounds = 0.0f, + }; std::array<VkPipelineColorBlendAttachmentState, Maxwell::NumRenderTargets> cb_attachments; - const std::size_t num_attachments = - std::min(cd.attachments_count, renderpass_params.color_attachments.size()); - for (std::size_t i = 0; i < num_attachments; ++i) { - static constexpr std::array component_table = { - VK_COLOR_COMPONENT_R_BIT, VK_COLOR_COMPONENT_G_BIT, VK_COLOR_COMPONENT_B_BIT, - VK_COLOR_COMPONENT_A_BIT}; - const auto& blend = cd.attachments[i]; + const auto num_attachments = static_cast<std::size_t>(renderpass_params.num_color_attachments); + for (std::size_t index = 0; index < num_attachments; ++index) { + static constexpr std::array COMPONENT_TABLE{ + VK_COLOR_COMPONENT_R_BIT, + VK_COLOR_COMPONENT_G_BIT, + VK_COLOR_COMPONENT_B_BIT, + VK_COLOR_COMPONENT_A_BIT, + }; + const auto& blend = state.attachments[index]; VkColorComponentFlags color_components = 0; - for (std::size_t j = 0; j < component_table.size(); ++j) { - if (blend.components[j]) { - color_components |= component_table[j]; + for (std::size_t i = 0; i < COMPONENT_TABLE.size(); ++i) { + if (blend.Mask()[i]) { + color_components |= COMPONENT_TABLE[i]; } } - VkPipelineColorBlendAttachmentState& attachment = cb_attachments[i]; - attachment.blendEnable = blend.enable; - attachment.srcColorBlendFactor = MaxwellToVK::BlendFactor(blend.src_rgb_func); - attachment.dstColorBlendFactor = MaxwellToVK::BlendFactor(blend.dst_rgb_func); - attachment.colorBlendOp = MaxwellToVK::BlendEquation(blend.rgb_equation); - attachment.srcAlphaBlendFactor = MaxwellToVK::BlendFactor(blend.src_a_func); - attachment.dstAlphaBlendFactor = MaxwellToVK::BlendFactor(blend.dst_a_func); - attachment.alphaBlendOp = MaxwellToVK::BlendEquation(blend.a_equation); - attachment.colorWriteMask = color_components; + cb_attachments[index] = { + .blendEnable = blend.enable != 0, + .srcColorBlendFactor = MaxwellToVK::BlendFactor(blend.SourceRGBFactor()), + .dstColorBlendFactor = MaxwellToVK::BlendFactor(blend.DestRGBFactor()), + .colorBlendOp = MaxwellToVK::BlendEquation(blend.EquationRGB()), + .srcAlphaBlendFactor = MaxwellToVK::BlendFactor(blend.SourceAlphaFactor()), + .dstAlphaBlendFactor = MaxwellToVK::BlendFactor(blend.DestAlphaFactor()), + .alphaBlendOp = MaxwellToVK::BlendEquation(blend.EquationAlpha()), + .colorWriteMask = color_components, + }; } - VkPipelineColorBlendStateCreateInfo color_blend_ci; - color_blend_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; - color_blend_ci.pNext = nullptr; - color_blend_ci.flags = 0; - color_blend_ci.logicOpEnable = VK_FALSE; - color_blend_ci.logicOp = VK_LOGIC_OP_COPY; - color_blend_ci.attachmentCount = static_cast<u32>(num_attachments); - color_blend_ci.pAttachments = cb_attachments.data(); - std::memset(color_blend_ci.blendConstants, 0, sizeof(color_blend_ci.blendConstants)); - - static constexpr std::array dynamic_states = { + const VkPipelineColorBlendStateCreateInfo color_blend_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .logicOpEnable = VK_FALSE, + .logicOp = VK_LOGIC_OP_COPY, + .attachmentCount = static_cast<u32>(num_attachments), + .pAttachments = cb_attachments.data(), + .blendConstants = {}, + }; + + std::vector dynamic_states{ VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR, VK_DYNAMIC_STATE_DEPTH_BIAS, VK_DYNAMIC_STATE_BLEND_CONSTANTS, VK_DYNAMIC_STATE_DEPTH_BOUNDS, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, - VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, VK_DYNAMIC_STATE_STENCIL_REFERENCE}; + VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, VK_DYNAMIC_STATE_STENCIL_REFERENCE, + }; + if (device.IsExtExtendedDynamicStateSupported()) { + static constexpr std::array extended{ + VK_DYNAMIC_STATE_CULL_MODE_EXT, + VK_DYNAMIC_STATE_FRONT_FACE_EXT, + VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT, + VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT, + VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT, + VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT, + VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT, + VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT, + VK_DYNAMIC_STATE_STENCIL_OP_EXT, + }; + dynamic_states.insert(dynamic_states.end(), extended.begin(), extended.end()); + } - VkPipelineDynamicStateCreateInfo dynamic_state_ci; - dynamic_state_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO; - dynamic_state_ci.pNext = nullptr; - dynamic_state_ci.flags = 0; - dynamic_state_ci.dynamicStateCount = static_cast<u32>(dynamic_states.size()); - dynamic_state_ci.pDynamicStates = dynamic_states.data(); + const VkPipelineDynamicStateCreateInfo dynamic_state_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .dynamicStateCount = static_cast<u32>(dynamic_states.size()), + .pDynamicStates = dynamic_states.data(), + }; - VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci; - subgroup_size_ci.sType = - VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT; - subgroup_size_ci.pNext = nullptr; - subgroup_size_ci.requiredSubgroupSize = GuestWarpSize; + const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT, + .pNext = nullptr, + .requiredSubgroupSize = GuestWarpSize, + }; std::vector<VkPipelineShaderStageCreateInfo> shader_stages; std::size_t module_index = 0; @@ -343,6 +433,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa if (!program[stage]) { continue; } + VkPipelineShaderStageCreateInfo& stage_ci = shader_stages.emplace_back(); stage_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; stage_ci.pNext = nullptr; @@ -357,26 +448,27 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa } } - VkGraphicsPipelineCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.stageCount = static_cast<u32>(shader_stages.size()); - ci.pStages = shader_stages.data(); - ci.pVertexInputState = &vertex_input_ci; - ci.pInputAssemblyState = &input_assembly_ci; - ci.pTessellationState = &tessellation_ci; - ci.pViewportState = &viewport_ci; - ci.pRasterizationState = &rasterization_ci; - ci.pMultisampleState = &multisample_ci; - ci.pDepthStencilState = &depth_stencil_ci; - ci.pColorBlendState = &color_blend_ci; - ci.pDynamicState = &dynamic_state_ci; - ci.layout = *layout; - ci.renderPass = renderpass; - ci.subpass = 0; - ci.basePipelineHandle = nullptr; - ci.basePipelineIndex = 0; + const VkGraphicsPipelineCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stageCount = static_cast<u32>(shader_stages.size()), + .pStages = shader_stages.data(), + .pVertexInputState = &vertex_input_ci, + .pInputAssemblyState = &input_assembly_ci, + .pTessellationState = &tessellation_ci, + .pViewportState = &viewport_ci, + .pRasterizationState = &rasterization_ci, + .pMultisampleState = &multisample_ci, + .pDepthStencilState = &depth_stencil_ci, + .pColorBlendState = &color_blend_ci, + .pDynamicState = &dynamic_state_ci, + .layout = *layout, + .renderPass = renderpass, + .subpass = 0, + .basePipelineHandle = nullptr, + .basePipelineIndex = 0, + }; return device.GetLogical().CreateGraphicsPipeline(ci); } diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index 7aba70960..58aa35efd 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -5,16 +5,13 @@ #pragma once #include <array> -#include <memory> #include <optional> -#include <unordered_map> #include <vector> #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_renderpass_cache.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_shader_decompiler.h" #include "video_core/renderer_vulkan/wrapper.h" @@ -22,7 +19,27 @@ namespace Vulkan { using Maxwell = Tegra::Engines::Maxwell3D::Regs; -struct GraphicsPipelineCacheKey; +struct GraphicsPipelineCacheKey { + RenderPassParams renderpass_params; + u32 padding; + std::array<GPUVAddr, Maxwell::MaxShaderProgram> shaders; + FixedPipelineState fixed_state; + + std::size_t Hash() const noexcept; + + bool operator==(const GraphicsPipelineCacheKey& rhs) const noexcept; + + bool operator!=(const GraphicsPipelineCacheKey& rhs) const noexcept { + return !operator==(rhs); + } + + std::size_t Size() const noexcept { + return sizeof(renderpass_params) + sizeof(padding) + sizeof(shaders) + fixed_state.Size(); + } +}; +static_assert(std::has_unique_object_representations_v<GraphicsPipelineCacheKey>); +static_assert(std::is_trivially_copyable_v<GraphicsPipelineCacheKey>); +static_assert(std::is_trivially_constructible_v<GraphicsPipelineCacheKey>); class VKDescriptorPool; class VKDevice; @@ -57,6 +74,10 @@ public: return renderpass; } + GraphicsPipelineCacheKey GetCacheKey() const { + return cache_key; + } + private: vk::DescriptorSetLayout CreateDescriptorSetLayout( vk::Span<VkDescriptorSetLayoutBinding> bindings) const; @@ -73,7 +94,7 @@ private: const VKDevice& device; VKScheduler& scheduler; - const FixedPipelineState fixed_state; + const GraphicsPipelineCacheKey cache_key; const u64 hash; vk::DescriptorSetLayout descriptor_set_layout; diff --git a/src/video_core/renderer_vulkan/vk_image.cpp b/src/video_core/renderer_vulkan/vk_image.cpp index 9bceb3861..1c418ea17 100644 --- a/src/video_core/renderer_vulkan/vk_image.cpp +++ b/src/video_core/renderer_vulkan/vk_image.cpp @@ -102,21 +102,29 @@ bool VKImage::HasChanged(u32 base_layer, u32 num_layers, u32 base_level, u32 num void VKImage::CreatePresentView() { // Image type has to be 2D to be presented. - VkImageViewCreateInfo image_view_ci; - image_view_ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; - image_view_ci.pNext = nullptr; - image_view_ci.flags = 0; - image_view_ci.image = *image; - image_view_ci.viewType = VK_IMAGE_VIEW_TYPE_2D; - image_view_ci.format = format; - image_view_ci.components = {VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY, - VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY}; - image_view_ci.subresourceRange.aspectMask = aspect_mask; - image_view_ci.subresourceRange.baseMipLevel = 0; - image_view_ci.subresourceRange.levelCount = 1; - image_view_ci.subresourceRange.baseArrayLayer = 0; - image_view_ci.subresourceRange.layerCount = 1; - present_view = device.GetLogical().CreateImageView(image_view_ci); + present_view = device.GetLogical().CreateImageView({ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .image = *image, + .viewType = VK_IMAGE_VIEW_TYPE_2D, + .format = format, + .components = + { + .r = VK_COMPONENT_SWIZZLE_IDENTITY, + .g = VK_COMPONENT_SWIZZLE_IDENTITY, + .b = VK_COMPONENT_SWIZZLE_IDENTITY, + .a = VK_COMPONENT_SWIZZLE_IDENTITY, + }, + .subresourceRange = + { + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1, + }, + }); } VKImage::SubrangeState& VKImage::GetSubrangeState(u32 layer, u32 level) noexcept { diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp new file mode 100644 index 000000000..ae26e558d --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp @@ -0,0 +1,56 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <atomic> +#include <chrono> + +#include "core/settings.h" +#include "video_core/renderer_vulkan/vk_device.h" +#include "video_core/renderer_vulkan/vk_master_semaphore.h" +#include "video_core/renderer_vulkan/wrapper.h" + +namespace Vulkan { + +using namespace std::chrono_literals; + +MasterSemaphore::MasterSemaphore(const VKDevice& device) { + static constexpr VkSemaphoreTypeCreateInfoKHR semaphore_type_ci{ + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR, + .pNext = nullptr, + .semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR, + .initialValue = 0, + }; + static constexpr VkSemaphoreCreateInfo semaphore_ci{ + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + .pNext = &semaphore_type_ci, + .flags = 0, + }; + semaphore = device.GetLogical().CreateSemaphore(semaphore_ci); + + if (!Settings::values.renderer_debug) { + return; + } + // Validation layers have a bug where they fail to track resource usage when using timeline + // semaphores and synchronizing with GetSemaphoreCounterValueKHR. To workaround this issue, have + // a separate thread waiting for each timeline semaphore value. + debug_thread = std::thread([this] { + u64 counter = 0; + while (!shutdown) { + if (semaphore.Wait(counter, 10'000'000)) { + ++counter; + } + } + }); +} + +MasterSemaphore::~MasterSemaphore() { + shutdown = true; + + // This thread might not be started + if (debug_thread.joinable()) { + debug_thread.join(); + } +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.h b/src/video_core/renderer_vulkan/vk_master_semaphore.h new file mode 100644 index 000000000..0e93706d7 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.h @@ -0,0 +1,70 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <atomic> +#include <thread> + +#include "common/common_types.h" +#include "video_core/renderer_vulkan/wrapper.h" + +namespace Vulkan { + +class VKDevice; + +class MasterSemaphore { +public: + explicit MasterSemaphore(const VKDevice& device); + ~MasterSemaphore(); + + /// Returns the current logical tick. + [[nodiscard]] u64 CurrentTick() const noexcept { + return current_tick; + } + + /// Returns the timeline semaphore handle. + [[nodiscard]] VkSemaphore Handle() const noexcept { + return *semaphore; + } + + /// Returns true when a tick has been hit by the GPU. + [[nodiscard]] bool IsFree(u64 tick) { + return gpu_tick >= tick; + } + + /// Advance to the logical tick. + void NextTick() noexcept { + ++current_tick; + } + + /// Refresh the known GPU tick + void Refresh() { + gpu_tick = semaphore.GetCounter(); + } + + /// Waits for a tick to be hit on the GPU + void Wait(u64 tick) { + // No need to wait if the GPU is ahead of the tick + if (IsFree(tick)) { + return; + } + // Update the GPU tick and try again + Refresh(); + if (IsFree(tick)) { + return; + } + // If none of the above is hit, fallback to a regular wait + semaphore.Wait(tick); + } + +private: + vk::Semaphore semaphore; ///< Timeline semaphore. + std::atomic<u64> gpu_tick{0}; ///< Current known GPU tick. + std::atomic<u64> current_tick{1}; ///< Current logical tick. + std::atomic<bool> shutdown{false}; ///< True when the object is being destroyed. + std::thread debug_thread; ///< Debug thread to workaround validation layer bugs. +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_memory_manager.cpp b/src/video_core/renderer_vulkan/vk_memory_manager.cpp index 6a9e658bf..24c8960ac 100644 --- a/src/video_core/renderer_vulkan/vk_memory_manager.cpp +++ b/src/video_core/renderer_vulkan/vk_memory_manager.cpp @@ -118,8 +118,7 @@ private: }; VKMemoryManager::VKMemoryManager(const VKDevice& device) - : device{device}, properties{device.GetPhysical().GetMemoryProperties()}, - is_memory_unified{GetMemoryUnified(properties)} {} + : device{device}, properties{device.GetPhysical().GetMemoryProperties()} {} VKMemoryManager::~VKMemoryManager() = default; @@ -179,13 +178,12 @@ bool VKMemoryManager::AllocMemory(VkMemoryPropertyFlags wanted_properties, u32 t }(); // Try to allocate found type. - VkMemoryAllocateInfo memory_ai; - memory_ai.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - memory_ai.pNext = nullptr; - memory_ai.allocationSize = size; - memory_ai.memoryTypeIndex = type; - - vk::DeviceMemory memory = device.GetLogical().TryAllocateMemory(memory_ai); + vk::DeviceMemory memory = device.GetLogical().TryAllocateMemory({ + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .pNext = nullptr, + .allocationSize = size, + .memoryTypeIndex = type, + }); if (!memory) { LOG_CRITICAL(Render_Vulkan, "Device allocation failed!"); return false; @@ -209,16 +207,6 @@ VKMemoryCommit VKMemoryManager::TryAllocCommit(const VkMemoryRequirements& requi return {}; } -bool VKMemoryManager::GetMemoryUnified(const VkPhysicalDeviceMemoryProperties& properties) { - for (u32 heap_index = 0; heap_index < properties.memoryHeapCount; ++heap_index) { - if (!(properties.memoryHeaps[heap_index].flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT)) { - // Memory is considered unified when heaps are device local only. - return false; - } - } - return true; -} - VKMemoryCommitImpl::VKMemoryCommitImpl(const VKDevice& device, VKMemoryAllocation* allocation, const vk::DeviceMemory& memory, u64 begin, u64 end) : device{device}, memory{memory}, interval{begin, end}, allocation{allocation} {} diff --git a/src/video_core/renderer_vulkan/vk_memory_manager.h b/src/video_core/renderer_vulkan/vk_memory_manager.h index 35ee54d30..1af88e3d4 100644 --- a/src/video_core/renderer_vulkan/vk_memory_manager.h +++ b/src/video_core/renderer_vulkan/vk_memory_manager.h @@ -32,7 +32,7 @@ public: * memory. When passing false, it will try to allocate device local memory. * @returns A memory commit. */ - VKMemoryCommit Commit(const VkMemoryRequirements& reqs, bool host_visible); + VKMemoryCommit Commit(const VkMemoryRequirements& requirements, bool host_visible); /// Commits memory required by the buffer and binds it. VKMemoryCommit Commit(const vk::Buffer& buffer, bool host_visible); @@ -40,11 +40,6 @@ public: /// Commits memory required by the image and binds it. VKMemoryCommit Commit(const vk::Image& image, bool host_visible); - /// Returns true if the memory allocations are done always in host visible and coherent memory. - bool IsMemoryUnified() const { - return is_memory_unified; - } - private: /// Allocates a chunk of memory. bool AllocMemory(VkMemoryPropertyFlags wanted_properties, u32 type_mask, u64 size); @@ -53,12 +48,8 @@ private: VKMemoryCommit TryAllocCommit(const VkMemoryRequirements& requirements, VkMemoryPropertyFlags wanted_properties); - /// Returns true if the device uses an unified memory model. - static bool GetMemoryUnified(const VkPhysicalDeviceMemoryProperties& properties); - - const VKDevice& device; ///< Device handler. - const VkPhysicalDeviceMemoryProperties properties; ///< Physical device properties. - const bool is_memory_unified; ///< True if memory model is unified. + const VKDevice& device; ///< Device handler. + const VkPhysicalDeviceMemoryProperties properties; ///< Physical device properties. std::vector<std::unique_ptr<VKMemoryAllocation>> allocations; ///< Current allocations. }; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 90e3a8edd..dedc9c466 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -22,17 +22,24 @@ #include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_renderpass_cache.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" #include "video_core/renderer_vulkan/wrapper.h" #include "video_core/shader/compiler_settings.h" +#include "video_core/shader/memory_util.h" +#include "video_core/shader_cache.h" +#include "video_core/shader_notify.h" namespace Vulkan { MICROPROFILE_DECLARE(Vulkan_PipelineCache); using Tegra::Engines::ShaderType; +using VideoCommon::Shader::GetShaderAddress; +using VideoCommon::Shader::GetShaderCode; +using VideoCommon::Shader::KERNEL_MAIN_OFFSET; +using VideoCommon::Shader::ProgramCode; +using VideoCommon::Shader::STAGE_MAIN_OFFSET; namespace { @@ -40,65 +47,12 @@ constexpr VkDescriptorType UNIFORM_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; constexpr VkDescriptorType STORAGE_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; constexpr VkDescriptorType UNIFORM_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER; constexpr VkDescriptorType COMBINED_IMAGE_SAMPLER = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; +constexpr VkDescriptorType STORAGE_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER; constexpr VkDescriptorType STORAGE_IMAGE = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; constexpr VideoCommon::Shader::CompilerSettings compiler_settings{ VideoCommon::Shader::CompileDepth::FullDecompile}; -/// Gets the address for the specified shader stage program -GPUVAddr GetShaderAddress(Core::System& system, Maxwell::ShaderProgram program) { - const auto& gpu{system.GPU().Maxwell3D()}; - const auto& shader_config{gpu.regs.shader_config[static_cast<std::size_t>(program)]}; - return gpu.regs.code_address.CodeAddress() + shader_config.offset; -} - -/// Gets if the current instruction offset is a scheduler instruction -constexpr bool IsSchedInstruction(std::size_t offset, std::size_t main_offset) { - // Sched instructions appear once every 4 instructions. - constexpr std::size_t SchedPeriod = 4; - const std::size_t absolute_offset = offset - main_offset; - return (absolute_offset % SchedPeriod) == 0; -} - -/// Calculates the size of a program stream -std::size_t CalculateProgramSize(const ProgramCode& program, bool is_compute) { - const std::size_t start_offset = is_compute ? 0 : 10; - // This is the encoded version of BRA that jumps to itself. All Nvidia - // shaders end with one. - constexpr u64 self_jumping_branch = 0xE2400FFFFF07000FULL; - constexpr u64 mask = 0xFFFFFFFFFF7FFFFFULL; - std::size_t offset = start_offset; - while (offset < program.size()) { - const u64 instruction = program[offset]; - if (!IsSchedInstruction(offset, start_offset)) { - if ((instruction & mask) == self_jumping_branch) { - // End on Maxwell's "nop" instruction - break; - } - if (instruction == 0) { - break; - } - } - ++offset; - } - // The last instruction is included in the program size - return std::min(offset + 1, program.size()); -} - -/// Gets the shader program code from memory for the specified address -ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr gpu_addr, - const u8* host_ptr, bool is_compute) { - ProgramCode program_code(VideoCommon::Shader::MAX_PROGRAM_LENGTH); - ASSERT_OR_EXECUTE(host_ptr != nullptr, { - std::fill(program_code.begin(), program_code.end(), 0); - return program_code; - }); - memory_manager.ReadBlockUnsafe(gpu_addr, program_code.data(), - program_code.size() * sizeof(u64)); - program_code.resize(CalculateProgramSize(program_code, is_compute)); - return program_code; -} - constexpr std::size_t GetStageFromProgram(std::size_t program) { return program == 0 ? 0 : program - 1; } @@ -133,14 +87,15 @@ void AddBindings(std::vector<VkDescriptorSetLayoutBinding>& bindings, u32& bindi u32 count = 1; if constexpr (descriptor_type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) { // Combined image samplers can be arrayed. - count = container[i].Size(); + count = container[i].size; } - VkDescriptorSetLayoutBinding& entry = bindings.emplace_back(); - entry.binding = binding++; - entry.descriptorType = descriptor_type; - entry.descriptorCount = count; - entry.stageFlags = stage_flags; - entry.pImmutableSamplers = nullptr; + bindings.push_back({ + .binding = binding++, + .descriptorType = descriptor_type, + .descriptorCount = count, + .stageFlags = stage_flags, + .pImmutableSamplers = nullptr, + }); } } @@ -153,96 +108,133 @@ u32 FillDescriptorLayout(const ShaderEntries& entries, u32 binding = base_binding; AddBindings<UNIFORM_BUFFER>(bindings, binding, flags, entries.const_buffers); AddBindings<STORAGE_BUFFER>(bindings, binding, flags, entries.global_buffers); - AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.texel_buffers); + AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.uniform_texels); AddBindings<COMBINED_IMAGE_SAMPLER>(bindings, binding, flags, entries.samplers); + AddBindings<STORAGE_TEXEL_BUFFER>(bindings, binding, flags, entries.storage_texels); AddBindings<STORAGE_IMAGE>(bindings, binding, flags, entries.images); return binding; } } // Anonymous namespace -CachedShader::CachedShader(Core::System& system, Tegra::Engines::ShaderType stage, - GPUVAddr gpu_addr, VAddr cpu_addr, ProgramCode program_code, - u32 main_offset) - : RasterizerCacheObject{cpu_addr}, gpu_addr{gpu_addr}, program_code{std::move(program_code)}, - registry{stage, GetEngine(system, stage)}, shader_ir{this->program_code, main_offset, - compiler_settings, registry}, - entries{GenerateShaderEntries(shader_ir)} {} - -CachedShader::~CachedShader() = default; - -Tegra::Engines::ConstBufferEngineInterface& CachedShader::GetEngine( - Core::System& system, Tegra::Engines::ShaderType stage) { - if (stage == Tegra::Engines::ShaderType::Compute) { - return system.GPU().KeplerCompute(); - } else { - return system.GPU().Maxwell3D(); - } +std::size_t GraphicsPipelineCacheKey::Hash() const noexcept { + const u64 hash = Common::CityHash64(reinterpret_cast<const char*>(this), Size()); + return static_cast<std::size_t>(hash); +} + +bool GraphicsPipelineCacheKey::operator==(const GraphicsPipelineCacheKey& rhs) const noexcept { + return std::memcmp(&rhs, this, Size()) == 0; +} + +std::size_t ComputePipelineCacheKey::Hash() const noexcept { + const u64 hash = Common::CityHash64(reinterpret_cast<const char*>(this), sizeof *this); + return static_cast<std::size_t>(hash); +} + +bool ComputePipelineCacheKey::operator==(const ComputePipelineCacheKey& rhs) const noexcept { + return std::memcmp(&rhs, this, sizeof *this) == 0; } -VKPipelineCache::VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer, - const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, - VKUpdateDescriptorQueue& update_descriptor_queue, - VKRenderPassCache& renderpass_cache) - : RasterizerCache{rasterizer}, system{system}, device{device}, scheduler{scheduler}, - descriptor_pool{descriptor_pool}, update_descriptor_queue{update_descriptor_queue}, - renderpass_cache{renderpass_cache} {} +Shader::Shader(Tegra::Engines::ConstBufferEngineInterface& engine, Tegra::Engines::ShaderType stage, + GPUVAddr gpu_addr_, VAddr cpu_addr, VideoCommon::Shader::ProgramCode program_code_, + u32 main_offset) + : gpu_addr(gpu_addr_), program_code(std::move(program_code_)), registry(stage, engine), + shader_ir(program_code, main_offset, compiler_settings, registry), + entries(GenerateShaderEntries(shader_ir)) {} + +Shader::~Shader() = default; + +VKPipelineCache::VKPipelineCache(RasterizerVulkan& rasterizer, Tegra::GPU& gpu_, + Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::Engines::KeplerCompute& kepler_compute_, + Tegra::MemoryManager& gpu_memory_, const VKDevice& device_, + VKScheduler& scheduler_, VKDescriptorPool& descriptor_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_, + VKRenderPassCache& renderpass_cache_) + : VideoCommon::ShaderCache<Shader>{rasterizer}, gpu{gpu_}, maxwell3d{maxwell3d_}, + kepler_compute{kepler_compute_}, gpu_memory{gpu_memory_}, device{device_}, + scheduler{scheduler_}, descriptor_pool{descriptor_pool_}, + update_descriptor_queue{update_descriptor_queue_}, renderpass_cache{renderpass_cache_} {} VKPipelineCache::~VKPipelineCache() = default; -std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() { - const auto& gpu = system.GPU().Maxwell3D(); +std::array<Shader*, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() { + std::array<Shader*, Maxwell::MaxShaderProgram> shaders{}; - std::array<Shader, Maxwell::MaxShaderProgram> shaders; for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { const auto program{static_cast<Maxwell::ShaderProgram>(index)}; // Skip stages that are not enabled - if (!gpu.regs.IsShaderConfigEnabled(index)) { + if (!maxwell3d.regs.IsShaderConfigEnabled(index)) { continue; } - auto& memory_manager{system.GPU().MemoryManager()}; - const GPUVAddr program_addr{GetShaderAddress(system, program)}; - const std::optional cpu_addr = memory_manager.GpuToCpuAddress(program_addr); + const GPUVAddr gpu_addr{GetShaderAddress(maxwell3d, program)}; + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); ASSERT(cpu_addr); - auto shader = cpu_addr ? TryGet(*cpu_addr) : nullptr; - if (!shader) { - const auto host_ptr{memory_manager.GetPointer(program_addr)}; - // No shader found - create a new one - constexpr u32 stage_offset = 10; - const auto stage = static_cast<Tegra::Engines::ShaderType>(index == 0 ? 0 : index - 1); - auto code = GetShaderCode(memory_manager, program_addr, host_ptr, false); + Shader* result = cpu_addr ? TryGet(*cpu_addr) : null_shader.get(); + if (!result) { + const u8* const host_ptr{gpu_memory.GetPointer(gpu_addr)}; - shader = std::make_shared<CachedShader>(system, stage, program_addr, *cpu_addr, - std::move(code), stage_offset); - Register(shader); + // No shader found - create a new one + static constexpr u32 stage_offset = STAGE_MAIN_OFFSET; + const auto stage = static_cast<ShaderType>(index == 0 ? 0 : index - 1); + ProgramCode code = GetShaderCode(gpu_memory, gpu_addr, host_ptr, false); + const std::size_t size_in_bytes = code.size() * sizeof(u64); + + auto shader = std::make_unique<Shader>(maxwell3d, stage, gpu_addr, *cpu_addr, + std::move(code), stage_offset); + result = shader.get(); + + if (cpu_addr) { + Register(std::move(shader), *cpu_addr, size_in_bytes); + } else { + null_shader = std::move(shader); + } } - shaders[index] = std::move(shader); + shaders[index] = result; } return last_shaders = shaders; } -VKGraphicsPipeline& VKPipelineCache::GetGraphicsPipeline(const GraphicsPipelineCacheKey& key) { +VKGraphicsPipeline* VKPipelineCache::GetGraphicsPipeline( + const GraphicsPipelineCacheKey& key, VideoCommon::Shader::AsyncShaders& async_shaders) { MICROPROFILE_SCOPE(Vulkan_PipelineCache); if (last_graphics_pipeline && last_graphics_key == key) { - return *last_graphics_pipeline; + return last_graphics_pipeline; } last_graphics_key = key; + if (device.UseAsynchronousShaders() && async_shaders.IsShaderAsync(gpu)) { + std::unique_lock lock{pipeline_cache}; + const auto [pair, is_cache_miss] = graphics_cache.try_emplace(key); + if (is_cache_miss) { + gpu.ShaderNotify().MarkSharderBuilding(); + LOG_INFO(Render_Vulkan, "Compile 0x{:016X}", key.Hash()); + const auto [program, bindings] = DecompileShaders(key.fixed_state); + async_shaders.QueueVulkanShader(this, device, scheduler, descriptor_pool, + update_descriptor_queue, renderpass_cache, bindings, + program, key); + } + last_graphics_pipeline = pair->second.get(); + return last_graphics_pipeline; + } + const auto [pair, is_cache_miss] = graphics_cache.try_emplace(key); auto& entry = pair->second; if (is_cache_miss) { + gpu.ShaderNotify().MarkSharderBuilding(); LOG_INFO(Render_Vulkan, "Compile 0x{:016X}", key.Hash()); - const auto [program, bindings] = DecompileShaders(key); + const auto [program, bindings] = DecompileShaders(key.fixed_state); entry = std::make_unique<VKGraphicsPipeline>(device, scheduler, descriptor_pool, update_descriptor_queue, renderpass_cache, key, bindings, program); + gpu.ShaderNotify().MarkShaderComplete(); } - return *(last_graphics_pipeline = entry.get()); + last_graphics_pipeline = entry.get(); + return last_graphics_pipeline; } VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCacheKey& key) { @@ -255,29 +247,39 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach } LOG_INFO(Render_Vulkan, "Compile 0x{:016X}", key.Hash()); - auto& memory_manager = system.GPU().MemoryManager(); - const auto program_addr = key.shader; + const GPUVAddr gpu_addr = key.shader; - const auto cpu_addr = memory_manager.GpuToCpuAddress(program_addr); + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); ASSERT(cpu_addr); - auto shader = cpu_addr ? TryGet(*cpu_addr) : nullptr; + Shader* shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get(); if (!shader) { // No shader found - create a new one - const auto host_ptr = memory_manager.GetPointer(program_addr); - - auto code = GetShaderCode(memory_manager, program_addr, host_ptr, true); - constexpr u32 kernel_main_offset = 0; - shader = std::make_shared<CachedShader>(system, Tegra::Engines::ShaderType::Compute, - program_addr, *cpu_addr, std::move(code), - kernel_main_offset); - Register(shader); - } + const auto host_ptr = gpu_memory.GetPointer(gpu_addr); - Specialization specialization; - specialization.workgroup_size = key.workgroup_size; - specialization.shared_memory_size = key.shared_memory_size; + ProgramCode code = GetShaderCode(gpu_memory, gpu_addr, host_ptr, true); + const std::size_t size_in_bytes = code.size() * sizeof(u64); + + auto shader_info = std::make_unique<Shader>(kepler_compute, ShaderType::Compute, gpu_addr, + *cpu_addr, std::move(code), KERNEL_MAIN_OFFSET); + shader = shader_info.get(); + if (cpu_addr) { + Register(std::move(shader_info), *cpu_addr, size_in_bytes); + } else { + null_kernel = std::move(shader_info); + } + } + + const Specialization specialization{ + .base_binding = 0, + .workgroup_size = key.workgroup_size, + .shared_memory_size = key.shared_memory_size, + .point_size = std::nullopt, + .enabled_attributes = {}, + .attribute_types = {}, + .ndc_minus_one_to_one = false, + }; const SPIRVShader spirv_shader{Decompile(device, shader->GetIR(), ShaderType::Compute, shader->GetRegistry(), specialization), shader->GetEntries()}; @@ -286,7 +288,13 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach return *entry; } -void VKPipelineCache::Unregister(const Shader& shader) { +void VKPipelineCache::EmplacePipeline(std::unique_ptr<VKGraphicsPipeline> pipeline) { + gpu.ShaderNotify().MarkShaderComplete(); + std::unique_lock lock{pipeline_cache}; + graphics_cache.at(pipeline->GetCacheKey()) = std::move(pipeline); +} + +void VKPipelineCache::OnShaderRemoval(Shader* shader) { bool finished = false; const auto Finish = [&] { // TODO(Rodrigo): Instead of finishing here, wait for the fences that use this pipeline and @@ -318,25 +326,23 @@ void VKPipelineCache::Unregister(const Shader& shader) { Finish(); it = compute_cache.erase(it); } - - RasterizerCache::Unregister(shader); } std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>> -VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) { - const auto& fixed_state = key.fixed_state; - auto& memory_manager = system.GPU().MemoryManager(); - const auto& gpu = system.GPU().Maxwell3D(); - +VKPipelineCache::DecompileShaders(const FixedPipelineState& fixed_state) { Specialization specialization; - if (fixed_state.input_assembly.topology == Maxwell::PrimitiveTopology::Points) { - ASSERT(fixed_state.input_assembly.point_size != 0.0f); - specialization.point_size = fixed_state.input_assembly.point_size; + if (fixed_state.topology == Maxwell::PrimitiveTopology::Points) { + float point_size; + std::memcpy(&point_size, &fixed_state.point_size, sizeof(float)); + specialization.point_size = point_size; + ASSERT(point_size != 0.0f); } for (std::size_t i = 0; i < Maxwell::NumVertexAttributes; ++i) { - specialization.attribute_types[i] = fixed_state.vertex_input.attributes[i].type; + const auto& attribute = fixed_state.attributes[i]; + specialization.enabled_attributes[i] = attribute.enabled.Value() != 0; + specialization.attribute_types[i] = attribute.Type(); } - specialization.ndc_minus_one_to_one = fixed_state.rasterizer.ndc_minus_one_to_one; + specialization.ndc_minus_one_to_one = fixed_state.ndc_minus_one_to_one; SPIRVProgram program; std::vector<VkDescriptorSetLayoutBinding> bindings; @@ -345,18 +351,16 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) { const auto program_enum = static_cast<Maxwell::ShaderProgram>(index); // Skip stages that are not enabled - if (!gpu.regs.IsShaderConfigEnabled(index)) { + if (!maxwell3d.regs.IsShaderConfigEnabled(index)) { continue; } - const GPUVAddr gpu_addr = GetShaderAddress(system, program_enum); - const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr); - ASSERT(cpu_addr); - const auto shader = TryGet(*cpu_addr); - ASSERT(shader); + const GPUVAddr gpu_addr = GetShaderAddress(maxwell3d, program_enum); + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + Shader* const shader = cpu_addr ? TryGet(*cpu_addr) : null_shader.get(); const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5 - const auto program_type = GetShaderType(program_enum); + const ShaderType program_type = GetShaderType(program_enum); const auto& entries = shader->GetEntries(); program[stage] = { Decompile(device, shader->GetIR(), program_type, shader->GetRegistry(), specialization), @@ -383,14 +387,15 @@ void AddEntry(std::vector<VkDescriptorUpdateTemplateEntry>& template_entries, u3 if constexpr (descriptor_type == COMBINED_IMAGE_SAMPLER) { for (u32 i = 0; i < count; ++i) { - const u32 num_samplers = container[i].Size(); - VkDescriptorUpdateTemplateEntry& entry = template_entries.emplace_back(); - entry.dstBinding = binding; - entry.dstArrayElement = 0; - entry.descriptorCount = num_samplers; - entry.descriptorType = descriptor_type; - entry.offset = offset; - entry.stride = entry_size; + const u32 num_samplers = container[i].size; + template_entries.push_back({ + .dstBinding = binding, + .dstArrayElement = 0, + .descriptorCount = num_samplers, + .descriptorType = descriptor_type, + .offset = offset, + .stride = entry_size, + }); ++binding; offset += num_samplers * entry_size; @@ -398,26 +403,29 @@ void AddEntry(std::vector<VkDescriptorUpdateTemplateEntry>& template_entries, u3 return; } - if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER) { - // Nvidia has a bug where updating multiple uniform texels at once causes the driver to - // crash. + if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER || + descriptor_type == STORAGE_TEXEL_BUFFER) { + // Nvidia has a bug where updating multiple texels at once causes the driver to crash. + // Note: Fixed in driver Windows 443.24, Linux 440.66.15 for (u32 i = 0; i < count; ++i) { - VkDescriptorUpdateTemplateEntry& entry = template_entries.emplace_back(); - entry.dstBinding = binding + i; - entry.dstArrayElement = 0; - entry.descriptorCount = 1; - entry.descriptorType = descriptor_type; - entry.offset = offset + i * entry_size; - entry.stride = entry_size; + template_entries.push_back({ + .dstBinding = binding + i, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = descriptor_type, + .offset = static_cast<std::size_t>(offset + i * entry_size), + .stride = entry_size, + }); } } else if (count > 0) { - VkDescriptorUpdateTemplateEntry& entry = template_entries.emplace_back(); - entry.dstBinding = binding; - entry.dstArrayElement = 0; - entry.descriptorCount = count; - entry.descriptorType = descriptor_type; - entry.offset = offset; - entry.stride = entry_size; + template_entries.push_back({ + .dstBinding = binding, + .dstArrayElement = 0, + .descriptorCount = count, + .descriptorType = descriptor_type, + .offset = offset, + .stride = entry_size, + }); } offset += count * entry_size; binding += count; @@ -428,8 +436,9 @@ void FillDescriptorUpdateTemplateEntries( std::vector<VkDescriptorUpdateTemplateEntryKHR>& template_entries) { AddEntry<UNIFORM_BUFFER>(template_entries, offset, binding, entries.const_buffers); AddEntry<STORAGE_BUFFER>(template_entries, offset, binding, entries.global_buffers); - AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.texel_buffers); + AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.uniform_texels); AddEntry<COMBINED_IMAGE_SAMPLER>(template_entries, offset, binding, entries.samplers); + AddEntry<STORAGE_TEXEL_BUFFER>(template_entries, offset, binding, entries.storage_texels); AddEntry<STORAGE_IMAGE>(template_entries, offset, binding, entries.images); } diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h index 7ccdb7083..e558e6658 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h @@ -7,7 +7,6 @@ #include <array> #include <cstddef> #include <memory> -#include <tuple> #include <type_traits> #include <unordered_map> #include <utility> @@ -18,16 +17,16 @@ #include "common/common_types.h" #include "video_core/engines/const_buffer_engine_interface.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/rasterizer_cache.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_renderpass_cache.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_shader_decompiler.h" #include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/shader/async_shaders.h" +#include "video_core/shader/memory_util.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" -#include "video_core/surface.h" +#include "video_core/shader_cache.h" namespace Core { class System; @@ -39,54 +38,27 @@ class RasterizerVulkan; class VKComputePipeline; class VKDescriptorPool; class VKDevice; -class VKFence; class VKScheduler; class VKUpdateDescriptorQueue; -class CachedShader; -using Shader = std::shared_ptr<CachedShader>; using Maxwell = Tegra::Engines::Maxwell3D::Regs; -using ProgramCode = std::vector<u64>; +struct ComputePipelineCacheKey { + GPUVAddr shader; + u32 shared_memory_size; + std::array<u32, 3> workgroup_size; -struct GraphicsPipelineCacheKey { - FixedPipelineState fixed_state; - std::array<GPUVAddr, Maxwell::MaxShaderProgram> shaders; - RenderPassParams renderpass_params; + std::size_t Hash() const noexcept; - std::size_t Hash() const noexcept { - std::size_t hash = fixed_state.Hash(); - for (const auto& shader : shaders) { - boost::hash_combine(hash, shader); - } - boost::hash_combine(hash, renderpass_params.Hash()); - return hash; - } + bool operator==(const ComputePipelineCacheKey& rhs) const noexcept; - bool operator==(const GraphicsPipelineCacheKey& rhs) const noexcept { - return std::tie(fixed_state, shaders, renderpass_params) == - std::tie(rhs.fixed_state, rhs.shaders, rhs.renderpass_params); - } -}; - -struct ComputePipelineCacheKey { - GPUVAddr shader{}; - u32 shared_memory_size{}; - std::array<u32, 3> workgroup_size{}; - - std::size_t Hash() const noexcept { - return static_cast<std::size_t>(shader) ^ - ((static_cast<std::size_t>(shared_memory_size) >> 7) << 40) ^ - static_cast<std::size_t>(workgroup_size[0]) ^ - (static_cast<std::size_t>(workgroup_size[1]) << 16) ^ - (static_cast<std::size_t>(workgroup_size[2]) << 24); - } - - bool operator==(const ComputePipelineCacheKey& rhs) const noexcept { - return std::tie(shader, shared_memory_size, workgroup_size) == - std::tie(rhs.shader, rhs.shared_memory_size, rhs.workgroup_size); + bool operator!=(const ComputePipelineCacheKey& rhs) const noexcept { + return !operator==(rhs); } }; +static_assert(std::has_unique_object_representations_v<ComputePipelineCacheKey>); +static_assert(std::is_trivially_copyable_v<ComputePipelineCacheKey>); +static_assert(std::is_trivially_constructible_v<ComputePipelineCacheKey>); } // namespace Vulkan @@ -110,21 +82,22 @@ struct hash<Vulkan::ComputePipelineCacheKey> { namespace Vulkan { -class CachedShader final : public RasterizerCacheObject { +class Shader { public: - explicit CachedShader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, - VAddr cpu_addr, ProgramCode program_code, u32 main_offset); - ~CachedShader(); + explicit Shader(Tegra::Engines::ConstBufferEngineInterface& engine, + Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, VAddr cpu_addr, + VideoCommon::Shader::ProgramCode program_code, u32 main_offset); + ~Shader(); GPUVAddr GetGpuAddr() const { return gpu_addr; } - std::size_t GetSizeInBytes() const override { - return program_code.size() * sizeof(u64); + VideoCommon::Shader::ShaderIR& GetIR() { + return shader_ir; } - VideoCommon::Shader::ShaderIR& GetIR() { + const VideoCommon::Shader::ShaderIR& GetIR() const { return shader_ir; } @@ -132,61 +105,65 @@ public: return registry; } - const VideoCommon::Shader::ShaderIR& GetIR() const { - return shader_ir; - } - const ShaderEntries& GetEntries() const { return entries; } private: - static Tegra::Engines::ConstBufferEngineInterface& GetEngine(Core::System& system, - Tegra::Engines::ShaderType stage); - GPUVAddr gpu_addr{}; - ProgramCode program_code; + VideoCommon::Shader::ProgramCode program_code; VideoCommon::Shader::Registry registry; VideoCommon::Shader::ShaderIR shader_ir; ShaderEntries entries; }; -class VKPipelineCache final : public RasterizerCache<Shader> { +class VKPipelineCache final : public VideoCommon::ShaderCache<Shader> { public: - explicit VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer, - const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, + explicit VKPipelineCache(RasterizerVulkan& rasterizer, Tegra::GPU& gpu, + Tegra::Engines::Maxwell3D& maxwell3d, + Tegra::Engines::KeplerCompute& kepler_compute, + Tegra::MemoryManager& gpu_memory, const VKDevice& device, + VKScheduler& scheduler, VKDescriptorPool& descriptor_pool, VKUpdateDescriptorQueue& update_descriptor_queue, VKRenderPassCache& renderpass_cache); - ~VKPipelineCache(); + ~VKPipelineCache() override; - std::array<Shader, Maxwell::MaxShaderProgram> GetShaders(); + std::array<Shader*, Maxwell::MaxShaderProgram> GetShaders(); - VKGraphicsPipeline& GetGraphicsPipeline(const GraphicsPipelineCacheKey& key); + VKGraphicsPipeline* GetGraphicsPipeline(const GraphicsPipelineCacheKey& key, + VideoCommon::Shader::AsyncShaders& async_shaders); VKComputePipeline& GetComputePipeline(const ComputePipelineCacheKey& key); -protected: - void Unregister(const Shader& shader) override; + void EmplacePipeline(std::unique_ptr<VKGraphicsPipeline> pipeline); - void FlushObjectInner(const Shader& object) override {} +protected: + void OnShaderRemoval(Shader* shader) final; private: std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>> DecompileShaders( - const GraphicsPipelineCacheKey& key); + const FixedPipelineState& fixed_state); + + Tegra::GPU& gpu; + Tegra::Engines::Maxwell3D& maxwell3d; + Tegra::Engines::KeplerCompute& kepler_compute; + Tegra::MemoryManager& gpu_memory; - Core::System& system; const VKDevice& device; VKScheduler& scheduler; VKDescriptorPool& descriptor_pool; VKUpdateDescriptorQueue& update_descriptor_queue; VKRenderPassCache& renderpass_cache; - std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; + std::unique_ptr<Shader> null_shader; + std::unique_ptr<Shader> null_kernel; + + std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{}; GraphicsPipelineCacheKey last_graphics_key; VKGraphicsPipeline* last_graphics_pipeline = nullptr; + std::mutex pipeline_cache; std::unordered_map<GraphicsPipelineCacheKey, std::unique_ptr<VKGraphicsPipeline>> graphics_cache; std::unordered_map<ComputePipelineCacheKey, std::unique_ptr<VKComputePipeline>> compute_cache; diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 0966c7ff7..ee2d871e3 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -4,41 +4,38 @@ #include <algorithm> #include <cstddef> -#include <cstdint> #include <utility> #include <vector> #include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_query_cache.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" +#include "video_core/renderer_vulkan/vk_resource_pool.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/wrapper.h" namespace Vulkan { +using VideoCore::QueryType; + namespace { constexpr std::array QUERY_TARGETS = {VK_QUERY_TYPE_OCCLUSION}; -constexpr VkQueryType GetTarget(VideoCore::QueryType type) { +constexpr VkQueryType GetTarget(QueryType type) { return QUERY_TARGETS[static_cast<std::size_t>(type)]; } } // Anonymous namespace -QueryPool::QueryPool() : VKFencedPool{GROW_STEP} {} +QueryPool::QueryPool(const VKDevice& device_, VKScheduler& scheduler, QueryType type_) + : ResourcePool{scheduler.GetMasterSemaphore(), GROW_STEP}, device{device_}, type{type_} {} QueryPool::~QueryPool() = default; -void QueryPool::Initialize(const VKDevice& device_, VideoCore::QueryType type_) { - device = &device_; - type = type_; -} - -std::pair<VkQueryPool, u32> QueryPool::Commit(VKFence& fence) { +std::pair<VkQueryPool, u32> QueryPool::Commit() { std::size_t index; do { - index = CommitResource(fence); + index = CommitResource(); } while (usage[index]); usage[index] = true; @@ -48,14 +45,14 @@ std::pair<VkQueryPool, u32> QueryPool::Commit(VKFence& fence) { void QueryPool::Allocate(std::size_t begin, std::size_t end) { usage.resize(end); - VkQueryPoolCreateInfo query_pool_ci; - query_pool_ci.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO; - query_pool_ci.pNext = nullptr; - query_pool_ci.flags = 0; - query_pool_ci.queryType = GetTarget(type); - query_pool_ci.queryCount = static_cast<u32>(end - begin); - query_pool_ci.pipelineStatistics = 0; - pools.push_back(device->GetLogical().CreateQueryPool(query_pool_ci)); + pools.push_back(device.GetLogical().CreateQueryPool({ + .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .queryType = GetTarget(type), + .queryCount = static_cast<u32>(end - begin), + .pipelineStatistics = 0, + })); } void QueryPool::Reserve(std::pair<VkQueryPool, u32> query) { @@ -69,30 +66,39 @@ void QueryPool::Reserve(std::pair<VkQueryPool, u32> query) { usage[pool_index * GROW_STEP + static_cast<std::ptrdiff_t>(query.second)] = false; } -VKQueryCache::VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, +VKQueryCache::VKQueryCache(VideoCore::RasterizerInterface& rasterizer, + Tegra::Engines::Maxwell3D& maxwell3d, Tegra::MemoryManager& gpu_memory, const VKDevice& device, VKScheduler& scheduler) - : VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter, - QueryPool>{system, rasterizer}, - device{device}, scheduler{scheduler} { - for (std::size_t i = 0; i < static_cast<std::size_t>(VideoCore::NumQueryTypes); ++i) { - query_pools[i].Initialize(device, static_cast<VideoCore::QueryType>(i)); + : VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, + HostCounter>{rasterizer, maxwell3d, gpu_memory}, + device{device}, scheduler{scheduler}, query_pools{ + QueryPool{device, scheduler, + QueryType::SamplesPassed}, + } {} + +VKQueryCache::~VKQueryCache() { + // TODO(Rodrigo): This is a hack to destroy all HostCounter instances before the base class + // destructor is called. The query cache should be redesigned to have a proper ownership model + // instead of using shared pointers. + for (size_t query_type = 0; query_type < VideoCore::NumQueryTypes; ++query_type) { + auto& stream = Stream(static_cast<QueryType>(query_type)); + stream.Update(false); + stream.Reset(); } } -VKQueryCache::~VKQueryCache() = default; - -std::pair<VkQueryPool, u32> VKQueryCache::AllocateQuery(VideoCore::QueryType type) { - return query_pools[static_cast<std::size_t>(type)].Commit(scheduler.GetFence()); +std::pair<VkQueryPool, u32> VKQueryCache::AllocateQuery(QueryType type) { + return query_pools[static_cast<std::size_t>(type)].Commit(); } -void VKQueryCache::Reserve(VideoCore::QueryType type, std::pair<VkQueryPool, u32> query) { +void VKQueryCache::Reserve(QueryType type, std::pair<VkQueryPool, u32> query) { query_pools[static_cast<std::size_t>(type)].Reserve(query); } HostCounter::HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency, - VideoCore::QueryType type) + QueryType type) : VideoCommon::HostCounterBase<VKQueryCache, HostCounter>{std::move(dependency)}, cache{cache}, - type{type}, query{cache.AllocateQuery(type)}, ticks{cache.Scheduler().Ticks()} { + type{type}, query{cache.AllocateQuery(type)}, tick{cache.Scheduler().CurrentTick()} { const vk::Device* logical = &cache.Device().GetLogical(); cache.Scheduler().Record([logical, query = query](vk::CommandBuffer cmdbuf) { logical->ResetQueryPoolEXT(query.first, query.second, 1); @@ -110,11 +116,22 @@ void HostCounter::EndQuery() { } u64 HostCounter::BlockingQuery() const { - if (ticks >= cache.Scheduler().Ticks()) { + if (tick >= cache.Scheduler().CurrentTick()) { cache.Scheduler().Flush(); } - return cache.Device().GetLogical().GetQueryResult<u64>( - query.first, query.second, VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); + u64 data; + const VkResult result = cache.Device().GetLogical().GetQueryResults( + query.first, query.second, 1, sizeof(data), &data, sizeof(data), + VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); + switch (result) { + case VK_SUCCESS: + return data; + case VK_ERROR_DEVICE_LOST: + cache.Device().ReportLoss(); + [[fallthrough]]; + default: + throw vk::Exception(result); + } } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h index b63784f4b..2e57fb75d 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.h +++ b/src/video_core/renderer_vulkan/vk_query_cache.h @@ -5,14 +5,13 @@ #pragma once #include <cstddef> -#include <cstdint> #include <memory> #include <utility> #include <vector> #include "common/common_types.h" #include "video_core/query_cache.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" +#include "video_core/renderer_vulkan/vk_resource_pool.h" #include "video_core/renderer_vulkan/wrapper.h" namespace VideoCore { @@ -29,14 +28,12 @@ class VKScheduler; using CounterStream = VideoCommon::CounterStreamBase<VKQueryCache, HostCounter>; -class QueryPool final : public VKFencedPool { +class QueryPool final : public ResourcePool { public: - explicit QueryPool(); + explicit QueryPool(const VKDevice& device, VKScheduler& scheduler, VideoCore::QueryType type); ~QueryPool() override; - void Initialize(const VKDevice& device, VideoCore::QueryType type); - - std::pair<VkQueryPool, u32> Commit(VKFence& fence); + std::pair<VkQueryPool, u32> Commit(); void Reserve(std::pair<VkQueryPool, u32> query); @@ -46,18 +43,18 @@ protected: private: static constexpr std::size_t GROW_STEP = 512; - const VKDevice* device = nullptr; - VideoCore::QueryType type = {}; + const VKDevice& device; + const VideoCore::QueryType type; std::vector<vk::QueryPool> pools; std::vector<bool> usage; }; class VKQueryCache final - : public VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter, - QueryPool> { + : public VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter> { public: - explicit VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, + explicit VKQueryCache(VideoCore::RasterizerInterface& rasterizer, + Tegra::Engines::Maxwell3D& maxwell3d, Tegra::MemoryManager& gpu_memory, const VKDevice& device, VKScheduler& scheduler); ~VKQueryCache(); @@ -76,6 +73,7 @@ public: private: const VKDevice& device; VKScheduler& scheduler; + std::array<QueryPool, VideoCore::NumQueryTypes> query_pools; }; class HostCounter final : public VideoCommon::HostCounterBase<VKQueryCache, HostCounter> { @@ -92,7 +90,7 @@ private: VKQueryCache& cache; const VideoCore::QueryType type; const std::pair<VkQueryPool, u32> query; - const u64 ticks; + const u64 tick; }; class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 774ba1f26..e0fb8693f 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -9,14 +9,14 @@ #include <vector> #include <boost/container/static_vector.hpp> -#include <boost/functional/hash.hpp> #include "common/alignment.h" #include "common/assert.h" #include "common/logging/log.h" #include "common/microprofile.h" +#include "common/scope_exit.h" #include "core/core.h" -#include "core/memory.h" +#include "core/settings.h" #include "video_core/engines/kepler_compute.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" @@ -31,7 +31,6 @@ #include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_renderpass_cache.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_sampler_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" @@ -39,6 +38,7 @@ #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" #include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/shader_cache.h" namespace Vulkan { @@ -64,20 +64,22 @@ VkViewport GetViewportState(const VKDevice& device, const Maxwell& regs, std::si const auto& src = regs.viewport_transform[index]; const float width = src.scale_x * 2.0f; const float height = src.scale_y * 2.0f; + const float reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1.0f : 0.0f; - VkViewport viewport; - viewport.x = src.translate_x - src.scale_x; - viewport.y = src.translate_y - src.scale_y; - viewport.width = width != 0.0f ? width : 1.0f; - viewport.height = height != 0.0f ? height : 1.0f; + VkViewport viewport{ + .x = src.translate_x - src.scale_x, + .y = src.translate_y - src.scale_y, + .width = width != 0.0f ? width : 1.0f, + .height = height != 0.0f ? height : 1.0f, + .minDepth = src.translate_z - src.scale_z * reduce_z, + .maxDepth = src.translate_z + src.scale_z, + }; - const float reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1.0f : 0.0f; - viewport.minDepth = src.translate_z - src.scale_z * reduce_z; - viewport.maxDepth = src.translate_z + src.scale_z; if (!device.IsExtDepthRangeUnrestrictedSupported()) { viewport.minDepth = std::clamp(viewport.minDepth, 0.0f, 1.0f); viewport.maxDepth = std::clamp(viewport.maxDepth, 0.0f, 1.0f); } + return viewport; } @@ -99,7 +101,7 @@ VkRect2D GetScissorState(const Maxwell& regs, std::size_t index) { } std::array<GPUVAddr, Maxwell::MaxShaderProgram> GetShaderAddresses( - const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) { + const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) { std::array<GPUVAddr, Maxwell::MaxShaderProgram> addresses; for (std::size_t i = 0; i < std::size(addresses); ++i) { addresses[i] = shaders[i] ? shaders[i]->GetGpuAddr() : 0; @@ -118,14 +120,24 @@ template <typename Engine, typename Entry> Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, std::size_t stage, std::size_t index = 0) { const auto stage_type = static_cast<Tegra::Engines::ShaderType>(stage); - if (entry.IsBindless()) { - const Tegra::Texture::TextureHandle tex_handle = - engine.AccessConstBuffer32(stage_type, entry.GetBuffer(), entry.GetOffset()); + if constexpr (std::is_same_v<Entry, SamplerEntry>) { + if (entry.is_separated) { + const u32 buffer_1 = entry.buffer; + const u32 buffer_2 = entry.secondary_buffer; + const u32 offset_1 = entry.offset; + const u32 offset_2 = entry.secondary_offset; + const u32 handle_1 = engine.AccessConstBuffer32(stage_type, buffer_1, offset_1); + const u32 handle_2 = engine.AccessConstBuffer32(stage_type, buffer_2, offset_2); + return engine.GetTextureInfo(handle_1 | handle_2); + } + } + if (entry.is_bindless) { + const auto tex_handle = engine.AccessConstBuffer32(stage_type, entry.buffer, entry.offset); return engine.GetTextureInfo(tex_handle); } const auto& gpu_profile = engine.AccessGuestDriverProfile(); const u32 entry_offset = static_cast<u32>(index * gpu_profile.GetTextureHandlerSize()); - const u32 offset = entry.GetOffset() + entry_offset; + const u32 offset = entry.offset + entry_offset; if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) { return engine.GetStageTexture(stage_type, offset); } else { @@ -133,92 +145,144 @@ Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry } } +/// @brief Determine if an attachment to be updated has to preserve contents +/// @param is_clear True when a clear is being executed +/// @param regs 3D registers +/// @return True when the contents have to be preserved +bool HasToPreserveColorContents(bool is_clear, const Maxwell& regs) { + if (!is_clear) { + return true; + } + // First we have to make sure all clear masks are enabled. + if (!regs.clear_buffers.R || !regs.clear_buffers.G || !regs.clear_buffers.B || + !regs.clear_buffers.A) { + return true; + } + // If scissors are disabled, the whole screen is cleared + if (!regs.clear_flags.scissor) { + return false; + } + // Then we have to confirm scissor testing clears the whole image + const std::size_t index = regs.clear_buffers.RT; + const auto& scissor = regs.scissor_test[0]; + return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.rt[index].width || + scissor.max_y < regs.rt[index].height; +} + +/// @brief Determine if an attachment to be updated has to preserve contents +/// @param is_clear True when a clear is being executed +/// @param regs 3D registers +/// @return True when the contents have to be preserved +bool HasToPreserveDepthContents(bool is_clear, const Maxwell& regs) { + // If we are not clearing, the contents have to be preserved + if (!is_clear) { + return true; + } + // For depth stencil clears we only have to confirm scissor test covers the whole image + if (!regs.clear_flags.scissor) { + return false; + } + // Make sure the clear cover the whole image + const auto& scissor = regs.scissor_test[0]; + return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.zeta_width || + scissor.max_y < regs.zeta_height; +} + +template <std::size_t N> +std::array<VkDeviceSize, N> ExpandStrides(const std::array<u16, N>& strides) { + std::array<VkDeviceSize, N> expanded; + std::copy(strides.begin(), strides.end(), expanded.begin()); + return expanded; +} + } // Anonymous namespace class BufferBindings final { public: - void AddVertexBinding(const VkBuffer* buffer, VkDeviceSize offset) { - vertex.buffer_ptrs[vertex.num_buffers] = buffer; + void AddVertexBinding(VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size, u32 stride) { + vertex.buffers[vertex.num_buffers] = buffer; vertex.offsets[vertex.num_buffers] = offset; + vertex.sizes[vertex.num_buffers] = size; + vertex.strides[vertex.num_buffers] = static_cast<u16>(stride); ++vertex.num_buffers; } - void SetIndexBinding(const VkBuffer* buffer, VkDeviceSize offset, VkIndexType type) { + void SetIndexBinding(VkBuffer buffer, VkDeviceSize offset, VkIndexType type) { index.buffer = buffer; index.offset = offset; index.type = type; } - void Bind(VKScheduler& scheduler) const { + void Bind(const VKDevice& device, VKScheduler& scheduler) const { // Use this large switch case to avoid dispatching more memory in the record lambda than // what we need. It looks horrible, but it's the best we can do on standard C++. switch (vertex.num_buffers) { case 0: - return BindStatic<0>(scheduler); + return BindStatic<0>(device, scheduler); case 1: - return BindStatic<1>(scheduler); + return BindStatic<1>(device, scheduler); case 2: - return BindStatic<2>(scheduler); + return BindStatic<2>(device, scheduler); case 3: - return BindStatic<3>(scheduler); + return BindStatic<3>(device, scheduler); case 4: - return BindStatic<4>(scheduler); + return BindStatic<4>(device, scheduler); case 5: - return BindStatic<5>(scheduler); + return BindStatic<5>(device, scheduler); case 6: - return BindStatic<6>(scheduler); + return BindStatic<6>(device, scheduler); case 7: - return BindStatic<7>(scheduler); + return BindStatic<7>(device, scheduler); case 8: - return BindStatic<8>(scheduler); + return BindStatic<8>(device, scheduler); case 9: - return BindStatic<9>(scheduler); + return BindStatic<9>(device, scheduler); case 10: - return BindStatic<10>(scheduler); + return BindStatic<10>(device, scheduler); case 11: - return BindStatic<11>(scheduler); + return BindStatic<11>(device, scheduler); case 12: - return BindStatic<12>(scheduler); + return BindStatic<12>(device, scheduler); case 13: - return BindStatic<13>(scheduler); + return BindStatic<13>(device, scheduler); case 14: - return BindStatic<14>(scheduler); + return BindStatic<14>(device, scheduler); case 15: - return BindStatic<15>(scheduler); + return BindStatic<15>(device, scheduler); case 16: - return BindStatic<16>(scheduler); + return BindStatic<16>(device, scheduler); case 17: - return BindStatic<17>(scheduler); + return BindStatic<17>(device, scheduler); case 18: - return BindStatic<18>(scheduler); + return BindStatic<18>(device, scheduler); case 19: - return BindStatic<19>(scheduler); + return BindStatic<19>(device, scheduler); case 20: - return BindStatic<20>(scheduler); + return BindStatic<20>(device, scheduler); case 21: - return BindStatic<21>(scheduler); + return BindStatic<21>(device, scheduler); case 22: - return BindStatic<22>(scheduler); + return BindStatic<22>(device, scheduler); case 23: - return BindStatic<23>(scheduler); + return BindStatic<23>(device, scheduler); case 24: - return BindStatic<24>(scheduler); + return BindStatic<24>(device, scheduler); case 25: - return BindStatic<25>(scheduler); + return BindStatic<25>(device, scheduler); case 26: - return BindStatic<26>(scheduler); + return BindStatic<26>(device, scheduler); case 27: - return BindStatic<27>(scheduler); + return BindStatic<27>(device, scheduler); case 28: - return BindStatic<28>(scheduler); + return BindStatic<28>(device, scheduler); case 29: - return BindStatic<29>(scheduler); + return BindStatic<29>(device, scheduler); case 30: - return BindStatic<30>(scheduler); + return BindStatic<30>(device, scheduler); case 31: - return BindStatic<31>(scheduler); + return BindStatic<31>(device, scheduler); case 32: - return BindStatic<32>(scheduler); + return BindStatic<32>(device, scheduler); } UNREACHABLE(); } @@ -227,26 +291,36 @@ private: // Some of these fields are intentionally left uninitialized to avoid initializing them twice. struct { std::size_t num_buffers = 0; - std::array<const VkBuffer*, Maxwell::NumVertexArrays> buffer_ptrs; + std::array<VkBuffer, Maxwell::NumVertexArrays> buffers; std::array<VkDeviceSize, Maxwell::NumVertexArrays> offsets; + std::array<VkDeviceSize, Maxwell::NumVertexArrays> sizes; + std::array<u16, Maxwell::NumVertexArrays> strides; } vertex; struct { - const VkBuffer* buffer = nullptr; + VkBuffer buffer = nullptr; VkDeviceSize offset; VkIndexType type; } index; template <std::size_t N> - void BindStatic(VKScheduler& scheduler) const { - if (index.buffer != nullptr) { - BindStatic<N, true>(scheduler); + void BindStatic(const VKDevice& device, VKScheduler& scheduler) const { + if (device.IsExtExtendedDynamicStateSupported()) { + if (index.buffer) { + BindStatic<N, true, true>(scheduler); + } else { + BindStatic<N, false, true>(scheduler); + } } else { - BindStatic<N, false>(scheduler); + if (index.buffer) { + BindStatic<N, true, false>(scheduler); + } else { + BindStatic<N, false, false>(scheduler); + } } } - template <std::size_t N, bool is_indexed> + template <std::size_t N, bool is_indexed, bool has_extended_dynamic_state> void BindStatic(VKScheduler& scheduler) const { static_assert(N <= Maxwell::NumVertexArrays); if constexpr (N == 0) { @@ -254,18 +328,39 @@ private: } std::array<VkBuffer, N> buffers; - std::transform(vertex.buffer_ptrs.begin(), vertex.buffer_ptrs.begin() + N, buffers.begin(), - [](const auto ptr) { return *ptr; }); - std::array<VkDeviceSize, N> offsets; + std::copy(vertex.buffers.begin(), vertex.buffers.begin() + N, buffers.begin()); std::copy(vertex.offsets.begin(), vertex.offsets.begin() + N, offsets.begin()); + if constexpr (has_extended_dynamic_state) { + // With extended dynamic states we can specify the length and stride of a vertex buffer + std::array<VkDeviceSize, N> sizes; + std::array<u16, N> strides; + std::copy(vertex.sizes.begin(), vertex.sizes.begin() + N, sizes.begin()); + std::copy(vertex.strides.begin(), vertex.strides.begin() + N, strides.begin()); + + if constexpr (is_indexed) { + scheduler.Record( + [buffers, offsets, sizes, strides, index = index](vk::CommandBuffer cmdbuf) { + cmdbuf.BindIndexBuffer(index.buffer, index.offset, index.type); + cmdbuf.BindVertexBuffers2EXT(0, static_cast<u32>(N), buffers.data(), + offsets.data(), sizes.data(), + ExpandStrides(strides).data()); + }); + } else { + scheduler.Record([buffers, offsets, sizes, strides](vk::CommandBuffer cmdbuf) { + cmdbuf.BindVertexBuffers2EXT(0, static_cast<u32>(N), buffers.data(), + offsets.data(), sizes.data(), + ExpandStrides(strides).data()); + }); + } + return; + } + if constexpr (is_indexed) { // Indexed draw - scheduler.Record([buffers, offsets, index_buffer = *index.buffer, - index_offset = index.offset, - index_type = index.type](vk::CommandBuffer cmdbuf) { - cmdbuf.BindIndexBuffer(index_buffer, index_offset, index_type); + scheduler.Record([buffers, offsets, index = index](vk::CommandBuffer cmdbuf) { + cmdbuf.BindIndexBuffer(index.buffer, index.offset, index.type); cmdbuf.BindVertexBuffers(0, static_cast<u32>(N), buffers.data(), offsets.data()); }); } else { @@ -285,25 +380,32 @@ void RasterizerVulkan::DrawParameters::Draw(vk::CommandBuffer cmdbuf) const { } } -RasterizerVulkan::RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& renderer, - VKScreenInfo& screen_info, const VKDevice& device, - VKResourceManager& resource_manager, - VKMemoryManager& memory_manager, StateTracker& state_tracker, - VKScheduler& scheduler) - : RasterizerAccelerated{system.Memory()}, system{system}, render_window{renderer}, - screen_info{screen_info}, device{device}, resource_manager{resource_manager}, - memory_manager{memory_manager}, state_tracker{state_tracker}, scheduler{scheduler}, - staging_pool(device, memory_manager, scheduler), descriptor_pool(device), - update_descriptor_queue(device, scheduler), renderpass_cache(device), +RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu_, + Tegra::MemoryManager& gpu_memory_, + Core::Memory::Memory& cpu_memory, VKScreenInfo& screen_info_, + const VKDevice& device_, VKMemoryManager& memory_manager_, + StateTracker& state_tracker_, VKScheduler& scheduler_) + : RasterizerAccelerated(cpu_memory), gpu(gpu_), gpu_memory(gpu_memory_), + maxwell3d(gpu.Maxwell3D()), kepler_compute(gpu.KeplerCompute()), screen_info(screen_info_), + device(device_), memory_manager(memory_manager_), state_tracker(state_tracker_), + scheduler(scheduler_), staging_pool(device, memory_manager, scheduler), + descriptor_pool(device, scheduler_), update_descriptor_queue(device, scheduler), + renderpass_cache(device), quad_array_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), + quad_indexed_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), - texture_cache(system, *this, device, resource_manager, memory_manager, scheduler, - staging_pool), - pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue, - renderpass_cache), - buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool), - sampler_cache(device), query_cache(system, *this, device, scheduler) { + texture_cache(*this, maxwell3d, gpu_memory, device, memory_manager, scheduler, staging_pool), + pipeline_cache(*this, gpu, maxwell3d, kepler_compute, gpu_memory, device, scheduler, + descriptor_pool, update_descriptor_queue, renderpass_cache), + buffer_cache(*this, gpu_memory, cpu_memory, device, memory_manager, scheduler, staging_pool), + sampler_cache(device), query_cache(*this, maxwell3d, gpu_memory, device, scheduler), + fence_manager(*this, gpu, gpu_memory, texture_cache, buffer_cache, query_cache, device, + scheduler), + wfi_event(device.GetLogical().CreateEvent()), async_shaders(emu_window) { scheduler.SetQueryCache(query_cache); + if (device.UseAsynchronousShaders()) { + async_shaders.AllocateWorkers(); + } } RasterizerVulkan::~RasterizerVulkan() = default; @@ -311,12 +413,13 @@ RasterizerVulkan::~RasterizerVulkan() = default; void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { MICROPROFILE_SCOPE(Vulkan_Drawing); + SCOPE_EXIT({ gpu.TickWork(); }); FlushWork(); query_cache.UpdateCounters(); - const auto& gpu = system.GPU().Maxwell3D(); - GraphicsPipelineCacheKey key{GetFixedPipelineState(gpu.regs)}; + GraphicsPipelineCacheKey key; + key.fixed_state.Fill(maxwell3d.regs, device.IsExtExtendedDynamicStateSupported()); buffer_cache.Map(CalculateGraphicsStreamBufferSize(is_indexed)); @@ -334,31 +437,32 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { buffer_cache.Unmap(); - const auto texceptions = UpdateAttachments(); + const Texceptions texceptions = UpdateAttachments(false); SetupImageTransitions(texceptions, color_attachments, zeta_attachment); key.renderpass_params = GetRenderPassParams(texceptions); + key.padding = 0; + + auto* pipeline = pipeline_cache.GetGraphicsPipeline(key, async_shaders); + if (pipeline == nullptr || pipeline->GetHandle() == VK_NULL_HANDLE) { + // Async graphics pipeline was not ready. + return; + } - auto& pipeline = pipeline_cache.GetGraphicsPipeline(key); - scheduler.BindGraphicsPipeline(pipeline.GetHandle()); + scheduler.BindGraphicsPipeline(pipeline->GetHandle()); - const auto renderpass = pipeline.GetRenderPass(); + const auto renderpass = pipeline->GetRenderPass(); const auto [framebuffer, render_area] = ConfigureFramebuffers(renderpass); scheduler.RequestRenderpass(renderpass, framebuffer, render_area); UpdateDynamicStates(); - buffer_bindings.Bind(scheduler); - - if (device.IsNvDeviceDiagnosticCheckpoints()) { - scheduler.Record( - [&pipeline](vk::CommandBuffer cmdbuf) { cmdbuf.SetCheckpointNV(&pipeline); }); - } + buffer_bindings.Bind(device, scheduler); BeginTransformFeedback(); - const auto pipeline_layout = pipeline.GetLayout(); - const auto descriptor_set = pipeline.CommitDescriptorSet(); + const auto pipeline_layout = pipeline->GetLayout(); + const auto descriptor_set = pipeline->CommitDescriptorSet(); scheduler.Record([pipeline_layout, descriptor_set, draw_params](vk::CommandBuffer cmdbuf) { if (descriptor_set) { cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout, @@ -373,8 +477,7 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { void RasterizerVulkan::Clear() { MICROPROFILE_SCOPE(Vulkan_Clearing); - const auto& gpu = system.GPU().Maxwell3D(); - if (!system.GPU().Maxwell3D().ShouldExecute()) { + if (!maxwell3d.ShouldExecute()) { return; } @@ -383,7 +486,7 @@ void RasterizerVulkan::Clear() { query_cache.UpdateCounters(); - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; const bool use_color = regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B || regs.clear_buffers.A; const bool use_depth = regs.clear_buffers.Z; @@ -392,7 +495,7 @@ void RasterizerVulkan::Clear() { return; } - [[maybe_unused]] const auto texceptions = UpdateAttachments(); + [[maybe_unused]] const auto texceptions = UpdateAttachments(true); DEBUG_ASSERT(texceptions.none()); SetupImageTransitions(0, color_attachments, zeta_attachment); @@ -413,10 +516,11 @@ void RasterizerVulkan::Clear() { const u32 color_attachment = regs.clear_buffers.RT; scheduler.Record([color_attachment, clear_value, clear_rect](vk::CommandBuffer cmdbuf) { - VkClearAttachment attachment; - attachment.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - attachment.colorAttachment = color_attachment; - attachment.clearValue = clear_value; + const VkClearAttachment attachment{ + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .colorAttachment = color_attachment, + .clearValue = clear_value, + }; cmdbuf.ClearAttachments(attachment, clear_rect); }); } @@ -434,10 +538,6 @@ void RasterizerVulkan::Clear() { scheduler.Record([clear_depth = regs.clear_depth, clear_stencil = regs.clear_stencil, clear_rect, aspect_flags](vk::CommandBuffer cmdbuf) { - VkClearValue clear_value; - clear_value.depthStencil.depth = clear_depth; - clear_value.depthStencil.stencil = clear_stencil; - VkClearAttachment attachment; attachment.aspectMask = aspect_flags; attachment.colorAttachment = 0; @@ -455,12 +555,17 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { query_cache.UpdateCounters(); - const auto& launch_desc = system.GPU().KeplerCompute().launch_description; - const ComputePipelineCacheKey key{ - code_addr, - launch_desc.shared_alloc, - {launch_desc.block_dim_x, launch_desc.block_dim_y, launch_desc.block_dim_z}}; - auto& pipeline = pipeline_cache.GetComputePipeline(key); + const auto& launch_desc = kepler_compute.launch_description; + auto& pipeline = pipeline_cache.GetComputePipeline({ + .shader = code_addr, + .shared_memory_size = launch_desc.shared_alloc, + .workgroup_size = + { + launch_desc.block_dim_x, + launch_desc.block_dim_y, + launch_desc.block_dim_z, + }, + }); // Compute dispatches can't be executed inside a renderpass scheduler.RequestOutsideRenderPassOperationContext(); @@ -470,8 +575,9 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { const auto& entries = pipeline.GetEntries(); SetupComputeConstBuffers(entries); SetupComputeGlobalBuffers(entries); - SetupComputeTexelBuffers(entries); + SetupComputeUniformTexels(entries); SetupComputeTextures(entries); + SetupComputeStorageTexels(entries); SetupComputeImages(entries); buffer_cache.Unmap(); @@ -481,11 +587,6 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { TransitionImages(image_views, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT); - if (device.IsNvDeviceDiagnosticCheckpoints()) { - scheduler.Record( - [&pipeline](vk::CommandBuffer cmdbuf) { cmdbuf.SetCheckpointNV(nullptr); }); - } - scheduler.Record([grid_x = launch_desc.grid_dim_x, grid_y = launch_desc.grid_dim_y, grid_z = launch_desc.grid_dim_z, pipeline_handle = pipeline.GetHandle(), layout = pipeline.GetLayout(), @@ -517,6 +618,13 @@ void RasterizerVulkan::FlushRegion(VAddr addr, u64 size) { query_cache.FlushRegion(addr, size); } +bool RasterizerVulkan::MustFlushRegion(VAddr addr, u64 size) { + if (!Settings::IsGPULevelHigh()) { + return buffer_cache.MustFlushRegion(addr, size); + } + return texture_cache.MustFlushRegion(addr, size) || buffer_cache.MustFlushRegion(addr, size); +} + void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; @@ -527,11 +635,71 @@ void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size) { query_cache.InvalidateRegion(addr, size); } +void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) { + if (addr == 0 || size == 0) { + return; + } + texture_cache.OnCPUWrite(addr, size); + pipeline_cache.OnCPUWrite(addr, size); + buffer_cache.OnCPUWrite(addr, size); +} + +void RasterizerVulkan::SyncGuestHost() { + texture_cache.SyncGuestHost(); + buffer_cache.SyncGuestHost(); + pipeline_cache.SyncGuestHost(); +} + +void RasterizerVulkan::SignalSemaphore(GPUVAddr addr, u32 value) { + if (!gpu.IsAsync()) { + gpu_memory.Write<u32>(addr, value); + return; + } + fence_manager.SignalSemaphore(addr, value); +} + +void RasterizerVulkan::SignalSyncPoint(u32 value) { + if (!gpu.IsAsync()) { + gpu.IncrementSyncPoint(value); + return; + } + fence_manager.SignalSyncPoint(value); +} + +void RasterizerVulkan::ReleaseFences() { + if (!gpu.IsAsync()) { + return; + } + fence_manager.WaitPendingFences(); +} + void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size) { - FlushRegion(addr, size); + if (Settings::IsGPULevelExtreme()) { + FlushRegion(addr, size); + } InvalidateRegion(addr, size); } +void RasterizerVulkan::WaitForIdle() { + // Everything but wait pixel operations. This intentionally includes FRAGMENT_SHADER_BIT because + // fragment shaders can still write storage buffers. + VkPipelineStageFlags flags = + VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | + VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT | + VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | + VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT; + if (device.IsExtTransformFeedbackSupported()) { + flags |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT; + } + + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([event = *wfi_event, flags](vk::CommandBuffer cmdbuf) { + cmdbuf.SetEvent(event, flags); + cmdbuf.WaitEvents(event, flags, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, {}, {}, {}); + }); +} + void RasterizerVulkan::FlushCommands() { if (draw_counter > 0) { draw_counter = 0; @@ -576,10 +744,6 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config, return true; } -void RasterizerVulkan::SetupDirtyFlags() { - state_tracker.Initialize(); -} - void RasterizerVulkan::FlushWork() { static constexpr u32 DRAWS_TO_DISPATCH = 4096; @@ -601,9 +765,11 @@ void RasterizerVulkan::FlushWork() { draw_counter = 0; } -RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() { +RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments(bool is_clear) { MICROPROFILE_SCOPE(Vulkan_RenderTargets); - auto& dirty = system.GPU().Maxwell3D().dirty.flags; + + const auto& regs = maxwell3d.regs; + auto& dirty = maxwell3d.dirty.flags; const bool update_rendertargets = dirty[VideoCommon::Dirty::RenderTargets]; dirty[VideoCommon::Dirty::RenderTargets] = false; @@ -612,7 +778,8 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() { Texceptions texceptions; for (std::size_t rt = 0; rt < Maxwell::NumRenderTargets; ++rt) { if (update_rendertargets) { - color_attachments[rt] = texture_cache.GetColorBufferSurface(rt); + const bool preserve_contents = HasToPreserveColorContents(is_clear, regs); + color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, preserve_contents); } if (color_attachments[rt] && WalkAttachmentOverlaps(*color_attachments[rt])) { texceptions[rt] = true; @@ -620,7 +787,8 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() { } if (update_rendertargets) { - zeta_attachment = texture_cache.GetDepthBufferSurface(); + const bool preserve_contents = HasToPreserveDepthContents(is_clear, regs); + zeta_attachment = texture_cache.GetDepthBufferSurface(preserve_contents); } if (zeta_attachment && WalkAttachmentOverlaps(*zeta_attachment)) { texceptions[ZETA_TEXCEPTION_INDEX] = true; @@ -645,21 +813,28 @@ bool RasterizerVulkan::WalkAttachmentOverlaps(const CachedSurfaceView& attachmen std::tuple<VkFramebuffer, VkExtent2D> RasterizerVulkan::ConfigureFramebuffers( VkRenderPass renderpass) { - FramebufferCacheKey key{renderpass, std::numeric_limits<u32>::max(), - std::numeric_limits<u32>::max(), std::numeric_limits<u32>::max()}; + FramebufferCacheKey key{ + .renderpass = renderpass, + .width = std::numeric_limits<u32>::max(), + .height = std::numeric_limits<u32>::max(), + .layers = std::numeric_limits<u32>::max(), + .views = {}, + }; - const auto try_push = [&](const View& view) { + const auto try_push = [&key](const View& view) { if (!view) { return false; } - key.views.push_back(view->GetHandle()); + key.views.push_back(view->GetAttachment()); key.width = std::min(key.width, view->GetWidth()); key.height = std::min(key.height, view->GetHeight()); key.layers = std::min(key.layers, view->GetNumLayers()); return true; }; - for (std::size_t index = 0; index < std::size(color_attachments); ++index) { + const auto& regs = maxwell3d.regs; + const std::size_t num_attachments = static_cast<std::size_t>(regs.rt_control.count); + for (std::size_t index = 0; index < num_attachments; ++index) { if (try_push(color_attachments[index])) { texture_cache.MarkColorBufferInUse(index); } @@ -671,17 +846,17 @@ std::tuple<VkFramebuffer, VkExtent2D> RasterizerVulkan::ConfigureFramebuffers( const auto [fbentry, is_cache_miss] = framebuffer_cache.try_emplace(key); auto& framebuffer = fbentry->second; if (is_cache_miss) { - VkFramebufferCreateInfo framebuffer_ci; - framebuffer_ci.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; - framebuffer_ci.pNext = nullptr; - framebuffer_ci.flags = 0; - framebuffer_ci.renderPass = key.renderpass; - framebuffer_ci.attachmentCount = static_cast<u32>(key.views.size()); - framebuffer_ci.pAttachments = key.views.data(); - framebuffer_ci.width = key.width; - framebuffer_ci.height = key.height; - framebuffer_ci.layers = key.layers; - framebuffer = device.GetLogical().CreateFramebuffer(framebuffer_ci); + framebuffer = device.GetLogical().CreateFramebuffer({ + .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .renderPass = key.renderpass, + .attachmentCount = static_cast<u32>(key.views.size()), + .pAttachments = key.views.data(), + .width = key.width, + .height = key.height, + .layers = key.layers, + }); } return {*framebuffer, VkExtent2D{key.width, key.height}}; @@ -693,13 +868,12 @@ RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineSt bool is_instanced) { MICROPROFILE_SCOPE(Vulkan_Geometry); - const auto& gpu = system.GPU().Maxwell3D(); - const auto& regs = gpu.regs; + const auto& regs = maxwell3d.regs; - SetupVertexArrays(fixed_state.vertex_input, buffer_bindings); + SetupVertexArrays(buffer_bindings); const u32 base_instance = regs.vb_base_instance; - const u32 num_instances = is_instanced ? gpu.mme_draw.instance_count : 1; + const u32 num_instances = is_instanced ? maxwell3d.mme_draw.instance_count : 1; const u32 base_vertex = is_indexed ? regs.vb_element_base : regs.vertex_buffer.first; const u32 num_vertices = is_indexed ? regs.index_array.count : regs.vertex_buffer.count; @@ -710,20 +884,21 @@ RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineSt } void RasterizerVulkan::SetupShaderDescriptors( - const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) { + const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) { texture_cache.GuardSamplers(true); for (std::size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { // Skip VertexA stage - const auto& shader = shaders[stage + 1]; + Shader* const shader = shaders[stage + 1]; if (!shader) { continue; } const auto& entries = shader->GetEntries(); SetupGraphicsConstBuffers(entries, stage); SetupGraphicsGlobalBuffers(entries, stage); - SetupGraphicsTexelBuffers(entries, stage); + SetupGraphicsUniformTexels(entries, stage); SetupGraphicsTextures(entries, stage); + SetupGraphicsStorageTexels(entries, stage); SetupGraphicsImages(entries, stage); } texture_cache.GuardSamplers(false); @@ -759,20 +934,34 @@ void RasterizerVulkan::SetupImageTransitions( } void RasterizerVulkan::UpdateDynamicStates() { - auto& regs = system.GPU().Maxwell3D().regs; + auto& regs = maxwell3d.regs; UpdateViewportsState(regs); UpdateScissorsState(regs); UpdateDepthBias(regs); UpdateBlendConstants(regs); UpdateDepthBounds(regs); UpdateStencilFaces(regs); + if (device.IsExtExtendedDynamicStateSupported()) { + UpdateCullMode(regs); + UpdateDepthBoundsTestEnable(regs); + UpdateDepthTestEnable(regs); + UpdateDepthWriteEnable(regs); + UpdateDepthCompareOp(regs); + UpdateFrontFace(regs); + UpdateStencilOp(regs); + UpdateStencilTestEnable(regs); + } } void RasterizerVulkan::BeginTransformFeedback() { - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; if (regs.tfb_enabled == 0) { return; } + if (!device.IsExtTransformFeedbackSupported()) { + LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); + return; + } UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || @@ -787,90 +976,92 @@ void RasterizerVulkan::BeginTransformFeedback() { UNIMPLEMENTED_IF(binding.buffer_offset != 0); const GPUVAddr gpu_addr = binding.Address(); - const std::size_t size = binding.buffer_size; - const auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true); + const VkDeviceSize size = static_cast<VkDeviceSize>(binding.buffer_size); + const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true); - scheduler.Record([buffer = *buffer, offset = offset, size](vk::CommandBuffer cmdbuf) { + scheduler.Record([buffer = info.handle, offset = info.offset, size](vk::CommandBuffer cmdbuf) { cmdbuf.BindTransformFeedbackBuffersEXT(0, 1, &buffer, &offset, &size); cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); }); } void RasterizerVulkan::EndTransformFeedback() { - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; if (regs.tfb_enabled == 0) { return; } + if (!device.IsExtTransformFeedbackSupported()) { + return; + } scheduler.Record( [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); }); } -void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input, - BufferBindings& buffer_bindings) { - const auto& regs = system.GPU().Maxwell3D().regs; - - for (u32 index = 0; index < static_cast<u32>(Maxwell::NumVertexAttributes); ++index) { - const auto& attrib = regs.vertex_attrib_format[index]; - if (!attrib.IsValid()) { - continue; - } +void RasterizerVulkan::SetupVertexArrays(BufferBindings& buffer_bindings) { + const auto& regs = maxwell3d.regs; - const auto& buffer = regs.vertex_array[attrib.buffer]; - ASSERT(buffer.IsEnabled()); - - vertex_input.attributes[vertex_input.num_attributes++] = - FixedPipelineState::VertexAttribute(index, attrib.buffer, attrib.type, attrib.size, - attrib.offset); - } - - for (u32 index = 0; index < static_cast<u32>(Maxwell::NumVertexArrays); ++index) { + for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { const auto& vertex_array = regs.vertex_array[index]; if (!vertex_array.IsEnabled()) { continue; } - const GPUVAddr start{vertex_array.StartAddress()}; const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()}; - ASSERT(end > start); - const std::size_t size{end - start + 1}; - const auto [buffer, offset] = buffer_cache.UploadMemory(start, size); - - vertex_input.bindings[vertex_input.num_bindings++] = FixedPipelineState::VertexBinding( - index, vertex_array.stride, - regs.instanced_arrays.IsInstancingEnabled(index) ? vertex_array.divisor : 0); - buffer_bindings.AddVertexBinding(buffer, offset); + ASSERT(end >= start); + const std::size_t size = end - start; + if (size == 0) { + buffer_bindings.AddVertexBinding(DefaultBuffer(), 0, DEFAULT_BUFFER_SIZE, 0); + continue; + } + const auto info = buffer_cache.UploadMemory(start, size); + buffer_bindings.AddVertexBinding(info.handle, info.offset, size, vertex_array.stride); } } void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawParameters& params, bool is_indexed) { - const auto& regs = system.GPU().Maxwell3D().regs; + if (params.num_vertices == 0) { + return; + } + const auto& regs = maxwell3d.regs; switch (regs.draw.topology) { - case Maxwell::PrimitiveTopology::Quads: - if (params.is_indexed) { - UNIMPLEMENTED(); - } else { + case Maxwell::PrimitiveTopology::Quads: { + if (!params.is_indexed) { const auto [buffer, offset] = quad_array_pass.Assemble(params.num_vertices, params.base_vertex); buffer_bindings.SetIndexBinding(buffer, offset, VK_INDEX_TYPE_UINT32); params.base_vertex = 0; params.num_vertices = params.num_vertices * 6 / 4; params.is_indexed = true; + break; } + const GPUVAddr gpu_addr = regs.index_array.IndexStart(); + const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); + VkBuffer buffer = info.handle; + u64 offset = info.offset; + std::tie(buffer, offset) = quad_indexed_pass.Assemble( + regs.index_array.format, params.num_vertices, params.base_vertex, buffer, offset); + + buffer_bindings.SetIndexBinding(buffer, offset, VK_INDEX_TYPE_UINT32); + params.num_vertices = (params.num_vertices / 4) * 6; + params.base_vertex = 0; break; + } default: { if (!is_indexed) { break; } const GPUVAddr gpu_addr = regs.index_array.IndexStart(); - auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); + const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); + VkBuffer buffer = info.handle; + u64 offset = info.offset; auto format = regs.index_array.format; const bool is_uint8 = format == Maxwell::IndexFormat::UnsignedByte; if (is_uint8 && !device.IsExtIndexTypeUint8Supported()) { - std::tie(buffer, offset) = uint8_pass.Assemble(params.num_vertices, *buffer, offset); + std::tie(buffer, offset) = uint8_pass.Assemble(params.num_vertices, buffer, offset); format = Maxwell::IndexFormat::UnsignedShort; } @@ -882,8 +1073,7 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar void RasterizerVulkan::SetupGraphicsConstBuffers(const ShaderEntries& entries, std::size_t stage) { MICROPROFILE_SCOPE(Vulkan_ConstBuffers); - const auto& gpu = system.GPU().Maxwell3D(); - const auto& shader_stage = gpu.state.shader_stages[stage]; + const auto& shader_stage = maxwell3d.state.shader_stages[stage]; for (const auto& entry : entries.const_buffers) { SetupConstBuffer(entry, shader_stage.const_buffers[entry.GetIndex()]); } @@ -891,8 +1081,7 @@ void RasterizerVulkan::SetupGraphicsConstBuffers(const ShaderEntries& entries, s void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage) { MICROPROFILE_SCOPE(Vulkan_GlobalBuffers); - auto& gpu{system.GPU()}; - const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage]}; + const auto& cbufs{maxwell3d.state.shader_stages[stage]}; for (const auto& entry : entries.global_buffers) { const auto addr = cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset(); @@ -900,38 +1089,43 @@ void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries, } } -void RasterizerVulkan::SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage) { +void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage) { MICROPROFILE_SCOPE(Vulkan_Textures); - const auto& gpu = system.GPU().Maxwell3D(); - for (const auto& entry : entries.texel_buffers) { - const auto image = GetTextureInfo(gpu, entry, stage).tic; - SetupTexelBuffer(image, entry); + for (const auto& entry : entries.uniform_texels) { + const auto image = GetTextureInfo(maxwell3d, entry, stage).tic; + SetupUniformTexels(image, entry); } } void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std::size_t stage) { MICROPROFILE_SCOPE(Vulkan_Textures); - const auto& gpu = system.GPU().Maxwell3D(); for (const auto& entry : entries.samplers) { - for (std::size_t i = 0; i < entry.Size(); ++i) { - const auto texture = GetTextureInfo(gpu, entry, stage, i); + for (std::size_t i = 0; i < entry.size; ++i) { + const auto texture = GetTextureInfo(maxwell3d, entry, stage, i); SetupTexture(texture, entry); } } } +void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage) { + MICROPROFILE_SCOPE(Vulkan_Textures); + for (const auto& entry : entries.storage_texels) { + const auto image = GetTextureInfo(maxwell3d, entry, stage).tic; + SetupStorageTexel(image, entry); + } +} + void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) { MICROPROFILE_SCOPE(Vulkan_Images); - const auto& gpu = system.GPU().Maxwell3D(); for (const auto& entry : entries.images) { - const auto tic = GetTextureInfo(gpu, entry, stage).tic; + const auto tic = GetTextureInfo(maxwell3d, entry, stage).tic; SetupImage(tic, entry); } } void RasterizerVulkan::SetupComputeConstBuffers(const ShaderEntries& entries) { MICROPROFILE_SCOPE(Vulkan_ConstBuffers); - const auto& launch_desc = system.GPU().KeplerCompute().launch_description; + const auto& launch_desc = kepler_compute.launch_description; for (const auto& entry : entries.const_buffers) { const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); @@ -945,38 +1139,43 @@ void RasterizerVulkan::SetupComputeConstBuffers(const ShaderEntries& entries) { void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) { MICROPROFILE_SCOPE(Vulkan_GlobalBuffers); - const auto cbufs{system.GPU().KeplerCompute().launch_description.const_buffer_config}; + const auto& cbufs{kepler_compute.launch_description.const_buffer_config}; for (const auto& entry : entries.global_buffers) { const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()}; SetupGlobalBuffer(entry, addr); } } -void RasterizerVulkan::SetupComputeTexelBuffers(const ShaderEntries& entries) { +void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) { MICROPROFILE_SCOPE(Vulkan_Textures); - const auto& gpu = system.GPU().KeplerCompute(); - for (const auto& entry : entries.texel_buffers) { - const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic; - SetupTexelBuffer(image, entry); + for (const auto& entry : entries.uniform_texels) { + const auto image = GetTextureInfo(kepler_compute, entry, ComputeShaderIndex).tic; + SetupUniformTexels(image, entry); } } void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) { MICROPROFILE_SCOPE(Vulkan_Textures); - const auto& gpu = system.GPU().KeplerCompute(); for (const auto& entry : entries.samplers) { - for (std::size_t i = 0; i < entry.Size(); ++i) { - const auto texture = GetTextureInfo(gpu, entry, ComputeShaderIndex, i); + for (std::size_t i = 0; i < entry.size; ++i) { + const auto texture = GetTextureInfo(kepler_compute, entry, ComputeShaderIndex, i); SetupTexture(texture, entry); } } } +void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) { + MICROPROFILE_SCOPE(Vulkan_Textures); + for (const auto& entry : entries.storage_texels) { + const auto image = GetTextureInfo(kepler_compute, entry, ComputeShaderIndex).tic; + SetupStorageTexel(image, entry); + } +} + void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) { MICROPROFILE_SCOPE(Vulkan_Images); - const auto& gpu = system.GPU().KeplerCompute(); for (const auto& entry : entries.images) { - const auto tic = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic; + const auto tic = GetTextureInfo(kepler_compute, entry, ComputeShaderIndex).tic; SetupImage(tic, entry); } } @@ -985,8 +1184,7 @@ void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry, const Tegra::Engines::ConstBufferInfo& buffer) { if (!buffer.enabled) { // Set values to zero to unbind buffers - update_descriptor_queue.AddBuffer(buffer_cache.GetEmptyBuffer(sizeof(float)), 0, - sizeof(float)); + update_descriptor_queue.AddBuffer(DefaultBuffer(), 0, DEFAULT_BUFFER_SIZE); return; } @@ -995,33 +1193,33 @@ void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry, Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float)); ASSERT(size <= MaxConstbufferSize); - const auto [buffer_handle, offset] = + const auto info = buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment()); - - update_descriptor_queue.AddBuffer(buffer_handle, offset, size); + update_descriptor_queue.AddBuffer(info.handle, info.offset, size); } void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address) { - auto& memory_manager{system.GPU().MemoryManager()}; - const auto actual_addr = memory_manager.Read<u64>(address); - const auto size = memory_manager.Read<u32>(address + 8); + const u64 actual_addr = gpu_memory.Read<u64>(address); + const u32 size = gpu_memory.Read<u32>(address + 8); if (size == 0) { - // Sometimes global memory pointers don't have a proper size. Upload a dummy entry because - // Vulkan doesn't like empty buffers. - constexpr std::size_t dummy_size = 4; - const auto buffer = buffer_cache.GetEmptyBuffer(dummy_size); - update_descriptor_queue.AddBuffer(buffer, 0, dummy_size); + // Sometimes global memory pointers don't have a proper size. Upload a dummy entry + // because Vulkan doesn't like empty buffers. + // Note: Do *not* use DefaultBuffer() here, storage buffers can be written breaking the + // default buffer. + static constexpr std::size_t dummy_size = 4; + const auto info = buffer_cache.GetEmptyBuffer(dummy_size); + update_descriptor_queue.AddBuffer(info.handle, info.offset, dummy_size); return; } - const auto [buffer, offset] = buffer_cache.UploadMemory( + const auto info = buffer_cache.UploadMemory( actual_addr, size, device.GetStorageBufferAlignment(), entry.IsWritten()); - update_descriptor_queue.AddBuffer(buffer, offset, size); + update_descriptor_queue.AddBuffer(info.handle, info.offset, size); } -void RasterizerVulkan::SetupTexelBuffer(const Tegra::Texture::TICEntry& tic, - const TexelBufferEntry& entry) { +void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic, + const UniformTexelEntry& entry) { const auto view = texture_cache.GetTextureSurface(tic, entry); ASSERT(view->IsBufferView()); @@ -1033,29 +1231,38 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu auto view = texture_cache.GetTextureSurface(texture.tic, entry); ASSERT(!view->IsBufferView()); - const auto image_view = view->GetHandle(texture.tic.x_source, texture.tic.y_source, - texture.tic.z_source, texture.tic.w_source); + const VkImageView image_view = view->GetImageView(texture.tic.x_source, texture.tic.y_source, + texture.tic.z_source, texture.tic.w_source); const auto sampler = sampler_cache.GetSampler(texture.tsc); update_descriptor_queue.AddSampledImage(sampler, image_view); - const auto image_layout = update_descriptor_queue.GetLastImageLayout(); + VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout(); *image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; sampled_views.push_back(ImageView{std::move(view), image_layout}); } +void RasterizerVulkan::SetupStorageTexel(const Tegra::Texture::TICEntry& tic, + const StorageTexelEntry& entry) { + const auto view = texture_cache.GetImageSurface(tic, entry); + ASSERT(view->IsBufferView()); + + update_descriptor_queue.AddTexelBuffer(view->GetBufferView()); +} + void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry) { auto view = texture_cache.GetImageSurface(tic, entry); - if (entry.IsWritten()) { + if (entry.is_written) { view->MarkAsModified(texture_cache.Tick()); } UNIMPLEMENTED_IF(tic.IsBuffer()); - const auto image_view = view->GetHandle(tic.x_source, tic.y_source, tic.z_source, tic.w_source); + const VkImageView image_view = + view->GetImageView(tic.x_source, tic.y_source, tic.z_source, tic.w_source); update_descriptor_queue.AddImage(image_view); - const auto image_layout = update_descriptor_queue.GetLastImageLayout(); + VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout(); *image_layout = VK_IMAGE_LAYOUT_GENERAL; image_views.push_back(ImageView{std::move(view), image_layout}); } @@ -1150,6 +1357,107 @@ void RasterizerVulkan::UpdateStencilFaces(Tegra::Engines::Maxwell3D::Regs& regs) } } +void RasterizerVulkan::UpdateCullMode(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchCullMode()) { + return; + } + scheduler.Record( + [enabled = regs.cull_test_enabled, cull_face = regs.cull_face](vk::CommandBuffer cmdbuf) { + cmdbuf.SetCullModeEXT(enabled ? MaxwellToVK::CullFace(cull_face) : VK_CULL_MODE_NONE); + }); +} + +void RasterizerVulkan::UpdateDepthBoundsTestEnable(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchDepthBoundsTestEnable()) { + return; + } + scheduler.Record([enable = regs.depth_bounds_enable](vk::CommandBuffer cmdbuf) { + cmdbuf.SetDepthBoundsTestEnableEXT(enable); + }); +} + +void RasterizerVulkan::UpdateDepthTestEnable(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchDepthTestEnable()) { + return; + } + scheduler.Record([enable = regs.depth_test_enable](vk::CommandBuffer cmdbuf) { + cmdbuf.SetDepthTestEnableEXT(enable); + }); +} + +void RasterizerVulkan::UpdateDepthWriteEnable(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchDepthWriteEnable()) { + return; + } + scheduler.Record([enable = regs.depth_write_enabled](vk::CommandBuffer cmdbuf) { + cmdbuf.SetDepthWriteEnableEXT(enable); + }); +} + +void RasterizerVulkan::UpdateDepthCompareOp(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchDepthCompareOp()) { + return; + } + scheduler.Record([func = regs.depth_test_func](vk::CommandBuffer cmdbuf) { + cmdbuf.SetDepthCompareOpEXT(MaxwellToVK::ComparisonOp(func)); + }); +} + +void RasterizerVulkan::UpdateFrontFace(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchFrontFace()) { + return; + } + + VkFrontFace front_face = MaxwellToVK::FrontFace(regs.front_face); + if (regs.screen_y_control.triangle_rast_flip != 0) { + front_face = front_face == VK_FRONT_FACE_CLOCKWISE ? VK_FRONT_FACE_COUNTER_CLOCKWISE + : VK_FRONT_FACE_CLOCKWISE; + } + scheduler.Record( + [front_face](vk::CommandBuffer cmdbuf) { cmdbuf.SetFrontFaceEXT(front_face); }); +} + +void RasterizerVulkan::UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchStencilOp()) { + return; + } + const Maxwell::StencilOp fail = regs.stencil_front_op_fail; + const Maxwell::StencilOp zfail = regs.stencil_front_op_zfail; + const Maxwell::StencilOp zpass = regs.stencil_front_op_zpass; + const Maxwell::ComparisonOp compare = regs.stencil_front_func_func; + if (regs.stencil_two_side_enable) { + scheduler.Record([fail, zfail, zpass, compare](vk::CommandBuffer cmdbuf) { + cmdbuf.SetStencilOpEXT(VK_STENCIL_FACE_FRONT_AND_BACK, MaxwellToVK::StencilOp(fail), + MaxwellToVK::StencilOp(zpass), MaxwellToVK::StencilOp(zfail), + MaxwellToVK::ComparisonOp(compare)); + }); + } else { + const Maxwell::StencilOp back_fail = regs.stencil_back_op_fail; + const Maxwell::StencilOp back_zfail = regs.stencil_back_op_zfail; + const Maxwell::StencilOp back_zpass = regs.stencil_back_op_zpass; + const Maxwell::ComparisonOp back_compare = regs.stencil_back_func_func; + scheduler.Record([fail, zfail, zpass, compare, back_fail, back_zfail, back_zpass, + back_compare](vk::CommandBuffer cmdbuf) { + cmdbuf.SetStencilOpEXT(VK_STENCIL_FACE_FRONT_BIT, MaxwellToVK::StencilOp(fail), + MaxwellToVK::StencilOp(zpass), MaxwellToVK::StencilOp(zfail), + MaxwellToVK::ComparisonOp(compare)); + cmdbuf.SetStencilOpEXT(VK_STENCIL_FACE_BACK_BIT, MaxwellToVK::StencilOp(back_fail), + MaxwellToVK::StencilOp(back_zpass), + MaxwellToVK::StencilOp(back_zfail), + MaxwellToVK::ComparisonOp(back_compare)); + }); + } +} + +void RasterizerVulkan::UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs& regs) { + if (!state_tracker.TouchStencilTestEnable()) { + return; + } + scheduler.Record([enable = regs.stencil_enable](vk::CommandBuffer cmdbuf) { + cmdbuf.SetStencilTestEnableEXT(enable); + }); +} + std::size_t RasterizerVulkan::CalculateGraphicsStreamBufferSize(bool is_indexed) const { std::size_t size = CalculateVertexArraysSize(); if (is_indexed) { @@ -1165,7 +1473,7 @@ std::size_t RasterizerVulkan::CalculateComputeStreamBufferSize() const { } std::size_t RasterizerVulkan::CalculateVertexArraysSize() const { - const auto& regs = system.GPU().Maxwell3D().regs; + const auto& regs = maxwell3d.regs; std::size_t size = 0; for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { @@ -1174,15 +1482,14 @@ std::size_t RasterizerVulkan::CalculateVertexArraysSize() const { const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()}; DEBUG_ASSERT(end >= start); - size += (end - start + 1) * regs.vertex_array[index].enable; + size += (end - start) * regs.vertex_array[index].enable; } return size; } std::size_t RasterizerVulkan::CalculateIndexBufferSize() const { - const auto& regs = system.GPU().Maxwell3D().regs; - return static_cast<std::size_t>(regs.index_array.count) * - static_cast<std::size_t>(regs.index_array.FormatSizeInBytes()); + return static_cast<std::size_t>(maxwell3d.regs.index_array.count) * + static_cast<std::size_t>(maxwell3d.regs.index_array.FormatSizeInBytes()); } std::size_t RasterizerVulkan::CalculateConstBufferSize( @@ -1197,28 +1504,54 @@ std::size_t RasterizerVulkan::CalculateConstBufferSize( } RenderPassParams RasterizerVulkan::GetRenderPassParams(Texceptions texceptions) const { - using namespace VideoCore::Surface; + const auto& regs = maxwell3d.regs; + const std::size_t num_attachments = static_cast<std::size_t>(regs.rt_control.count); - const auto& regs = system.GPU().Maxwell3D().regs; - RenderPassParams renderpass_params; + RenderPassParams params; + params.color_formats = {}; + std::size_t color_texceptions = 0; - for (std::size_t rt = 0; rt < static_cast<std::size_t>(regs.rt_control.count); ++rt) { + std::size_t index = 0; + for (std::size_t rt = 0; rt < num_attachments; ++rt) { const auto& rendertarget = regs.rt[rt]; if (rendertarget.Address() == 0 || rendertarget.format == Tegra::RenderTargetFormat::NONE) { continue; } - renderpass_params.color_attachments.push_back(RenderPassParams::ColorAttachment{ - static_cast<u32>(rt), PixelFormatFromRenderTargetFormat(rendertarget.format), - texceptions[rt]}); + params.color_formats[index] = static_cast<u8>(rendertarget.format); + color_texceptions |= (texceptions[rt] ? 1ULL : 0ULL) << index; + ++index; } + params.num_color_attachments = static_cast<u8>(index); + params.texceptions = static_cast<u8>(color_texceptions); - renderpass_params.has_zeta = regs.zeta_enable; - if (renderpass_params.has_zeta) { - renderpass_params.zeta_pixel_format = PixelFormatFromDepthFormat(regs.zeta.format); - renderpass_params.zeta_texception = texceptions[ZETA_TEXCEPTION_INDEX]; - } + params.zeta_format = regs.zeta_enable ? static_cast<u8>(regs.zeta.format) : 0; + params.zeta_texception = texceptions[ZETA_TEXCEPTION_INDEX]; + return params; +} + +VkBuffer RasterizerVulkan::DefaultBuffer() { + if (default_buffer) { + return *default_buffer; + } + + default_buffer = device.GetLogical().CreateBuffer({ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = DEFAULT_BUFFER_SIZE, + .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }); + default_buffer_commit = memory_manager.Commit(default_buffer, false); - return renderpass_params; + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([buffer = *default_buffer](vk::CommandBuffer cmdbuf) { + cmdbuf.FillBuffer(buffer, 0, DEFAULT_BUFFER_SIZE, 0); + }); + return *default_buffer; } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 46037860a..237e51fa4 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -14,24 +14,24 @@ #include <boost/functional/hash.hpp> #include "common/common_types.h" -#include "video_core/memory_manager.h" #include "video_core/rasterizer_accelerated.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" +#include "video_core/renderer_vulkan/vk_fence_manager.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_renderpass_cache.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_sampler_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" #include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/shader/async_shaders.h" namespace Core { class System; @@ -105,10 +105,11 @@ struct ImageView { class RasterizerVulkan final : public VideoCore::RasterizerAccelerated { public: - explicit RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& render_window, + explicit RasterizerVulkan(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu, + Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory, VKScreenInfo& screen_info, const VKDevice& device, - VKResourceManager& resource_manager, VKMemoryManager& memory_manager, - StateTracker& state_tracker, VKScheduler& scheduler); + VKMemoryManager& memory_manager, StateTracker& state_tracker, + VKScheduler& scheduler); ~RasterizerVulkan() override; void Draw(bool is_indexed, bool is_instanced) override; @@ -118,8 +119,15 @@ public: void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; void FlushAll() override; void FlushRegion(VAddr addr, u64 size) override; + bool MustFlushRegion(VAddr addr, u64 size) override; void InvalidateRegion(VAddr addr, u64 size) override; + void OnCPUWrite(VAddr addr, u64 size) override; + void SyncGuestHost() override; + void SignalSemaphore(GPUVAddr addr, u32 value) override; + void SignalSyncPoint(u32 value) override; + void ReleaseFences() override; void FlushAndInvalidateRegion(VAddr addr, u64 size) override; + void WaitForIdle() override; void FlushCommands() override; void TickFrame() override; bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, @@ -127,7 +135,14 @@ public: const Tegra::Engines::Fermi2D::Config& copy_config) override; bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) override; - void SetupDirtyFlags() override; + + VideoCommon::Shader::AsyncShaders& GetAsyncShaders() { + return async_shaders; + } + + const VideoCommon::Shader::AsyncShaders& GetAsyncShaders() const { + return async_shaders; + } /// Maximum supported size that a constbuffer can have in bytes. static constexpr std::size_t MaxConstbufferSize = 0x10000; @@ -148,10 +163,14 @@ private: using Texceptions = std::bitset<Maxwell::NumRenderTargets + 1>; static constexpr std::size_t ZETA_TEXCEPTION_INDEX = 8; + static constexpr VkDeviceSize DEFAULT_BUFFER_SIZE = 4 * sizeof(float); void FlushWork(); - Texceptions UpdateAttachments(); + /// @brief Updates the currently bound attachments + /// @param is_clear True when the framebuffer is updated as a clear + /// @return Bitfield of attachments being used as sampled textures + Texceptions UpdateAttachments(bool is_clear); std::tuple<VkFramebuffer, VkExtent2D> ConfigureFramebuffers(VkRenderPass renderpass); @@ -160,7 +179,7 @@ private: bool is_indexed, bool is_instanced); /// Setup descriptors in the graphics pipeline. - void SetupShaderDescriptors(const std::array<Shader, Maxwell::MaxShaderProgram>& shaders); + void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders); void SetupImageTransitions(Texceptions texceptions, const std::array<View, Maxwell::NumRenderTargets>& color_attachments, @@ -174,8 +193,7 @@ private: bool WalkAttachmentOverlaps(const CachedSurfaceView& attachment); - void SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input, - BufferBindings& buffer_bindings); + void SetupVertexArrays(BufferBindings& buffer_bindings); void SetupIndexBuffer(BufferBindings& buffer_bindings, DrawParameters& params, bool is_indexed); @@ -185,12 +203,15 @@ private: /// Setup global buffers in the graphics pipeline. void SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage); - /// Setup texel buffers in the graphics pipeline. - void SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage); + /// Setup uniform texels in the graphics pipeline. + void SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage); /// Setup textures in the graphics pipeline. void SetupGraphicsTextures(const ShaderEntries& entries, std::size_t stage); + /// Setup storage texels in the graphics pipeline. + void SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage); + /// Setup images in the graphics pipeline. void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage); @@ -201,11 +222,14 @@ private: void SetupComputeGlobalBuffers(const ShaderEntries& entries); /// Setup texel buffers in the compute pipeline. - void SetupComputeTexelBuffers(const ShaderEntries& entries); + void SetupComputeUniformTexels(const ShaderEntries& entries); /// Setup textures in the compute pipeline. void SetupComputeTextures(const ShaderEntries& entries); + /// Setup storage texels in the compute pipeline. + void SetupComputeStorageTexels(const ShaderEntries& entries); + /// Setup images in the compute pipeline. void SetupComputeImages(const ShaderEntries& entries); @@ -214,10 +238,12 @@ private: void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address); - void SetupTexelBuffer(const Tegra::Texture::TICEntry& image, const TexelBufferEntry& entry); + void SetupUniformTexels(const Tegra::Texture::TICEntry& image, const UniformTexelEntry& entry); void SetupTexture(const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry); + void SetupStorageTexel(const Tegra::Texture::TICEntry& tic, const StorageTexelEntry& entry); + void SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); @@ -227,6 +253,15 @@ private: void UpdateDepthBounds(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateStencilFaces(Tegra::Engines::Maxwell3D::Regs& regs); + void UpdateCullMode(Tegra::Engines::Maxwell3D::Regs& regs); + void UpdateDepthBoundsTestEnable(Tegra::Engines::Maxwell3D::Regs& regs); + void UpdateDepthTestEnable(Tegra::Engines::Maxwell3D::Regs& regs); + void UpdateDepthWriteEnable(Tegra::Engines::Maxwell3D::Regs& regs); + void UpdateDepthCompareOp(Tegra::Engines::Maxwell3D::Regs& regs); + void UpdateFrontFace(Tegra::Engines::Maxwell3D::Regs& regs); + void UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs); + void UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs& regs); + std::size_t CalculateGraphicsStreamBufferSize(bool is_indexed) const; std::size_t CalculateComputeStreamBufferSize() const; @@ -240,11 +275,15 @@ private: RenderPassParams GetRenderPassParams(Texceptions texceptions) const; - Core::System& system; - Core::Frontend::EmuWindow& render_window; + VkBuffer DefaultBuffer(); + + Tegra::GPU& gpu; + Tegra::MemoryManager& gpu_memory; + Tegra::Engines::Maxwell3D& maxwell3d; + Tegra::Engines::KeplerCompute& kepler_compute; + VKScreenInfo& screen_info; const VKDevice& device; - VKResourceManager& resource_manager; VKMemoryManager& memory_manager; StateTracker& state_tracker; VKScheduler& scheduler; @@ -254,6 +293,7 @@ private: VKUpdateDescriptorQueue update_descriptor_queue; VKRenderPassCache renderpass_cache; QuadArrayPass quad_array_pass; + QuadIndexedPass quad_indexed_pass; Uint8Pass uint8_pass; VKTextureCache texture_cache; @@ -261,6 +301,12 @@ private: VKBufferCache buffer_cache; VKSamplerCache sampler_cache; VKQueryCache query_cache; + VKFenceManager fence_manager; + + vk::Buffer default_buffer; + VKMemoryCommit default_buffer_commit; + vk::Event wfi_event; + VideoCommon::Shader::AsyncShaders async_shaders; std::array<View, Maxwell::NumRenderTargets> color_attachments; View zeta_attachment; diff --git a/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp b/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp index 4e5286a69..80284cf92 100644 --- a/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp @@ -2,9 +2,11 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <cstring> #include <memory> #include <vector> +#include "common/cityhash.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/maxwell_to_vk.h" #include "video_core/renderer_vulkan/vk_device.h" @@ -13,6 +15,15 @@ namespace Vulkan { +std::size_t RenderPassParams::Hash() const noexcept { + const u64 hash = Common::CityHash64(reinterpret_cast<const char*>(this), sizeof *this); + return static_cast<std::size_t>(hash); +} + +bool RenderPassParams::operator==(const RenderPassParams& rhs) const noexcept { + return std::memcmp(&rhs, this, sizeof *this) == 0; +} + VKRenderPassCache::VKRenderPassCache(const VKDevice& device) : device{device} {} VKRenderPassCache::~VKRenderPassCache() = default; @@ -27,72 +38,86 @@ VkRenderPass VKRenderPassCache::GetRenderPass(const RenderPassParams& params) { } vk::RenderPass VKRenderPassCache::CreateRenderPass(const RenderPassParams& params) const { + using namespace VideoCore::Surface; + const std::size_t num_attachments = static_cast<std::size_t>(params.num_color_attachments); + std::vector<VkAttachmentDescription> descriptors; + descriptors.reserve(num_attachments); + std::vector<VkAttachmentReference> color_references; + color_references.reserve(num_attachments); - for (std::size_t rt = 0; rt < params.color_attachments.size(); ++rt) { - const auto attachment = params.color_attachments[rt]; - const auto format = - MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, attachment.pixel_format); + for (std::size_t rt = 0; rt < num_attachments; ++rt) { + const auto guest_format = static_cast<Tegra::RenderTargetFormat>(params.color_formats[rt]); + const PixelFormat pixel_format = PixelFormatFromRenderTargetFormat(guest_format); + const auto format = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, pixel_format); ASSERT_MSG(format.attachable, "Trying to attach a non-attachable format with format={}", - static_cast<u32>(attachment.pixel_format)); - - // TODO(Rodrigo): Add eMayAlias when it's needed. - const auto color_layout = attachment.is_texception - ? VK_IMAGE_LAYOUT_GENERAL - : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; - VkAttachmentDescription& descriptor = descriptors.emplace_back(); - descriptor.flags = VK_ATTACHMENT_DESCRIPTION_MAY_ALIAS_BIT; - descriptor.format = format.format; - descriptor.samples = VK_SAMPLE_COUNT_1_BIT; - descriptor.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; - descriptor.storeOp = VK_ATTACHMENT_STORE_OP_STORE; - descriptor.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; - descriptor.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; - descriptor.initialLayout = color_layout; - descriptor.finalLayout = color_layout; - - VkAttachmentReference& reference = color_references.emplace_back(); - reference.attachment = static_cast<u32>(rt); - reference.layout = color_layout; + static_cast<int>(pixel_format)); + + // TODO(Rodrigo): Add MAY_ALIAS_BIT when it's needed. + const VkImageLayout color_layout = ((params.texceptions >> rt) & 1) != 0 + ? VK_IMAGE_LAYOUT_GENERAL + : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; + descriptors.push_back({ + .flags = VK_ATTACHMENT_DESCRIPTION_MAY_ALIAS_BIT, + .format = format.format, + .samples = VK_SAMPLE_COUNT_1_BIT, + .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE, + .stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE, + .initialLayout = color_layout, + .finalLayout = color_layout, + }); + + color_references.push_back({ + .attachment = static_cast<u32>(rt), + .layout = color_layout, + }); } VkAttachmentReference zeta_attachment_ref; - if (params.has_zeta) { - const auto format = - MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, params.zeta_pixel_format); + const bool has_zeta = params.zeta_format != 0; + if (has_zeta) { + const auto guest_format = static_cast<Tegra::DepthFormat>(params.zeta_format); + const PixelFormat pixel_format = PixelFormatFromDepthFormat(guest_format); + const auto format = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, pixel_format); ASSERT_MSG(format.attachable, "Trying to attach a non-attachable format with format={}", - static_cast<u32>(params.zeta_pixel_format)); - - const auto zeta_layout = params.zeta_texception - ? VK_IMAGE_LAYOUT_GENERAL - : VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; - VkAttachmentDescription& descriptor = descriptors.emplace_back(); - descriptor.flags = 0; - descriptor.format = format.format; - descriptor.samples = VK_SAMPLE_COUNT_1_BIT; - descriptor.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; - descriptor.storeOp = VK_ATTACHMENT_STORE_OP_STORE; - descriptor.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD; - descriptor.stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE; - descriptor.initialLayout = zeta_layout; - descriptor.finalLayout = zeta_layout; - - zeta_attachment_ref.attachment = static_cast<u32>(params.color_attachments.size()); - zeta_attachment_ref.layout = zeta_layout; + static_cast<int>(pixel_format)); + + const VkImageLayout zeta_layout = params.zeta_texception != 0 + ? VK_IMAGE_LAYOUT_GENERAL + : VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; + descriptors.push_back({ + .flags = 0, + .format = format.format, + .samples = VK_SAMPLE_COUNT_1_BIT, + .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD, + .stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = zeta_layout, + .finalLayout = zeta_layout, + }); + + zeta_attachment_ref = { + .attachment = static_cast<u32>(num_attachments), + .layout = zeta_layout, + }; } - VkSubpassDescription subpass_description; - subpass_description.flags = 0; - subpass_description.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; - subpass_description.inputAttachmentCount = 0; - subpass_description.pInputAttachments = nullptr; - subpass_description.colorAttachmentCount = static_cast<u32>(color_references.size()); - subpass_description.pColorAttachments = color_references.data(); - subpass_description.pResolveAttachments = nullptr; - subpass_description.pDepthStencilAttachment = params.has_zeta ? &zeta_attachment_ref : nullptr; - subpass_description.preserveAttachmentCount = 0; - subpass_description.pPreserveAttachments = nullptr; + const VkSubpassDescription subpass_description{ + .flags = 0, + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .inputAttachmentCount = 0, + .pInputAttachments = nullptr, + .colorAttachmentCount = static_cast<u32>(color_references.size()), + .pColorAttachments = color_references.data(), + .pResolveAttachments = nullptr, + .pDepthStencilAttachment = has_zeta ? &zeta_attachment_ref : nullptr, + .preserveAttachmentCount = 0, + .pPreserveAttachments = nullptr, + }; VkAccessFlags access = 0; VkPipelineStageFlags stage = 0; @@ -101,32 +126,33 @@ vk::RenderPass VKRenderPassCache::CreateRenderPass(const RenderPassParams& param stage |= VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; } - if (params.has_zeta) { + if (has_zeta) { access |= VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; stage |= VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT; } - VkSubpassDependency subpass_dependency; - subpass_dependency.srcSubpass = VK_SUBPASS_EXTERNAL; - subpass_dependency.dstSubpass = 0; - subpass_dependency.srcStageMask = stage; - subpass_dependency.dstStageMask = stage; - subpass_dependency.srcAccessMask = 0; - subpass_dependency.dstAccessMask = access; - subpass_dependency.dependencyFlags = 0; - - VkRenderPassCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.attachmentCount = static_cast<u32>(descriptors.size()); - ci.pAttachments = descriptors.data(); - ci.subpassCount = 1; - ci.pSubpasses = &subpass_description; - ci.dependencyCount = 1; - ci.pDependencies = &subpass_dependency; - return device.GetLogical().CreateRenderPass(ci); + const VkSubpassDependency subpass_dependency{ + .srcSubpass = VK_SUBPASS_EXTERNAL, + .dstSubpass = 0, + .srcStageMask = stage, + .dstStageMask = stage, + .srcAccessMask = 0, + .dstAccessMask = access, + .dependencyFlags = 0, + }; + + return device.GetLogical().CreateRenderPass({ + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .attachmentCount = static_cast<u32>(descriptors.size()), + .pAttachments = descriptors.data(), + .subpassCount = 1, + .pSubpasses = &subpass_description, + .dependencyCount = 1, + .pDependencies = &subpass_dependency, + }); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_renderpass_cache.h b/src/video_core/renderer_vulkan/vk_renderpass_cache.h index 921b6efb5..8b0fec720 100644 --- a/src/video_core/renderer_vulkan/vk_renderpass_cache.h +++ b/src/video_core/renderer_vulkan/vk_renderpass_cache.h @@ -4,8 +4,7 @@ #pragma once -#include <memory> -#include <tuple> +#include <type_traits> #include <unordered_map> #include <boost/container/static_vector.hpp> @@ -19,51 +18,25 @@ namespace Vulkan { class VKDevice; -// TODO(Rodrigo): Optimize this structure for faster hashing - struct RenderPassParams { - struct ColorAttachment { - u32 index = 0; - VideoCore::Surface::PixelFormat pixel_format = VideoCore::Surface::PixelFormat::Invalid; - bool is_texception = false; - - std::size_t Hash() const noexcept { - return static_cast<std::size_t>(pixel_format) | - static_cast<std::size_t>(is_texception) << 6 | - static_cast<std::size_t>(index) << 7; - } - - bool operator==(const ColorAttachment& rhs) const noexcept { - return std::tie(index, pixel_format, is_texception) == - std::tie(rhs.index, rhs.pixel_format, rhs.is_texception); - } - }; - - boost::container::static_vector<ColorAttachment, - Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> - color_attachments{}; - // TODO(Rodrigo): Unify has_zeta into zeta_pixel_format and zeta_component_type. - VideoCore::Surface::PixelFormat zeta_pixel_format = VideoCore::Surface::PixelFormat::Invalid; - bool has_zeta = false; - bool zeta_texception = false; - - std::size_t Hash() const noexcept { - std::size_t hash = 0; - for (const auto& rt : color_attachments) { - boost::hash_combine(hash, rt.Hash()); - } - boost::hash_combine(hash, zeta_pixel_format); - boost::hash_combine(hash, has_zeta); - boost::hash_combine(hash, zeta_texception); - return hash; - } + std::array<u8, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> color_formats; + u8 num_color_attachments; + u8 texceptions; + + u8 zeta_format; + u8 zeta_texception; + + std::size_t Hash() const noexcept; + + bool operator==(const RenderPassParams& rhs) const noexcept; - bool operator==(const RenderPassParams& rhs) const { - return std::tie(color_attachments, zeta_pixel_format, has_zeta, zeta_texception) == - std::tie(rhs.color_attachments, rhs.zeta_pixel_format, rhs.has_zeta, - rhs.zeta_texception); + bool operator!=(const RenderPassParams& rhs) const noexcept { + return !operator==(rhs); } }; +static_assert(std::has_unique_object_representations_v<RenderPassParams>); +static_assert(std::is_trivially_copyable_v<RenderPassParams>); +static_assert(std::is_trivially_constructible_v<RenderPassParams>); } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_resource_manager.cpp b/src/video_core/renderer_vulkan/vk_resource_manager.cpp deleted file mode 100644 index dc06f545a..000000000 --- a/src/video_core/renderer_vulkan/vk_resource_manager.cpp +++ /dev/null @@ -1,312 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <algorithm> -#include <optional> -#include "common/assert.h" -#include "common/logging/log.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" -#include "video_core/renderer_vulkan/wrapper.h" - -namespace Vulkan { - -namespace { - -// TODO(Rodrigo): Fine tune these numbers. -constexpr std::size_t COMMAND_BUFFER_POOL_SIZE = 0x1000; -constexpr std::size_t FENCES_GROW_STEP = 0x40; - -VkFenceCreateInfo BuildFenceCreateInfo() { - VkFenceCreateInfo fence_ci; - fence_ci.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; - fence_ci.pNext = nullptr; - fence_ci.flags = 0; - return fence_ci; -} - -} // Anonymous namespace - -class CommandBufferPool final : public VKFencedPool { -public: - CommandBufferPool(const VKDevice& device) - : VKFencedPool(COMMAND_BUFFER_POOL_SIZE), device{device} {} - - void Allocate(std::size_t begin, std::size_t end) override { - // Command buffers are going to be commited, recorded, executed every single usage cycle. - // They are also going to be reseted when commited. - VkCommandPoolCreateInfo command_pool_ci; - command_pool_ci.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; - command_pool_ci.pNext = nullptr; - command_pool_ci.flags = - VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; - command_pool_ci.queueFamilyIndex = device.GetGraphicsFamily(); - - Pool& pool = pools.emplace_back(); - pool.handle = device.GetLogical().CreateCommandPool(command_pool_ci); - pool.cmdbufs = pool.handle.Allocate(COMMAND_BUFFER_POOL_SIZE); - } - - VkCommandBuffer Commit(VKFence& fence) { - const std::size_t index = CommitResource(fence); - const auto pool_index = index / COMMAND_BUFFER_POOL_SIZE; - const auto sub_index = index % COMMAND_BUFFER_POOL_SIZE; - return pools[pool_index].cmdbufs[sub_index]; - } - -private: - struct Pool { - vk::CommandPool handle; - vk::CommandBuffers cmdbufs; - }; - - const VKDevice& device; - std::vector<Pool> pools; -}; - -VKResource::VKResource() = default; - -VKResource::~VKResource() = default; - -VKFence::VKFence(const VKDevice& device) - : device{device}, handle{device.GetLogical().CreateFence(BuildFenceCreateInfo())} {} - -VKFence::~VKFence() = default; - -void VKFence::Wait() { - switch (const VkResult result = handle.Wait()) { - case VK_SUCCESS: - return; - case VK_ERROR_DEVICE_LOST: - device.ReportLoss(); - [[fallthrough]]; - default: - throw vk::Exception(result); - } -} - -void VKFence::Release() { - ASSERT(is_owned); - is_owned = false; -} - -void VKFence::Commit() { - is_owned = true; - is_used = true; -} - -bool VKFence::Tick(bool gpu_wait, bool owner_wait) { - if (!is_used) { - // If a fence is not used it's always free. - return true; - } - if (is_owned && !owner_wait) { - // The fence is still being owned (Release has not been called) and ownership wait has - // not been asked. - return false; - } - - if (gpu_wait) { - // Wait for the fence if it has been requested. - (void)handle.Wait(); - } else { - if (handle.GetStatus() != VK_SUCCESS) { - // Vulkan fence is not ready, not much it can do here - return false; - } - } - - // Broadcast resources their free state. - for (auto* resource : protected_resources) { - resource->OnFenceRemoval(this); - } - protected_resources.clear(); - - // Prepare fence for reusage. - handle.Reset(); - is_used = false; - return true; -} - -void VKFence::Protect(VKResource* resource) { - protected_resources.push_back(resource); -} - -void VKFence::Unprotect(VKResource* resource) { - const auto it = std::find(protected_resources.begin(), protected_resources.end(), resource); - ASSERT(it != protected_resources.end()); - - resource->OnFenceRemoval(this); - protected_resources.erase(it); -} - -void VKFence::RedirectProtection(VKResource* old_resource, VKResource* new_resource) noexcept { - std::replace(std::begin(protected_resources), std::end(protected_resources), old_resource, - new_resource); -} - -VKFenceWatch::VKFenceWatch() = default; - -VKFenceWatch::VKFenceWatch(VKFence& initial_fence) { - Watch(initial_fence); -} - -VKFenceWatch::VKFenceWatch(VKFenceWatch&& rhs) noexcept { - fence = std::exchange(rhs.fence, nullptr); - if (fence) { - fence->RedirectProtection(&rhs, this); - } -} - -VKFenceWatch& VKFenceWatch::operator=(VKFenceWatch&& rhs) noexcept { - fence = std::exchange(rhs.fence, nullptr); - if (fence) { - fence->RedirectProtection(&rhs, this); - } - return *this; -} - -VKFenceWatch::~VKFenceWatch() { - if (fence) { - fence->Unprotect(this); - } -} - -void VKFenceWatch::Wait() { - if (fence == nullptr) { - return; - } - fence->Wait(); - fence->Unprotect(this); -} - -void VKFenceWatch::Watch(VKFence& new_fence) { - Wait(); - fence = &new_fence; - fence->Protect(this); -} - -bool VKFenceWatch::TryWatch(VKFence& new_fence) { - if (fence) { - return false; - } - fence = &new_fence; - fence->Protect(this); - return true; -} - -void VKFenceWatch::OnFenceRemoval(VKFence* signaling_fence) { - ASSERT_MSG(signaling_fence == fence, "Removing the wrong fence"); - fence = nullptr; -} - -VKFencedPool::VKFencedPool(std::size_t grow_step) : grow_step{grow_step} {} - -VKFencedPool::~VKFencedPool() = default; - -std::size_t VKFencedPool::CommitResource(VKFence& fence) { - const auto Search = [&](std::size_t begin, std::size_t end) -> std::optional<std::size_t> { - for (std::size_t iterator = begin; iterator < end; ++iterator) { - if (watches[iterator]->TryWatch(fence)) { - // The resource is now being watched, a free resource was successfully found. - return iterator; - } - } - return {}; - }; - // Try to find a free resource from the hinted position to the end. - auto found = Search(free_iterator, watches.size()); - if (!found) { - // Search from beginning to the hinted position. - found = Search(0, free_iterator); - if (!found) { - // Both searches failed, the pool is full; handle it. - const std::size_t free_resource = ManageOverflow(); - - // Watch will wait for the resource to be free. - watches[free_resource]->Watch(fence); - found = free_resource; - } - } - // Free iterator is hinted to the resource after the one that's been commited. - free_iterator = (*found + 1) % watches.size(); - return *found; -} - -std::size_t VKFencedPool::ManageOverflow() { - const std::size_t old_capacity = watches.size(); - Grow(); - - // The last entry is guaranted to be free, since it's the first element of the freshly - // allocated resources. - return old_capacity; -} - -void VKFencedPool::Grow() { - const std::size_t old_capacity = watches.size(); - watches.resize(old_capacity + grow_step); - std::generate(watches.begin() + old_capacity, watches.end(), - []() { return std::make_unique<VKFenceWatch>(); }); - Allocate(old_capacity, old_capacity + grow_step); -} - -VKResourceManager::VKResourceManager(const VKDevice& device) : device{device} { - GrowFences(FENCES_GROW_STEP); - command_buffer_pool = std::make_unique<CommandBufferPool>(device); -} - -VKResourceManager::~VKResourceManager() = default; - -VKFence& VKResourceManager::CommitFence() { - const auto StepFences = [&](bool gpu_wait, bool owner_wait) -> VKFence* { - const auto Tick = [=](auto& fence) { return fence->Tick(gpu_wait, owner_wait); }; - const auto hinted = fences.begin() + fences_iterator; - - auto it = std::find_if(hinted, fences.end(), Tick); - if (it == fences.end()) { - it = std::find_if(fences.begin(), hinted, Tick); - if (it == hinted) { - return nullptr; - } - } - fences_iterator = std::distance(fences.begin(), it) + 1; - if (fences_iterator >= fences.size()) - fences_iterator = 0; - - auto& fence = *it; - fence->Commit(); - return fence.get(); - }; - - VKFence* found_fence = StepFences(false, false); - if (!found_fence) { - // Try again, this time waiting. - found_fence = StepFences(true, false); - - if (!found_fence) { - // Allocate new fences and try again. - LOG_INFO(Render_Vulkan, "Allocating new fences {} -> {}", fences.size(), - fences.size() + FENCES_GROW_STEP); - - GrowFences(FENCES_GROW_STEP); - found_fence = StepFences(true, false); - ASSERT(found_fence != nullptr); - } - } - return *found_fence; -} - -VkCommandBuffer VKResourceManager::CommitCommandBuffer(VKFence& fence) { - return command_buffer_pool->Commit(fence); -} - -void VKResourceManager::GrowFences(std::size_t new_fences_count) { - const std::size_t previous_size = fences.size(); - fences.resize(previous_size + new_fences_count); - - std::generate(fences.begin() + previous_size, fences.end(), - [this] { return std::make_unique<VKFence>(device); }); -} - -} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_resource_manager.h b/src/video_core/renderer_vulkan/vk_resource_manager.h deleted file mode 100644 index f683d2276..000000000 --- a/src/video_core/renderer_vulkan/vk_resource_manager.h +++ /dev/null @@ -1,196 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <cstddef> -#include <memory> -#include <vector> -#include "video_core/renderer_vulkan/wrapper.h" - -namespace Vulkan { - -class VKDevice; -class VKFence; -class VKResourceManager; - -class CommandBufferPool; - -/// Interface for a Vulkan resource -class VKResource { -public: - explicit VKResource(); - virtual ~VKResource(); - - /** - * Signals the object that an owning fence has been signaled. - * @param signaling_fence Fence that signals its usage end. - */ - virtual void OnFenceRemoval(VKFence* signaling_fence) = 0; -}; - -/** - * Fences take ownership of objects, protecting them from GPU-side or driver-side concurrent access. - * They must be commited from the resource manager. Their usage flow is: commit the fence from the - * resource manager, protect resources with it and use them, send the fence to an execution queue - * and Wait for it if needed and then call Release. Used resources will automatically be signaled - * when they are free to be reused. - * @brief Protects resources for concurrent usage and signals its release. - */ -class VKFence { - friend class VKResourceManager; - -public: - explicit VKFence(const VKDevice& device); - ~VKFence(); - - /** - * Waits for the fence to be signaled. - * @warning You must have ownership of the fence and it has to be previously sent to a queue to - * call this function. - */ - void Wait(); - - /** - * Releases ownership of the fence. Pass after it has been sent to an execution queue. - * Unmanaged usage of the fence after the call will result in undefined behavior because it may - * be being used for something else. - */ - void Release(); - - /// Protects a resource with this fence. - void Protect(VKResource* resource); - - /// Removes protection for a resource. - void Unprotect(VKResource* resource); - - /// Redirects one protected resource to a new address. - void RedirectProtection(VKResource* old_resource, VKResource* new_resource) noexcept; - - /// Retreives the fence. - operator VkFence() const { - return *handle; - } - -private: - /// Take ownership of the fence. - void Commit(); - - /** - * Updates the fence status. - * @warning Waiting for the owner might soft lock the execution. - * @param gpu_wait Wait for the fence to be signaled by the driver. - * @param owner_wait Wait for the owner to signal its freedom. - * @returns True if the fence is free. Waiting for gpu and owner will always return true. - */ - bool Tick(bool gpu_wait, bool owner_wait); - - const VKDevice& device; ///< Device handler - vk::Fence handle; ///< Vulkan fence - std::vector<VKResource*> protected_resources; ///< List of resources protected by this fence - bool is_owned = false; ///< The fence has been commited but not released yet. - bool is_used = false; ///< The fence has been commited but it has not been checked to be free. -}; - -/** - * A fence watch is used to keep track of the usage of a fence and protect a resource or set of - * resources without having to inherit VKResource from their handlers. - */ -class VKFenceWatch final : public VKResource { -public: - explicit VKFenceWatch(); - VKFenceWatch(VKFence& initial_fence); - VKFenceWatch(VKFenceWatch&&) noexcept; - VKFenceWatch(const VKFenceWatch&) = delete; - ~VKFenceWatch() override; - - VKFenceWatch& operator=(VKFenceWatch&&) noexcept; - - /// Waits for the fence to be released. - void Wait(); - - /** - * Waits for a previous fence and watches a new one. - * @param new_fence New fence to wait to. - */ - void Watch(VKFence& new_fence); - - /** - * Checks if it's currently being watched and starts watching it if it's available. - * @returns True if a watch has started, false if it's being watched. - */ - bool TryWatch(VKFence& new_fence); - - void OnFenceRemoval(VKFence* signaling_fence) override; - - /** - * Do not use it paired with Watch. Use TryWatch instead. - * Returns true when the watch is free. - */ - bool IsUsed() const { - return fence != nullptr; - } - -private: - VKFence* fence{}; ///< Fence watching this resource. nullptr when the watch is free. -}; - -/** - * Handles a pool of resources protected by fences. Manages resource overflow allocating more - * resources. - */ -class VKFencedPool { -public: - explicit VKFencedPool(std::size_t grow_step); - virtual ~VKFencedPool(); - -protected: - /** - * Commits a free resource and protects it with a fence. It may allocate new resources. - * @param fence Fence that protects the commited resource. - * @returns Index of the resource commited. - */ - std::size_t CommitResource(VKFence& fence); - - /// Called when a chunk of resources have to be allocated. - virtual void Allocate(std::size_t begin, std::size_t end) = 0; - -private: - /// Manages pool overflow allocating new resources. - std::size_t ManageOverflow(); - - /// Allocates a new page of resources. - void Grow(); - - std::size_t grow_step = 0; ///< Number of new resources created after an overflow - std::size_t free_iterator = 0; ///< Hint to where the next free resources is likely to be found - std::vector<std::unique_ptr<VKFenceWatch>> watches; ///< Set of watched resources -}; - -/** - * The resource manager handles all resources that can be protected with a fence avoiding - * driver-side or GPU-side concurrent usage. Usage is documented in VKFence. - */ -class VKResourceManager final { -public: - explicit VKResourceManager(const VKDevice& device); - ~VKResourceManager(); - - /// Commits a fence. It has to be sent to a queue and released. - VKFence& CommitFence(); - - /// Commits an unused command buffer and protects it with a fence. - VkCommandBuffer CommitCommandBuffer(VKFence& fence); - -private: - /// Allocates new fences. - void GrowFences(std::size_t new_fences_count); - - const VKDevice& device; ///< Device handler. - std::size_t fences_iterator = 0; ///< Index where a free fence is likely to be found. - std::vector<std::unique_ptr<VKFence>> fences; ///< Pool of fences. - std::unique_ptr<CommandBufferPool> command_buffer_pool; ///< Pool of command buffers. -}; - -} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_resource_pool.cpp b/src/video_core/renderer_vulkan/vk_resource_pool.cpp new file mode 100644 index 000000000..ee274ac59 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_resource_pool.cpp @@ -0,0 +1,63 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <optional> + +#include "video_core/renderer_vulkan/vk_master_semaphore.h" +#include "video_core/renderer_vulkan/vk_resource_pool.h" + +namespace Vulkan { + +ResourcePool::ResourcePool(MasterSemaphore& master_semaphore_, size_t grow_step_) + : master_semaphore{master_semaphore_}, grow_step{grow_step_} {} + +ResourcePool::~ResourcePool() = default; + +size_t ResourcePool::CommitResource() { + // Refresh semaphore to query updated results + master_semaphore.Refresh(); + + const auto search = [this](size_t begin, size_t end) -> std::optional<size_t> { + for (size_t iterator = begin; iterator < end; ++iterator) { + if (master_semaphore.IsFree(ticks[iterator])) { + ticks[iterator] = master_semaphore.CurrentTick(); + return iterator; + } + } + return {}; + }; + // Try to find a free resource from the hinted position to the end. + auto found = search(free_iterator, ticks.size()); + if (!found) { + // Search from beginning to the hinted position. + found = search(0, free_iterator); + if (!found) { + // Both searches failed, the pool is full; handle it. + const size_t free_resource = ManageOverflow(); + + ticks[free_resource] = master_semaphore.CurrentTick(); + found = free_resource; + } + } + // Free iterator is hinted to the resource after the one that's been commited. + free_iterator = (*found + 1) % ticks.size(); + return *found; +} + +size_t ResourcePool::ManageOverflow() { + const size_t old_capacity = ticks.size(); + Grow(); + + // The last entry is guaranted to be free, since it's the first element of the freshly + // allocated resources. + return old_capacity; +} + +void ResourcePool::Grow() { + const size_t old_capacity = ticks.size(); + ticks.resize(old_capacity + grow_step); + Allocate(old_capacity, old_capacity + grow_step); +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_resource_pool.h b/src/video_core/renderer_vulkan/vk_resource_pool.h new file mode 100644 index 000000000..a018c7ec2 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_resource_pool.h @@ -0,0 +1,43 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <vector> + +#include "common/common_types.h" + +namespace Vulkan { + +class MasterSemaphore; + +/** + * Handles a pool of resources protected by fences. Manages resource overflow allocating more + * resources. + */ +class ResourcePool { +public: + explicit ResourcePool(MasterSemaphore& master_semaphore, size_t grow_step); + virtual ~ResourcePool(); + +protected: + size_t CommitResource(); + + /// Called when a chunk of resources have to be allocated. + virtual void Allocate(size_t begin, size_t end) = 0; + +private: + /// Manages pool overflow allocating new resources. + size_t ManageOverflow(); + + /// Allocates a new page of resources. + void Grow(); + + MasterSemaphore& master_semaphore; + size_t grow_step = 0; ///< Number of new resources created after an overflow + size_t free_iterator = 0; ///< Hint to where the next free resources is likely to be found + std::vector<u64> ticks; ///< Ticks for each resource +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp index 07bbcf520..b068888f9 100644 --- a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp @@ -2,16 +2,15 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <cstring> -#include <optional> #include <unordered_map> -#include "common/assert.h" #include "video_core/renderer_vulkan/maxwell_to_vk.h" #include "video_core/renderer_vulkan/vk_sampler_cache.h" #include "video_core/renderer_vulkan/wrapper.h" #include "video_core/textures/texture.h" +using Tegra::Texture::TextureMipmapFilter; + namespace Vulkan { namespace { @@ -42,26 +41,39 @@ VKSamplerCache::VKSamplerCache(const VKDevice& device) : device{device} {} VKSamplerCache::~VKSamplerCache() = default; vk::Sampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc) const { - VkSamplerCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.magFilter = MaxwellToVK::Sampler::Filter(tsc.mag_filter); - ci.minFilter = MaxwellToVK::Sampler::Filter(tsc.min_filter); - ci.mipmapMode = MaxwellToVK::Sampler::MipmapMode(tsc.mipmap_filter); - ci.addressModeU = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_u, tsc.mag_filter); - ci.addressModeV = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_v, tsc.mag_filter); - ci.addressModeW = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_p, tsc.mag_filter); - ci.mipLodBias = tsc.GetLodBias(); - ci.anisotropyEnable = tsc.GetMaxAnisotropy() > 1.0f ? VK_TRUE : VK_FALSE; - ci.maxAnisotropy = tsc.GetMaxAnisotropy(); - ci.compareEnable = tsc.depth_compare_enabled; - ci.compareOp = MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func); - ci.minLod = tsc.GetMinLod(); - ci.maxLod = tsc.GetMaxLod(); - ci.borderColor = ConvertBorderColor(tsc.GetBorderColor()); - ci.unnormalizedCoordinates = VK_FALSE; - return device.GetLogical().CreateSampler(ci); + const bool arbitrary_borders = device.IsExtCustomBorderColorSupported(); + const std::array color = tsc.GetBorderColor(); + + VkSamplerCustomBorderColorCreateInfoEXT border{ + .sType = VK_STRUCTURE_TYPE_SAMPLER_CUSTOM_BORDER_COLOR_CREATE_INFO_EXT, + .pNext = nullptr, + .customBorderColor = {}, + .format = VK_FORMAT_UNDEFINED, + }; + std::memcpy(&border.customBorderColor, color.data(), sizeof(color)); + + return device.GetLogical().CreateSampler({ + .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, + .pNext = arbitrary_borders ? &border : nullptr, + .flags = 0, + .magFilter = MaxwellToVK::Sampler::Filter(tsc.mag_filter), + .minFilter = MaxwellToVK::Sampler::Filter(tsc.min_filter), + .mipmapMode = MaxwellToVK::Sampler::MipmapMode(tsc.mipmap_filter), + .addressModeU = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_u, tsc.mag_filter), + .addressModeV = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_v, tsc.mag_filter), + .addressModeW = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_p, tsc.mag_filter), + .mipLodBias = tsc.GetLodBias(), + .anisotropyEnable = + static_cast<VkBool32>(tsc.GetMaxAnisotropy() > 1.0f ? VK_TRUE : VK_FALSE), + .maxAnisotropy = tsc.GetMaxAnisotropy(), + .compareEnable = tsc.depth_compare_enabled, + .compareOp = MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func), + .minLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.0f : tsc.GetMinLod(), + .maxLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.25f : tsc.GetMaxLod(), + .borderColor = + arbitrary_borders ? VK_BORDER_COLOR_INT_CUSTOM_EXT : ConvertBorderColor(color), + .unnormalizedCoordinates = VK_FALSE, + }); } VkSampler VKSamplerCache::ToSamplerType(const vk::Sampler& sampler) const { diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 900f551b3..1a483dc71 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -8,11 +8,12 @@ #include <thread> #include <utility> -#include "common/assert.h" #include "common/microprofile.h" +#include "common/thread.h" +#include "video_core/renderer_vulkan/vk_command_pool.h" #include "video_core/renderer_vulkan/vk_device.h" +#include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/renderer_vulkan/vk_query_cache.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_state_tracker.h" #include "video_core/renderer_vulkan/wrapper.h" @@ -35,10 +36,10 @@ void VKScheduler::CommandChunk::ExecuteAll(vk::CommandBuffer cmdbuf) { last = nullptr; } -VKScheduler::VKScheduler(const VKDevice& device, VKResourceManager& resource_manager, - StateTracker& state_tracker) - : device{device}, resource_manager{resource_manager}, state_tracker{state_tracker}, - next_fence{&resource_manager.CommitFence()} { +VKScheduler::VKScheduler(const VKDevice& device_, StateTracker& state_tracker_) + : device{device_}, state_tracker{state_tracker_}, + master_semaphore{std::make_unique<MasterSemaphore>(device)}, + command_pool{std::make_unique<CommandPool>(*master_semaphore, device)} { AcquireNewChunk(); AllocateNewContext(); worker_thread = std::thread(&VKScheduler::WorkerThread, this); @@ -50,20 +51,27 @@ VKScheduler::~VKScheduler() { worker_thread.join(); } -void VKScheduler::Flush(bool release_fence, VkSemaphore semaphore) { +u64 VKScheduler::CurrentTick() const noexcept { + return master_semaphore->CurrentTick(); +} + +bool VKScheduler::IsFree(u64 tick) const noexcept { + return master_semaphore->IsFree(tick); +} + +void VKScheduler::Wait(u64 tick) { + master_semaphore->Wait(tick); +} + +void VKScheduler::Flush(VkSemaphore semaphore) { SubmitExecution(semaphore); - if (release_fence) { - current_fence->Release(); - } AllocateNewContext(); } -void VKScheduler::Finish(bool release_fence, VkSemaphore semaphore) { +void VKScheduler::Finish(VkSemaphore semaphore) { + const u64 presubmit_tick = CurrentTick(); SubmitExecution(semaphore); - current_fence->Wait(); - if (release_fence) { - current_fence->Release(); - } + Wait(presubmit_tick); AllocateNewContext(); } @@ -100,16 +108,19 @@ void VKScheduler::RequestRenderpass(VkRenderPass renderpass, VkFramebuffer frame state.framebuffer = framebuffer; state.render_area = render_area; - VkRenderPassBeginInfo renderpass_bi; - renderpass_bi.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; - renderpass_bi.pNext = nullptr; - renderpass_bi.renderPass = renderpass; - renderpass_bi.framebuffer = framebuffer; - renderpass_bi.renderArea.offset.x = 0; - renderpass_bi.renderArea.offset.y = 0; - renderpass_bi.renderArea.extent = render_area; - renderpass_bi.clearValueCount = 0; - renderpass_bi.pClearValues = nullptr; + const VkRenderPassBeginInfo renderpass_bi{ + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .pNext = nullptr, + .renderPass = renderpass, + .framebuffer = framebuffer, + .renderArea = + { + .offset = {.x = 0, .y = 0}, + .extent = render_area, + }, + .clearValueCount = 0, + .pClearValues = nullptr, + }; Record([renderpass_bi, end_renderpass](vk::CommandBuffer cmdbuf) { if (end_renderpass) { @@ -134,6 +145,7 @@ void VKScheduler::BindGraphicsPipeline(VkPipeline pipeline) { } void VKScheduler::WorkerThread() { + Common::SetCurrentThreadPriority(Common::ThreadPriority::High); std::unique_lock lock{mutex}; do { cv.wait(lock, [this] { return !chunk_queue.Empty() || quit; }); @@ -156,35 +168,58 @@ void VKScheduler::SubmitExecution(VkSemaphore semaphore) { current_cmdbuf.End(); - VkSubmitInfo submit_info; - submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - submit_info.pNext = nullptr; - submit_info.waitSemaphoreCount = 0; - submit_info.pWaitSemaphores = nullptr; - submit_info.pWaitDstStageMask = nullptr; - submit_info.commandBufferCount = 1; - submit_info.pCommandBuffers = current_cmdbuf.address(); - submit_info.signalSemaphoreCount = semaphore ? 1 : 0; - submit_info.pSignalSemaphores = &semaphore; - device.GetGraphicsQueue().Submit(submit_info, *current_fence); + const VkSemaphore timeline_semaphore = master_semaphore->Handle(); + const u32 num_signal_semaphores = semaphore ? 2U : 1U; + + const u64 signal_value = master_semaphore->CurrentTick(); + const u64 wait_value = signal_value - 1; + const VkPipelineStageFlags wait_stage_mask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; + + master_semaphore->NextTick(); + + const std::array signal_values{signal_value, u64(0)}; + const std::array signal_semaphores{timeline_semaphore, semaphore}; + + const VkTimelineSemaphoreSubmitInfoKHR timeline_si{ + .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR, + .pNext = nullptr, + .waitSemaphoreValueCount = 1, + .pWaitSemaphoreValues = &wait_value, + .signalSemaphoreValueCount = num_signal_semaphores, + .pSignalSemaphoreValues = signal_values.data(), + }; + const VkSubmitInfo submit_info{ + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, + .pNext = &timeline_si, + .waitSemaphoreCount = 1, + .pWaitSemaphores = &timeline_semaphore, + .pWaitDstStageMask = &wait_stage_mask, + .commandBufferCount = 1, + .pCommandBuffers = current_cmdbuf.address(), + .signalSemaphoreCount = num_signal_semaphores, + .pSignalSemaphores = signal_semaphores.data(), + }; + switch (const VkResult result = device.GetGraphicsQueue().Submit(submit_info)) { + case VK_SUCCESS: + break; + case VK_ERROR_DEVICE_LOST: + device.ReportLoss(); + [[fallthrough]]; + default: + vk::Check(result); + } } void VKScheduler::AllocateNewContext() { - ++ticks; - - VkCommandBufferBeginInfo cmdbuf_bi; - cmdbuf_bi.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; - cmdbuf_bi.pNext = nullptr; - cmdbuf_bi.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - cmdbuf_bi.pInheritanceInfo = nullptr; - std::unique_lock lock{mutex}; - current_fence = next_fence; - next_fence = &resource_manager.CommitFence(); - current_cmdbuf = vk::CommandBuffer(resource_manager.CommitCommandBuffer(*current_fence), - device.GetDispatchLoader()); - current_cmdbuf.Begin(cmdbuf_bi); + current_cmdbuf = vk::CommandBuffer(command_pool->Commit(), device.GetDispatchLoader()); + current_cmdbuf.Begin({ + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + .pNext = nullptr, + .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, + .pInheritanceInfo = nullptr, + }); // Enable counters once again. These are disabled when a command buffer is finished. if (query_cache) { diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 82a8adc69..7be8a19f0 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -7,7 +7,6 @@ #include <atomic> #include <condition_variable> #include <memory> -#include <optional> #include <stack> #include <thread> #include <utility> @@ -17,42 +16,33 @@ namespace Vulkan { +class CommandPool; +class MasterSemaphore; class StateTracker; class VKDevice; -class VKFence; class VKQueryCache; -class VKResourceManager; - -class VKFenceView { -public: - VKFenceView() = default; - VKFenceView(VKFence* const& fence) : fence{fence} {} - - VKFence* operator->() const noexcept { - return fence; - } - - operator VKFence&() const noexcept { - return *fence; - } - -private: - VKFence* const& fence; -}; /// The scheduler abstracts command buffer and fence management with an interface that's able to do /// OpenGL-like operations on Vulkan command buffers. class VKScheduler { public: - explicit VKScheduler(const VKDevice& device, VKResourceManager& resource_manager, - StateTracker& state_tracker); + explicit VKScheduler(const VKDevice& device, StateTracker& state_tracker); ~VKScheduler(); + /// Returns the current command buffer tick. + [[nodiscard]] u64 CurrentTick() const noexcept; + + /// Returns true when a tick has been triggered by the GPU. + [[nodiscard]] bool IsFree(u64 tick) const noexcept; + + /// Waits for the given tick to trigger on the GPU. + void Wait(u64 tick); + /// Sends the current execution context to the GPU. - void Flush(bool release_fence = true, VkSemaphore semaphore = nullptr); + void Flush(VkSemaphore semaphore = nullptr); /// Sends the current execution context to the GPU and waits for it to complete. - void Finish(bool release_fence = true, VkSemaphore semaphore = nullptr); + void Finish(VkSemaphore semaphore = nullptr); /// Waits for the worker thread to finish executing everything. After this function returns it's /// safe to touch worker resources. @@ -87,14 +77,9 @@ public: (void)chunk->Record(command); } - /// Gets a reference to the current fence. - VKFenceView GetFence() const { - return current_fence; - } - - /// Returns the current command buffer tick. - u64 Ticks() const { - return ticks; + /// Returns the master timeline semaphore. + [[nodiscard]] MasterSemaphore& GetMasterSemaphore() const noexcept { + return *master_semaphore; } private: @@ -172,6 +157,13 @@ private: std::array<u8, 0x8000> data{}; }; + struct State { + VkRenderPass renderpass = nullptr; + VkFramebuffer framebuffer = nullptr; + VkExtent2D render_area = {0, 0}; + VkPipeline graphics_pipeline = nullptr; + }; + void WorkerThread(); void SubmitExecution(VkSemaphore semaphore); @@ -187,30 +179,23 @@ private: void AcquireNewChunk(); const VKDevice& device; - VKResourceManager& resource_manager; StateTracker& state_tracker; + std::unique_ptr<MasterSemaphore> master_semaphore; + std::unique_ptr<CommandPool> command_pool; + VKQueryCache* query_cache = nullptr; vk::CommandBuffer current_cmdbuf; - VKFence* current_fence = nullptr; - VKFence* next_fence = nullptr; - - struct State { - VkRenderPass renderpass = nullptr; - VkFramebuffer framebuffer = nullptr; - VkExtent2D render_area = {0, 0}; - VkPipeline graphics_pipeline = nullptr; - } state; std::unique_ptr<CommandChunk> chunk; std::thread worker_thread; + State state; Common::SPSCQueue<std::unique_ptr<CommandChunk>> chunk_queue; Common::SPSCQueue<std::unique_ptr<CommandChunk>> chunk_reserve; std::mutex mutex; std::condition_variable cv; - std::atomic<u64> ticks = 0; bool quit = false; }; diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index aaa138f52..a20452b87 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -103,8 +103,8 @@ struct GenericVaryingDescription { }; spv::Dim GetSamplerDim(const Sampler& sampler) { - ASSERT(!sampler.IsBuffer()); - switch (sampler.GetType()) { + ASSERT(!sampler.is_buffer); + switch (sampler.type) { case Tegra::Shader::TextureType::Texture1D: return spv::Dim::Dim1D; case Tegra::Shader::TextureType::Texture2D: @@ -114,13 +114,13 @@ spv::Dim GetSamplerDim(const Sampler& sampler) { case Tegra::Shader::TextureType::TextureCube: return spv::Dim::Cube; default: - UNIMPLEMENTED_MSG("Unimplemented sampler type={}", static_cast<u32>(sampler.GetType())); + UNIMPLEMENTED_MSG("Unimplemented sampler type={}", static_cast<int>(sampler.type)); return spv::Dim::Dim2D; } } std::pair<spv::Dim, bool> GetImageDim(const Image& image) { - switch (image.GetType()) { + switch (image.type) { case Tegra::Shader::ImageType::Texture1D: return {spv::Dim::Dim1D, false}; case Tegra::Shader::ImageType::TextureBuffer: @@ -134,7 +134,7 @@ std::pair<spv::Dim, bool> GetImageDim(const Image& image) { case Tegra::Shader::ImageType::Texture3D: return {spv::Dim::Dim3D, false}; default: - UNIMPLEMENTED_MSG("Unimplemented image type={}", static_cast<u32>(image.GetType())); + UNIMPLEMENTED_MSG("Unimplemented image type={}", static_cast<int>(image.type)); return {spv::Dim::Dim2D, false}; } } @@ -272,12 +272,19 @@ bool IsPrecise(Operation operand) { return false; } +u32 ShaderVersion(const VKDevice& device) { + if (device.InstanceApiVersion() < VK_API_VERSION_1_1) { + return 0x00010000; + } + return 0x00010300; +} + class SPIRVDecompiler final : public Sirit::Module { public: explicit SPIRVDecompiler(const VKDevice& device, const ShaderIR& ir, ShaderType stage, const Registry& registry, const Specialization& specialization) - : Module(0x00010300), device{device}, ir{ir}, stage{stage}, header{ir.GetHeader()}, - registry{registry}, specialization{specialization} { + : Module(ShaderVersion(device)), device{device}, ir{ir}, stage{stage}, + header{ir.GetHeader()}, registry{registry}, specialization{specialization} { if (stage != ShaderType::Compute) { transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); } @@ -293,6 +300,7 @@ public: AddCapability(spv::Capability::DrawParameters); AddCapability(spv::Capability::SubgroupBallotKHR); AddCapability(spv::Capability::SubgroupVoteKHR); + AddExtension("SPV_KHR_16bit_storage"); AddExtension("SPV_KHR_shader_ballot"); AddExtension("SPV_KHR_subgroup_vote"); AddExtension("SPV_KHR_storage_buffer_storage_class"); @@ -400,8 +408,9 @@ private: u32 binding = specialization.base_binding; binding = DeclareConstantBuffers(binding); binding = DeclareGlobalBuffers(binding); - binding = DeclareTexelBuffers(binding); + binding = DeclareUniformTexels(binding); binding = DeclareSamplers(binding); + binding = DeclareStorageTexels(binding); binding = DeclareImages(binding); const Id main = OpFunction(t_void, {}, TypeFunction(t_void)); @@ -515,6 +524,16 @@ private: void DeclareCommon() { thread_id = DeclareInputBuiltIn(spv::BuiltIn::SubgroupLocalInvocationId, t_in_uint, "thread_id"); + thread_masks[0] = + DeclareInputBuiltIn(spv::BuiltIn::SubgroupEqMask, t_in_uint4, "thread_eq_mask"); + thread_masks[1] = + DeclareInputBuiltIn(spv::BuiltIn::SubgroupGeMask, t_in_uint4, "thread_ge_mask"); + thread_masks[2] = + DeclareInputBuiltIn(spv::BuiltIn::SubgroupGtMask, t_in_uint4, "thread_gt_mask"); + thread_masks[3] = + DeclareInputBuiltIn(spv::BuiltIn::SubgroupLeMask, t_in_uint4, "thread_le_mask"); + thread_masks[4] = + DeclareInputBuiltIn(spv::BuiltIn::SubgroupLtMask, t_in_uint4, "thread_lt_mask"); } void DeclareVertex() { @@ -674,13 +693,19 @@ private: } t_smem_uint = TypePointer(spv::StorageClass::Workgroup, t_uint); - const u32 smem_size = specialization.shared_memory_size; + u32 smem_size = specialization.shared_memory_size * 4; if (smem_size == 0) { // Avoid declaring an empty array. return; } - const auto element_count = static_cast<u32>(Common::AlignUp(smem_size, 4) / 4); - const Id type_array = TypeArray(t_uint, Constant(t_uint, element_count)); + const u32 limit = device.GetMaxComputeSharedMemorySize(); + if (smem_size > limit) { + LOG_ERROR(Render_Vulkan, "Shared memory size {} is clamped to host's limit {}", + smem_size, limit); + smem_size = limit; + } + + const Id type_array = TypeArray(t_uint, Constant(t_uint, smem_size / 4)); const Id type_pointer = TypePointer(spv::StorageClass::Workgroup, type_array); Name(type_pointer, "SharedMemory"); @@ -689,9 +714,9 @@ private: } void DeclareInternalFlags() { - constexpr std::array names = {"zero", "sign", "carry", "overflow"}; + static constexpr std::array names{"zero", "sign", "carry", "overflow"}; + for (std::size_t flag = 0; flag < INTERNAL_FLAGS_COUNT; ++flag) { - const auto flag_code = static_cast<InternalFlag>(flag); const Id id = OpVariable(t_prv_bool, spv::StorageClass::Private, v_false); internal_flags[flag] = AddGlobalVariable(Name(id, names[flag])); } @@ -731,8 +756,10 @@ private: if (!IsGenericAttribute(index)) { continue; } - const u32 location = GetGenericAttributeLocation(index); + if (!IsAttributeEnabled(location)) { + continue; + } const auto type_descriptor = GetAttributeType(location); Id type; if (IsInputAttributeArray()) { @@ -877,13 +904,13 @@ private: return binding; } - u32 DeclareTexelBuffers(u32 binding) { + u32 DeclareUniformTexels(u32 binding) { for (const auto& sampler : ir.GetSamplers()) { - if (!sampler.IsBuffer()) { + if (!sampler.is_buffer) { continue; } - ASSERT(!sampler.IsArray()); - ASSERT(!sampler.IsShadow()); + ASSERT(!sampler.is_array); + ASSERT(!sampler.is_shadow); constexpr auto dim = spv::Dim::Buffer; constexpr int depth = 0; @@ -894,23 +921,23 @@ private: const Id image_type = TypeImage(t_float, dim, depth, arrayed, ms, sampled, format); const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type); const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant); - AddGlobalVariable(Name(id, fmt::format("sampler_{}", sampler.GetIndex()))); + AddGlobalVariable(Name(id, fmt::format("sampler_{}", sampler.index))); Decorate(id, spv::Decoration::Binding, binding++); Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET); - texel_buffers.emplace(sampler.GetIndex(), TexelBuffer{image_type, id}); + uniform_texels.emplace(sampler.index, TexelBuffer{image_type, id}); } return binding; } u32 DeclareSamplers(u32 binding) { for (const auto& sampler : ir.GetSamplers()) { - if (sampler.IsBuffer()) { + if (sampler.is_buffer) { continue; } const auto dim = GetSamplerDim(sampler); - const int depth = sampler.IsShadow() ? 1 : 0; - const int arrayed = sampler.IsArray() ? 1 : 0; + const int depth = sampler.is_shadow ? 1 : 0; + const int arrayed = sampler.is_array ? 1 : 0; constexpr bool ms = false; constexpr int sampled = 1; constexpr auto format = spv::ImageFormat::Unknown; @@ -918,46 +945,63 @@ private: const Id sampler_type = TypeSampledImage(image_type); const Id sampler_pointer_type = TypePointer(spv::StorageClass::UniformConstant, sampler_type); - const Id type = sampler.IsIndexed() - ? TypeArray(sampler_type, Constant(t_uint, sampler.Size())) + const Id type = sampler.is_indexed + ? TypeArray(sampler_type, Constant(t_uint, sampler.size)) : sampler_type; const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, type); const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant); - AddGlobalVariable(Name(id, fmt::format("sampler_{}", sampler.GetIndex()))); + AddGlobalVariable(Name(id, fmt::format("sampler_{}", sampler.index))); Decorate(id, spv::Decoration::Binding, binding++); Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET); - sampled_images.emplace(sampler.GetIndex(), SampledImage{image_type, sampler_type, - sampler_pointer_type, id}); + sampled_images.emplace( + sampler.index, SampledImage{image_type, sampler_type, sampler_pointer_type, id}); } return binding; } - u32 DeclareImages(u32 binding) { + u32 DeclareStorageTexels(u32 binding) { for (const auto& image : ir.GetImages()) { - const auto [dim, arrayed] = GetImageDim(image); - constexpr int depth = 0; - constexpr bool ms = false; - constexpr int sampled = 2; // This won't be accessed with a sampler - constexpr auto format = spv::ImageFormat::Unknown; - const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {}); - const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type); - const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant); - AddGlobalVariable(Name(id, fmt::format("image_{}", image.GetIndex()))); - - Decorate(id, spv::Decoration::Binding, binding++); - Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET); - if (image.IsRead() && !image.IsWritten()) { - Decorate(id, spv::Decoration::NonWritable); - } else if (image.IsWritten() && !image.IsRead()) { - Decorate(id, spv::Decoration::NonReadable); + if (image.type != Tegra::Shader::ImageType::TextureBuffer) { + continue; } + DeclareImage(image, binding); + } + return binding; + } - images.emplace(static_cast<u32>(image.GetIndex()), StorageImage{image_type, id}); + u32 DeclareImages(u32 binding) { + for (const auto& image : ir.GetImages()) { + if (image.type == Tegra::Shader::ImageType::TextureBuffer) { + continue; + } + DeclareImage(image, binding); } return binding; } + void DeclareImage(const Image& image, u32& binding) { + const auto [dim, arrayed] = GetImageDim(image); + constexpr int depth = 0; + constexpr bool ms = false; + constexpr int sampled = 2; // This won't be accessed with a sampler + const auto format = image.is_atomic ? spv::ImageFormat::R32ui : spv::ImageFormat::Unknown; + const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {}); + const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type); + const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant); + AddGlobalVariable(Name(id, fmt::format("image_{}", image.index))); + + Decorate(id, spv::Decoration::Binding, binding++); + Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET); + if (image.is_read && !image.is_written) { + Decorate(id, spv::Decoration::NonWritable); + } else if (image.is_written && !image.is_read) { + Decorate(id, spv::Decoration::NonReadable); + } + + images.emplace(image.index, StorageImage{image_type, id}); + } + bool IsRenderTargetEnabled(u32 rt) const { for (u32 component = 0; component < 4; ++component) { if (header.ps.IsColorComponentOutputEnabled(rt, component)) { @@ -976,6 +1020,10 @@ private: return stage == ShaderType::TesselationControl; } + bool IsAttributeEnabled(u32 location) const { + return stage != ShaderType::Vertex || specialization.enabled_attributes[location]; + } + u32 GetNumInputVertices() const { switch (stage) { case ShaderType::Geometry: @@ -1071,8 +1119,7 @@ private: void VisitBasicBlock(const NodeBlock& bb) { for (const auto& node : bb) { - [[maybe_unused]] const Type type = Visit(node).type; - ASSERT(type == Type::Void); + Visit(node); } } @@ -1192,16 +1239,20 @@ private: UNIMPLEMENTED_MSG("Unmanaged FrontFacing element={}", element); return {v_float_zero, Type::Float}; default: - if (IsGenericAttribute(attribute)) { - const u32 location = GetGenericAttributeLocation(attribute); - const auto type_descriptor = GetAttributeType(location); - const Type type = type_descriptor.type; - const Id attribute_id = input_attributes.at(attribute); - const std::vector elements = {element}; - const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements); - return {OpLoad(GetTypeDefinition(type), pointer), type}; + if (!IsGenericAttribute(attribute)) { + break; } - break; + const u32 location = GetGenericAttributeLocation(attribute); + if (!IsAttributeEnabled(location)) { + // Disabled attributes (also known as constant attributes) always return zero. + return {v_float_zero, Type::Float}; + } + const auto type_descriptor = GetAttributeType(location); + const Type type = type_descriptor.type; + const Id attribute_id = input_attributes.at(attribute); + const std::vector elements = {element}; + const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements); + return {OpLoad(GetTypeDefinition(type), pointer), type}; } UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute)); return {v_float_zero, Type::Float}; @@ -1237,7 +1288,7 @@ private: } else { UNREACHABLE_MSG("Unmanaged offset node type"); } - pointer = OpAccessChain(t_cbuf_float, buffer_id, Constant(t_uint, 0), buffer_index, + pointer = OpAccessChain(t_cbuf_float, buffer_id, v_uint_zero, buffer_index, buffer_element); } return {OpLoad(t_float, pointer), Type::Float}; @@ -1362,7 +1413,9 @@ private: Expression target{}; if (const auto gpr = std::get_if<GprNode>(&*dest)) { if (gpr->GetIndex() == Register::ZeroIndex) { - // Writing to Register::ZeroIndex is a no op + // Writing to Register::ZeroIndex is a no op but we still have to visit its source + // because it might have side effects. + Visit(src); return {}; } target = {registers.at(gpr->GetIndex()), Type::Float}; @@ -1584,6 +1637,15 @@ private: return {OpCompositeConstruct(t_half, low, high), Type::HalfFloat}; } + Expression LogicalAddCarry(Operation operation) { + const Id op_a = AsUint(Visit(operation[0])); + const Id op_b = AsUint(Visit(operation[1])); + + const Id result = OpIAddCarry(TypeStruct({t_uint, t_uint}), op_a, op_b); + const Id carry = OpCompositeExtract(t_uint, result, 1); + return {OpINotEqual(t_bool, carry, v_uint_zero), Type::Bool}; + } + Expression LogicalAssign(Operation operation) { const Node& dest = operation[0]; const Node& src = operation[1]; @@ -1609,13 +1671,31 @@ private: return {}; } + Expression LogicalFOrdered(Operation operation) { + // Emulate SPIR-V's OpOrdered + const Id op_a = AsFloat(Visit(operation[0])); + const Id op_b = AsFloat(Visit(operation[1])); + const Id is_num_a = OpFOrdEqual(t_bool, op_a, op_a); + const Id is_num_b = OpFOrdEqual(t_bool, op_b, op_b); + return {OpLogicalAnd(t_bool, is_num_a, is_num_b), Type::Bool}; + } + + Expression LogicalFUnordered(Operation operation) { + // Emulate SPIR-V's OpUnordered + const Id op_a = AsFloat(Visit(operation[0])); + const Id op_b = AsFloat(Visit(operation[1])); + const Id is_nan_a = OpIsNan(t_bool, op_a); + const Id is_nan_b = OpIsNan(t_bool, op_b); + return {OpLogicalOr(t_bool, is_nan_a, is_nan_b), Type::Bool}; + } + Id GetTextureSampler(Operation operation) { const auto& meta = std::get<MetaTexture>(operation.GetMeta()); - ASSERT(!meta.sampler.IsBuffer()); + ASSERT(!meta.sampler.is_buffer); - const auto& entry = sampled_images.at(meta.sampler.GetIndex()); + const auto& entry = sampled_images.at(meta.sampler.index); Id sampler = entry.variable; - if (meta.sampler.IsIndexed()) { + if (meta.sampler.is_indexed) { const Id index = AsInt(Visit(meta.index)); sampler = OpAccessChain(entry.sampler_pointer_type, sampler, index); } @@ -1624,9 +1704,9 @@ private: Id GetTextureImage(Operation operation) { const auto& meta = std::get<MetaTexture>(operation.GetMeta()); - const u32 index = meta.sampler.GetIndex(); - if (meta.sampler.IsBuffer()) { - const auto& entry = texel_buffers.at(index); + const u32 index = meta.sampler.index; + if (meta.sampler.is_buffer) { + const auto& entry = uniform_texels.at(index); return OpLoad(entry.image_type, entry.image); } else { const auto& entry = sampled_images.at(index); @@ -1636,7 +1716,7 @@ private: Id GetImage(Operation operation) { const auto& meta = std::get<MetaImage>(operation.GetMeta()); - const auto entry = images.at(meta.image.GetIndex()); + const auto entry = images.at(meta.image.index); return OpLoad(entry.image_type, entry.image); } @@ -1652,7 +1732,7 @@ private: } if (const auto meta = std::get_if<MetaTexture>(&operation.GetMeta())) { // Add array coordinate for textures - if (meta->sampler.IsArray()) { + if (meta->sampler.is_array) { Id array = AsInt(Visit(meta->array)); if (type == Type::Float) { array = OpConvertSToF(t_float, array); @@ -1758,7 +1838,7 @@ private: operands.push_back(GetOffsetCoordinates(operation)); } - if (meta.sampler.IsShadow()) { + if (meta.sampler.is_shadow) { const Id dref = AsFloat(Visit(meta.depth_compare)); return {OpImageSampleDrefExplicitLod(t_float, sampler, coords, dref, mask, operands), Type::Float}; @@ -1773,7 +1853,7 @@ private: const Id coords = GetCoordinates(operation, Type::Float); Id texture{}; - if (meta.sampler.IsShadow()) { + if (meta.sampler.is_shadow) { texture = OpImageDrefGather(t_float4, GetTextureSampler(operation), coords, AsFloat(Visit(meta.depth_compare))); } else { @@ -1800,8 +1880,8 @@ private: } const Id lod = AsUint(Visit(operation[0])); - const std::size_t coords_count = [&]() { - switch (const auto type = meta.sampler.GetType(); type) { + const std::size_t coords_count = [&meta] { + switch (const auto type = meta.sampler.type) { case Tegra::Shader::TextureType::Texture1D: return 1; case Tegra::Shader::TextureType::Texture2D: @@ -1810,7 +1890,7 @@ private: case Tegra::Shader::TextureType::Texture3D: return 3; default: - UNREACHABLE_MSG("Invalid texture type={}", static_cast<u32>(type)); + UNREACHABLE_MSG("Invalid texture type={}", static_cast<int>(type)); return 2; } }(); @@ -1853,7 +1933,7 @@ private: const Id image = GetTextureImage(operation); const Id coords = GetCoordinates(operation, Type::Int); Id fetch; - if (meta.lod && !meta.sampler.IsBuffer()) { + if (meta.lod && !meta.sampler.is_buffer) { fetch = OpImageFetch(t_float4, image, coords, spv::ImageOperandsMask::Lod, AsInt(Visit(meta.lod))); } else { @@ -1903,39 +1983,20 @@ private: return {}; } - Expression AtomicImageAdd(Operation operation) { - UNIMPLEMENTED(); - return {}; - } - - Expression AtomicImageMin(Operation operation) { - UNIMPLEMENTED(); - return {}; - } - - Expression AtomicImageMax(Operation operation) { - UNIMPLEMENTED(); - return {}; - } - - Expression AtomicImageAnd(Operation operation) { - UNIMPLEMENTED(); - return {}; - } - - Expression AtomicImageOr(Operation operation) { - UNIMPLEMENTED(); - return {}; - } + template <Id (Module::*func)(Id, Id, Id, Id, Id)> + Expression AtomicImage(Operation operation) { + const auto& meta{std::get<MetaImage>(operation.GetMeta())}; + ASSERT(meta.values.size() == 1); - Expression AtomicImageXor(Operation operation) { - UNIMPLEMENTED(); - return {}; - } + const Id coordinate = GetCoordinates(operation, Type::Int); + const Id image = images.at(meta.image.index).image; + const Id sample = v_uint_zero; + const Id pointer = OpImageTexelPointer(t_image_uint, image, coordinate, sample); - Expression AtomicImageExchange(Operation operation) { - UNIMPLEMENTED(); - return {}; + const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device)); + const Id semantics = v_uint_zero; + const Id value = AsUint(Visit(meta.values[0])); + return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint}; } template <Id (Module::*func)(Id, Id, Id, Id, Id)> @@ -1950,7 +2011,7 @@ private: return {v_float_zero, Type::Float}; } const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device)); - const Id semantics = Constant(t_uint, 0); + const Id semantics = v_uint_zero; const Id value = AsUint(Visit(operation[1])); return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint}; @@ -2148,14 +2209,37 @@ private: return {OpLoad(t_uint, thread_id), Type::Uint}; } + template <std::size_t index> + Expression ThreadMask(Operation) { + // TODO(Rodrigo): Handle devices with different warp sizes + const Id mask = thread_masks[index]; + return {OpLoad(t_uint, AccessElement(t_in_uint, mask, 0)), Type::Uint}; + } + Expression ShuffleIndexed(Operation operation) { const Id value = AsFloat(Visit(operation[0])); const Id index = AsUint(Visit(operation[1])); return {OpSubgroupReadInvocationKHR(t_float, value, index), Type::Float}; } - Expression MemoryBarrierGL(Operation) { - const auto scope = spv::Scope::Device; + Expression Barrier(Operation) { + if (!ir.IsDecompiled()) { + LOG_ERROR(Render_Vulkan, "OpBarrier used by shader is not decompiled"); + return {}; + } + + const auto scope = spv::Scope::Workgroup; + const auto memory = spv::Scope::Workgroup; + const auto semantics = + spv::MemorySemanticsMask::WorkgroupMemory | spv::MemorySemanticsMask::AcquireRelease; + OpControlBarrier(Constant(t_uint, static_cast<u32>(scope)), + Constant(t_uint, static_cast<u32>(memory)), + Constant(t_uint, static_cast<u32>(semantics))); + return {}; + } + + template <spv::Scope scope> + Expression MemoryBarrier(Operation) { const auto semantics = spv::MemorySemanticsMask::AcquireRelease | spv::MemorySemanticsMask::UniformMemory | spv::MemorySemanticsMask::WorkgroupMemory | @@ -2502,7 +2586,14 @@ private: &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThan, Type::Bool, Type::Float>, &SPIRVDecompiler::Binary<&Module::OpFOrdNotEqual, Type::Bool, Type::Float>, &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThanEqual, Type::Bool, Type::Float>, - &SPIRVDecompiler::Unary<&Module::OpIsNan, Type::Bool, Type::Float>, + &SPIRVDecompiler::LogicalFOrdered, + &SPIRVDecompiler::LogicalFUnordered, + &SPIRVDecompiler::Binary<&Module::OpFUnordLessThan, Type::Bool, Type::Float>, + &SPIRVDecompiler::Binary<&Module::OpFUnordEqual, Type::Bool, Type::Float>, + &SPIRVDecompiler::Binary<&Module::OpFUnordLessThanEqual, Type::Bool, Type::Float>, + &SPIRVDecompiler::Binary<&Module::OpFUnordGreaterThan, Type::Bool, Type::Float>, + &SPIRVDecompiler::Binary<&Module::OpFUnordNotEqual, Type::Bool, Type::Float>, + &SPIRVDecompiler::Binary<&Module::OpFUnordGreaterThanEqual, Type::Bool, Type::Float>, &SPIRVDecompiler::Binary<&Module::OpSLessThan, Type::Bool, Type::Int>, &SPIRVDecompiler::Binary<&Module::OpIEqual, Type::Bool, Type::Int>, @@ -2518,6 +2609,8 @@ private: &SPIRVDecompiler::Binary<&Module::OpINotEqual, Type::Bool, Type::Uint>, &SPIRVDecompiler::Binary<&Module::OpUGreaterThanEqual, Type::Bool, Type::Uint>, + &SPIRVDecompiler::LogicalAddCarry, + &SPIRVDecompiler::Binary<&Module::OpFOrdLessThan, Type::Bool2, Type::HalfFloat>, &SPIRVDecompiler::Binary<&Module::OpFOrdEqual, Type::Bool2, Type::HalfFloat>, &SPIRVDecompiler::Binary<&Module::OpFOrdLessThanEqual, Type::Bool2, Type::HalfFloat>, @@ -2542,11 +2635,11 @@ private: &SPIRVDecompiler::ImageLoad, &SPIRVDecompiler::ImageStore, - &SPIRVDecompiler::AtomicImageAdd, - &SPIRVDecompiler::AtomicImageAnd, - &SPIRVDecompiler::AtomicImageOr, - &SPIRVDecompiler::AtomicImageXor, - &SPIRVDecompiler::AtomicImageExchange, + &SPIRVDecompiler::AtomicImage<&Module::OpAtomicIAdd>, + &SPIRVDecompiler::AtomicImage<&Module::OpAtomicAnd>, + &SPIRVDecompiler::AtomicImage<&Module::OpAtomicOr>, + &SPIRVDecompiler::AtomicImage<&Module::OpAtomicXor>, + &SPIRVDecompiler::AtomicImage<&Module::OpAtomicExchange>, &SPIRVDecompiler::Atomic<&Module::OpAtomicExchange>, &SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd>, @@ -2603,9 +2696,16 @@ private: &SPIRVDecompiler::Vote<&Module::OpSubgroupAllEqualKHR>, &SPIRVDecompiler::ThreadId, + &SPIRVDecompiler::ThreadMask<0>, // Eq + &SPIRVDecompiler::ThreadMask<1>, // Ge + &SPIRVDecompiler::ThreadMask<2>, // Gt + &SPIRVDecompiler::ThreadMask<3>, // Le + &SPIRVDecompiler::ThreadMask<4>, // Lt &SPIRVDecompiler::ShuffleIndexed, - &SPIRVDecompiler::MemoryBarrierGL, + &SPIRVDecompiler::Barrier, + &SPIRVDecompiler::MemoryBarrier<spv::Scope::Workgroup>, + &SPIRVDecompiler::MemoryBarrier<spv::Scope::Device>, }; static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); @@ -2681,8 +2781,11 @@ private: Decorate(TypeStruct(t_gmem_array), spv::Decoration::Block), 0, spv::Decoration::Offset, 0); const Id t_gmem_ssbo = TypePointer(spv::StorageClass::StorageBuffer, t_gmem_struct); + const Id t_image_uint = TypePointer(spv::StorageClass::Image, t_uint); + const Id v_float_zero = Constant(t_float, 0.0f); const Id v_float_one = Constant(t_float, 1.0f); + const Id v_uint_zero = Constant(t_uint, 0); // Nvidia uses these defaults for varyings (e.g. position and generic attributes) const Id v_varying_default = @@ -2707,15 +2810,15 @@ private: std::unordered_map<u8, GenericVaryingDescription> output_attributes; std::map<u32, Id> constant_buffers; std::map<GlobalMemoryBase, Id> global_buffers; - std::map<u32, TexelBuffer> texel_buffers; + std::map<u32, TexelBuffer> uniform_texels; std::map<u32, SampledImage> sampled_images; std::map<u32, StorageImage> images; + std::array<Id, Maxwell::NumRenderTargets> frag_colors{}; Id instance_index{}; Id vertex_index{}; Id base_instance{}; Id base_vertex{}; - std::array<Id, Maxwell::NumRenderTargets> frag_colors{}; Id frag_depth{}; Id frag_coord{}; Id front_facing{}; @@ -2727,6 +2830,7 @@ private: Id workgroup_id{}; Id local_invocation_id{}; Id thread_id{}; + std::array<Id, 5> thread_masks{}; // eq, ge, gt, le, lt VertexIndices in_indices; VertexIndices out_indices; @@ -2969,14 +3073,18 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) { entries.global_buffers.emplace_back(base.cbuf_index, base.cbuf_offset, usage.is_written); } for (const auto& sampler : ir.GetSamplers()) { - if (sampler.IsBuffer()) { - entries.texel_buffers.emplace_back(sampler); + if (sampler.is_buffer) { + entries.uniform_texels.emplace_back(sampler); } else { entries.samplers.emplace_back(sampler); } } for (const auto& image : ir.GetImages()) { - entries.images.emplace_back(image); + if (image.type == Tegra::Shader::ImageType::TextureBuffer) { + entries.storage_texels.emplace_back(image); + } else { + entries.images.emplace_back(image); + } } for (const auto& attribute : ir.GetInputAttributes()) { if (IsGenericAttribute(attribute)) { diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h index ffea4709e..2b0e90396 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h @@ -5,11 +5,7 @@ #pragma once #include <array> -#include <bitset> -#include <memory> #include <set> -#include <type_traits> -#include <utility> #include <vector> #include "common/common_types.h" @@ -25,8 +21,9 @@ class VKDevice; namespace Vulkan { using Maxwell = Tegra::Engines::Maxwell3D::Regs; -using TexelBufferEntry = VideoCommon::Shader::Sampler; +using UniformTexelEntry = VideoCommon::Shader::Sampler; using SamplerEntry = VideoCommon::Shader::Sampler; +using StorageTexelEntry = VideoCommon::Shader::Image; using ImageEntry = VideoCommon::Shader::Image; constexpr u32 DESCRIPTOR_SET = 0; @@ -70,13 +67,15 @@ private: struct ShaderEntries { u32 NumBindings() const { return static_cast<u32>(const_buffers.size() + global_buffers.size() + - texel_buffers.size() + samplers.size() + images.size()); + uniform_texels.size() + samplers.size() + storage_texels.size() + + images.size()); } std::vector<ConstBufferEntry> const_buffers; std::vector<GlobalBufferEntry> global_buffers; - std::vector<TexelBufferEntry> texel_buffers; + std::vector<UniformTexelEntry> uniform_texels; std::vector<SamplerEntry> samplers; + std::vector<StorageTexelEntry> storage_texels; std::vector<ImageEntry> images; std::set<u32> attributes; std::array<bool, Maxwell::NumClipDistances> clip_distances{}; @@ -92,7 +91,8 @@ struct Specialization final { u32 shared_memory_size{}; // Graphics specific - std::optional<float> point_size{}; + std::optional<float> point_size; + std::bitset<Maxwell::NumVertexAttributes> enabled_attributes; std::array<Maxwell::VertexAttribute::Type, Maxwell::NumVertexAttributes> attribute_types{}; bool ndc_minus_one_to_one{}; }; diff --git a/src/video_core/renderer_vulkan/vk_shader_util.cpp b/src/video_core/renderer_vulkan/vk_shader_util.cpp index 784839327..c1a218d76 100644 --- a/src/video_core/renderer_vulkan/vk_shader_util.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_util.cpp @@ -4,8 +4,7 @@ #include <cstring> #include <memory> -#include <vector> -#include "common/alignment.h" + #include "common/assert.h" #include "common/common_types.h" #include "video_core/renderer_vulkan/vk_device.h" @@ -20,13 +19,13 @@ vk::ShaderModule BuildShader(const VKDevice& device, std::size_t code_size, cons const auto data = std::make_unique<u32[]>(code_size / sizeof(u32)); std::memcpy(data.get(), code_data, code_size); - VkShaderModuleCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.codeSize = code_size; - ci.pCode = data.get(); - return device.GetLogical().CreateShaderModule(ci); + return device.GetLogical().CreateShaderModule({ + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .codeSize = code_size, + .pCode = data.get(), + }); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_shader_util.h b/src/video_core/renderer_vulkan/vk_shader_util.h index be38d6697..d1d3f3cae 100644 --- a/src/video_core/renderer_vulkan/vk_shader_util.h +++ b/src/video_core/renderer_vulkan/vk_shader_util.h @@ -4,7 +4,6 @@ #pragma once -#include <vector> #include "common/common_types.h" #include "video_core/renderer_vulkan/wrapper.h" diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp index 94d954d7a..2fd3b7f39 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp @@ -10,37 +10,18 @@ #include "common/bit_util.h" #include "common/common_types.h" #include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/wrapper.h" namespace Vulkan { -VKStagingBufferPool::StagingBuffer::StagingBuffer(std::unique_ptr<VKBuffer> buffer, VKFence& fence, - u64 last_epoch) - : buffer{std::move(buffer)}, watch{fence}, last_epoch{last_epoch} {} +VKStagingBufferPool::StagingBuffer::StagingBuffer(std::unique_ptr<VKBuffer> buffer_) + : buffer{std::move(buffer_)} {} -VKStagingBufferPool::StagingBuffer::StagingBuffer(StagingBuffer&& rhs) noexcept { - buffer = std::move(rhs.buffer); - watch = std::move(rhs.watch); - last_epoch = rhs.last_epoch; -} - -VKStagingBufferPool::StagingBuffer::~StagingBuffer() = default; - -VKStagingBufferPool::StagingBuffer& VKStagingBufferPool::StagingBuffer::operator=( - StagingBuffer&& rhs) noexcept { - buffer = std::move(rhs.buffer); - watch = std::move(rhs.watch); - last_epoch = rhs.last_epoch; - return *this; -} - -VKStagingBufferPool::VKStagingBufferPool(const VKDevice& device, VKMemoryManager& memory_manager, - VKScheduler& scheduler) - : device{device}, memory_manager{memory_manager}, scheduler{scheduler}, - is_device_integrated{device.IsIntegrated()} {} +VKStagingBufferPool::VKStagingBufferPool(const VKDevice& device_, VKMemoryManager& memory_manager_, + VKScheduler& scheduler_) + : device{device_}, memory_manager{memory_manager_}, scheduler{scheduler_} {} VKStagingBufferPool::~VKStagingBufferPool() = default; @@ -52,21 +33,19 @@ VKBuffer& VKStagingBufferPool::GetUnusedBuffer(std::size_t size, bool host_visib } void VKStagingBufferPool::TickFrame() { - ++epoch; current_delete_level = (current_delete_level + 1) % NumLevels; ReleaseCache(true); - if (!is_device_integrated) { - ReleaseCache(false); - } + ReleaseCache(false); } VKBuffer* VKStagingBufferPool::TryGetReservedBuffer(std::size_t size, bool host_visible) { - for (auto& entry : GetCache(host_visible)[Common::Log2Ceil64(size)].entries) { - if (entry.watch.TryWatch(scheduler.GetFence())) { - entry.last_epoch = epoch; - return &*entry.buffer; + for (StagingBuffer& entry : GetCache(host_visible)[Common::Log2Ceil64(size)].entries) { + if (!scheduler.IsFree(entry.tick)) { + continue; } + entry.tick = scheduler.CurrentTick(); + return &*entry.buffer; } return nullptr; } @@ -74,28 +53,29 @@ VKBuffer* VKStagingBufferPool::TryGetReservedBuffer(std::size_t size, bool host_ VKBuffer& VKStagingBufferPool::CreateStagingBuffer(std::size_t size, bool host_visible) { const u32 log2 = Common::Log2Ceil64(size); - VkBufferCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.size = 1ULL << log2; - ci.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | - VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | - VK_BUFFER_USAGE_INDEX_BUFFER_BIT; - ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - ci.queueFamilyIndexCount = 0; - ci.pQueueFamilyIndices = nullptr; - auto buffer = std::make_unique<VKBuffer>(); - buffer->handle = device.GetLogical().CreateBuffer(ci); + buffer->handle = device.GetLogical().CreateBuffer({ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = 1ULL << log2, + .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }); buffer->commit = memory_manager.Commit(buffer->handle, host_visible); - auto& entries = GetCache(host_visible)[log2].entries; - return *entries.emplace_back(std::move(buffer), scheduler.GetFence(), epoch).buffer; + std::vector<StagingBuffer>& entries = GetCache(host_visible)[log2].entries; + StagingBuffer& entry = entries.emplace_back(std::move(buffer)); + entry.tick = scheduler.CurrentTick(); + return *entry.buffer; } VKStagingBufferPool::StagingBuffersCache& VKStagingBufferPool::GetCache(bool host_visible) { - return is_device_integrated || host_visible ? host_staging_buffers : device_staging_buffers; + return host_visible ? host_staging_buffers : device_staging_buffers; } void VKStagingBufferPool::ReleaseCache(bool host_visible) { @@ -113,9 +93,8 @@ u64 VKStagingBufferPool::ReleaseLevel(StagingBuffersCache& cache, std::size_t lo auto& entries = staging.entries; const std::size_t old_size = entries.size(); - const auto is_deleteable = [this](const auto& entry) { - static constexpr u64 epochs_to_destroy = 180; - return entry.last_epoch + epochs_to_destroy < epoch && !entry.watch.IsUsed(); + const auto is_deleteable = [this](const StagingBuffer& entry) { + return scheduler.IsFree(entry.tick); }; const std::size_t begin_offset = staging.delete_index; const std::size_t end_offset = std::min(begin_offset + deletions_per_tick, old_size); diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h index a0840ff8c..2dd5049ac 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h @@ -5,20 +5,16 @@ #pragma once #include <climits> -#include <unordered_map> -#include <utility> #include <vector> #include "common/common_types.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/wrapper.h" namespace Vulkan { class VKDevice; -class VKFenceWatch; class VKScheduler; struct VKBuffer final { @@ -38,16 +34,10 @@ public: private: struct StagingBuffer final { - explicit StagingBuffer(std::unique_ptr<VKBuffer> buffer, VKFence& fence, u64 last_epoch); - StagingBuffer(StagingBuffer&& rhs) noexcept; - StagingBuffer(const StagingBuffer&) = delete; - ~StagingBuffer(); - - StagingBuffer& operator=(StagingBuffer&& rhs) noexcept; + explicit StagingBuffer(std::unique_ptr<VKBuffer> buffer); std::unique_ptr<VKBuffer> buffer; - VKFenceWatch watch; - u64 last_epoch = 0; + u64 tick = 0; }; struct StagingBuffers final { @@ -71,13 +61,10 @@ private: const VKDevice& device; VKMemoryManager& memory_manager; VKScheduler& scheduler; - const bool is_device_integrated; StagingBuffersCache host_staging_buffers; StagingBuffersCache device_staging_buffers; - u64 epoch = 0; - std::size_t current_delete_level = 0; }; diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.cpp b/src/video_core/renderer_vulkan/vk_state_tracker.cpp index 94a89e388..5d2c4a796 100644 --- a/src/video_core/renderer_vulkan/vk_state_tracker.cpp +++ b/src/video_core/renderer_vulkan/vk_state_tracker.cpp @@ -36,6 +36,14 @@ Flags MakeInvalidationFlags() { flags[BlendConstants] = true; flags[DepthBounds] = true; flags[StencilProperties] = true; + flags[CullMode] = true; + flags[DepthBoundsEnable] = true; + flags[DepthTestEnable] = true; + flags[DepthWriteEnable] = true; + flags[DepthCompareOp] = true; + flags[FrontFace] = true; + flags[StencilOp] = true; + flags[StencilTestEnable] = true; return flags; } @@ -75,14 +83,58 @@ void SetupDirtyStencilProperties(Tables& tables) { table[OFF(stencil_back_func_mask)] = StencilProperties; } -} // Anonymous namespace +void SetupDirtyCullMode(Tables& tables) { + auto& table = tables[0]; + table[OFF(cull_face)] = CullMode; + table[OFF(cull_test_enabled)] = CullMode; +} + +void SetupDirtyDepthBoundsEnable(Tables& tables) { + tables[0][OFF(depth_bounds_enable)] = DepthBoundsEnable; +} + +void SetupDirtyDepthTestEnable(Tables& tables) { + tables[0][OFF(depth_test_enable)] = DepthTestEnable; +} + +void SetupDirtyDepthWriteEnable(Tables& tables) { + tables[0][OFF(depth_write_enabled)] = DepthWriteEnable; +} + +void SetupDirtyDepthCompareOp(Tables& tables) { + tables[0][OFF(depth_test_func)] = DepthCompareOp; +} -StateTracker::StateTracker(Core::System& system) - : system{system}, invalidation_flags{MakeInvalidationFlags()} {} +void SetupDirtyFrontFace(Tables& tables) { + auto& table = tables[0]; + table[OFF(front_face)] = FrontFace; + table[OFF(screen_y_control)] = FrontFace; +} + +void SetupDirtyStencilOp(Tables& tables) { + auto& table = tables[0]; + table[OFF(stencil_front_op_fail)] = StencilOp; + table[OFF(stencil_front_op_zfail)] = StencilOp; + table[OFF(stencil_front_op_zpass)] = StencilOp; + table[OFF(stencil_front_func_func)] = StencilOp; + table[OFF(stencil_back_op_fail)] = StencilOp; + table[OFF(stencil_back_op_zfail)] = StencilOp; + table[OFF(stencil_back_op_zpass)] = StencilOp; + table[OFF(stencil_back_func_func)] = StencilOp; + + // Table 0 is used by StencilProperties + tables[1][OFF(stencil_two_side_enable)] = StencilOp; +} -void StateTracker::Initialize() { - auto& dirty = system.GPU().Maxwell3D().dirty; - auto& tables = dirty.tables; +void SetupDirtyStencilTestEnable(Tables& tables) { + tables[0][OFF(stencil_enable)] = StencilTestEnable; +} + +} // Anonymous namespace + +StateTracker::StateTracker(Tegra::GPU& gpu) + : flags{gpu.Maxwell3D().dirty.flags}, invalidation_flags{MakeInvalidationFlags()} { + auto& tables = gpu.Maxwell3D().dirty.tables; SetupDirtyRenderTargets(tables); SetupDirtyViewports(tables); SetupDirtyScissors(tables); @@ -90,10 +142,14 @@ void StateTracker::Initialize() { SetupDirtyBlendConstants(tables); SetupDirtyDepthBounds(tables); SetupDirtyStencilProperties(tables); -} - -void StateTracker::InvalidateCommandBufferState() { - system.GPU().Maxwell3D().dirty.flags |= invalidation_flags; + SetupDirtyCullMode(tables); + SetupDirtyDepthBoundsEnable(tables); + SetupDirtyDepthTestEnable(tables); + SetupDirtyDepthWriteEnable(tables); + SetupDirtyDepthCompareOp(tables); + SetupDirtyFrontFace(tables); + SetupDirtyStencilOp(tables); + SetupDirtyStencilTestEnable(tables); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.h b/src/video_core/renderer_vulkan/vk_state_tracker.h index 03bc415b2..1de789e57 100644 --- a/src/video_core/renderer_vulkan/vk_state_tracker.h +++ b/src/video_core/renderer_vulkan/vk_state_tracker.h @@ -26,6 +26,15 @@ enum : u8 { DepthBounds, StencilProperties, + CullMode, + DepthBoundsEnable, + DepthTestEnable, + DepthWriteEnable, + DepthCompareOp, + FrontFace, + StencilOp, + StencilTestEnable, + Last }; static_assert(Last <= std::numeric_limits<u8>::max()); @@ -33,12 +42,15 @@ static_assert(Last <= std::numeric_limits<u8>::max()); } // namespace Dirty class StateTracker { -public: - explicit StateTracker(Core::System& system); + using Maxwell = Tegra::Engines::Maxwell3D::Regs; - void Initialize(); +public: + explicit StateTracker(Tegra::GPU& gpu); - void InvalidateCommandBufferState(); + void InvalidateCommandBufferState() { + flags |= invalidation_flags; + current_topology = INVALID_TOPOLOGY; + } bool TouchViewports() { return Exchange(Dirty::Viewports, false); @@ -64,16 +76,60 @@ public: return Exchange(Dirty::StencilProperties, false); } + bool TouchCullMode() { + return Exchange(Dirty::CullMode, false); + } + + bool TouchDepthBoundsTestEnable() { + return Exchange(Dirty::DepthBoundsEnable, false); + } + + bool TouchDepthTestEnable() { + return Exchange(Dirty::DepthTestEnable, false); + } + + bool TouchDepthBoundsEnable() { + return Exchange(Dirty::DepthBoundsEnable, false); + } + + bool TouchDepthWriteEnable() { + return Exchange(Dirty::DepthWriteEnable, false); + } + + bool TouchDepthCompareOp() { + return Exchange(Dirty::DepthCompareOp, false); + } + + bool TouchFrontFace() { + return Exchange(Dirty::FrontFace, false); + } + + bool TouchStencilOp() { + return Exchange(Dirty::StencilOp, false); + } + + bool TouchStencilTestEnable() { + return Exchange(Dirty::StencilTestEnable, false); + } + + bool ChangePrimitiveTopology(Maxwell::PrimitiveTopology new_topology) { + const bool has_changed = current_topology != new_topology; + current_topology = new_topology; + return has_changed; + } + private: + static constexpr auto INVALID_TOPOLOGY = static_cast<Maxwell::PrimitiveTopology>(~0u); + bool Exchange(std::size_t id, bool new_value) const noexcept { - auto& flags = system.GPU().Maxwell3D().dirty.flags; const bool is_dirty = flags[id]; flags[id] = new_value; return is_dirty; } - Core::System& system; + Tegra::Engines::Maxwell3D::DirtyState::Flags& flags; Tegra::Engines::Maxwell3D::DirtyState::Flags invalidation_flags; + Maxwell::PrimitiveTopology current_topology = INVALID_TOPOLOGY; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp index 38a93a01a..1b59612b9 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp @@ -3,6 +3,7 @@ // Refer to the license.txt file included. #include <algorithm> +#include <limits> #include <optional> #include <tuple> #include <vector> @@ -10,7 +11,6 @@ #include "common/alignment.h" #include "common/assert.h" #include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_stream_buffer.h" #include "video_core/renderer_vulkan/wrapper.h" @@ -22,27 +22,43 @@ namespace { constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000; constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000; -constexpr u64 STREAM_BUFFER_SIZE = 256 * 1024 * 1024; +constexpr u64 PREFERRED_STREAM_BUFFER_SIZE = 256 * 1024 * 1024; -std::optional<u32> FindMemoryType(const VKDevice& device, u32 filter, - VkMemoryPropertyFlags wanted) { - const auto properties = device.GetPhysical().GetMemoryProperties(); - for (u32 i = 0; i < properties.memoryTypeCount; i++) { - if (!(filter & (1 << i))) { - continue; - } - if ((properties.memoryTypes[i].propertyFlags & wanted) == wanted) { +/// Find a memory type with the passed requirements +std::optional<u32> FindMemoryType(const VkPhysicalDeviceMemoryProperties& properties, + VkMemoryPropertyFlags wanted, + u32 filter = std::numeric_limits<u32>::max()) { + for (u32 i = 0; i < properties.memoryTypeCount; ++i) { + const auto flags = properties.memoryTypes[i].propertyFlags; + if ((flags & wanted) == wanted && (filter & (1U << i)) != 0) { return i; } } return std::nullopt; } +/// Get the preferred host visible memory type. +u32 GetMemoryType(const VkPhysicalDeviceMemoryProperties& properties, + u32 filter = std::numeric_limits<u32>::max()) { + // Prefer device local host visible allocations. Both AMD and Nvidia now provide one. + // Otherwise search for a host visible allocation. + static constexpr auto HOST_MEMORY = + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + static constexpr auto DYNAMIC_MEMORY = HOST_MEMORY | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + + std::optional preferred_type = FindMemoryType(properties, DYNAMIC_MEMORY); + if (!preferred_type) { + preferred_type = FindMemoryType(properties, HOST_MEMORY); + ASSERT_MSG(preferred_type, "No host visible and coherent memory type found"); + } + return preferred_type.value_or(0); +} + } // Anonymous namespace -VKStreamBuffer::VKStreamBuffer(const VKDevice& device, VKScheduler& scheduler, +VKStreamBuffer::VKStreamBuffer(const VKDevice& device_, VKScheduler& scheduler_, VkBufferUsageFlags usage) - : device{device}, scheduler{scheduler} { + : device{device_}, scheduler{scheduler_} { CreateBuffers(usage); ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE); ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE); @@ -51,7 +67,7 @@ VKStreamBuffer::VKStreamBuffer(const VKDevice& device, VKScheduler& scheduler, VKStreamBuffer::~VKStreamBuffer() = default; std::tuple<u8*, u64, bool> VKStreamBuffer::Map(u64 size, u64 alignment) { - ASSERT(size <= STREAM_BUFFER_SIZE); + ASSERT(size <= stream_buffer_size); mapped_size = size; if (alignment > 0) { @@ -61,7 +77,7 @@ std::tuple<u8*, u64, bool> VKStreamBuffer::Map(u64 size, u64 alignment) { WaitPendingOperations(offset); bool invalidated = false; - if (offset + size > STREAM_BUFFER_SIZE) { + if (offset + size > stream_buffer_size) { // The buffer would overflow, save the amount of used watches and reset the state. invalidation_mark = current_watch_cursor; current_watch_cursor = 0; @@ -94,44 +110,39 @@ void VKStreamBuffer::Unmap(u64 size) { } auto& watch = current_watches[current_watch_cursor++]; watch.upper_bound = offset; - watch.fence.Watch(scheduler.GetFence()); + watch.tick = scheduler.CurrentTick(); } void VKStreamBuffer::CreateBuffers(VkBufferUsageFlags usage) { - VkBufferCreateInfo buffer_ci; - buffer_ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - buffer_ci.pNext = nullptr; - buffer_ci.flags = 0; - buffer_ci.size = STREAM_BUFFER_SIZE; - buffer_ci.usage = usage; - buffer_ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - buffer_ci.queueFamilyIndexCount = 0; - buffer_ci.pQueueFamilyIndices = nullptr; - - const auto& dev = device.GetLogical(); - buffer = dev.CreateBuffer(buffer_ci); - - const auto& dld = device.GetDispatchLoader(); - const auto requirements = dev.GetBufferMemoryRequirements(*buffer); - // Prefer device local host visible allocations (this should hit AMD's pinned memory). - auto type = - FindMemoryType(device, requirements.memoryTypeBits, - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); - if (!type) { - // Otherwise search for a host visible allocation. - type = FindMemoryType(device, requirements.memoryTypeBits, - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | - VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); - ASSERT_MSG(type, "No host visible and coherent memory type found"); - } - VkMemoryAllocateInfo memory_ai; - memory_ai.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - memory_ai.pNext = nullptr; - memory_ai.allocationSize = requirements.size; - memory_ai.memoryTypeIndex = *type; - - memory = dev.AllocateMemory(memory_ai); + const auto memory_properties = device.GetPhysical().GetMemoryProperties(); + const u32 preferred_type = GetMemoryType(memory_properties); + const u32 preferred_heap = memory_properties.memoryTypes[preferred_type].heapIndex; + + // Substract from the preferred heap size some bytes to avoid getting out of memory. + const VkDeviceSize heap_size = memory_properties.memoryHeaps[preferred_heap].size; + // As per DXVK's example, using `heap_size / 2` + const VkDeviceSize allocable_size = heap_size / 2; + buffer = device.GetLogical().CreateBuffer({ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = std::min(PREFERRED_STREAM_BUFFER_SIZE, allocable_size), + .usage = usage, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }); + + const auto requirements = device.GetLogical().GetBufferMemoryRequirements(*buffer); + const u32 required_flags = requirements.memoryTypeBits; + stream_buffer_size = static_cast<u64>(requirements.size); + + memory = device.GetLogical().AllocateMemory({ + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .pNext = nullptr, + .allocationSize = requirements.size, + .memoryTypeIndex = GetMemoryType(memory_properties, required_flags), + }); buffer.BindMemory(*memory, 0); } @@ -146,7 +157,7 @@ void VKStreamBuffer::WaitPendingOperations(u64 requested_upper_bound) { while (requested_upper_bound < wait_bound && wait_cursor < *invalidation_mark) { auto& watch = previous_watches[wait_cursor]; wait_bound = watch.upper_bound; - watch.fence.Wait(); + scheduler.Wait(watch.tick); ++wait_cursor; } } diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h index 58ce8b973..5e15ad78f 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.h +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h @@ -14,7 +14,6 @@ namespace Vulkan { class VKDevice; -class VKFence; class VKFenceWatch; class VKScheduler; @@ -35,13 +34,17 @@ public: /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy. void Unmap(u64 size); - VkBuffer GetHandle() const { + VkBuffer Handle() const noexcept { return *buffer; } + u64 Address() const noexcept { + return 0; + } + private: - struct Watch final { - VKFenceWatch fence; + struct Watch { + u64 tick{}; u64 upper_bound{}; }; @@ -56,8 +59,9 @@ private: const VKDevice& device; ///< Vulkan device manager. VKScheduler& scheduler; ///< Command scheduler. - vk::Buffer buffer; ///< Mapped buffer. - vk::DeviceMemory memory; ///< Memory allocation. + vk::Buffer buffer; ///< Mapped buffer. + vk::DeviceMemory memory; ///< Memory allocation. + u64 stream_buffer_size{}; ///< Stream buffer size. u64 offset{}; ///< Buffer iterator. u64 mapped_size{}; ///< Size reserved for the current copy. diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp index bffd8f32a..9636a7c65 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.cpp +++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp @@ -12,7 +12,7 @@ #include "core/core.h" #include "core/frontend/framebuffer_layout.h" #include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/vk_resource_manager.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_swapchain.h" #include "video_core/renderer_vulkan/wrapper.h" @@ -56,8 +56,8 @@ VkExtent2D ChooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities, u32 wi } // Anonymous namespace -VKSwapchain::VKSwapchain(VkSurfaceKHR surface, const VKDevice& device) - : surface{surface}, device{device} {} +VKSwapchain::VKSwapchain(VkSurfaceKHR surface_, const VKDevice& device_, VKScheduler& scheduler_) + : surface{surface_}, device{device_}, scheduler{scheduler_} {} VKSwapchain::~VKSwapchain() = default; @@ -75,35 +75,33 @@ void VKSwapchain::Create(u32 width, u32 height, bool srgb) { CreateSemaphores(); CreateImageViews(); - fences.resize(image_count, nullptr); + resource_ticks.clear(); + resource_ticks.resize(image_count); } void VKSwapchain::AcquireNextImage() { device.GetLogical().AcquireNextImageKHR(*swapchain, std::numeric_limits<u64>::max(), *present_semaphores[frame_index], {}, &image_index); - if (auto& fence = fences[image_index]; fence) { - fence->Wait(); - fence->Release(); - fence = nullptr; - } + scheduler.Wait(resource_ticks[image_index]); } -bool VKSwapchain::Present(VkSemaphore render_semaphore, VKFence& fence) { +bool VKSwapchain::Present(VkSemaphore render_semaphore) { const VkSemaphore present_semaphore{*present_semaphores[frame_index]}; const std::array<VkSemaphore, 2> semaphores{present_semaphore, render_semaphore}; const auto present_queue{device.GetPresentQueue()}; bool recreated = false; - VkPresentInfoKHR present_info; - present_info.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR; - present_info.pNext = nullptr; - present_info.waitSemaphoreCount = render_semaphore ? 2U : 1U; - present_info.pWaitSemaphores = semaphores.data(); - present_info.swapchainCount = 1; - present_info.pSwapchains = swapchain.address(); - present_info.pImageIndices = &image_index; - present_info.pResults = nullptr; + const VkPresentInfoKHR present_info{ + .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR, + .pNext = nullptr, + .waitSemaphoreCount = render_semaphore ? 2U : 1U, + .pWaitSemaphores = semaphores.data(), + .swapchainCount = 1, + .pSwapchains = swapchain.address(), + .pImageIndices = &image_index, + .pResults = nullptr, + }; switch (const VkResult result = present_queue.Present(present_info)) { case VK_SUCCESS: @@ -122,8 +120,7 @@ bool VKSwapchain::Present(VkSemaphore render_semaphore, VKFence& fence) { break; } - ASSERT(fences[image_index] == nullptr); - fences[image_index] = &fence; + resource_ticks[image_index] = scheduler.CurrentTick(); frame_index = (frame_index + 1) % static_cast<u32>(image_count); return recreated; } @@ -147,24 +144,26 @@ void VKSwapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, requested_image_count = capabilities.maxImageCount; } - VkSwapchainCreateInfoKHR swapchain_ci; - swapchain_ci.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR; - swapchain_ci.pNext = nullptr; - swapchain_ci.flags = 0; - swapchain_ci.surface = surface; - swapchain_ci.minImageCount = requested_image_count; - swapchain_ci.imageFormat = surface_format.format; - swapchain_ci.imageColorSpace = surface_format.colorSpace; - swapchain_ci.imageArrayLayers = 1; - swapchain_ci.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; - swapchain_ci.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE; - swapchain_ci.queueFamilyIndexCount = 0; - swapchain_ci.pQueueFamilyIndices = nullptr; - swapchain_ci.preTransform = capabilities.currentTransform; - swapchain_ci.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; - swapchain_ci.presentMode = present_mode; - swapchain_ci.clipped = VK_FALSE; - swapchain_ci.oldSwapchain = nullptr; + VkSwapchainCreateInfoKHR swapchain_ci{ + .sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR, + .pNext = nullptr, + .flags = 0, + .surface = surface, + .minImageCount = requested_image_count, + .imageFormat = surface_format.format, + .imageColorSpace = surface_format.colorSpace, + .imageExtent = {}, + .imageArrayLayers = 1, + .imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, + .imageSharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + .preTransform = capabilities.currentTransform, + .compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR, + .presentMode = present_mode, + .clipped = VK_FALSE, + .oldSwapchain = nullptr, + }; const u32 graphics_family{device.GetGraphicsFamily()}; const u32 present_family{device.GetPresentFamily()}; @@ -173,8 +172,6 @@ void VKSwapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, swapchain_ci.imageSharingMode = VK_SHARING_MODE_CONCURRENT; swapchain_ci.queueFamilyIndexCount = static_cast<u32>(queue_indices.size()); swapchain_ci.pQueueFamilyIndices = queue_indices.data(); - } else { - swapchain_ci.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE; } // Request the size again to reduce the possibility of a TOCTOU race condition. @@ -200,20 +197,29 @@ void VKSwapchain::CreateSemaphores() { } void VKSwapchain::CreateImageViews() { - VkImageViewCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - // ci.image - ci.viewType = VK_IMAGE_VIEW_TYPE_2D; - ci.format = image_format; - ci.components = {VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY, - VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY}; - ci.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - ci.subresourceRange.baseMipLevel = 0; - ci.subresourceRange.levelCount = 1; - ci.subresourceRange.baseArrayLayer = 0; - ci.subresourceRange.layerCount = 1; + VkImageViewCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .image = {}, + .viewType = VK_IMAGE_VIEW_TYPE_2D, + .format = image_format, + .components = + { + .r = VK_COMPONENT_SWIZZLE_IDENTITY, + .g = VK_COMPONENT_SWIZZLE_IDENTITY, + .b = VK_COMPONENT_SWIZZLE_IDENTITY, + .a = VK_COMPONENT_SWIZZLE_IDENTITY, + }, + .subresourceRange = + { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1, + }, + }; image_views.resize(image_count); for (std::size_t i = 0; i < image_count; i++) { diff --git a/src/video_core/renderer_vulkan/vk_swapchain.h b/src/video_core/renderer_vulkan/vk_swapchain.h index a35d61345..6b39befdf 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.h +++ b/src/video_core/renderer_vulkan/vk_swapchain.h @@ -16,11 +16,11 @@ struct FramebufferLayout; namespace Vulkan { class VKDevice; -class VKFence; +class VKScheduler; class VKSwapchain { public: - explicit VKSwapchain(VkSurfaceKHR surface, const VKDevice& device); + explicit VKSwapchain(VkSurfaceKHR surface, const VKDevice& device, VKScheduler& scheduler); ~VKSwapchain(); /// Creates (or recreates) the swapchain with a given size. @@ -31,7 +31,7 @@ public: /// Presents the rendered image to the swapchain. Returns true when the swapchains had to be /// recreated. Takes responsability for the ownership of fence. - bool Present(VkSemaphore render_semaphore, VKFence& fence); + bool Present(VkSemaphore render_semaphore); /// Returns true when the framebuffer layout has changed. bool HasFramebufferChanged(const Layout::FramebufferLayout& framebuffer) const; @@ -74,6 +74,7 @@ private: const VkSurfaceKHR surface; const VKDevice& device; + VKScheduler& scheduler; vk::SwapchainKHR swapchain; @@ -81,7 +82,7 @@ private: std::vector<VkImage> images; std::vector<vk::ImageView> image_views; std::vector<vk::Framebuffer> framebuffers; - std::vector<VKFence*> fences; + std::vector<u64> resource_ticks; std::vector<vk::Semaphore> present_semaphores; u32 image_index{}; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index de4c23120..f2c8f2ae1 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -10,11 +10,9 @@ #include <variant> #include <vector> -#include "common/alignment.h" #include "common/assert.h" #include "common/common_types.h" #include "core/core.h" -#include "core/memory.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/morton.h" #include "video_core/renderer_vulkan/maxwell_to_vk.h" @@ -26,7 +24,6 @@ #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/wrapper.h" #include "video_core/surface.h" -#include "video_core/textures/convert.h" namespace Vulkan { @@ -98,17 +95,18 @@ VkImageViewType GetImageViewType(SurfaceTarget target) { vk::Buffer CreateBuffer(const VKDevice& device, const SurfaceParams& params, std::size_t host_memory_size) { // TODO(Rodrigo): Move texture buffer creation to the buffer cache - VkBufferCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.size = static_cast<VkDeviceSize>(host_memory_size); - ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | - VK_BUFFER_USAGE_TRANSFER_DST_BIT; - ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - ci.queueFamilyIndexCount = 0; - ci.pQueueFamilyIndices = nullptr; - return device.GetLogical().CreateBuffer(ci); + return device.GetLogical().CreateBuffer({ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = static_cast<VkDeviceSize>(host_memory_size), + .usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | + VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }); } VkBufferViewCreateInfo GenerateBufferViewCreateInfo(const VKDevice& device, @@ -116,15 +114,16 @@ VkBufferViewCreateInfo GenerateBufferViewCreateInfo(const VKDevice& device, std::size_t host_memory_size) { ASSERT(params.IsBuffer()); - VkBufferViewCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.buffer = buffer; - ci.format = MaxwellToVK::SurfaceFormat(device, FormatType::Buffer, params.pixel_format).format; - ci.offset = 0; - ci.range = static_cast<VkDeviceSize>(host_memory_size); - return ci; + return { + .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .buffer = buffer, + .format = + MaxwellToVK::SurfaceFormat(device, FormatType::Buffer, params.pixel_format).format, + .offset = 0, + .range = static_cast<VkDeviceSize>(host_memory_size), + }; } VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceParams& params) { @@ -133,23 +132,24 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP const auto [format, attachable, storage] = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, params.pixel_format); - VkImageCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.imageType = SurfaceTargetToImage(params.target); - ci.format = format; - ci.mipLevels = params.num_levels; - ci.arrayLayers = static_cast<u32>(params.GetNumLayers()); - ci.samples = VK_SAMPLE_COUNT_1_BIT; - ci.tiling = VK_IMAGE_TILING_OPTIMAL; - ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - ci.queueFamilyIndexCount = 0; - ci.pQueueFamilyIndices = nullptr; - ci.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; - - ci.usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | - VK_IMAGE_USAGE_TRANSFER_SRC_BIT; + VkImageCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .imageType = SurfaceTargetToImage(params.target), + .format = format, + .extent = {}, + .mipLevels = params.num_levels, + .arrayLayers = static_cast<u32>(params.GetNumLayers()), + .samples = VK_SAMPLE_COUNT_1_BIT, + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | + VK_IMAGE_USAGE_TRANSFER_SRC_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, + }; if (attachable) { ci.usage |= params.IsPixelFormatZeta() ? VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT : VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; @@ -170,6 +170,7 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP ci.extent = {params.width, params.height, 1}; break; case SurfaceTarget::Texture3D: + ci.flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT; ci.extent = {params.width, params.height, params.depth}; break; case SurfaceTarget::TextureBuffer: @@ -179,14 +180,18 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP return ci; } +u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, Tegra::Texture::SwizzleSource y_source, + Tegra::Texture::SwizzleSource z_source, Tegra::Texture::SwizzleSource w_source) { + return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) | + (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source); +} + } // Anonymous namespace -CachedSurface::CachedSurface(Core::System& system, const VKDevice& device, - VKResourceManager& resource_manager, VKMemoryManager& memory_manager, +CachedSurface::CachedSurface(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler, VKStagingBufferPool& staging_pool, GPUVAddr gpu_addr, const SurfaceParams& params) - : SurfaceBase<View>{gpu_addr, params, device.IsOptimalAstcSupported()}, system{system}, - device{device}, resource_manager{resource_manager}, + : SurfaceBase<View>{gpu_addr, params, device.IsOptimalAstcSupported()}, device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{staging_pool} { if (params.IsBuffer()) { buffer = CreateBuffer(device, params, host_memory_size); @@ -206,9 +211,11 @@ CachedSurface::CachedSurface(Core::System& system, const VKDevice& device, } // TODO(Rodrigo): Move this to a virtual function. - main_view = CreateViewInner( - ViewParams(params.target, 0, static_cast<u32>(params.GetNumLayers()), 0, params.num_levels), - true); + u32 num_layers = 1; + if (params.is_layered || params.target == SurfaceTarget::Texture3D) { + num_layers = params.depth; + } + main_view = CreateView(ViewParams(params.target, 0, num_layers, 0, params.num_levels)); } CachedSurface::~CachedSurface() = default; @@ -227,7 +234,7 @@ void CachedSurface::UploadTexture(const std::vector<u8>& staging_buffer) { void CachedSurface::DownloadTexture(std::vector<u8>& staging_buffer) { UNIMPLEMENTED_IF(params.IsBuffer()); - if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5U) { + if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5_UNORM) { LOG_WARNING(Render_Vulkan, "A1B5G5R5 flushing is stubbed"); } @@ -256,12 +263,8 @@ void CachedSurface::DecorateSurfaceName() { } View CachedSurface::CreateView(const ViewParams& params) { - return CreateViewInner(params, false); -} - -View CachedSurface::CreateViewInner(const ViewParams& params, bool is_proxy) { // TODO(Rodrigo): Add name decorations - return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params, is_proxy); + return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params); } void CachedSurface::UploadBuffer(const std::vector<u8>& staging_buffer) { @@ -279,12 +282,10 @@ void CachedSurface::UploadBuffer(const std::vector<u8>& staging_buffer) { VkBufferMemoryBarrier barrier; barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; barrier.pNext = nullptr; - barrier.srcAccessMask = VK_PIPELINE_STAGE_TRANSFER_BIT; - barrier.dstAccessMask = VK_PIPELINE_STAGE_VERTEX_SHADER_BIT; - barrier.srcQueueFamilyIndex = VK_ACCESS_TRANSFER_WRITE_BIT; - barrier.dstQueueFamilyIndex = VK_ACCESS_SHADER_READ_BIT; - barrier.srcQueueFamilyIndex = 0; - barrier.dstQueueFamilyIndex = 0; + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; // They'll be ignored anyway + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.buffer = dst_buffer; barrier.offset = 0; barrier.size = size; @@ -321,22 +322,25 @@ void CachedSurface::UploadImage(const std::vector<u8>& staging_buffer) { } VkBufferImageCopy CachedSurface::GetBufferImageCopy(u32 level) const { - VkBufferImageCopy copy; - copy.bufferOffset = params.GetHostMipmapLevelOffset(level, is_converted); - copy.bufferRowLength = 0; - copy.bufferImageHeight = 0; - copy.imageSubresource.aspectMask = image->GetAspectMask(); - copy.imageSubresource.mipLevel = level; - copy.imageSubresource.baseArrayLayer = 0; - copy.imageSubresource.layerCount = static_cast<u32>(params.GetNumLayers()); - copy.imageOffset.x = 0; - copy.imageOffset.y = 0; - copy.imageOffset.z = 0; - copy.imageExtent.width = params.GetMipWidth(level); - copy.imageExtent.height = params.GetMipHeight(level); - copy.imageExtent.depth = - params.target == SurfaceTarget::Texture3D ? params.GetMipDepth(level) : 1; - return copy; + return { + .bufferOffset = params.GetHostMipmapLevelOffset(level, is_converted), + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource = + { + .aspectMask = image->GetAspectMask(), + .mipLevel = level, + .baseArrayLayer = 0, + .layerCount = static_cast<u32>(params.GetNumLayers()), + }, + .imageOffset = {.x = 0, .y = 0, .z = 0}, + .imageExtent = + { + .width = params.GetMipWidth(level), + .height = params.GetMipHeight(level), + .depth = params.target == SurfaceTarget::Texture3D ? params.GetMipDepth(level) : 1U, + }, + }; } VkImageSubresourceRange CachedSurface::GetImageSubresourceRange() const { @@ -345,38 +349,44 @@ VkImageSubresourceRange CachedSurface::GetImageSubresourceRange() const { } CachedSurfaceView::CachedSurfaceView(const VKDevice& device, CachedSurface& surface, - const ViewParams& params, bool is_proxy) + const ViewParams& params) : VideoCommon::ViewBase{params}, params{surface.GetSurfaceParams()}, image{surface.GetImageHandle()}, buffer_view{surface.GetBufferViewHandle()}, aspect_mask{surface.GetAspectMask()}, device{device}, surface{surface}, - base_layer{params.base_layer}, num_layers{params.num_layers}, base_level{params.base_level}, - num_levels{params.num_levels}, image_view_type{image ? GetImageViewType(params.target) - : VK_IMAGE_VIEW_TYPE_1D} {} + base_level{params.base_level}, num_levels{params.num_levels}, + image_view_type{image ? GetImageViewType(params.target) : VK_IMAGE_VIEW_TYPE_1D} { + if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { + base_layer = 0; + num_layers = 1; + base_slice = params.base_layer; + num_slices = params.num_layers; + } else { + base_layer = params.base_layer; + num_layers = params.num_layers; + } +} CachedSurfaceView::~CachedSurfaceView() = default; -VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y_source, - SwizzleSource z_source, SwizzleSource w_source) { - const u32 swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source); - if (last_image_view && last_swizzle == swizzle) { +VkImageView CachedSurfaceView::GetImageView(SwizzleSource x_source, SwizzleSource y_source, + SwizzleSource z_source, SwizzleSource w_source) { + const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source); + if (last_image_view && last_swizzle == new_swizzle) { return last_image_view; } - last_swizzle = swizzle; + last_swizzle = new_swizzle; - const auto [entry, is_cache_miss] = view_cache.try_emplace(swizzle); + const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle); auto& image_view = entry->second; if (!is_cache_miss) { return last_image_view = *image_view; } - auto swizzle_x = MaxwellToVK::SwizzleSource(x_source); - auto swizzle_y = MaxwellToVK::SwizzleSource(y_source); - auto swizzle_z = MaxwellToVK::SwizzleSource(z_source); - auto swizzle_w = MaxwellToVK::SwizzleSource(w_source); - - if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5U) { + std::array swizzle{MaxwellToVK::SwizzleSource(x_source), MaxwellToVK::SwizzleSource(y_source), + MaxwellToVK::SwizzleSource(z_source), MaxwellToVK::SwizzleSource(w_source)}; + if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5_UNORM) { // A1B5G5R5 is implemented as A1R5G5B5, we have to change the swizzle here. - std::swap(swizzle_x, swizzle_z); + std::swap(swizzle[0], swizzle[2]); } // Games can sample depth or stencil values on textures. This is decided by the swizzle value on @@ -386,11 +396,11 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G); const bool is_first = x_source == SwizzleSource::R; switch (params.pixel_format) { - case VideoCore::Surface::PixelFormat::Z24S8: - case VideoCore::Surface::PixelFormat::Z32FS8: + case VideoCore::Surface::PixelFormat::D24_UNORM_S8_UINT: + case VideoCore::Surface::PixelFormat::D32_FLOAT_S8_UINT: aspect = is_first ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_STENCIL_BIT; break; - case VideoCore::Surface::PixelFormat::S8Z24: + case VideoCore::Surface::PixelFormat::S8_UINT_D24_UNORM: aspect = is_first ? VK_IMAGE_ASPECT_STENCIL_BIT : VK_IMAGE_ASPECT_DEPTH_BIT; break; default: @@ -398,44 +408,100 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y UNIMPLEMENTED(); } - // Vulkan doesn't seem to understand swizzling of a depth stencil image, use identity - swizzle_x = VK_COMPONENT_SWIZZLE_R; - swizzle_y = VK_COMPONENT_SWIZZLE_G; - swizzle_z = VK_COMPONENT_SWIZZLE_B; - swizzle_w = VK_COMPONENT_SWIZZLE_A; + // Make sure we sample the first component + std::transform( + swizzle.begin(), swizzle.end(), swizzle.begin(), [](VkComponentSwizzle component) { + return component == VK_COMPONENT_SWIZZLE_G ? VK_COMPONENT_SWIZZLE_R : component; + }); } - VkImageViewCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.image = surface.GetImageHandle(); - ci.viewType = image_view_type; - ci.format = surface.GetImage().GetFormat(); - ci.components = {swizzle_x, swizzle_y, swizzle_z, swizzle_w}; - ci.subresourceRange.aspectMask = aspect; - ci.subresourceRange.baseMipLevel = base_level; - ci.subresourceRange.levelCount = num_levels; - ci.subresourceRange.baseArrayLayer = base_layer; - ci.subresourceRange.layerCount = num_layers; - image_view = device.GetLogical().CreateImageView(ci); + if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { + ASSERT(base_slice == 0); + ASSERT(num_slices == params.depth); + } + + image_view = device.GetLogical().CreateImageView({ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .image = surface.GetImageHandle(), + .viewType = image_view_type, + .format = surface.GetImage().GetFormat(), + .components = + { + .r = swizzle[0], + .g = swizzle[1], + .b = swizzle[2], + .a = swizzle[3], + }, + .subresourceRange = + { + .aspectMask = aspect, + .baseMipLevel = base_level, + .levelCount = num_levels, + .baseArrayLayer = base_layer, + .layerCount = num_layers, + }, + }); return last_image_view = *image_view; } -VKTextureCache::VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - const VKDevice& device, VKResourceManager& resource_manager, - VKMemoryManager& memory_manager, VKScheduler& scheduler, - VKStagingBufferPool& staging_pool) - : TextureCache(system, rasterizer, device.IsOptimalAstcSupported()), device{device}, - resource_manager{resource_manager}, memory_manager{memory_manager}, scheduler{scheduler}, - staging_pool{staging_pool} {} +VkImageView CachedSurfaceView::GetAttachment() { + if (render_target) { + return *render_target; + } + + VkImageViewCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .image = surface.GetImageHandle(), + .viewType = VK_IMAGE_VIEW_TYPE_1D, + .format = surface.GetImage().GetFormat(), + .components = + { + .r = VK_COMPONENT_SWIZZLE_IDENTITY, + .g = VK_COMPONENT_SWIZZLE_IDENTITY, + .b = VK_COMPONENT_SWIZZLE_IDENTITY, + .a = VK_COMPONENT_SWIZZLE_IDENTITY, + }, + .subresourceRange = + { + .aspectMask = aspect_mask, + .baseMipLevel = base_level, + .levelCount = num_levels, + .baseArrayLayer = 0, + .layerCount = 0, + }, + }; + if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { + ci.viewType = num_slices > 1 ? VK_IMAGE_VIEW_TYPE_2D_ARRAY : VK_IMAGE_VIEW_TYPE_2D; + ci.subresourceRange.baseArrayLayer = base_slice; + ci.subresourceRange.layerCount = num_slices; + } else { + ci.viewType = image_view_type; + ci.subresourceRange.baseArrayLayer = base_layer; + ci.subresourceRange.layerCount = num_layers; + } + render_target = device.GetLogical().CreateImageView(ci); + return *render_target; +} + +VKTextureCache::VKTextureCache(VideoCore::RasterizerInterface& rasterizer, + Tegra::Engines::Maxwell3D& maxwell3d, + Tegra::MemoryManager& gpu_memory, const VKDevice& device_, + VKMemoryManager& memory_manager_, VKScheduler& scheduler_, + VKStagingBufferPool& staging_pool_) + : TextureCache(rasterizer, maxwell3d, gpu_memory, device_.IsOptimalAstcSupported()), + device{device_}, memory_manager{memory_manager_}, scheduler{scheduler_}, staging_pool{ + staging_pool_} {} VKTextureCache::~VKTextureCache() = default; Surface VKTextureCache::CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) { - return std::make_shared<CachedSurface>(system, device, resource_manager, memory_manager, - scheduler, staging_pool, gpu_addr, params); + return std::make_shared<CachedSurface>(device, memory_manager, scheduler, staging_pool, + gpu_addr, params); } void VKTextureCache::ImageCopy(Surface& src_surface, Surface& dst_surface, @@ -462,24 +528,40 @@ void VKTextureCache::ImageCopy(Surface& src_surface, Surface& dst_surface, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); - VkImageCopy copy; - copy.srcSubresource.aspectMask = src_surface->GetAspectMask(); - copy.srcSubresource.mipLevel = copy_params.source_level; - copy.srcSubresource.baseArrayLayer = copy_params.source_z; - copy.srcSubresource.layerCount = num_layers; - copy.srcOffset.x = copy_params.source_x; - copy.srcOffset.y = copy_params.source_y; - copy.srcOffset.z = 0; - copy.dstSubresource.aspectMask = dst_surface->GetAspectMask(); - copy.dstSubresource.mipLevel = copy_params.dest_level; - copy.dstSubresource.baseArrayLayer = dst_base_layer; - copy.dstSubresource.layerCount = num_layers; - copy.dstOffset.x = copy_params.dest_x; - copy.dstOffset.y = copy_params.dest_y; - copy.dstOffset.z = dst_offset_z; - copy.extent.width = copy_params.width; - copy.extent.height = copy_params.height; - copy.extent.depth = extent_z; + const VkImageCopy copy{ + .srcSubresource = + { + .aspectMask = src_surface->GetAspectMask(), + .mipLevel = copy_params.source_level, + .baseArrayLayer = copy_params.source_z, + .layerCount = num_layers, + }, + .srcOffset = + { + .x = static_cast<s32>(copy_params.source_x), + .y = static_cast<s32>(copy_params.source_y), + .z = 0, + }, + .dstSubresource = + { + .aspectMask = dst_surface->GetAspectMask(), + .mipLevel = copy_params.dest_level, + .baseArrayLayer = dst_base_layer, + .layerCount = num_layers, + }, + .dstOffset = + { + .x = static_cast<s32>(copy_params.dest_x), + .y = static_cast<s32>(copy_params.dest_y), + .z = static_cast<s32>(dst_offset_z), + }, + .extent = + { + .width = copy_params.width, + .height = copy_params.height, + .depth = extent_z, + }, + }; const VkImage src_image = src_surface->GetImageHandle(); const VkImage dst_image = dst_surface->GetImageHandle(); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 115595f28..39202feba 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -7,23 +7,13 @@ #include <memory> #include <unordered_map> -#include "common/assert.h" #include "common/common_types.h" -#include "common/logging/log.h" -#include "common/math_util.h" -#include "video_core/gpu.h" -#include "video_core/rasterizer_cache.h" #include "video_core/renderer_vulkan/vk_image.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/wrapper.h" #include "video_core/texture_cache/surface_base.h" #include "video_core/texture_cache/texture_cache.h" -#include "video_core/textures/decoders.h" - -namespace Core { -class System; -} namespace VideoCore { class RasterizerInterface; @@ -33,7 +23,6 @@ namespace Vulkan { class RasterizerVulkan; class VKDevice; -class VKResourceManager; class VKScheduler; class VKStagingBufferPool; @@ -51,8 +40,7 @@ class CachedSurface final : public VideoCommon::SurfaceBase<View> { friend CachedSurfaceView; public: - explicit CachedSurface(Core::System& system, const VKDevice& device, - VKResourceManager& resource_manager, VKMemoryManager& memory_manager, + explicit CachedSurface(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler, VKStagingBufferPool& staging_pool, GPUVAddr gpu_addr, const SurfaceParams& params); ~CachedSurface(); @@ -97,7 +85,6 @@ protected: void DecorateSurfaceName(); View CreateView(const ViewParams& params) override; - View CreateViewInner(const ViewParams& params, bool is_proxy); private: void UploadBuffer(const std::vector<u8>& staging_buffer); @@ -108,9 +95,7 @@ private: VkImageSubresourceRange GetImageSubresourceRange() const; - Core::System& system; const VKDevice& device; - VKResourceManager& resource_manager; VKMemoryManager& memory_manager; VKScheduler& scheduler; VKStagingBufferPool& staging_pool; @@ -126,23 +111,20 @@ private: class CachedSurfaceView final : public VideoCommon::ViewBase { public: explicit CachedSurfaceView(const VKDevice& device, CachedSurface& surface, - const ViewParams& params, bool is_proxy); + const ViewParams& params); ~CachedSurfaceView(); - VkImageView GetHandle(Tegra::Texture::SwizzleSource x_source, - Tegra::Texture::SwizzleSource y_source, - Tegra::Texture::SwizzleSource z_source, - Tegra::Texture::SwizzleSource w_source); + VkImageView GetImageView(Tegra::Texture::SwizzleSource x_source, + Tegra::Texture::SwizzleSource y_source, + Tegra::Texture::SwizzleSource z_source, + Tegra::Texture::SwizzleSource w_source); + + VkImageView GetAttachment(); bool IsSameSurface(const CachedSurfaceView& rhs) const { return &surface == &rhs.surface; } - VkImageView GetHandle() { - return GetHandle(Tegra::Texture::SwizzleSource::R, Tegra::Texture::SwizzleSource::G, - Tegra::Texture::SwizzleSource::B, Tegra::Texture::SwizzleSource::A); - } - u32 GetWidth() const { return params.GetMipWidth(base_level); } @@ -186,14 +168,6 @@ public: } private: - static u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, - Tegra::Texture::SwizzleSource y_source, - Tegra::Texture::SwizzleSource z_source, - Tegra::Texture::SwizzleSource w_source) { - return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) | - (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source); - } - // Store a copy of these values to avoid double dereference when reading them const SurfaceParams params; const VkImage image; @@ -202,24 +176,27 @@ private: const VKDevice& device; CachedSurface& surface; - const u32 base_layer; - const u32 num_layers; const u32 base_level; const u32 num_levels; const VkImageViewType image_view_type; + u32 base_layer = 0; + u32 num_layers = 0; + u32 base_slice = 0; + u32 num_slices = 0; VkImageView last_image_view = nullptr; u32 last_swizzle = 0; + vk::ImageView render_target; std::unordered_map<u32, vk::ImageView> view_cache; }; class VKTextureCache final : public TextureCacheBase { public: - explicit VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - const VKDevice& device, VKResourceManager& resource_manager, - VKMemoryManager& memory_manager, VKScheduler& scheduler, - VKStagingBufferPool& staging_pool); + explicit VKTextureCache(VideoCore::RasterizerInterface& rasterizer, + Tegra::Engines::Maxwell3D& maxwell3d, Tegra::MemoryManager& gpu_memory, + const VKDevice& device, VKMemoryManager& memory_manager, + VKScheduler& scheduler, VKStagingBufferPool& staging_pool); ~VKTextureCache(); private: @@ -234,7 +211,6 @@ private: void BufferCopy(Surface& src_surface, Surface& dst_surface) override; const VKDevice& device; - VKResourceManager& resource_manager; VKMemoryManager& memory_manager; VKScheduler& scheduler; VKStagingBufferPool& staging_pool; diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp index 4bfec0077..351c048d2 100644 --- a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp +++ b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp @@ -24,34 +24,25 @@ void VKUpdateDescriptorQueue::TickFrame() { } void VKUpdateDescriptorQueue::Acquire() { - entries.clear(); -} + // Minimum number of entries required. + // This is the maximum number of entries a single draw call migth use. + static constexpr std::size_t MIN_ENTRIES = 0x400; -void VKUpdateDescriptorQueue::Send(VkDescriptorUpdateTemplateKHR update_template, - VkDescriptorSet set) { - if (payload.size() + entries.size() >= payload.max_size()) { + if (payload.size() + MIN_ENTRIES >= payload.max_size()) { LOG_WARNING(Render_Vulkan, "Payload overflow, waiting for worker thread"); scheduler.WaitWorker(); payload.clear(); } + upload_start = &*payload.end(); +} - const auto payload_start = payload.data() + payload.size(); - for (const auto& entry : entries) { - if (const auto image = std::get_if<VkDescriptorImageInfo>(&entry)) { - payload.push_back(*image); - } else if (const auto buffer = std::get_if<Buffer>(&entry)) { - payload.emplace_back(*buffer->buffer, buffer->offset, buffer->size); - } else if (const auto texel = std::get_if<VkBufferView>(&entry)) { - payload.push_back(*texel); - } else { - UNREACHABLE(); - } - } - - scheduler.Record( - [payload_start, set, update_template, logical = &device.GetLogical()](vk::CommandBuffer) { - logical->UpdateDescriptorSet(set, update_template, payload_start); - }); +void VKUpdateDescriptorQueue::Send(VkDescriptorUpdateTemplateKHR update_template, + VkDescriptorSet set) { + const void* const data = upload_start; + const vk::Device* const logical = &device.GetLogical(); + scheduler.Record([data, logical, set, update_template](vk::CommandBuffer) { + logical->UpdateDescriptorSet(set, update_template, data); + }); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.h b/src/video_core/renderer_vulkan/vk_update_descriptor.h index a9e3d5dba..945320c72 100644 --- a/src/video_core/renderer_vulkan/vk_update_descriptor.h +++ b/src/video_core/renderer_vulkan/vk_update_descriptor.h @@ -4,7 +4,6 @@ #pragma once -#include <type_traits> #include <variant> #include <boost/container/static_vector.hpp> @@ -16,18 +15,13 @@ namespace Vulkan { class VKDevice; class VKScheduler; -class DescriptorUpdateEntry { -public: - explicit DescriptorUpdateEntry() : image{} {} - - DescriptorUpdateEntry(VkDescriptorImageInfo image) : image{image} {} +struct DescriptorUpdateEntry { + DescriptorUpdateEntry(VkDescriptorImageInfo image_) : image{image_} {} - DescriptorUpdateEntry(VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size) - : buffer{buffer, offset, size} {} + DescriptorUpdateEntry(VkDescriptorBufferInfo buffer_) : buffer{buffer_} {} - DescriptorUpdateEntry(VkBufferView texel_buffer) : texel_buffer{texel_buffer} {} + DescriptorUpdateEntry(VkBufferView texel_buffer_) : texel_buffer{texel_buffer_} {} -private: union { VkDescriptorImageInfo image; VkDescriptorBufferInfo buffer; @@ -47,37 +41,34 @@ public: void Send(VkDescriptorUpdateTemplateKHR update_template, VkDescriptorSet set); void AddSampledImage(VkSampler sampler, VkImageView image_view) { - entries.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}}); + payload.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}}); } void AddImage(VkImageView image_view) { - entries.emplace_back(VkDescriptorImageInfo{{}, image_view, {}}); + payload.emplace_back(VkDescriptorImageInfo{{}, image_view, {}}); } - void AddBuffer(const VkBuffer* buffer, u64 offset, std::size_t size) { - entries.push_back(Buffer{buffer, offset, size}); + void AddBuffer(VkBuffer buffer, u64 offset, std::size_t size) { + payload.emplace_back(VkDescriptorBufferInfo{buffer, offset, size}); } void AddTexelBuffer(VkBufferView texel_buffer) { - entries.emplace_back(texel_buffer); + payload.emplace_back(texel_buffer); } - VkImageLayout* GetLastImageLayout() { - return &std::get<VkDescriptorImageInfo>(entries.back()).imageLayout; + VkImageLayout* LastImageLayout() { + return &payload.back().image.imageLayout; } -private: - struct Buffer { - const VkBuffer* buffer = nullptr; - u64 offset = 0; - std::size_t size = 0; - }; - using Variant = std::variant<VkDescriptorImageInfo, Buffer, VkBufferView>; + const VkImageLayout* LastImageLayout() const { + return &payload.back().image.imageLayout; + } +private: const VKDevice& device; VKScheduler& scheduler; - boost::container::static_vector<Variant, 0x400> entries; + const DescriptorUpdateEntry* upload_start = nullptr; boost::container::static_vector<DescriptorUpdateEntry, 0x10000> payload; }; diff --git a/src/video_core/renderer_vulkan/wrapper.cpp b/src/video_core/renderer_vulkan/wrapper.cpp index 9b94dfff1..4e83303d8 100644 --- a/src/video_core/renderer_vulkan/wrapper.cpp +++ b/src/video_core/renderer_vulkan/wrapper.cpp @@ -2,13 +2,16 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <algorithm> #include <exception> #include <memory> #include <optional> +#include <string_view> #include <utility> #include <vector> #include "common/common_types.h" +#include "common/logging/log.h" #include "video_core/renderer_vulkan/wrapper.h" @@ -16,6 +19,44 @@ namespace Vulkan::vk { namespace { +template <typename Func> +void SortPhysicalDevices(std::vector<VkPhysicalDevice>& devices, const InstanceDispatch& dld, + Func&& func) { + // Calling GetProperties calls Vulkan more than needed. But they are supposed to be cheap + // functions. + std::stable_sort(devices.begin(), devices.end(), + [&dld, &func](VkPhysicalDevice lhs, VkPhysicalDevice rhs) { + return func(vk::PhysicalDevice(lhs, dld).GetProperties(), + vk::PhysicalDevice(rhs, dld).GetProperties()); + }); +} + +void SortPhysicalDevicesPerVendor(std::vector<VkPhysicalDevice>& devices, + const InstanceDispatch& dld, + std::initializer_list<u32> vendor_ids) { + for (auto it = vendor_ids.end(); it != vendor_ids.begin();) { + --it; + SortPhysicalDevices(devices, dld, [id = *it](const auto& lhs, const auto& rhs) { + return lhs.vendorID == id && rhs.vendorID != id; + }); + } +} + +void SortPhysicalDevices(std::vector<VkPhysicalDevice>& devices, const InstanceDispatch& dld) { + // Sort by name, this will set a base and make GPUs with higher numbers appear first + // (e.g. GTX 1650 will intentionally be listed before a GTX 1080). + SortPhysicalDevices(devices, dld, [](const auto& lhs, const auto& rhs) { + return std::string_view{lhs.deviceName} > std::string_view{rhs.deviceName}; + }); + // Prefer discrete over non-discrete + SortPhysicalDevices(devices, dld, [](const auto& lhs, const auto& rhs) { + return lhs.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU && + rhs.deviceType != VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU; + }); + // Prefer Nvidia over AMD, AMD over Intel, Intel over the rest. + SortPhysicalDevicesPerVendor(devices, dld, {0x10DE, 0x1002, 0x8086}); +} + template <typename T> bool Proc(T& result, const InstanceDispatch& dld, const char* proc_name, VkInstance instance = nullptr) noexcept { @@ -61,14 +102,25 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkCmdPipelineBarrier); X(vkCmdPushConstants); X(vkCmdSetBlendConstants); - X(vkCmdSetCheckpointNV); X(vkCmdSetDepthBias); X(vkCmdSetDepthBounds); + X(vkCmdSetEvent); X(vkCmdSetScissor); X(vkCmdSetStencilCompareMask); X(vkCmdSetStencilReference); X(vkCmdSetStencilWriteMask); X(vkCmdSetViewport); + X(vkCmdWaitEvents); + X(vkCmdBindVertexBuffers2EXT); + X(vkCmdSetCullModeEXT); + X(vkCmdSetDepthBoundsTestEnableEXT); + X(vkCmdSetDepthCompareOpEXT); + X(vkCmdSetDepthTestEnableEXT); + X(vkCmdSetDepthWriteEnableEXT); + X(vkCmdSetFrontFaceEXT); + X(vkCmdSetPrimitiveTopologyEXT); + X(vkCmdSetStencilOpEXT); + X(vkCmdSetStencilTestEnableEXT); X(vkCreateBuffer); X(vkCreateBufferView); X(vkCreateCommandPool); @@ -76,6 +128,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkCreateDescriptorPool); X(vkCreateDescriptorSetLayout); X(vkCreateDescriptorUpdateTemplateKHR); + X(vkCreateEvent); X(vkCreateFence); X(vkCreateFramebuffer); X(vkCreateGraphicsPipelines); @@ -94,6 +147,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkDestroyDescriptorPool); X(vkDestroyDescriptorSetLayout); X(vkDestroyDescriptorUpdateTemplateKHR); + X(vkDestroyEvent); X(vkDestroyFence); X(vkDestroyFramebuffer); X(vkDestroyImage); @@ -113,10 +167,11 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkFreeMemory); X(vkGetBufferMemoryRequirements); X(vkGetDeviceQueue); + X(vkGetEventStatus); X(vkGetFenceStatus); X(vkGetImageMemoryRequirements); X(vkGetQueryPoolResults); - X(vkGetQueueCheckpointDataNV); + X(vkGetSemaphoreCounterValueKHR); X(vkMapMemory); X(vkQueueSubmit); X(vkResetFences); @@ -125,6 +180,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkUpdateDescriptorSetWithTemplateKHR); X(vkUpdateDescriptorSets); X(vkWaitForFences); + X(vkWaitSemaphoresKHR); #undef X } @@ -132,7 +188,8 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { bool Load(InstanceDispatch& dld) noexcept { #define X(name) Proc(dld.name, dld, #name) - return X(vkCreateInstance) && X(vkEnumerateInstanceExtensionProperties); + return X(vkCreateInstance) && X(vkEnumerateInstanceExtensionProperties) && + X(vkEnumerateInstanceLayerProperties); #undef X } @@ -230,6 +287,22 @@ const char* ToString(VkResult result) noexcept { return "VK_ERROR_INVALID_DEVICE_ADDRESS_EXT"; case VkResult::VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT: return "VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT"; + case VkResult::VK_ERROR_UNKNOWN: + return "VK_ERROR_UNKNOWN"; + case VkResult::VK_ERROR_INCOMPATIBLE_VERSION_KHR: + return "VK_ERROR_INCOMPATIBLE_VERSION_KHR"; + case VkResult::VK_THREAD_IDLE_KHR: + return "VK_THREAD_IDLE_KHR"; + case VkResult::VK_THREAD_DONE_KHR: + return "VK_THREAD_DONE_KHR"; + case VkResult::VK_OPERATION_DEFERRED_KHR: + return "VK_OPERATION_DEFERRED_KHR"; + case VkResult::VK_OPERATION_NOT_DEFERRED_KHR: + return "VK_OPERATION_NOT_DEFERRED_KHR"; + case VkResult::VK_PIPELINE_COMPILE_REQUIRED_EXT: + return "VK_PIPELINE_COMPILE_REQUIRED_EXT"; + case VkResult::VK_RESULT_MAX_ENUM: + return "VK_RESULT_MAX_ENUM"; } return "Unknown"; } @@ -271,6 +344,10 @@ void Destroy(VkDevice device, VkDeviceMemory handle, const DeviceDispatch& dld) dld.vkFreeMemory(device, handle, nullptr); } +void Destroy(VkDevice device, VkEvent handle, const DeviceDispatch& dld) noexcept { + dld.vkDestroyEvent(device, handle, nullptr); +} + void Destroy(VkDevice device, VkFence handle, const DeviceDispatch& dld) noexcept { dld.vkDestroyFence(device, handle, nullptr); } @@ -339,26 +416,27 @@ VkResult Free(VkDevice device, VkCommandPool handle, Span<VkCommandBuffer> buffe return VK_SUCCESS; } -Instance Instance::Create(Span<const char*> layers, Span<const char*> extensions, +Instance Instance::Create(u32 version, Span<const char*> layers, Span<const char*> extensions, InstanceDispatch& dld) noexcept { - VkApplicationInfo application_info; - application_info.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; - application_info.pNext = nullptr; - application_info.pApplicationName = "yuzu Emulator"; - application_info.applicationVersion = VK_MAKE_VERSION(0, 1, 0); - application_info.pEngineName = "yuzu Emulator"; - application_info.engineVersion = VK_MAKE_VERSION(0, 1, 0); - application_info.apiVersion = VK_API_VERSION_1_1; - - VkInstanceCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; - ci.pApplicationInfo = &application_info; - ci.enabledLayerCount = layers.size(); - ci.ppEnabledLayerNames = layers.data(); - ci.enabledExtensionCount = extensions.size(); - ci.ppEnabledExtensionNames = extensions.data(); + const VkApplicationInfo application_info{ + .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO, + .pNext = nullptr, + .pApplicationName = "yuzu Emulator", + .applicationVersion = VK_MAKE_VERSION(0, 1, 0), + .pEngineName = "yuzu Emulator", + .engineVersion = VK_MAKE_VERSION(0, 1, 0), + .apiVersion = version, + }; + const VkInstanceCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .pApplicationInfo = &application_info, + .enabledLayerCount = layers.size(), + .ppEnabledLayerNames = layers.data(), + .enabledExtensionCount = extensions.size(), + .ppEnabledExtensionNames = extensions.data(), + }; VkInstance instance; if (dld.vkCreateInstance(&ci, nullptr, &instance) != VK_SUCCESS) { @@ -383,24 +461,26 @@ std::optional<std::vector<VkPhysicalDevice>> Instance::EnumeratePhysicalDevices( if (dld->vkEnumeratePhysicalDevices(handle, &num, physical_devices.data()) != VK_SUCCESS) { return std::nullopt; } - return physical_devices; + SortPhysicalDevices(physical_devices, *dld); + return std::make_optional(std::move(physical_devices)); } DebugCallback Instance::TryCreateDebugCallback( PFN_vkDebugUtilsMessengerCallbackEXT callback) noexcept { - VkDebugUtilsMessengerCreateInfoEXT ci; - ci.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT; - ci.pNext = nullptr; - ci.flags = 0; - ci.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT; - ci.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT; - ci.pfnUserCallback = callback; - ci.pUserData = nullptr; + const VkDebugUtilsMessengerCreateInfoEXT ci{ + .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT, + .pNext = nullptr, + .flags = 0, + .messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT, + .messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT, + .pfnUserCallback = callback, + .pUserData = nullptr, + }; VkDebugUtilsMessengerEXT messenger; if (dld->vkCreateDebugUtilsMessengerEXT(handle, &ci, nullptr, &messenger) != VK_SUCCESS) { @@ -409,17 +489,6 @@ DebugCallback Instance::TryCreateDebugCallback( return DebugCallback(messenger, handle, *dld); } -std::vector<VkCheckpointDataNV> Queue::GetCheckpointDataNV(const DeviceDispatch& dld) const { - if (!dld.vkGetQueueCheckpointDataNV) { - return {}; - } - u32 num; - dld.vkGetQueueCheckpointDataNV(queue, &num, nullptr); - std::vector<VkCheckpointDataNV> checkpoints(num); - dld.vkGetQueueCheckpointDataNV(queue, &num, checkpoints.data()); - return checkpoints; -} - void Buffer::BindMemory(VkDeviceMemory memory, VkDeviceSize offset) const { Check(dld->vkBindBufferMemory(owner, handle, memory, offset)); } @@ -442,12 +511,13 @@ DescriptorSets DescriptorPool::Allocate(const VkDescriptorSetAllocateInfo& ai) c } CommandBuffers CommandPool::Allocate(std::size_t num_buffers, VkCommandBufferLevel level) const { - VkCommandBufferAllocateInfo ai; - ai.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; - ai.pNext = nullptr; - ai.commandPool = handle; - ai.level = level; - ai.commandBufferCount = static_cast<u32>(num_buffers); + const VkCommandBufferAllocateInfo ai{ + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, + .pNext = nullptr, + .commandPool = handle, + .level = level, + .commandBufferCount = static_cast<u32>(num_buffers), + }; std::unique_ptr buffers = std::make_unique<VkCommandBuffer[]>(num_buffers); switch (const VkResult result = dld->vkAllocateCommandBuffers(owner, &ai, buffers.get())) { @@ -469,20 +539,20 @@ std::vector<VkImage> SwapchainKHR::GetImages() const { } Device Device::Create(VkPhysicalDevice physical_device, Span<VkDeviceQueueCreateInfo> queues_ci, - Span<const char*> enabled_extensions, - const VkPhysicalDeviceFeatures2& enabled_features, + Span<const char*> enabled_extensions, const void* next, DeviceDispatch& dld) noexcept { - VkDeviceCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; - ci.pNext = &enabled_features; - ci.flags = 0; - ci.queueCreateInfoCount = queues_ci.size(); - ci.pQueueCreateInfos = queues_ci.data(); - ci.enabledLayerCount = 0; - ci.ppEnabledLayerNames = nullptr; - ci.enabledExtensionCount = enabled_extensions.size(); - ci.ppEnabledExtensionNames = enabled_extensions.data(); - ci.pEnabledFeatures = nullptr; + const VkDeviceCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, + .pNext = next, + .flags = 0, + .queueCreateInfoCount = queues_ci.size(), + .pQueueCreateInfos = queues_ci.data(), + .enabledLayerCount = 0, + .ppEnabledLayerNames = nullptr, + .enabledExtensionCount = enabled_extensions.size(), + .ppEnabledExtensionNames = enabled_extensions.data(), + .pEnabledFeatures = nullptr, + }; VkDevice device; if (dld.vkCreateDevice(physical_device, &ci, nullptr, &device) != VK_SUCCESS) { @@ -523,11 +593,15 @@ ImageView Device::CreateImageView(const VkImageViewCreateInfo& ci) const { } Semaphore Device::CreateSemaphore() const { - VkSemaphoreCreateInfo ci; - ci.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; - ci.pNext = nullptr; - ci.flags = 0; + static constexpr VkSemaphoreCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + }; + return CreateSemaphore(ci); +} +Semaphore Device::CreateSemaphore(const VkSemaphoreCreateInfo& ci) const { VkSemaphore object; Check(dld->vkCreateSemaphore(handle, &ci, nullptr, &object)); return Semaphore(object, handle, *dld); @@ -613,6 +687,18 @@ ShaderModule Device::CreateShaderModule(const VkShaderModuleCreateInfo& ci) cons return ShaderModule(object, handle, *dld); } +Event Device::CreateEvent() const { + static constexpr VkEventCreateInfo ci{ + .sType = VK_STRUCTURE_TYPE_EVENT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + }; + + VkEvent object; + Check(dld->vkCreateEvent(handle, &ci, nullptr, &object)); + return Event(object, handle, *dld); +} + SwapchainKHR Device::CreateSwapchainKHR(const VkSwapchainCreateInfoKHR& ci) const { VkSwapchainKHR object; Check(dld->vkCreateSwapchainKHR(handle, &ci, nullptr, &object)); @@ -701,8 +787,7 @@ bool PhysicalDevice::GetSurfaceSupportKHR(u32 queue_family_index, VkSurfaceKHR s return supported == VK_TRUE; } -VkSurfaceCapabilitiesKHR PhysicalDevice::GetSurfaceCapabilitiesKHR(VkSurfaceKHR surface) const - noexcept { +VkSurfaceCapabilitiesKHR PhysicalDevice::GetSurfaceCapabilitiesKHR(VkSurfaceKHR surface) const { VkSurfaceCapabilitiesKHR capabilities; Check(dld->vkGetPhysicalDeviceSurfaceCapabilitiesKHR(physical_device, surface, &capabilities)); return capabilities; @@ -733,6 +818,21 @@ VkPhysicalDeviceMemoryProperties PhysicalDevice::GetMemoryProperties() const noe return properties; } +u32 AvailableVersion(const InstanceDispatch& dld) noexcept { + PFN_vkEnumerateInstanceVersion vkEnumerateInstanceVersion; + if (!Proc(vkEnumerateInstanceVersion, dld, "vkEnumerateInstanceVersion")) { + // If the procedure is not found, Vulkan 1.0 is assumed + return VK_API_VERSION_1_0; + } + u32 version; + if (const VkResult result = vkEnumerateInstanceVersion(&version); result != VK_SUCCESS) { + LOG_ERROR(Render_Vulkan, "vkEnumerateInstanceVersion returned {}, assuming Vulkan 1.1", + ToString(result)); + return VK_API_VERSION_1_1; + } + return version; +} + std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProperties( const InstanceDispatch& dld) { u32 num; @@ -747,4 +847,17 @@ std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProp return properties; } +std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties( + const InstanceDispatch& dld) { + u32 num; + if (dld.vkEnumerateInstanceLayerProperties(&num, nullptr) != VK_SUCCESS) { + return std::nullopt; + } + std::vector<VkLayerProperties> properties(num); + if (dld.vkEnumerateInstanceLayerProperties(&num, properties.data()) != VK_SUCCESS) { + return std::nullopt; + } + return properties; +} + } // namespace Vulkan::vk diff --git a/src/video_core/renderer_vulkan/wrapper.h b/src/video_core/renderer_vulkan/wrapper.h index fb3657819..f64919623 100644 --- a/src/video_core/renderer_vulkan/wrapper.h +++ b/src/video_core/renderer_vulkan/wrapper.h @@ -141,6 +141,7 @@ struct InstanceDispatch { PFN_vkCreateInstance vkCreateInstance; PFN_vkDestroyInstance vkDestroyInstance; PFN_vkEnumerateInstanceExtensionProperties vkEnumerateInstanceExtensionProperties; + PFN_vkEnumerateInstanceLayerProperties vkEnumerateInstanceLayerProperties; PFN_vkCreateDebugUtilsMessengerEXT vkCreateDebugUtilsMessengerEXT; PFN_vkCreateDevice vkCreateDevice; @@ -197,14 +198,25 @@ struct DeviceDispatch : public InstanceDispatch { PFN_vkCmdPipelineBarrier vkCmdPipelineBarrier; PFN_vkCmdPushConstants vkCmdPushConstants; PFN_vkCmdSetBlendConstants vkCmdSetBlendConstants; - PFN_vkCmdSetCheckpointNV vkCmdSetCheckpointNV; PFN_vkCmdSetDepthBias vkCmdSetDepthBias; PFN_vkCmdSetDepthBounds vkCmdSetDepthBounds; + PFN_vkCmdSetEvent vkCmdSetEvent; PFN_vkCmdSetScissor vkCmdSetScissor; PFN_vkCmdSetStencilCompareMask vkCmdSetStencilCompareMask; PFN_vkCmdSetStencilReference vkCmdSetStencilReference; PFN_vkCmdSetStencilWriteMask vkCmdSetStencilWriteMask; PFN_vkCmdSetViewport vkCmdSetViewport; + PFN_vkCmdWaitEvents vkCmdWaitEvents; + PFN_vkCmdBindVertexBuffers2EXT vkCmdBindVertexBuffers2EXT; + PFN_vkCmdSetCullModeEXT vkCmdSetCullModeEXT; + PFN_vkCmdSetDepthBoundsTestEnableEXT vkCmdSetDepthBoundsTestEnableEXT; + PFN_vkCmdSetDepthCompareOpEXT vkCmdSetDepthCompareOpEXT; + PFN_vkCmdSetDepthTestEnableEXT vkCmdSetDepthTestEnableEXT; + PFN_vkCmdSetDepthWriteEnableEXT vkCmdSetDepthWriteEnableEXT; + PFN_vkCmdSetFrontFaceEXT vkCmdSetFrontFaceEXT; + PFN_vkCmdSetPrimitiveTopologyEXT vkCmdSetPrimitiveTopologyEXT; + PFN_vkCmdSetStencilOpEXT vkCmdSetStencilOpEXT; + PFN_vkCmdSetStencilTestEnableEXT vkCmdSetStencilTestEnableEXT; PFN_vkCreateBuffer vkCreateBuffer; PFN_vkCreateBufferView vkCreateBufferView; PFN_vkCreateCommandPool vkCreateCommandPool; @@ -212,6 +224,7 @@ struct DeviceDispatch : public InstanceDispatch { PFN_vkCreateDescriptorPool vkCreateDescriptorPool; PFN_vkCreateDescriptorSetLayout vkCreateDescriptorSetLayout; PFN_vkCreateDescriptorUpdateTemplateKHR vkCreateDescriptorUpdateTemplateKHR; + PFN_vkCreateEvent vkCreateEvent; PFN_vkCreateFence vkCreateFence; PFN_vkCreateFramebuffer vkCreateFramebuffer; PFN_vkCreateGraphicsPipelines vkCreateGraphicsPipelines; @@ -230,6 +243,7 @@ struct DeviceDispatch : public InstanceDispatch { PFN_vkDestroyDescriptorPool vkDestroyDescriptorPool; PFN_vkDestroyDescriptorSetLayout vkDestroyDescriptorSetLayout; PFN_vkDestroyDescriptorUpdateTemplateKHR vkDestroyDescriptorUpdateTemplateKHR; + PFN_vkDestroyEvent vkDestroyEvent; PFN_vkDestroyFence vkDestroyFence; PFN_vkDestroyFramebuffer vkDestroyFramebuffer; PFN_vkDestroyImage vkDestroyImage; @@ -249,10 +263,11 @@ struct DeviceDispatch : public InstanceDispatch { PFN_vkFreeMemory vkFreeMemory; PFN_vkGetBufferMemoryRequirements vkGetBufferMemoryRequirements; PFN_vkGetDeviceQueue vkGetDeviceQueue; + PFN_vkGetEventStatus vkGetEventStatus; PFN_vkGetFenceStatus vkGetFenceStatus; PFN_vkGetImageMemoryRequirements vkGetImageMemoryRequirements; PFN_vkGetQueryPoolResults vkGetQueryPoolResults; - PFN_vkGetQueueCheckpointDataNV vkGetQueueCheckpointDataNV; + PFN_vkGetSemaphoreCounterValueKHR vkGetSemaphoreCounterValueKHR; PFN_vkMapMemory vkMapMemory; PFN_vkQueueSubmit vkQueueSubmit; PFN_vkResetFences vkResetFences; @@ -261,6 +276,7 @@ struct DeviceDispatch : public InstanceDispatch { PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR; PFN_vkUpdateDescriptorSets vkUpdateDescriptorSets; PFN_vkWaitForFences vkWaitForFences; + PFN_vkWaitSemaphoresKHR vkWaitSemaphoresKHR; }; /// Loads instance agnostic function pointers. @@ -281,6 +297,7 @@ void Destroy(VkDevice, VkDescriptorPool, const DeviceDispatch&) noexcept; void Destroy(VkDevice, VkDescriptorSetLayout, const DeviceDispatch&) noexcept; void Destroy(VkDevice, VkDescriptorUpdateTemplateKHR, const DeviceDispatch&) noexcept; void Destroy(VkDevice, VkDeviceMemory, const DeviceDispatch&) noexcept; +void Destroy(VkDevice, VkEvent, const DeviceDispatch&) noexcept; void Destroy(VkDevice, VkFence, const DeviceDispatch&) noexcept; void Destroy(VkDevice, VkFramebuffer, const DeviceDispatch&) noexcept; void Destroy(VkDevice, VkImage, const DeviceDispatch&) noexcept; @@ -535,7 +552,6 @@ using PipelineLayout = Handle<VkPipelineLayout, VkDevice, DeviceDispatch>; using QueryPool = Handle<VkQueryPool, VkDevice, DeviceDispatch>; using RenderPass = Handle<VkRenderPass, VkDevice, DeviceDispatch>; using Sampler = Handle<VkSampler, VkDevice, DeviceDispatch>; -using Semaphore = Handle<VkSemaphore, VkDevice, DeviceDispatch>; using ShaderModule = Handle<VkShaderModule, VkDevice, DeviceDispatch>; using SurfaceKHR = Handle<VkSurfaceKHR, VkInstance, InstanceDispatch>; @@ -548,7 +564,7 @@ class Instance : public Handle<VkInstance, NoOwner, InstanceDispatch> { public: /// Creates a Vulkan instance. Use "operator bool" for error handling. - static Instance Create(Span<const char*> layers, Span<const char*> extensions, + static Instance Create(u32 version, Span<const char*> layers, Span<const char*> extensions, InstanceDispatch& dld) noexcept; /// Enumerates physical devices. @@ -567,12 +583,9 @@ public: /// Construct a queue handle. constexpr Queue(VkQueue queue, const DeviceDispatch& dld) noexcept : queue{queue}, dld{&dld} {} - /// Returns the checkpoint data. - /// @note Returns an empty vector when the function pointer is not present. - std::vector<VkCheckpointDataNV> GetCheckpointDataNV(const DeviceDispatch& dld) const; - - void Submit(Span<VkSubmitInfo> submit_infos, VkFence fence) const { - Check(dld->vkQueueSubmit(queue, submit_infos.size(), submit_infos.data(), fence)); + VkResult Submit(Span<VkSubmitInfo> submit_infos, + VkFence fence = VK_NULL_HANDLE) const noexcept { + return dld->vkQueueSubmit(queue, submit_infos.size(), submit_infos.data(), fence); } VkResult Present(const VkPresentInfoKHR& present_info) const noexcept { @@ -654,13 +667,59 @@ public: std::vector<VkImage> GetImages() const; }; +class Event : public Handle<VkEvent, VkDevice, DeviceDispatch> { + using Handle<VkEvent, VkDevice, DeviceDispatch>::Handle; + +public: + VkResult GetStatus() const noexcept { + return dld->vkGetEventStatus(owner, handle); + } +}; + +class Semaphore : public Handle<VkSemaphore, VkDevice, DeviceDispatch> { + using Handle<VkSemaphore, VkDevice, DeviceDispatch>::Handle; + +public: + [[nodiscard]] u64 GetCounter() const { + u64 value; + Check(dld->vkGetSemaphoreCounterValueKHR(owner, handle, &value)); + return value; + } + + /** + * Waits for a timeline semaphore on the host. + * + * @param value Value to wait + * @param timeout Time in nanoseconds to timeout + * @return True on successful wait, false on timeout + */ + bool Wait(u64 value, u64 timeout = std::numeric_limits<u64>::max()) const { + const VkSemaphoreWaitInfoKHR wait_info{ + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO_KHR, + .pNext = nullptr, + .flags = 0, + .semaphoreCount = 1, + .pSemaphores = &handle, + .pValues = &value, + }; + const VkResult result = dld->vkWaitSemaphoresKHR(owner, &wait_info, timeout); + switch (result) { + case VK_SUCCESS: + return true; + case VK_TIMEOUT: + return false; + default: + throw Exception(result); + } + } +}; + class Device : public Handle<VkDevice, NoOwner, DeviceDispatch> { using Handle<VkDevice, NoOwner, DeviceDispatch>::Handle; public: static Device Create(VkPhysicalDevice physical_device, Span<VkDeviceQueueCreateInfo> queues_ci, - Span<const char*> enabled_extensions, - const VkPhysicalDeviceFeatures2& enabled_features, + Span<const char*> enabled_extensions, const void* next, DeviceDispatch& dld) noexcept; Queue GetQueue(u32 family_index) const noexcept; @@ -675,6 +734,8 @@ public: Semaphore CreateSemaphore() const; + Semaphore CreateSemaphore(const VkSemaphoreCreateInfo& ci) const; + Fence CreateFence(const VkFenceCreateInfo& ci) const; DescriptorPool CreateDescriptorPool(const VkDescriptorPoolCreateInfo& ci) const; @@ -702,6 +763,8 @@ public: ShaderModule CreateShaderModule(const VkShaderModuleCreateInfo& ci) const; + Event CreateEvent() const; + SwapchainKHR CreateSwapchainKHR(const VkSwapchainCreateInfoKHR& ci) const; DeviceMemory TryAllocateMemory(const VkMemoryAllocateInfo& ai) const noexcept; @@ -734,18 +797,11 @@ public: dld->vkResetQueryPoolEXT(handle, query_pool, first, count); } - void GetQueryResults(VkQueryPool query_pool, u32 first, u32 count, std::size_t data_size, - void* data, VkDeviceSize stride, VkQueryResultFlags flags) const { - Check(dld->vkGetQueryPoolResults(handle, query_pool, first, count, data_size, data, stride, - flags)); - } - - template <typename T> - T GetQueryResult(VkQueryPool query_pool, u32 first, VkQueryResultFlags flags) const { - static_assert(std::is_trivially_copyable_v<T>); - T value; - GetQueryResults(query_pool, first, 1, sizeof(T), &value, sizeof(T), flags); - return value; + VkResult GetQueryResults(VkQueryPool query_pool, u32 first, u32 count, std::size_t data_size, + void* data, VkDeviceSize stride, + VkQueryResultFlags flags) const noexcept { + return dld->vkGetQueryPoolResults(handle, query_pool, first, count, data_size, data, stride, + flags); } }; @@ -776,7 +832,7 @@ public: bool GetSurfaceSupportKHR(u32 queue_family_index, VkSurfaceKHR) const; - VkSurfaceCapabilitiesKHR GetSurfaceCapabilitiesKHR(VkSurfaceKHR) const noexcept; + VkSurfaceCapabilitiesKHR GetSurfaceCapabilitiesKHR(VkSurfaceKHR) const; std::vector<VkSurfaceFormatKHR> GetSurfaceFormatsKHR(VkSurfaceKHR) const; @@ -835,8 +891,8 @@ public: dld->vkCmdBindPipeline(handle, bind_point, pipeline); } - void BindIndexBuffer(VkBuffer buffer, VkDeviceSize offset, VkIndexType index_type) const - noexcept { + void BindIndexBuffer(VkBuffer buffer, VkDeviceSize offset, + VkIndexType index_type) const noexcept { dld->vkCmdBindIndexBuffer(handle, buffer, offset, index_type); } @@ -849,8 +905,8 @@ public: BindVertexBuffers(binding, 1, &buffer, &offset); } - void Draw(u32 vertex_count, u32 instance_count, u32 first_vertex, u32 first_instance) const - noexcept { + void Draw(u32 vertex_count, u32 instance_count, u32 first_vertex, + u32 first_instance) const noexcept { dld->vkCmdDraw(handle, vertex_count, instance_count, first_vertex, first_instance); } @@ -860,15 +916,15 @@ public: first_instance); } - void ClearAttachments(Span<VkClearAttachment> attachments, Span<VkClearRect> rects) const - noexcept { + void ClearAttachments(Span<VkClearAttachment> attachments, + Span<VkClearRect> rects) const noexcept { dld->vkCmdClearAttachments(handle, attachments.size(), attachments.data(), rects.size(), rects.data()); } void BlitImage(VkImage src_image, VkImageLayout src_layout, VkImage dst_image, - VkImageLayout dst_layout, Span<VkImageBlit> regions, VkFilter filter) const - noexcept { + VkImageLayout dst_layout, Span<VkImageBlit> regions, + VkFilter filter) const noexcept { dld->vkCmdBlitImage(handle, src_image, src_layout, dst_image, dst_layout, regions.size(), regions.data(), filter); } @@ -893,8 +949,8 @@ public: regions.data()); } - void CopyBuffer(VkBuffer src_buffer, VkBuffer dst_buffer, Span<VkBufferCopy> regions) const - noexcept { + void CopyBuffer(VkBuffer src_buffer, VkBuffer dst_buffer, + Span<VkBufferCopy> regions) const noexcept { dld->vkCmdCopyBuffer(handle, src_buffer, dst_buffer, regions.size(), regions.data()); } @@ -910,8 +966,8 @@ public: regions.data()); } - void FillBuffer(VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize size, u32 data) const - noexcept { + void FillBuffer(VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize size, + u32 data) const noexcept { dld->vkCmdFillBuffer(handle, dst_buffer, dst_offset, size, data); } @@ -920,10 +976,6 @@ public: dld->vkCmdPushConstants(handle, layout, flags, offset, size, values); } - void SetCheckpointNV(const void* checkpoint_marker) const noexcept { - dld->vkCmdSetCheckpointNV(handle, checkpoint_marker); - } - void SetViewport(u32 first, Span<VkViewport> viewports) const noexcept { dld->vkCmdSetViewport(handle, first, viewports.size(), viewports.data()); } @@ -956,6 +1008,63 @@ public: dld->vkCmdSetDepthBounds(handle, min_depth_bounds, max_depth_bounds); } + void SetEvent(VkEvent event, VkPipelineStageFlags stage_flags) const noexcept { + dld->vkCmdSetEvent(handle, event, stage_flags); + } + + void WaitEvents(Span<VkEvent> events, VkPipelineStageFlags src_stage_mask, + VkPipelineStageFlags dst_stage_mask, Span<VkMemoryBarrier> memory_barriers, + Span<VkBufferMemoryBarrier> buffer_barriers, + Span<VkImageMemoryBarrier> image_barriers) const noexcept { + dld->vkCmdWaitEvents(handle, events.size(), events.data(), src_stage_mask, dst_stage_mask, + memory_barriers.size(), memory_barriers.data(), buffer_barriers.size(), + buffer_barriers.data(), image_barriers.size(), image_barriers.data()); + } + + void BindVertexBuffers2EXT(u32 first_binding, u32 binding_count, const VkBuffer* buffers, + const VkDeviceSize* offsets, const VkDeviceSize* sizes, + const VkDeviceSize* strides) const noexcept { + dld->vkCmdBindVertexBuffers2EXT(handle, first_binding, binding_count, buffers, offsets, + sizes, strides); + } + + void SetCullModeEXT(VkCullModeFlags cull_mode) const noexcept { + dld->vkCmdSetCullModeEXT(handle, cull_mode); + } + + void SetDepthBoundsTestEnableEXT(bool enable) const noexcept { + dld->vkCmdSetDepthBoundsTestEnableEXT(handle, enable ? VK_TRUE : VK_FALSE); + } + + void SetDepthCompareOpEXT(VkCompareOp compare_op) const noexcept { + dld->vkCmdSetDepthCompareOpEXT(handle, compare_op); + } + + void SetDepthTestEnableEXT(bool enable) const noexcept { + dld->vkCmdSetDepthTestEnableEXT(handle, enable ? VK_TRUE : VK_FALSE); + } + + void SetDepthWriteEnableEXT(bool enable) const noexcept { + dld->vkCmdSetDepthWriteEnableEXT(handle, enable ? VK_TRUE : VK_FALSE); + } + + void SetFrontFaceEXT(VkFrontFace front_face) const noexcept { + dld->vkCmdSetFrontFaceEXT(handle, front_face); + } + + void SetPrimitiveTopologyEXT(VkPrimitiveTopology primitive_topology) const noexcept { + dld->vkCmdSetPrimitiveTopologyEXT(handle, primitive_topology); + } + + void SetStencilOpEXT(VkStencilFaceFlags face_mask, VkStencilOp fail_op, VkStencilOp pass_op, + VkStencilOp depth_fail_op, VkCompareOp compare_op) const noexcept { + dld->vkCmdSetStencilOpEXT(handle, face_mask, fail_op, pass_op, depth_fail_op, compare_op); + } + + void SetStencilTestEnableEXT(bool enable) const noexcept { + dld->vkCmdSetStencilTestEnableEXT(handle, enable ? VK_TRUE : VK_FALSE); + } + void BindTransformFeedbackBuffersEXT(u32 first, u32 count, const VkBuffer* buffers, const VkDeviceSize* offsets, const VkDeviceSize* sizes) const noexcept { @@ -981,7 +1090,12 @@ private: const DeviceDispatch* dld; }; +u32 AvailableVersion(const InstanceDispatch& dld) noexcept; + std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProperties( const InstanceDispatch& dld); +std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties( + const InstanceDispatch& dld); + } // namespace Vulkan::vk diff --git a/src/video_core/shader/ast.h b/src/video_core/shader/ast.h index cca13bcde..8e5a22ab3 100644 --- a/src/video_core/shader/ast.h +++ b/src/video_core/shader/ast.h @@ -199,55 +199,48 @@ public: } std::optional<u32> GetGotoLabel() const { - auto inner = std::get_if<ASTGoto>(&data); - if (inner) { + if (const auto* inner = std::get_if<ASTGoto>(&data)) { return {inner->label}; } - return {}; + return std::nullopt; } Expr GetGotoCondition() const { - auto inner = std::get_if<ASTGoto>(&data); - if (inner) { + if (const auto* inner = std::get_if<ASTGoto>(&data)) { return inner->condition; } return nullptr; } void MarkLabelUnused() { - auto inner = std::get_if<ASTLabel>(&data); - if (inner) { + if (auto* inner = std::get_if<ASTLabel>(&data)) { inner->unused = true; } } bool IsLabelUnused() const { - auto inner = std::get_if<ASTLabel>(&data); - if (inner) { + if (const auto* inner = std::get_if<ASTLabel>(&data)) { return inner->unused; } return true; } std::optional<u32> GetLabelIndex() const { - auto inner = std::get_if<ASTLabel>(&data); - if (inner) { + if (const auto* inner = std::get_if<ASTLabel>(&data)) { return {inner->index}; } - return {}; + return std::nullopt; } Expr GetIfCondition() const { - auto inner = std::get_if<ASTIfThen>(&data); - if (inner) { + if (const auto* inner = std::get_if<ASTIfThen>(&data)) { return inner->condition; } return nullptr; } void SetGotoCondition(Expr new_condition) { - auto inner = std::get_if<ASTGoto>(&data); - if (inner) { + if (auto* inner = std::get_if<ASTGoto>(&data)) { inner->condition = std::move(new_condition); } } diff --git a/src/video_core/shader/async_shaders.cpp b/src/video_core/shader/async_shaders.cpp new file mode 100644 index 000000000..6920afdf2 --- /dev/null +++ b/src/video_core/shader/async_shaders.cpp @@ -0,0 +1,216 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <condition_variable> +#include <mutex> +#include <thread> +#include <vector> +#include "video_core/engines/maxwell_3d.h" +#include "video_core/renderer_base.h" +#include "video_core/renderer_opengl/gl_shader_cache.h" +#include "video_core/shader/async_shaders.h" + +namespace VideoCommon::Shader { + +AsyncShaders::AsyncShaders(Core::Frontend::EmuWindow& emu_window) : emu_window(emu_window) {} + +AsyncShaders::~AsyncShaders() { + KillWorkers(); +} + +void AsyncShaders::AllocateWorkers() { + // Use at least one thread + u32 num_workers = 1; + + // Deduce how many more threads we can use + const u32 thread_count = std::thread::hardware_concurrency(); + if (thread_count >= 8) { + // Increase async workers by 1 for every 2 threads >= 8 + num_workers += 1 + (thread_count - 8) / 2; + } + + // If we already have workers queued, ignore + if (num_workers == worker_threads.size()) { + return; + } + + // If workers already exist, clear them + if (!worker_threads.empty()) { + FreeWorkers(); + } + + // Create workers + for (std::size_t i = 0; i < num_workers; i++) { + context_list.push_back(emu_window.CreateSharedContext()); + worker_threads.emplace_back(&AsyncShaders::ShaderCompilerThread, this, + context_list[i].get()); + } +} + +void AsyncShaders::FreeWorkers() { + // Mark all threads to quit + is_thread_exiting.store(true); + cv.notify_all(); + for (auto& thread : worker_threads) { + thread.join(); + } + // Clear our shared contexts + context_list.clear(); + + // Clear our worker threads + worker_threads.clear(); +} + +void AsyncShaders::KillWorkers() { + is_thread_exiting.store(true); + for (auto& thread : worker_threads) { + thread.detach(); + } + // Clear our shared contexts + context_list.clear(); + + // Clear our worker threads + worker_threads.clear(); +} + +bool AsyncShaders::HasWorkQueued() const { + return !pending_queue.empty(); +} + +bool AsyncShaders::HasCompletedWork() const { + std::shared_lock lock{completed_mutex}; + return !finished_work.empty(); +} + +bool AsyncShaders::IsShaderAsync(const Tegra::GPU& gpu) const { + const auto& regs = gpu.Maxwell3D().regs; + + // If something is using depth, we can assume that games are not rendering anything which will + // be used one time. + if (regs.zeta_enable) { + return true; + } + + // If games are using a small index count, we can assume these are full screen quads. Usually + // these shaders are only used once for building textures so we can assume they can't be built + // async + if (regs.index_array.count <= 6 || regs.vertex_buffer.count <= 6) { + return false; + } + + return true; +} + +std::vector<AsyncShaders::Result> AsyncShaders::GetCompletedWork() { + std::vector<Result> results; + { + std::unique_lock lock{completed_mutex}; + results = std::move(finished_work); + finished_work.clear(); + } + return results; +} + +void AsyncShaders::QueueOpenGLShader(const OpenGL::Device& device, + Tegra::Engines::ShaderType shader_type, u64 uid, + std::vector<u64> code, std::vector<u64> code_b, + u32 main_offset, CompilerSettings compiler_settings, + const Registry& registry, VAddr cpu_addr) { + std::unique_lock lock(queue_mutex); + pending_queue.push({ + .backend = device.UseAssemblyShaders() ? Backend::GLASM : Backend::OpenGL, + .device = &device, + .shader_type = shader_type, + .uid = uid, + .code = std::move(code), + .code_b = std::move(code_b), + .main_offset = main_offset, + .compiler_settings = compiler_settings, + .registry = registry, + .cpu_address = cpu_addr, + }); + cv.notify_one(); +} + +void AsyncShaders::QueueVulkanShader(Vulkan::VKPipelineCache* pp_cache, + const Vulkan::VKDevice& device, Vulkan::VKScheduler& scheduler, + Vulkan::VKDescriptorPool& descriptor_pool, + Vulkan::VKUpdateDescriptorQueue& update_descriptor_queue, + Vulkan::VKRenderPassCache& renderpass_cache, + std::vector<VkDescriptorSetLayoutBinding> bindings, + Vulkan::SPIRVProgram program, + Vulkan::GraphicsPipelineCacheKey key) { + std::unique_lock lock(queue_mutex); + pending_queue.push({ + .backend = Backend::Vulkan, + .pp_cache = pp_cache, + .vk_device = &device, + .scheduler = &scheduler, + .descriptor_pool = &descriptor_pool, + .update_descriptor_queue = &update_descriptor_queue, + .renderpass_cache = &renderpass_cache, + .bindings = std::move(bindings), + .program = std::move(program), + .key = key, + }); + cv.notify_one(); +} + +void AsyncShaders::ShaderCompilerThread(Core::Frontend::GraphicsContext* context) { + while (!is_thread_exiting.load(std::memory_order_relaxed)) { + std::unique_lock lock{queue_mutex}; + cv.wait(lock, [this] { return HasWorkQueued() || is_thread_exiting; }); + if (is_thread_exiting) { + return; + } + + // Partial lock to allow all threads to read at the same time + if (!HasWorkQueued()) { + continue; + } + // Another thread beat us, just unlock and wait for the next load + if (pending_queue.empty()) { + continue; + } + + // Pull work from queue + WorkerParams work = std::move(pending_queue.front()); + pending_queue.pop(); + lock.unlock(); + + if (work.backend == Backend::OpenGL || work.backend == Backend::GLASM) { + const ShaderIR ir(work.code, work.main_offset, work.compiler_settings, *work.registry); + const auto scope = context->Acquire(); + auto program = + OpenGL::BuildShader(*work.device, work.shader_type, work.uid, ir, *work.registry); + Result result{}; + result.backend = work.backend; + result.cpu_address = work.cpu_address; + result.uid = work.uid; + result.code = std::move(work.code); + result.code_b = std::move(work.code_b); + result.shader_type = work.shader_type; + + if (work.backend == Backend::OpenGL) { + result.program.opengl = std::move(program->source_program); + } else if (work.backend == Backend::GLASM) { + result.program.glasm = std::move(program->assembly_program); + } + + { + std::unique_lock complete_lock(completed_mutex); + finished_work.push_back(std::move(result)); + } + } else if (work.backend == Backend::Vulkan) { + auto pipeline = std::make_unique<Vulkan::VKGraphicsPipeline>( + *work.vk_device, *work.scheduler, *work.descriptor_pool, + *work.update_descriptor_queue, *work.renderpass_cache, work.key, work.bindings, + work.program); + + work.pp_cache->EmplacePipeline(std::move(pipeline)); + } + } +} + +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/async_shaders.h b/src/video_core/shader/async_shaders.h new file mode 100644 index 000000000..7a99e1dc5 --- /dev/null +++ b/src/video_core/shader/async_shaders.h @@ -0,0 +1,147 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <condition_variable> +#include <memory> +#include <shared_mutex> +#include <thread> + +// This header includes both Vulkan and OpenGL headers, this has to be fixed +// Unfortunately, including OpenGL will include Windows.h that defines macros that can cause issues. +// Forcefully include glad early and undefine macros +#include <glad/glad.h> +#ifdef CreateEvent +#undef CreateEvent +#endif +#ifdef CreateSemaphore +#undef CreateSemaphore +#endif + +#include "common/common_types.h" +#include "video_core/renderer_opengl/gl_device.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/renderer_opengl/gl_shader_decompiler.h" +#include "video_core/renderer_vulkan/vk_device.h" +#include "video_core/renderer_vulkan/vk_pipeline_cache.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" + +namespace Core::Frontend { +class EmuWindow; +class GraphicsContext; +} // namespace Core::Frontend + +namespace Tegra { +class GPU; +} + +namespace Vulkan { +class VKPipelineCache; +} + +namespace VideoCommon::Shader { + +class AsyncShaders { +public: + enum class Backend { + OpenGL, + GLASM, + Vulkan, + }; + + struct ResultPrograms { + OpenGL::OGLProgram opengl; + OpenGL::OGLAssemblyProgram glasm; + }; + + struct Result { + u64 uid; + VAddr cpu_address; + Backend backend; + ResultPrograms program; + std::vector<u64> code; + std::vector<u64> code_b; + Tegra::Engines::ShaderType shader_type; + }; + + explicit AsyncShaders(Core::Frontend::EmuWindow& emu_window); + ~AsyncShaders(); + + /// Start up shader worker threads + void AllocateWorkers(); + + /// Clear the shader queue and kill all worker threads + void FreeWorkers(); + + // Force end all threads + void KillWorkers(); + + /// Check to see if any shaders have actually been compiled + [[nodiscard]] bool HasCompletedWork() const; + + /// Deduce if a shader can be build on another thread of MUST be built in sync. We cannot build + /// every shader async as some shaders are only built and executed once. We try to "guess" which + /// shader would be used only once + [[nodiscard]] bool IsShaderAsync(const Tegra::GPU& gpu) const; + + /// Pulls completed compiled shaders + [[nodiscard]] std::vector<Result> GetCompletedWork(); + + void QueueOpenGLShader(const OpenGL::Device& device, Tegra::Engines::ShaderType shader_type, + u64 uid, std::vector<u64> code, std::vector<u64> code_b, u32 main_offset, + CompilerSettings compiler_settings, const Registry& registry, + VAddr cpu_addr); + + void QueueVulkanShader(Vulkan::VKPipelineCache* pp_cache, const Vulkan::VKDevice& device, + Vulkan::VKScheduler& scheduler, + Vulkan::VKDescriptorPool& descriptor_pool, + Vulkan::VKUpdateDescriptorQueue& update_descriptor_queue, + Vulkan::VKRenderPassCache& renderpass_cache, + std::vector<VkDescriptorSetLayoutBinding> bindings, + Vulkan::SPIRVProgram program, Vulkan::GraphicsPipelineCacheKey key); + +private: + void ShaderCompilerThread(Core::Frontend::GraphicsContext* context); + + /// Check our worker queue to see if we have any work queued already + [[nodiscard]] bool HasWorkQueued() const; + + struct WorkerParams { + Backend backend; + // For OGL + const OpenGL::Device* device; + Tegra::Engines::ShaderType shader_type; + u64 uid; + std::vector<u64> code; + std::vector<u64> code_b; + u32 main_offset; + CompilerSettings compiler_settings; + std::optional<Registry> registry; + VAddr cpu_address; + + // For Vulkan + Vulkan::VKPipelineCache* pp_cache; + const Vulkan::VKDevice* vk_device; + Vulkan::VKScheduler* scheduler; + Vulkan::VKDescriptorPool* descriptor_pool; + Vulkan::VKUpdateDescriptorQueue* update_descriptor_queue; + Vulkan::VKRenderPassCache* renderpass_cache; + std::vector<VkDescriptorSetLayoutBinding> bindings; + Vulkan::SPIRVProgram program; + Vulkan::GraphicsPipelineCacheKey key; + }; + + std::condition_variable cv; + mutable std::mutex queue_mutex; + mutable std::shared_mutex completed_mutex; + std::atomic<bool> is_thread_exiting{}; + std::vector<std::unique_ptr<Core::Frontend::GraphicsContext>> context_list; + std::vector<std::thread> worker_threads; + std::queue<WorkerParams> pending_queue; + std::vector<Result> finished_work; + Core::Frontend::EmuWindow& emu_window; +}; + +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp index 2e2711350..4c8971615 100644 --- a/src/video_core/shader/control_flow.cpp +++ b/src/video_core/shader/control_flow.cpp @@ -13,6 +13,7 @@ #include "common/common_types.h" #include "video_core/shader/ast.h" #include "video_core/shader/control_flow.h" +#include "video_core/shader/memory_util.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" @@ -115,17 +116,6 @@ Pred GetPredicate(u32 index, bool negated) { return static_cast<Pred>(static_cast<u64>(index) + (negated ? 8ULL : 0ULL)); } -/** - * Returns whether the instruction at the specified offset is a 'sched' instruction. - * Sched instructions always appear before a sequence of 3 instructions. - */ -constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) { - constexpr u32 SchedPeriod = 4; - u32 absolute_offset = offset - main_offset; - - return (absolute_offset % SchedPeriod) == 0; -} - enum class ParseResult : u32 { ControlCaught, BlockEnd, @@ -197,24 +187,26 @@ std::optional<std::pair<BufferInfo, u64>> TrackLDC(const CFGRebuildState& state, std::optional<u64> TrackSHLRegister(const CFGRebuildState& state, u32& pos, u64 ldc_tracked_register) { - return TrackInstruction<u64>(state, pos, - [ldc_tracked_register](auto instr, const auto& opcode) { - return opcode.GetId() == OpCode::Id::SHL_IMM && - instr.gpr0.Value() == ldc_tracked_register; - }, - [](auto instr, const auto&) { return instr.gpr8.Value(); }); + return TrackInstruction<u64>( + state, pos, + [ldc_tracked_register](auto instr, const auto& opcode) { + return opcode.GetId() == OpCode::Id::SHL_IMM && + instr.gpr0.Value() == ldc_tracked_register; + }, + [](auto instr, const auto&) { return instr.gpr8.Value(); }); } std::optional<u32> TrackIMNMXValue(const CFGRebuildState& state, u32& pos, u64 shl_tracked_register) { - return TrackInstruction<u32>(state, pos, - [shl_tracked_register](auto instr, const auto& opcode) { - return opcode.GetId() == OpCode::Id::IMNMX_IMM && - instr.gpr0.Value() == shl_tracked_register; - }, - [](auto instr, const auto&) { - return static_cast<u32>(instr.alu.GetSignedImm20_20() + 1); - }); + return TrackInstruction<u32>( + state, pos, + [shl_tracked_register](auto instr, const auto& opcode) { + return opcode.GetId() == OpCode::Id::IMNMX_IMM && + instr.gpr0.Value() == shl_tracked_register; + }, + [](auto instr, const auto&) { + return static_cast<u32>(instr.alu.GetSignedImm20_20() + 1); + }); } std::optional<BranchIndirectInfo> TrackBranchIndirectInfo(const CFGRebuildState& state, u32 pos) { @@ -484,17 +476,17 @@ bool TryInspectAddress(CFGRebuildState& state) { } case BlockCollision::Inside: { // This case is the tricky one: - // We need to Split the block in 2 sepparate blocks + // We need to split the block into 2 separate blocks const u32 end = state.block_info[block_index].end; BlockInfo& new_block = CreateBlockInfo(state, address, end); BlockInfo& current_block = state.block_info[block_index]; current_block.end = address - 1; - new_block.branch = current_block.branch; + new_block.branch = std::move(current_block.branch); BlockBranchInfo forward_branch = MakeBranchInfo<SingleBranch>(); const auto branch = std::get_if<SingleBranch>(forward_branch.get()); branch->address = address; branch->ignore = true; - current_block.branch = forward_branch; + current_block.branch = std::move(forward_branch); return true; } default: @@ -555,13 +547,13 @@ bool TryQuery(CFGRebuildState& state) { gather_labels(q2.ssy_stack, state.ssy_labels, block); gather_labels(q2.pbk_stack, state.pbk_labels, block); if (std::holds_alternative<SingleBranch>(*block.branch)) { - const auto branch = std::get_if<SingleBranch>(block.branch.get()); + auto* branch = std::get_if<SingleBranch>(block.branch.get()); if (!branch->condition.IsUnconditional()) { q2.address = block.end + 1; state.queries.push_back(q2); } - Query conditional_query{q2}; + auto& conditional_query = state.queries.emplace_back(q2); if (branch->is_sync) { if (branch->address == unassigned_branch) { branch->address = conditional_query.ssy_stack.top(); @@ -575,23 +567,21 @@ bool TryQuery(CFGRebuildState& state) { conditional_query.pbk_stack.pop(); } conditional_query.address = branch->address; - state.queries.push_back(std::move(conditional_query)); return true; } - const auto multi_branch = std::get_if<MultiBranch>(block.branch.get()); + + const auto* multi_branch = std::get_if<MultiBranch>(block.branch.get()); for (const auto& branch_case : multi_branch->branches) { - Query conditional_query{q2}; + auto& conditional_query = state.queries.emplace_back(q2); conditional_query.address = branch_case.address; - state.queries.push_back(std::move(conditional_query)); } + return true; } -} // Anonymous namespace - void InsertBranch(ASTManager& mm, const BlockBranchInfo& branch_info) { - const auto get_expr = ([&](const Condition& cond) -> Expr { - Expr result{}; + const auto get_expr = [](const Condition& cond) -> Expr { + Expr result; if (cond.cc != ConditionCode::T) { result = MakeExpr<ExprCondCode>(cond.cc); } @@ -604,10 +594,10 @@ void InsertBranch(ASTManager& mm, const BlockBranchInfo& branch_info) { } Expr extra = MakeExpr<ExprPredicate>(pred); if (negate) { - extra = MakeExpr<ExprNot>(extra); + extra = MakeExpr<ExprNot>(std::move(extra)); } if (result) { - return MakeExpr<ExprAnd>(extra, result); + return MakeExpr<ExprAnd>(std::move(extra), std::move(result)); } return extra; } @@ -615,9 +605,10 @@ void InsertBranch(ASTManager& mm, const BlockBranchInfo& branch_info) { return result; } return MakeExpr<ExprBoolean>(true); - }); + }; + if (std::holds_alternative<SingleBranch>(*branch_info)) { - const auto branch = std::get_if<SingleBranch>(branch_info.get()); + const auto* branch = std::get_if<SingleBranch>(branch_info.get()); if (branch->address < 0) { if (branch->kill) { mm.InsertReturn(get_expr(branch->condition), true); @@ -629,7 +620,7 @@ void InsertBranch(ASTManager& mm, const BlockBranchInfo& branch_info) { mm.InsertGoto(get_expr(branch->condition), branch->address); return; } - const auto multi_branch = std::get_if<MultiBranch>(branch_info.get()); + const auto* multi_branch = std::get_if<MultiBranch>(branch_info.get()); for (const auto& branch_case : multi_branch->branches) { mm.InsertGoto(MakeExpr<ExprGprEqual>(multi_branch->gpr, branch_case.cmp_value), branch_case.address); @@ -655,6 +646,8 @@ void DecompileShader(CFGRebuildState& state) { state.manager->Decompile(); } +} // Anonymous namespace + std::unique_ptr<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 start_address, const CompilerSettings& settings, Registry& registry) { diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp index 87ac9ac6c..eeac328a6 100644 --- a/src/video_core/shader/decode.cpp +++ b/src/video_core/shader/decode.cpp @@ -13,6 +13,7 @@ #include "video_core/engines/shader_bytecode.h" #include "video_core/engines/shader_header.h" #include "video_core/shader/control_flow.h" +#include "video_core/shader/memory_util.h" #include "video_core/shader/node_helper.h" #include "video_core/shader/shader_ir.h" @@ -23,17 +24,6 @@ using Tegra::Shader::OpCode; namespace { -/** - * Returns whether the instruction at the specified offset is a 'sched' instruction. - * Sched instructions always appear before a sequence of 3 instructions. - */ -constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) { - constexpr u32 SchedPeriod = 4; - u32 absolute_offset = offset - main_offset; - - return (absolute_offset % SchedPeriod) == 0; -} - void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile& gpu_driver, const std::list<Sampler>& used_samplers) { if (gpu_driver.IsTextureHandlerSizeKnown() || used_samplers.size() <= 1) { @@ -42,11 +32,11 @@ void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile& gpu_driver, u32 count{}; std::vector<u32> bound_offsets; for (const auto& sampler : used_samplers) { - if (sampler.IsBindless()) { + if (sampler.is_bindless) { continue; } ++count; - bound_offsets.emplace_back(sampler.GetOffset()); + bound_offsets.emplace_back(sampler.offset); } if (count > 1) { gpu_driver.DeduceTextureHandlerSize(std::move(bound_offsets)); @@ -56,14 +46,14 @@ void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile& gpu_driver, std::optional<u32> TryDeduceSamplerSize(const Sampler& sampler_to_deduce, VideoCore::GuestDriverProfile& gpu_driver, const std::list<Sampler>& used_samplers) { - const u32 base_offset = sampler_to_deduce.GetOffset(); + const u32 base_offset = sampler_to_deduce.offset; u32 max_offset{std::numeric_limits<u32>::max()}; for (const auto& sampler : used_samplers) { - if (sampler.IsBindless()) { + if (sampler.is_bindless) { continue; } - if (sampler.GetOffset() > base_offset) { - max_offset = std::min(sampler.GetOffset(), max_offset); + if (sampler.offset > base_offset) { + max_offset = std::min(sampler.offset, max_offset); } } if (max_offset == std::numeric_limits<u32>::max()) { @@ -265,7 +255,7 @@ void ShaderIR::InsertControlFlow(NodeBlock& bb, const ShaderBlock& block) { Node n = Operation(OperationCode::Branch, Immediate(branch_case.address)); Node op_b = Immediate(branch_case.cmp_value); Node condition = - GetPredicateComparisonInteger(Tegra::Shader::PredCondition::Equal, false, op_a, op_b); + GetPredicateComparisonInteger(Tegra::Shader::PredCondition::EQ, false, op_a, op_b); auto result = Conditional(condition, {n}); bb.push_back(result); global_code.push_back(result); @@ -363,14 +353,14 @@ void ShaderIR::PostDecode() { return; } for (auto& sampler : used_samplers) { - if (!sampler.IsIndexed()) { + if (!sampler.is_indexed) { continue; } if (const auto size = TryDeduceSamplerSize(sampler, gpu_driver, used_samplers)) { - sampler.SetSize(*size); + sampler.size = *size; } else { LOG_CRITICAL(HW_GPU, "Failed to deduce size of indexed sampler"); - sampler.SetSize(1); + sampler.size = 1; } } } diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp index 4db329fa5..afef5948d 100644 --- a/src/video_core/shader/decode/arithmetic.cpp +++ b/src/video_core/shader/decode/arithmetic.cpp @@ -137,7 +137,8 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::FCMP_RR: - case OpCode::Id::FCMP_RC: { + case OpCode::Id::FCMP_RC: + case OpCode::Id::FCMP_IMMR: { UNIMPLEMENTED_IF(instr.fcmp.ftz == 0); Node op_c = GetRegister(instr.gpr39); Node comp = GetPredicateComparisonFloat(instr.fcmp.cond, std::move(op_c), Immediate(0.0f)); diff --git a/src/video_core/shader/decode/arithmetic_half.cpp b/src/video_core/shader/decode/arithmetic_half.cpp index ee7d9a29d..88103fede 100644 --- a/src/video_core/shader/decode/arithmetic_half.cpp +++ b/src/video_core/shader/decode/arithmetic_half.cpp @@ -19,22 +19,49 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; const auto opcode = OpCode::Decode(instr); - if (opcode->get().GetId() == OpCode::Id::HADD2_C || - opcode->get().GetId() == OpCode::Id::HADD2_R) { + bool negate_a = false; + bool negate_b = false; + bool absolute_a = false; + bool absolute_b = false; + + switch (opcode->get().GetId()) { + case OpCode::Id::HADD2_R: if (instr.alu_half.ftz == 0) { LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName()); } + negate_a = ((instr.value >> 43) & 1) != 0; + negate_b = ((instr.value >> 31) & 1) != 0; + absolute_a = ((instr.value >> 44) & 1) != 0; + absolute_b = ((instr.value >> 30) & 1) != 0; + break; + case OpCode::Id::HADD2_C: + if (instr.alu_half.ftz == 0) { + LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName()); + } + negate_a = ((instr.value >> 43) & 1) != 0; + negate_b = ((instr.value >> 56) & 1) != 0; + absolute_a = ((instr.value >> 44) & 1) != 0; + absolute_b = ((instr.value >> 54) & 1) != 0; + break; + case OpCode::Id::HMUL2_R: + negate_a = ((instr.value >> 43) & 1) != 0; + absolute_a = ((instr.value >> 44) & 1) != 0; + absolute_b = ((instr.value >> 30) & 1) != 0; + break; + case OpCode::Id::HMUL2_C: + negate_b = ((instr.value >> 31) & 1) != 0; + absolute_a = ((instr.value >> 44) & 1) != 0; + absolute_b = ((instr.value >> 54) & 1) != 0; + break; + default: + UNREACHABLE(); + break; } - const bool negate_a = - opcode->get().GetId() != OpCode::Id::HMUL2_R && instr.alu_half.negate_a != 0; - const bool negate_b = - opcode->get().GetId() != OpCode::Id::HMUL2_C && instr.alu_half.negate_b != 0; - Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half.type_a); - op_a = GetOperandAbsNegHalf(op_a, instr.alu_half.abs_a, negate_a); + op_a = GetOperandAbsNegHalf(op_a, absolute_a, negate_a); - auto [type_b, op_b] = [&]() -> std::tuple<HalfType, Node> { + auto [type_b, op_b] = [this, instr, opcode]() -> std::pair<HalfType, Node> { switch (opcode->get().GetId()) { case OpCode::Id::HADD2_C: case OpCode::Id::HMUL2_C: @@ -48,17 +75,16 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) { } }(); op_b = UnpackHalfFloat(op_b, type_b); - // redeclaration to avoid a bug in clang with reusing local bindings in lambdas - Node op_b_alt = GetOperandAbsNegHalf(op_b, instr.alu_half.abs_b, negate_b); + op_b = GetOperandAbsNegHalf(op_b, absolute_b, negate_b); - Node value = [&]() { + Node value = [this, opcode, op_a, op_b = op_b] { switch (opcode->get().GetId()) { case OpCode::Id::HADD2_C: case OpCode::Id::HADD2_R: - return Operation(OperationCode::HAdd, PRECISE, op_a, op_b_alt); + return Operation(OperationCode::HAdd, PRECISE, op_a, op_b); case OpCode::Id::HMUL2_C: case OpCode::Id::HMUL2_R: - return Operation(OperationCode::HMul, PRECISE, op_a, op_b_alt); + return Operation(OperationCode::HMul, PRECISE, op_a, op_b); default: UNIMPLEMENTED_MSG("Unhandled half float instruction: {}", opcode->get().GetName()); return Immediate(0); diff --git a/src/video_core/shader/decode/arithmetic_integer.cpp b/src/video_core/shader/decode/arithmetic_integer.cpp index 0f4c3103a..73155966f 100644 --- a/src/video_core/shader/decode/arithmetic_integer.cpp +++ b/src/video_core/shader/decode/arithmetic_integer.cpp @@ -35,15 +35,38 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) { case OpCode::Id::IADD_C: case OpCode::Id::IADD_R: case OpCode::Id::IADD_IMM: { - UNIMPLEMENTED_IF_MSG(instr.alu.saturate_d, "IADD saturation not implemented"); + UNIMPLEMENTED_IF_MSG(instr.alu.saturate_d, "IADD.SAT"); + UNIMPLEMENTED_IF_MSG(instr.iadd.x && instr.generates_cc, "IADD.X Rd.CC"); op_a = GetOperandAbsNegInteger(op_a, false, instr.alu_integer.negate_a, true); op_b = GetOperandAbsNegInteger(op_b, false, instr.alu_integer.negate_b, true); - const Node value = Operation(OperationCode::IAdd, PRECISE, op_a, op_b); + Node value = Operation(OperationCode::UAdd, op_a, op_b); - SetInternalFlagsFromInteger(bb, value, instr.generates_cc); - SetRegister(bb, instr.gpr0, value); + if (instr.iadd.x) { + Node carry = GetInternalFlag(InternalFlag::Carry); + Node x = Operation(OperationCode::Select, std::move(carry), Immediate(1), Immediate(0)); + value = Operation(OperationCode::UAdd, std::move(value), std::move(x)); + } + + if (instr.generates_cc) { + const Node i0 = Immediate(0); + + Node zero = Operation(OperationCode::LogicalIEqual, value, i0); + Node sign = Operation(OperationCode::LogicalILessThan, value, i0); + Node carry = Operation(OperationCode::LogicalAddCarry, op_a, op_b); + + Node pos_a = Operation(OperationCode::LogicalIGreaterThan, op_a, i0); + Node pos_b = Operation(OperationCode::LogicalIGreaterThan, op_b, i0); + Node pos = Operation(OperationCode::LogicalAnd, std::move(pos_a), std::move(pos_b)); + Node overflow = Operation(OperationCode::LogicalAnd, pos, sign); + + SetInternalFlag(bb, InternalFlag::Zero, std::move(zero)); + SetInternalFlag(bb, InternalFlag::Sign, std::move(sign)); + SetInternalFlag(bb, InternalFlag::Carry, std::move(carry)); + SetInternalFlag(bb, InternalFlag::Overflow, std::move(overflow)); + } + SetRegister(bb, instr.gpr0, std::move(value)); break; } case OpCode::Id::IADD3_C: @@ -75,12 +98,12 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) { op_b = GetOperandAbsNegInteger(op_b, false, instr.iadd3.neg_b, true); op_c = GetOperandAbsNegInteger(op_c, false, instr.iadd3.neg_c, true); - const Node value = [&]() { - const Node add_ab = Operation(OperationCode::IAdd, NO_PRECISE, op_a, op_b); + const Node value = [&] { + Node add_ab = Operation(OperationCode::IAdd, NO_PRECISE, op_a, op_b); if (opcode->get().GetId() != OpCode::Id::IADD3_R) { return Operation(OperationCode::IAdd, NO_PRECISE, add_ab, op_c); } - const Node shifted = [&]() { + const Node shifted = [&] { switch (instr.iadd3.mode) { case Tegra::Shader::IAdd3Mode::RightShift: // TODO(tech4me): According to @@ -249,8 +272,8 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) { } case OpCode::Id::LEA_IMM: { const bool neg = instr.lea.imm.neg != 0; - return {Immediate(static_cast<u32>(instr.lea.imm.entry_a)), - GetOperandAbsNegInteger(GetRegister(instr.gpr8), false, neg, true), + return {GetOperandAbsNegInteger(GetRegister(instr.gpr8), false, neg, true), + Immediate(static_cast<u32>(instr.lea.imm.entry_a)), Immediate(static_cast<u32>(instr.lea.imm.entry_b))}; } case OpCode::Id::LEA_RZ: { diff --git a/src/video_core/shader/decode/arithmetic_integer_immediate.cpp b/src/video_core/shader/decode/arithmetic_integer_immediate.cpp index 73880db0e..2a30aab2b 100644 --- a/src/video_core/shader/decode/arithmetic_integer_immediate.cpp +++ b/src/video_core/shader/decode/arithmetic_integer_immediate.cpp @@ -28,23 +28,26 @@ u32 ShaderIR::DecodeArithmeticIntegerImmediate(NodeBlock& bb, u32 pc) { case OpCode::Id::IADD32I: { UNIMPLEMENTED_IF_MSG(instr.iadd32i.saturate, "IADD32I saturation is not implemented"); - op_a = GetOperandAbsNegInteger(op_a, false, instr.iadd32i.negate_a, true); + op_a = GetOperandAbsNegInteger(std::move(op_a), false, instr.iadd32i.negate_a != 0, true); - const Node value = Operation(OperationCode::IAdd, PRECISE, op_a, op_b); + Node value = Operation(OperationCode::IAdd, PRECISE, std::move(op_a), std::move(op_b)); - SetInternalFlagsFromInteger(bb, value, instr.op_32.generates_cc); - SetRegister(bb, instr.gpr0, value); + SetInternalFlagsFromInteger(bb, value, instr.op_32.generates_cc != 0); + SetRegister(bb, instr.gpr0, std::move(value)); break; } case OpCode::Id::LOP32I: { - if (instr.alu.lop32i.invert_a) - op_a = Operation(OperationCode::IBitwiseNot, NO_PRECISE, op_a); + if (instr.alu.lop32i.invert_a) { + op_a = Operation(OperationCode::IBitwiseNot, NO_PRECISE, std::move(op_a)); + } - if (instr.alu.lop32i.invert_b) - op_b = Operation(OperationCode::IBitwiseNot, NO_PRECISE, op_b); + if (instr.alu.lop32i.invert_b) { + op_b = Operation(OperationCode::IBitwiseNot, NO_PRECISE, std::move(op_b)); + } - WriteLogicOperation(bb, instr.gpr0, instr.alu.lop32i.operation, op_a, op_b, - PredicateResultMode::None, Pred::UnusedIndex, instr.op_32.generates_cc); + WriteLogicOperation(bb, instr.gpr0, instr.alu.lop32i.operation, std::move(op_a), + std::move(op_b), PredicateResultMode::None, Pred::UnusedIndex, + instr.op_32.generates_cc != 0); break; } default: @@ -58,14 +61,14 @@ u32 ShaderIR::DecodeArithmeticIntegerImmediate(NodeBlock& bb, u32 pc) { void ShaderIR::WriteLogicOperation(NodeBlock& bb, Register dest, LogicOperation logic_op, Node op_a, Node op_b, PredicateResultMode predicate_mode, Pred predicate, bool sets_cc) { - const Node result = [&]() { + Node result = [&] { switch (logic_op) { case LogicOperation::And: - return Operation(OperationCode::IBitwiseAnd, PRECISE, op_a, op_b); + return Operation(OperationCode::IBitwiseAnd, PRECISE, std::move(op_a), std::move(op_b)); case LogicOperation::Or: - return Operation(OperationCode::IBitwiseOr, PRECISE, op_a, op_b); + return Operation(OperationCode::IBitwiseOr, PRECISE, std::move(op_a), std::move(op_b)); case LogicOperation::Xor: - return Operation(OperationCode::IBitwiseXor, PRECISE, op_a, op_b); + return Operation(OperationCode::IBitwiseXor, PRECISE, std::move(op_a), std::move(op_b)); case LogicOperation::PassB: return op_b; default: @@ -84,8 +87,8 @@ void ShaderIR::WriteLogicOperation(NodeBlock& bb, Register dest, LogicOperation return; case PredicateResultMode::NotZero: { // Set the predicate to true if the result is not zero. - const Node compare = Operation(OperationCode::LogicalINotEqual, result, Immediate(0)); - SetPredicate(bb, static_cast<u64>(predicate), compare); + Node compare = Operation(OperationCode::LogicalINotEqual, std::move(result), Immediate(0)); + SetPredicate(bb, static_cast<u64>(predicate), std::move(compare)); break; } default: diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp index 848e46874..b2e88fa20 100644 --- a/src/video_core/shader/decode/half_set.cpp +++ b/src/video_core/shader/decode/half_set.cpp @@ -13,55 +13,101 @@ namespace VideoCommon::Shader { +using std::move; using Tegra::Shader::Instruction; using Tegra::Shader::OpCode; +using Tegra::Shader::PredCondition; u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; const auto opcode = OpCode::Decode(instr); - if (instr.hset2.ftz == 0) { - LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName()); + PredCondition cond; + bool bf; + bool ftz; + bool neg_a; + bool abs_a; + bool neg_b; + bool abs_b; + switch (opcode->get().GetId()) { + case OpCode::Id::HSET2_C: + case OpCode::Id::HSET2_IMM: + cond = instr.hsetp2.cbuf_and_imm.cond; + bf = instr.Bit(53); + ftz = instr.Bit(54); + neg_a = instr.Bit(43); + abs_a = instr.Bit(44); + neg_b = instr.Bit(56); + abs_b = instr.Bit(54); + break; + case OpCode::Id::HSET2_R: + cond = instr.hsetp2.reg.cond; + bf = instr.Bit(49); + ftz = instr.Bit(50); + neg_a = instr.Bit(43); + abs_a = instr.Bit(44); + neg_b = instr.Bit(31); + abs_b = instr.Bit(30); + break; + default: + UNREACHABLE(); } - Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a); - op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a); - - Node op_b = [&]() { + Node op_b = [this, instr, opcode] { switch (opcode->get().GetId()) { + case OpCode::Id::HSET2_C: + // Inform as unimplemented as this is not tested. + UNIMPLEMENTED_MSG("HSET2_C is not implemented"); + return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); case OpCode::Id::HSET2_R: return GetRegister(instr.gpr20); + case OpCode::Id::HSET2_IMM: + return UnpackHalfImmediate(instr, true); default: UNREACHABLE(); - return Immediate(0); + return Node{}; } }(); - op_b = UnpackHalfFloat(op_b, instr.hset2.type_b); - op_b = GetOperandAbsNegHalf(op_b, instr.hset2.abs_b, instr.hset2.negate_b); - const Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred); + if (!ftz) { + LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName()); + } + + Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a); + op_a = GetOperandAbsNegHalf(op_a, abs_a, neg_a); + + switch (opcode->get().GetId()) { + case OpCode::Id::HSET2_R: + op_b = GetOperandAbsNegHalf(move(op_b), abs_b, neg_b); + [[fallthrough]]; + case OpCode::Id::HSET2_C: + op_b = UnpackHalfFloat(move(op_b), instr.hset2.type_b); + break; + default: + break; + } - const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, op_a, op_b); + Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred); + + Node comparison_pair = GetPredicateComparisonHalf(cond, op_a, op_b); const OperationCode combiner = GetPredicateCombiner(instr.hset2.op); // HSET2 operates on each half float in the pack. std::array<Node, 2> values; for (u32 i = 0; i < 2; ++i) { - const u32 raw_value = instr.hset2.bf ? 0x3c00 : 0xffff; - const Node true_value = Immediate(raw_value << (i * 16)); - const Node false_value = Immediate(0); - - const Node comparison = - Operation(OperationCode::LogicalPick2, comparison_pair, Immediate(i)); - const Node predicate = Operation(combiner, comparison, second_pred); + const u32 raw_value = bf ? 0x3c00 : 0xffff; + Node true_value = Immediate(raw_value << (i * 16)); + Node false_value = Immediate(0); + Node comparison = Operation(OperationCode::LogicalPick2, comparison_pair, Immediate(i)); + Node predicate = Operation(combiner, comparison, second_pred); values[i] = - Operation(OperationCode::Select, NO_PRECISE, predicate, true_value, false_value); + Operation(OperationCode::Select, predicate, move(true_value), move(false_value)); } - const Node value = Operation(OperationCode::UBitwiseOr, NO_PRECISE, values[0], values[1]); - SetRegister(bb, instr.gpr0, value); + Node value = Operation(OperationCode::UBitwiseOr, values[0], values[1]); + SetRegister(bb, instr.gpr0, move(value)); return pc; } diff --git a/src/video_core/shader/decode/image.cpp b/src/video_core/shader/decode/image.cpp index 08ebca38b..1ed4212ee 100644 --- a/src/video_core/shader/decode/image.cpp +++ b/src/video_core/shader/decode/image.cpp @@ -31,11 +31,11 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor, std::size_t component) { const TextureFormat format{descriptor.format}; switch (format) { - case TextureFormat::R16_G16_B16_A16: - case TextureFormat::R32_G32_B32_A32: - case TextureFormat::R32_G32_B32: - case TextureFormat::R32_G32: - case TextureFormat::R16_G16: + case TextureFormat::R16G16B16A16: + case TextureFormat::R32G32B32A32: + case TextureFormat::R32G32B32: + case TextureFormat::R32G32: + case TextureFormat::R16G16: case TextureFormat::R32: case TextureFormat::R16: case TextureFormat::R8: @@ -97,6 +97,7 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor, break; case TextureFormat::B5G6R5: case TextureFormat::B6G5R5: + case TextureFormat::B10G11R11: if (component == 0) { return descriptor.b_type; } @@ -107,9 +108,9 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor, return descriptor.r_type; } break; - case TextureFormat::G8R24: - case TextureFormat::G24R8: - case TextureFormat::G8R8: + case TextureFormat::R24G8: + case TextureFormat::R8G24: + case TextureFormat::R8G8: case TextureFormat::G4R4: if (component == 0) { return descriptor.g_type; @@ -118,6 +119,8 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor, return descriptor.r_type; } break; + default: + break; } UNIMPLEMENTED_MSG("Texture format not implemented={}", format); return ComponentType::FLOAT; @@ -136,15 +139,15 @@ bool IsComponentEnabled(std::size_t component_mask, std::size_t component) { u32 GetComponentSize(TextureFormat format, std::size_t component) { switch (format) { - case TextureFormat::R32_G32_B32_A32: + case TextureFormat::R32G32B32A32: return 32; - case TextureFormat::R16_G16_B16_A16: + case TextureFormat::R16G16B16A16: return 16; - case TextureFormat::R32_G32_B32: + case TextureFormat::R32G32B32: return component <= 2 ? 32 : 0; - case TextureFormat::R32_G32: + case TextureFormat::R32G32: return component <= 1 ? 32 : 0; - case TextureFormat::R16_G16: + case TextureFormat::R16G16: return component <= 1 ? 16 : 0; case TextureFormat::R32: return component == 0 ? 32 : 0; @@ -191,7 +194,15 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) { return 6; } return 0; - case TextureFormat::G8R24: + case TextureFormat::B10G11R11: + if (component == 1 || component == 2) { + return 11; + } + if (component == 0) { + return 10; + } + return 0; + case TextureFormat::R24G8: if (component == 0) { return 8; } @@ -199,7 +210,7 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) { return 24; } return 0; - case TextureFormat::G24R8: + case TextureFormat::R8G24: if (component == 0) { return 24; } @@ -207,7 +218,7 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) { return 8; } return 0; - case TextureFormat::G8R8: + case TextureFormat::R8G8: return (component == 0 || component == 1) ? 8 : 0; case TextureFormat::G4R4: return (component == 0 || component == 1) ? 4 : 0; @@ -223,24 +234,25 @@ std::size_t GetImageComponentMask(TextureFormat format) { constexpr u8 B = 0b0100; constexpr u8 A = 0b1000; switch (format) { - case TextureFormat::R32_G32_B32_A32: - case TextureFormat::R16_G16_B16_A16: + case TextureFormat::R32G32B32A32: + case TextureFormat::R16G16B16A16: case TextureFormat::A8R8G8B8: case TextureFormat::A2B10G10R10: case TextureFormat::A4B4G4R4: case TextureFormat::A5B5G5R1: case TextureFormat::A1B5G5R5: return std::size_t{R | G | B | A}; - case TextureFormat::R32_G32_B32: + case TextureFormat::R32G32B32: case TextureFormat::R32_B24G8: case TextureFormat::B5G6R5: case TextureFormat::B6G5R5: + case TextureFormat::B10G11R11: return std::size_t{R | G | B}; - case TextureFormat::R32_G32: - case TextureFormat::R16_G16: - case TextureFormat::G8R24: - case TextureFormat::G24R8: - case TextureFormat::G8R8: + case TextureFormat::R32G32: + case TextureFormat::R16G16: + case TextureFormat::R24G8: + case TextureFormat::R8G24: + case TextureFormat::R8G8: case TextureFormat::G4R4: return std::size_t{R | G}; case TextureFormat::R32: @@ -299,7 +311,7 @@ std::pair<Node, bool> ShaderIR::GetComponentValue(ComponentType component_type, return {std::move(original_value), true}; } default: - UNIMPLEMENTED_MSG("Unimplement component type={}", component_type); + UNIMPLEMENTED_MSG("Unimplemented component type={}", component_type); return {std::move(original_value), true}; } } @@ -352,8 +364,10 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) { registry.ObtainBoundSampler(static_cast<u32>(instr.image.index.Value())); } else { const Node image_register = GetRegister(instr.gpr39); - const auto [base_image, buffer, offset] = TrackCbuf( - image_register, global_code, static_cast<s64>(global_code.size())); + const auto result = TrackCbuf(image_register, global_code, + static_cast<s64>(global_code.size())); + const auto buffer = std::get<1>(result); + const auto offset = std::get<2>(result); descriptor = registry.ObtainBindlessSampler(buffer, offset); } if (!descriptor) { @@ -453,11 +467,14 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) { return OperationCode::AtomicImageXor; case Tegra::Shader::ImageAtomicOperation::Exch: return OperationCode::AtomicImageExchange; + default: + break; } + break; default: break; } - UNIMPLEMENTED_MSG("Unimplemented operation={} type={}", + UNIMPLEMENTED_MSG("Unimplemented operation={}, type={}", static_cast<u64>(instr.suatom_d.operation.Value()), static_cast<u64>(instr.suatom_d.operation_type.Value())); return OperationCode::AtomicImageAdd; @@ -483,11 +500,10 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) { Image& ShaderIR::GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type) { const auto offset = static_cast<u32>(image.index.Value()); - const auto it = - std::find_if(std::begin(used_images), std::end(used_images), - [offset](const Image& entry) { return entry.GetOffset() == offset; }); + const auto it = std::find_if(std::begin(used_images), std::end(used_images), + [offset](const Image& entry) { return entry.offset == offset; }); if (it != std::end(used_images)) { - ASSERT(!it->IsBindless() && it->GetType() == it->GetType()); + ASSERT(!it->is_bindless && it->type == type); return *it; } @@ -497,16 +513,18 @@ Image& ShaderIR::GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType t Image& ShaderIR::GetBindlessImage(Tegra::Shader::Register reg, Tegra::Shader::ImageType type) { const Node image_register = GetRegister(reg); - const auto [base_image, buffer, offset] = + const auto result = TrackCbuf(image_register, global_code, static_cast<s64>(global_code.size())); - const auto it = - std::find_if(std::begin(used_images), std::end(used_images), - [buffer = buffer, offset = offset](const Image& entry) { - return entry.GetBuffer() == buffer && entry.GetOffset() == offset; - }); + const auto buffer = std::get<1>(result); + const auto offset = std::get<2>(result); + + const auto it = std::find_if(std::begin(used_images), std::end(used_images), + [buffer, offset](const Image& entry) { + return entry.buffer == buffer && entry.offset == offset; + }); if (it != std::end(used_images)) { - ASSERT(it->IsBindless() && it->GetType() == it->GetType()); + ASSERT(it->is_bindless && it->type == type); return *it; } diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp index 8112ead3e..e2bba88dd 100644 --- a/src/video_core/shader/decode/memory.cpp +++ b/src/video_core/shader/decode/memory.cpp @@ -386,8 +386,8 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::RED: { - UNIMPLEMENTED_IF_MSG(instr.red.type != GlobalAtomicType::U32); - UNIMPLEMENTED_IF_MSG(instr.red.operation != AtomicOp::Add); + UNIMPLEMENTED_IF_MSG(instr.red.type != GlobalAtomicType::U32, "type={}", + static_cast<int>(instr.red.type.Value())); const auto [real_address, base_address, descriptor] = TrackGlobalMemory(bb, instr, true, true); if (!real_address || !base_address) { @@ -396,7 +396,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { } Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor); Node value = GetRegister(instr.gpr0); - bb.push_back(Operation(OperationCode::ReduceIAdd, move(gmem), move(value))); + bb.push_back(Operation(GetAtomOperation(instr.red.operation), move(gmem), move(value))); break; } case OpCode::Id::ATOM: { @@ -472,14 +472,14 @@ std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackGlobalMemory(NodeBlock& const auto [base_address, index, offset] = TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size())); - ASSERT_OR_EXECUTE_MSG(base_address != nullptr, - { return std::make_tuple(nullptr, nullptr, GlobalMemoryBase{}); }, - "Global memory tracking failed"); + ASSERT_OR_EXECUTE_MSG( + base_address != nullptr, { return std::make_tuple(nullptr, nullptr, GlobalMemoryBase{}); }, + "Global memory tracking failed"); bb.push_back(Comment(fmt::format("Base address is c[0x{:x}][0x{:x}]", index, offset))); const GlobalMemoryBase descriptor{index, offset}; - const auto& [entry, is_new] = used_global_memory.try_emplace(descriptor); + const auto& entry = used_global_memory.try_emplace(descriptor).first; auto& usage = entry->second; usage.is_written |= is_write; usage.is_read |= is_read; diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp index d4f95b18c..29a7cfbfe 100644 --- a/src/video_core/shader/decode/other.cpp +++ b/src/video_core/shader/decode/other.cpp @@ -75,15 +75,14 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { const Node value = [this, instr] { switch (instr.sys20) { case SystemVariable::LaneId: - LOG_WARNING(HW_GPU, "S2R instruction with LaneId is incomplete"); - return Immediate(0U); + return Operation(OperationCode::ThreadId); case SystemVariable::InvocationId: return Operation(OperationCode::InvocationId); case SystemVariable::Ydirection: return Operation(OperationCode::YNegate); case SystemVariable::InvocationInfo: LOG_WARNING(HW_GPU, "S2R instruction with InvocationInfo is incomplete"); - return Immediate(0U); + return Immediate(0x00ff'0000U); case SystemVariable::WscaleFactorXY: UNIMPLEMENTED_MSG("S2R WscaleFactorXY is not implemented"); return Immediate(0U); @@ -109,6 +108,27 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { return Operation(OperationCode::WorkGroupIdY); case SystemVariable::CtaIdZ: return Operation(OperationCode::WorkGroupIdZ); + case SystemVariable::EqMask: + case SystemVariable::LtMask: + case SystemVariable::LeMask: + case SystemVariable::GtMask: + case SystemVariable::GeMask: + uses_warps = true; + switch (instr.sys20) { + case SystemVariable::EqMask: + return Operation(OperationCode::ThreadEqMask); + case SystemVariable::LtMask: + return Operation(OperationCode::ThreadLtMask); + case SystemVariable::LeMask: + return Operation(OperationCode::ThreadLeMask); + case SystemVariable::GtMask: + return Operation(OperationCode::ThreadGtMask); + case SystemVariable::GeMask: + return Operation(OperationCode::ThreadGeMask); + default: + UNREACHABLE(); + return Immediate(0u); + } default: UNIMPLEMENTED_MSG("Unhandled system move: {}", static_cast<u32>(instr.sys20.Value())); @@ -272,10 +292,25 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { SetRegister(bb, instr.gpr0, GetRegister(instr.gpr8)); break; } + case OpCode::Id::BAR: { + UNIMPLEMENTED_IF_MSG(instr.value != 0xF0A81B8000070000ULL, "BAR is not BAR.SYNC 0x0"); + bb.push_back(Operation(OperationCode::Barrier)); + break; + } case OpCode::Id::MEMBAR: { - UNIMPLEMENTED_IF(instr.membar.type != Tegra::Shader::MembarType::GL); UNIMPLEMENTED_IF(instr.membar.unknown != Tegra::Shader::MembarUnknown::Default); - bb.push_back(Operation(OperationCode::MemoryBarrierGL)); + const OperationCode type = [instr] { + switch (instr.membar.type) { + case Tegra::Shader::MembarType::CTA: + return OperationCode::MemoryBarrierGroup; + case Tegra::Shader::MembarType::GL: + return OperationCode::MemoryBarrierGlobal; + default: + UNIMPLEMENTED_MSG("MEMBAR type={}", static_cast<int>(instr.membar.type.Value())); + return OperationCode::MemoryBarrierGlobal; + } + }(); + bb.push_back(Operation(type)); break; } case OpCode::Id::DEPBAR: { diff --git a/src/video_core/shader/decode/register_set_predicate.cpp b/src/video_core/shader/decode/register_set_predicate.cpp index 8d54cce34..6116c31aa 100644 --- a/src/video_core/shader/decode/register_set_predicate.cpp +++ b/src/video_core/shader/decode/register_set_predicate.cpp @@ -2,6 +2,8 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <utility> + #include "common/assert.h" #include "common/common_types.h" #include "video_core/engines/shader_bytecode.h" @@ -10,20 +12,20 @@ namespace VideoCommon::Shader { +using std::move; using Tegra::Shader::Instruction; using Tegra::Shader::OpCode; namespace { -constexpr u64 NUM_PROGRAMMABLE_PREDICATES = 7; -} +constexpr u64 NUM_CONDITION_CODES = 4; +constexpr u64 NUM_PREDICATES = 7; +} // namespace u32 ShaderIR::DecodeRegisterSetPredicate(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; const auto opcode = OpCode::Decode(instr); - UNIMPLEMENTED_IF(instr.p2r_r2p.mode != Tegra::Shader::R2pMode::Pr); - - const Node apply_mask = [&] { + Node apply_mask = [this, opcode, instr] { switch (opcode->get().GetId()) { case OpCode::Id::R2P_IMM: case OpCode::Id::P2R_IMM: @@ -34,39 +36,43 @@ u32 ShaderIR::DecodeRegisterSetPredicate(NodeBlock& bb, u32 pc) { } }(); - const auto offset = static_cast<u32>(instr.p2r_r2p.byte) * 8; + const u32 offset = static_cast<u32>(instr.p2r_r2p.byte) * 8; + + const bool cc = instr.p2r_r2p.mode == Tegra::Shader::R2pMode::Cc; + const u64 num_entries = cc ? NUM_CONDITION_CODES : NUM_PREDICATES; + const auto get_entry = [this, cc](u64 entry) { + return cc ? GetInternalFlag(static_cast<InternalFlag>(entry)) : GetPredicate(entry); + }; switch (opcode->get().GetId()) { case OpCode::Id::R2P_IMM: { - const Node mask = GetRegister(instr.gpr8); + Node mask = GetRegister(instr.gpr8); - for (u64 pred = 0; pred < NUM_PROGRAMMABLE_PREDICATES; ++pred) { - const auto shift = static_cast<u32>(pred); + for (u64 entry = 0; entry < num_entries; ++entry) { + const u32 shift = static_cast<u32>(entry); - const Node apply_compare = BitfieldExtract(apply_mask, shift, 1); - const Node condition = - Operation(OperationCode::LogicalUNotEqual, apply_compare, Immediate(0)); + Node apply = BitfieldExtract(apply_mask, shift, 1); + Node condition = Operation(OperationCode::LogicalUNotEqual, apply, Immediate(0)); - const Node value_compare = BitfieldExtract(mask, offset + shift, 1); - const Node value = - Operation(OperationCode::LogicalUNotEqual, value_compare, Immediate(0)); + Node compare = BitfieldExtract(mask, offset + shift, 1); + Node value = Operation(OperationCode::LogicalUNotEqual, move(compare), Immediate(0)); - const Node code = Operation(OperationCode::LogicalAssign, GetPredicate(pred), value); - bb.push_back(Conditional(condition, {code})); + Node code = Operation(OperationCode::LogicalAssign, get_entry(entry), move(value)); + bb.push_back(Conditional(condition, {move(code)})); } break; } case OpCode::Id::P2R_IMM: { Node value = Immediate(0); - for (u64 pred = 0; pred < NUM_PROGRAMMABLE_PREDICATES; ++pred) { - Node bit = Operation(OperationCode::Select, GetPredicate(pred), Immediate(1U << pred), + for (u64 entry = 0; entry < num_entries; ++entry) { + Node bit = Operation(OperationCode::Select, get_entry(entry), Immediate(1U << entry), Immediate(0)); - value = Operation(OperationCode::UBitwiseOr, std::move(value), std::move(bit)); + value = Operation(OperationCode::UBitwiseOr, move(value), move(bit)); } - value = Operation(OperationCode::UBitwiseAnd, std::move(value), apply_mask); - value = BitfieldInsert(GetRegister(instr.gpr8), std::move(value), offset, 8); + value = Operation(OperationCode::UBitwiseAnd, move(value), apply_mask); + value = BitfieldInsert(GetRegister(instr.gpr8), move(value), offset, 8); - SetRegister(bb, instr.gpr0, std::move(value)); + SetRegister(bb, instr.gpr0, move(value)); break; } default: diff --git a/src/video_core/shader/decode/shift.cpp b/src/video_core/shader/decode/shift.cpp index 3b391d3e6..d4ffa8014 100644 --- a/src/video_core/shader/decode/shift.cpp +++ b/src/video_core/shader/decode/shift.cpp @@ -23,7 +23,6 @@ Node IsFull(Node shift) { } Node Shift(OperationCode opcode, Node value, Node shift) { - Node is_full = Operation(OperationCode::LogicalIEqual, shift, Immediate(32)); Node shifted = Operation(opcode, move(value), shift); return Operation(OperationCode::Select, IsFull(move(shift)), Immediate(0), move(shifted)); } diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index 6c4a1358b..02fdccd86 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -139,15 +139,15 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { } const Node component = Immediate(static_cast<u32>(instr.tld4s.component)); - const SamplerInfo info{TextureType::Texture2D, false, is_depth_compare}; - const Sampler& sampler = *GetSampler(instr.sampler, info); + SamplerInfo info; + info.is_shadow = is_depth_compare; + const std::optional<Sampler> sampler = GetSampler(instr.sampler, info); Node4 values; for (u32 element = 0; element < values.size(); ++element) { - auto coords_copy = coords; - MetaTexture meta{sampler, {}, depth_compare, aoffi, {}, {}, - {}, {}, component, element, {}}; - values[element] = Operation(OperationCode::TextureGather, meta, std::move(coords_copy)); + MetaTexture meta{*sampler, {}, depth_compare, aoffi, {}, {}, + {}, {}, component, element, {}}; + values[element] = Operation(OperationCode::TextureGather, meta, coords); } if (instr.tld4s.fp16_flag) { @@ -165,19 +165,20 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { "AOFFI is not implemented"); const bool is_array = instr.txd.is_array != 0; - u64 base_reg = instr.gpr8.Value(); const auto derivate_reg = instr.gpr20.Value(); const auto texture_type = instr.txd.texture_type.Value(); const auto coord_count = GetCoordCount(texture_type); - Node index_var{}; - const Sampler* sampler = - is_bindless ? GetBindlessSampler(base_reg, index_var, {{texture_type, is_array, false}}) - : GetSampler(instr.sampler, {{texture_type, is_array, false}}); + u64 base_reg = instr.gpr8.Value(); + Node index_var; + SamplerInfo info; + info.type = texture_type; + info.is_array = is_array; + const std::optional<Sampler> sampler = is_bindless + ? GetBindlessSampler(base_reg, info, index_var) + : GetSampler(instr.sampler, info); Node4 values; - if (sampler == nullptr) { - for (u32 element = 0; element < values.size(); ++element) { - values[element] = Immediate(0); - } + if (!sampler) { + std::generate(values.begin(), values.end(), [this] { return Immediate(0); }); WriteTexInstructionFloat(bb, instr, values); break; } @@ -215,14 +216,12 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { is_bindless = true; [[fallthrough]]; case OpCode::Id::TXQ: { - // TODO: The new commits on the texture refactor, change the way samplers work. - // Sadly, not all texture instructions specify the type of texture their sampler - // uses. This must be fixed at a later instance. - Node index_var{}; - const Sampler* sampler = - is_bindless ? GetBindlessSampler(instr.gpr8, index_var) : GetSampler(instr.sampler); - - if (sampler == nullptr) { + Node index_var; + const std::optional<Sampler> sampler = is_bindless + ? GetBindlessSampler(instr.gpr8, {}, index_var) + : GetSampler(instr.sampler, {}); + + if (!sampler) { u32 indexer = 0; for (u32 element = 0; element < 4; ++element) { if (!instr.txq.IsComponentEnabled(element)) { @@ -268,13 +267,17 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { UNIMPLEMENTED_IF_MSG(instr.tmml.UsesMiscMode(Tegra::Shader::TextureMiscMode::NDV), "NDV is not implemented"); - auto texture_type = instr.tmml.texture_type.Value(); + const auto texture_type = instr.tmml.texture_type.Value(); const bool is_array = instr.tmml.array != 0; - Node index_var{}; - const Sampler* sampler = - is_bindless ? GetBindlessSampler(instr.gpr20, index_var) : GetSampler(instr.sampler); - - if (sampler == nullptr) { + SamplerInfo info; + info.type = texture_type; + info.is_array = is_array; + Node index_var; + const std::optional<Sampler> sampler = + is_bindless ? GetBindlessSampler(instr.gpr20, info, index_var) + : GetSampler(instr.sampler, info); + + if (!sampler) { u32 indexer = 0; for (u32 element = 0; element < 2; ++element) { if (!instr.tmml.IsComponentEnabled(element)) { @@ -289,34 +292,36 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { break; } - std::vector<Node> coords; - - // TODO: Add coordinates for different samplers once other texture types are implemented. - switch (texture_type) { - case TextureType::Texture1D: - coords.push_back(GetRegister(instr.gpr8)); - break; - case TextureType::Texture2D: - coords.push_back(GetRegister(instr.gpr8.Value() + 0)); - coords.push_back(GetRegister(instr.gpr8.Value() + 1)); - break; - default: - UNIMPLEMENTED_MSG("Unhandled texture type {}", static_cast<u32>(texture_type)); + const u64 base_index = is_array ? 1 : 0; + const u64 num_components = [texture_type] { + switch (texture_type) { + case TextureType::Texture1D: + return 1; + case TextureType::Texture2D: + return 2; + case TextureType::TextureCube: + return 3; + default: + UNIMPLEMENTED_MSG("Unhandled texture type {}", static_cast<int>(texture_type)); + return 2; + } + }(); + // TODO: What's the array component used for? - // Fallback to interpreting as a 2D texture for now - coords.push_back(GetRegister(instr.gpr8.Value() + 0)); - coords.push_back(GetRegister(instr.gpr8.Value() + 1)); - texture_type = TextureType::Texture2D; + std::vector<Node> coords; + coords.reserve(num_components); + for (u64 component = 0; component < num_components; ++component) { + coords.push_back(GetRegister(instr.gpr8.Value() + base_index + component)); } + u32 indexer = 0; for (u32 element = 0; element < 2; ++element) { if (!instr.tmml.IsComponentEnabled(element)) { continue; } - auto params = coords; MetaTexture meta{*sampler, {}, {}, {}, {}, {}, {}, {}, {}, element, index_var}; - const Node value = Operation(OperationCode::TextureQueryLod, meta, std::move(params)); - SetTemporary(bb, indexer++, value); + Node value = Operation(OperationCode::TextureQueryLod, meta, coords); + SetTemporary(bb, indexer++, std::move(value)); } for (u32 i = 0; i < indexer; ++i) { SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i)); @@ -355,98 +360,122 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { return pc; } -ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(std::optional<SamplerInfo> sampler_info, u32 offset, - std::optional<u32> buffer) { - if (sampler_info) { - return *sampler_info; +ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo( + SamplerInfo info, std::optional<Tegra::Engines::SamplerDescriptor> sampler) { + if (info.IsComplete()) { + return info; } - const auto sampler = buffer ? registry.ObtainBindlessSampler(*buffer, offset) - : registry.ObtainBoundSampler(offset); if (!sampler) { LOG_WARNING(HW_GPU, "Unknown sampler info"); - return SamplerInfo{TextureType::Texture2D, false, false, false}; - } - return SamplerInfo{sampler->texture_type, sampler->is_array != 0, sampler->is_shadow != 0, - sampler->is_buffer != 0}; + info.type = info.type.value_or(Tegra::Shader::TextureType::Texture2D); + info.is_array = info.is_array.value_or(false); + info.is_shadow = info.is_shadow.value_or(false); + info.is_buffer = info.is_buffer.value_or(false); + return info; + } + info.type = info.type.value_or(sampler->texture_type); + info.is_array = info.is_array.value_or(sampler->is_array != 0); + info.is_shadow = info.is_shadow.value_or(sampler->is_shadow != 0); + info.is_buffer = info.is_buffer.value_or(sampler->is_buffer != 0); + return info; } -const Sampler* ShaderIR::GetSampler(const Tegra::Shader::Sampler& sampler, - std::optional<SamplerInfo> sampler_info) { - const auto offset = static_cast<u32>(sampler.index.Value()); - const auto info = GetSamplerInfo(sampler_info, offset); +std::optional<Sampler> ShaderIR::GetSampler(Tegra::Shader::Sampler sampler, + SamplerInfo sampler_info) { + const u32 offset = static_cast<u32>(sampler.index.Value()); + const auto info = GetSamplerInfo(sampler_info, registry.ObtainBoundSampler(offset)); // If this sampler has already been used, return the existing mapping. - const auto it = - std::find_if(used_samplers.begin(), used_samplers.end(), - [offset](const Sampler& entry) { return entry.GetOffset() == offset; }); + const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), + [offset](const Sampler& entry) { return entry.offset == offset; }); if (it != used_samplers.end()) { - ASSERT(!it->IsBindless() && it->GetType() == info.type && it->IsArray() == info.is_array && - it->IsShadow() == info.is_shadow && it->IsBuffer() == info.is_buffer); - return &*it; + ASSERT(!it->is_bindless && it->type == info.type && it->is_array == info.is_array && + it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer); + return *it; } // Otherwise create a new mapping for this sampler const auto next_index = static_cast<u32>(used_samplers.size()); - return &used_samplers.emplace_back(next_index, offset, info.type, info.is_array, info.is_shadow, - info.is_buffer, false); + return used_samplers.emplace_back(next_index, offset, *info.type, *info.is_array, + *info.is_shadow, *info.is_buffer, false); } -const Sampler* ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, Node& index_var, - std::optional<SamplerInfo> sampler_info) { +std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, SamplerInfo info, + Node& index_var) { const Node sampler_register = GetRegister(reg); const auto [base_node, tracked_sampler_info] = TrackBindlessSampler(sampler_register, global_code, static_cast<s64>(global_code.size())); - ASSERT(base_node != nullptr); - if (base_node == nullptr) { - return nullptr; + if (!base_node) { + UNREACHABLE(); + return std::nullopt; } - if (const auto bindless_sampler_info = - std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) { - const u32 buffer = bindless_sampler_info->GetIndex(); - const u32 offset = bindless_sampler_info->GetOffset(); - const auto info = GetSamplerInfo(sampler_info, offset, buffer); + if (const auto sampler_info = std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) { + const u32 buffer = sampler_info->index; + const u32 offset = sampler_info->offset; + info = GetSamplerInfo(info, registry.ObtainBindlessSampler(buffer, offset)); // If this sampler has already been used, return the existing mapping. - const auto it = - std::find_if(used_samplers.begin(), used_samplers.end(), - [buffer = buffer, offset = offset](const Sampler& entry) { - return entry.GetBuffer() == buffer && entry.GetOffset() == offset; - }); + const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), + [buffer, offset](const Sampler& entry) { + return entry.buffer == buffer && entry.offset == offset; + }); if (it != used_samplers.end()) { - ASSERT(it->IsBindless() && it->GetType() == info.type && - it->IsArray() == info.is_array && it->IsShadow() == info.is_shadow); - return &*it; + ASSERT(it->is_bindless && it->type == info.type && it->is_array == info.is_array && + it->is_shadow == info.is_shadow); + return *it; } // Otherwise create a new mapping for this sampler const auto next_index = static_cast<u32>(used_samplers.size()); - return &used_samplers.emplace_back(next_index, offset, buffer, info.type, info.is_array, - info.is_shadow, info.is_buffer, false); - } else if (const auto array_sampler_info = - std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) { - const u32 base_offset = array_sampler_info->GetBaseOffset() / 4; - index_var = GetCustomVariable(array_sampler_info->GetIndexVar()); - const auto info = GetSamplerInfo(sampler_info, base_offset); + return used_samplers.emplace_back(next_index, offset, buffer, *info.type, *info.is_array, + *info.is_shadow, *info.is_buffer, false); + } + if (const auto sampler_info = std::get_if<SeparateSamplerNode>(&*tracked_sampler_info)) { + const std::pair indices = sampler_info->indices; + const std::pair offsets = sampler_info->offsets; + info = GetSamplerInfo(info, registry.ObtainSeparateSampler(indices, offsets)); + + // Try to use an already created sampler if it exists + const auto it = std::find_if( + used_samplers.begin(), used_samplers.end(), [indices, offsets](const Sampler& entry) { + return offsets == std::pair{entry.offset, entry.secondary_offset} && + indices == std::pair{entry.buffer, entry.secondary_buffer}; + }); + if (it != used_samplers.end()) { + ASSERT(it->is_separated && it->type == info.type && it->is_array == info.is_array && + it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer); + return *it; + } + + // Otherwise create a new mapping for this sampler + const u32 next_index = static_cast<u32>(used_samplers.size()); + return used_samplers.emplace_back(next_index, offsets, indices, *info.type, *info.is_array, + *info.is_shadow, *info.is_buffer); + } + if (const auto sampler_info = std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) { + const u32 base_offset = sampler_info->base_offset / 4; + index_var = GetCustomVariable(sampler_info->bindless_var); + info = GetSamplerInfo(info, registry.ObtainBoundSampler(base_offset)); // If this sampler has already been used, return the existing mapping. const auto it = std::find_if( used_samplers.begin(), used_samplers.end(), - [base_offset](const Sampler& entry) { return entry.GetOffset() == base_offset; }); + [base_offset](const Sampler& entry) { return entry.offset == base_offset; }); if (it != used_samplers.end()) { - ASSERT(!it->IsBindless() && it->GetType() == info.type && - it->IsArray() == info.is_array && it->IsShadow() == info.is_shadow && - it->IsBuffer() == info.is_buffer && it->IsIndexed()); - return &*it; + ASSERT(!it->is_bindless && it->type == info.type && it->is_array == info.is_array && + it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer && + it->is_indexed); + return *it; } uses_indexed_samplers = true; // Otherwise create a new mapping for this sampler const auto next_index = static_cast<u32>(used_samplers.size()); - return &used_samplers.emplace_back(next_index, base_offset, info.type, info.is_array, - info.is_shadow, info.is_buffer, true); + return used_samplers.emplace_back(next_index, base_offset, *info.type, *info.is_array, + *info.is_shadow, *info.is_buffer, true); } - return nullptr; + return std::nullopt; } void ShaderIR::WriteTexInstructionFloat(NodeBlock& bb, Instruction instr, const Node4& components) { @@ -527,14 +556,19 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type, const bool is_shadow = depth_compare != nullptr; const bool is_bindless = bindless_reg.has_value(); - UNIMPLEMENTED_IF(texture_type == TextureType::TextureCube && is_array && is_shadow); ASSERT_MSG(texture_type != TextureType::Texture3D || !is_array || !is_shadow, "Illegal texture type"); - const SamplerInfo info{texture_type, is_array, is_shadow, false}; + SamplerInfo info; + info.type = texture_type; + info.is_array = is_array; + info.is_shadow = is_shadow; + info.is_buffer = false; + Node index_var; - const Sampler* sampler = is_bindless ? GetBindlessSampler(*bindless_reg, index_var, info) - : GetSampler(instr.sampler, info); + const std::optional<Sampler> sampler = is_bindless + ? GetBindlessSampler(*bindless_reg, info, index_var) + : GetSampler(instr.sampler, info); if (!sampler) { return {Immediate(0), Immediate(0), Immediate(0), Immediate(0)}; } @@ -593,8 +627,9 @@ Node4 ShaderIR::GetTexCode(Instruction instr, TextureType texture_type, ++parameter_register; } - const auto [coord_count, total_coord_count] = ValidateAndGetCoordinateElement( - texture_type, depth_compare, is_array, lod_bias_enabled, 4, 5); + const auto coord_counts = ValidateAndGetCoordinateElement(texture_type, depth_compare, is_array, + lod_bias_enabled, 4, 5); + const auto coord_count = std::get<0>(coord_counts); // If enabled arrays index is always stored in the gpr8 field const u64 array_register = instr.gpr8.Value(); // First coordinate index is the gpr8 or gpr8 + 1 when arrays are used @@ -632,8 +667,10 @@ Node4 ShaderIR::GetTexsCode(Instruction instr, TextureType texture_type, const bool lod_bias_enabled = (process_mode != TextureProcessMode::None && process_mode != TextureProcessMode::LZ); - const auto [coord_count, total_coord_count] = ValidateAndGetCoordinateElement( - texture_type, depth_compare, is_array, lod_bias_enabled, 4, 4); + const auto coord_counts = ValidateAndGetCoordinateElement(texture_type, depth_compare, is_array, + lod_bias_enabled, 4, 4); + const auto coord_count = std::get<0>(coord_counts); + // If enabled arrays index is always stored in the gpr8 field const u64 array_register = instr.gpr8.Value(); // First coordinate index is stored in gpr8 field or (gpr8 + 1) when arrays are used @@ -682,12 +719,17 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de u64 parameter_register = instr.gpr20.Value(); - const SamplerInfo info{texture_type, is_array, depth_compare, false}; - Node index_var{}; - const Sampler* sampler = is_bindless ? GetBindlessSampler(parameter_register++, index_var, info) - : GetSampler(instr.sampler, info); + SamplerInfo info; + info.type = texture_type; + info.is_array = is_array; + info.is_shadow = depth_compare; + + Node index_var; + const std::optional<Sampler> sampler = + is_bindless ? GetBindlessSampler(parameter_register++, info, index_var) + : GetSampler(instr.sampler, info); Node4 values; - if (sampler == nullptr) { + if (!sampler) { for (u32 element = 0; element < values.size(); ++element) { values[element] = Immediate(0); } @@ -723,7 +765,7 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de Node4 ShaderIR::GetTldCode(Tegra::Shader::Instruction instr) { const auto texture_type{instr.tld.texture_type}; - const bool is_array{instr.tld.is_array}; + const bool is_array{instr.tld.is_array != 0}; const bool lod_enabled{instr.tld.GetTextureProcessMode() == TextureProcessMode::LL}; const std::size_t coord_count{GetCoordCount(texture_type)}; @@ -742,12 +784,12 @@ Node4 ShaderIR::GetTldCode(Tegra::Shader::Instruction instr) { // const Node aoffi_register{is_aoffi ? GetRegister(gpr20_cursor++) : nullptr}; // const Node multisample{is_multisample ? GetRegister(gpr20_cursor++) : nullptr}; - const auto& sampler = *GetSampler(instr.sampler); + const std::optional<Sampler> sampler = GetSampler(instr.sampler, {}); Node4 values; for (u32 element = 0; element < values.size(); ++element) { auto coords_copy = coords; - MetaTexture meta{sampler, array_register, {}, {}, {}, {}, {}, lod, {}, element, {}}; + MetaTexture meta{*sampler, array_register, {}, {}, {}, {}, {}, lod, {}, element, {}}; values[element] = Operation(OperationCode::TexelFetch, meta, std::move(coords_copy)); } @@ -755,7 +797,11 @@ Node4 ShaderIR::GetTldCode(Tegra::Shader::Instruction instr) { } Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is_array) { - const Sampler& sampler = *GetSampler(instr.sampler); + SamplerInfo info; + info.type = texture_type; + info.is_array = is_array; + info.is_shadow = false; + const std::optional<Sampler> sampler = GetSampler(instr.sampler, info); const std::size_t type_coord_count = GetCoordCount(texture_type); const bool lod_enabled = instr.tlds.GetTextureProcessMode() == TextureProcessMode::LL; @@ -783,7 +829,7 @@ Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is Node4 values; for (u32 element = 0; element < values.size(); ++element) { auto coords_copy = coords; - MetaTexture meta{sampler, array, {}, {}, {}, {}, {}, lod, {}, element, {}}; + MetaTexture meta{*sampler, array, {}, {}, {}, {}, {}, lod, {}, element, {}}; values[element] = Operation(OperationCode::TexelFetch, meta, std::move(coords_copy)); } return values; diff --git a/src/video_core/shader/decode/video.cpp b/src/video_core/shader/decode/video.cpp index 64ba60ea2..1c0957277 100644 --- a/src/video_core/shader/decode/video.cpp +++ b/src/video_core/shader/decode/video.cpp @@ -91,29 +91,28 @@ u32 ShaderIR::DecodeVideo(NodeBlock& bb, u32 pc) { return pc; } -Node ShaderIR::GetVideoOperand(Node op, bool is_chunk, bool is_signed, - Tegra::Shader::VideoType type, u64 byte_height) { +Node ShaderIR::GetVideoOperand(Node op, bool is_chunk, bool is_signed, VideoType type, + u64 byte_height) { if (!is_chunk) { return BitfieldExtract(op, static_cast<u32>(byte_height * 8), 8); } - const Node zero = Immediate(0); switch (type) { - case Tegra::Shader::VideoType::Size16_Low: + case VideoType::Size16_Low: return BitfieldExtract(op, 0, 16); - case Tegra::Shader::VideoType::Size16_High: + case VideoType::Size16_High: return BitfieldExtract(op, 16, 16); - case Tegra::Shader::VideoType::Size32: + case VideoType::Size32: // TODO(Rodrigo): From my hardware tests it becomes a bit "mad" when this type is used // (1 * 1 + 0 == 0x5b800000). Until a better explanation is found: abort. UNIMPLEMENTED(); - return zero; - case Tegra::Shader::VideoType::Invalid: + return Immediate(0); + case VideoType::Invalid: UNREACHABLE_MSG("Invalid instruction encoding"); - return zero; + return Immediate(0); default: UNREACHABLE(); - return zero; + return Immediate(0); } } diff --git a/src/video_core/shader/decode/xmad.cpp b/src/video_core/shader/decode/xmad.cpp index 6191ffba1..233b8fa42 100644 --- a/src/video_core/shader/decode/xmad.cpp +++ b/src/video_core/shader/decode/xmad.cpp @@ -81,35 +81,36 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) { SetTemporary(bb, 0, product); product = GetTemporary(0); - const Node original_c = op_c; + Node original_c = op_c; const Tegra::Shader::XmadMode set_mode = mode; // Workaround to clang compile error - op_c = [&]() { + op_c = [&] { switch (set_mode) { case Tegra::Shader::XmadMode::None: return original_c; case Tegra::Shader::XmadMode::CLo: - return BitfieldExtract(original_c, 0, 16); + return BitfieldExtract(std::move(original_c), 0, 16); case Tegra::Shader::XmadMode::CHi: - return BitfieldExtract(original_c, 16, 16); + return BitfieldExtract(std::move(original_c), 16, 16); case Tegra::Shader::XmadMode::CBcc: { - const Node shifted_b = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed_b, - original_b, Immediate(16)); - return SignedOperation(OperationCode::IAdd, is_signed_c, original_c, shifted_b); + Node shifted_b = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed_b, + original_b, Immediate(16)); + return SignedOperation(OperationCode::IAdd, is_signed_c, std::move(original_c), + std::move(shifted_b)); } case Tegra::Shader::XmadMode::CSfu: { - const Node comp_a = GetPredicateComparisonInteger(PredCondition::Equal, is_signed_a, - op_a, Immediate(0)); - const Node comp_b = GetPredicateComparisonInteger(PredCondition::Equal, is_signed_b, - op_b, Immediate(0)); + const Node comp_a = + GetPredicateComparisonInteger(PredCondition::EQ, is_signed_a, op_a, Immediate(0)); + const Node comp_b = + GetPredicateComparisonInteger(PredCondition::EQ, is_signed_b, op_b, Immediate(0)); const Node comp = Operation(OperationCode::LogicalOr, comp_a, comp_b); const Node comp_minus_a = GetPredicateComparisonInteger( - PredCondition::NotEqual, is_signed_a, + PredCondition::NE, is_signed_a, SignedOperation(OperationCode::IBitwiseAnd, is_signed_a, op_a, Immediate(0x80000000)), Immediate(0)); const Node comp_minus_b = GetPredicateComparisonInteger( - PredCondition::NotEqual, is_signed_b, + PredCondition::NE, is_signed_b, SignedOperation(OperationCode::IBitwiseAnd, is_signed_b, op_b, Immediate(0x80000000)), Immediate(0)); diff --git a/src/video_core/shader/memory_util.cpp b/src/video_core/shader/memory_util.cpp new file mode 100644 index 000000000..e18ccba8e --- /dev/null +++ b/src/video_core/shader/memory_util.cpp @@ -0,0 +1,76 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <cstddef> + +#include <boost/container_hash/hash.hpp> + +#include "common/common_types.h" +#include "core/core.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/memory_manager.h" +#include "video_core/shader/memory_util.h" +#include "video_core/shader/shader_ir.h" + +namespace VideoCommon::Shader { + +GPUVAddr GetShaderAddress(Tegra::Engines::Maxwell3D& maxwell3d, + Tegra::Engines::Maxwell3D::Regs::ShaderProgram program) { + const auto& shader_config{maxwell3d.regs.shader_config[static_cast<std::size_t>(program)]}; + return maxwell3d.regs.code_address.CodeAddress() + shader_config.offset; +} + +bool IsSchedInstruction(std::size_t offset, std::size_t main_offset) { + // Sched instructions appear once every 4 instructions. + constexpr std::size_t SchedPeriod = 4; + const std::size_t absolute_offset = offset - main_offset; + return (absolute_offset % SchedPeriod) == 0; +} + +std::size_t CalculateProgramSize(const ProgramCode& program, bool is_compute) { + // This is the encoded version of BRA that jumps to itself. All Nvidia + // shaders end with one. + static constexpr u64 SELF_JUMPING_BRANCH = 0xE2400FFFFF07000FULL; + static constexpr u64 MASK = 0xFFFFFFFFFF7FFFFFULL; + + const std::size_t start_offset = is_compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET; + std::size_t offset = start_offset; + while (offset < program.size()) { + const u64 instruction = program[offset]; + if (!IsSchedInstruction(offset, start_offset)) { + if ((instruction & MASK) == SELF_JUMPING_BRANCH) { + // End on Maxwell's "nop" instruction + break; + } + if (instruction == 0) { + break; + } + } + ++offset; + } + // The last instruction is included in the program size + return std::min(offset + 1, program.size()); +} + +ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, GPUVAddr gpu_addr, + const u8* host_ptr, bool is_compute) { + ProgramCode code(VideoCommon::Shader::MAX_PROGRAM_LENGTH); + ASSERT_OR_EXECUTE(host_ptr != nullptr, { return code; }); + memory_manager.ReadBlockUnsafe(gpu_addr, code.data(), code.size() * sizeof(u64)); + code.resize(CalculateProgramSize(code, is_compute)); + return code; +} + +u64 GetUniqueIdentifier(Tegra::Engines::ShaderType shader_type, bool is_a, const ProgramCode& code, + const ProgramCode& code_b) { + size_t unique_identifier = boost::hash_value(code); + if (is_a) { + // VertexA programs include two programs + boost::hash_combine(unique_identifier, boost::hash_value(code_b)); + } + return static_cast<u64>(unique_identifier); +} + +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/memory_util.h b/src/video_core/shader/memory_util.h new file mode 100644 index 000000000..4624d38e6 --- /dev/null +++ b/src/video_core/shader/memory_util.h @@ -0,0 +1,43 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <cstddef> +#include <vector> + +#include "common/common_types.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/engines/shader_type.h" + +namespace Tegra { +class MemoryManager; +} + +namespace VideoCommon::Shader { + +using ProgramCode = std::vector<u64>; + +constexpr u32 STAGE_MAIN_OFFSET = 10; +constexpr u32 KERNEL_MAIN_OFFSET = 0; + +/// Gets the address for the specified shader stage program +GPUVAddr GetShaderAddress(Tegra::Engines::Maxwell3D& maxwell3d, + Tegra::Engines::Maxwell3D::Regs::ShaderProgram program); + +/// Gets if the current instruction offset is a scheduler instruction +bool IsSchedInstruction(std::size_t offset, std::size_t main_offset); + +/// Calculates the size of a program stream +std::size_t CalculateProgramSize(const ProgramCode& program, bool is_compute); + +/// Gets the shader program code from memory for the specified address +ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, GPUVAddr gpu_addr, + const u8* host_ptr, bool is_compute); + +/// Hashes one (or two) program streams +u64 GetUniqueIdentifier(Tegra::Engines::ShaderType shader_type, bool is_a, const ProgramCode& code, + const ProgramCode& code_b = {}); + +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index 3eee961f5..8f230d57a 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -110,13 +110,20 @@ enum class OperationCode { LogicalPick2, /// (bool2 pair, uint index) -> bool LogicalAnd2, /// (bool2 a) -> bool - LogicalFLessThan, /// (float a, float b) -> bool - LogicalFEqual, /// (float a, float b) -> bool - LogicalFLessEqual, /// (float a, float b) -> bool - LogicalFGreaterThan, /// (float a, float b) -> bool - LogicalFNotEqual, /// (float a, float b) -> bool - LogicalFGreaterEqual, /// (float a, float b) -> bool - LogicalFIsNan, /// (float a) -> bool + LogicalFOrdLessThan, /// (float a, float b) -> bool + LogicalFOrdEqual, /// (float a, float b) -> bool + LogicalFOrdLessEqual, /// (float a, float b) -> bool + LogicalFOrdGreaterThan, /// (float a, float b) -> bool + LogicalFOrdNotEqual, /// (float a, float b) -> bool + LogicalFOrdGreaterEqual, /// (float a, float b) -> bool + LogicalFOrdered, /// (float a, float b) -> bool + LogicalFUnordered, /// (float a, float b) -> bool + LogicalFUnordLessThan, /// (float a, float b) -> bool + LogicalFUnordEqual, /// (float a, float b) -> bool + LogicalFUnordLessEqual, /// (float a, float b) -> bool + LogicalFUnordGreaterThan, /// (float a, float b) -> bool + LogicalFUnordNotEqual, /// (float a, float b) -> bool + LogicalFUnordGreaterEqual, /// (float a, float b) -> bool LogicalILessThan, /// (int a, int b) -> bool LogicalIEqual, /// (int a, int b) -> bool @@ -132,6 +139,8 @@ enum class OperationCode { LogicalUNotEqual, /// (uint a, uint b) -> bool LogicalUGreaterEqual, /// (uint a, uint b) -> bool + LogicalAddCarry, /// (uint a, uint b) -> bool + Logical2HLessThan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 Logical2HEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 Logical2HLessEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 @@ -217,9 +226,16 @@ enum class OperationCode { VoteEqual, /// (bool) -> bool ThreadId, /// () -> uint + ThreadEqMask, /// () -> uint + ThreadGeMask, /// () -> uint + ThreadGtMask, /// () -> uint + ThreadLeMask, /// () -> uint + ThreadLtMask, /// () -> uint ShuffleIndexed, /// (uint value, uint index) -> uint - MemoryBarrierGL, /// () -> void + Barrier, /// () -> void + MemoryBarrierGroup, /// () -> void + MemoryBarrierGlobal, /// () -> void Amount, }; @@ -259,133 +275,76 @@ using Node = std::shared_ptr<NodeData>; using Node4 = std::array<Node, 4>; using NodeBlock = std::vector<Node>; -class BindlessSamplerNode; -class ArraySamplerNode; +struct ArraySamplerNode; +struct BindlessSamplerNode; +struct SeparateSamplerNode; -using TrackSamplerData = std::variant<BindlessSamplerNode, ArraySamplerNode>; +using TrackSamplerData = std::variant<BindlessSamplerNode, SeparateSamplerNode, ArraySamplerNode>; using TrackSampler = std::shared_ptr<TrackSamplerData>; -class Sampler { -public: - /// This constructor is for bound samplers +struct Sampler { + /// Bound samplers constructor constexpr explicit Sampler(u32 index, u32 offset, Tegra::Shader::TextureType type, bool is_array, bool is_shadow, bool is_buffer, bool is_indexed) : index{index}, offset{offset}, type{type}, is_array{is_array}, is_shadow{is_shadow}, is_buffer{is_buffer}, is_indexed{is_indexed} {} - /// This constructor is for bindless samplers + /// Separate sampler constructor + constexpr explicit Sampler(u32 index, std::pair<u32, u32> offsets, std::pair<u32, u32> buffers, + Tegra::Shader::TextureType type, bool is_array, bool is_shadow, + bool is_buffer) + : index{index}, offset{offsets.first}, secondary_offset{offsets.second}, + buffer{buffers.first}, secondary_buffer{buffers.second}, type{type}, is_array{is_array}, + is_shadow{is_shadow}, is_buffer{is_buffer}, is_separated{true} {} + + /// Bindless samplers constructor constexpr explicit Sampler(u32 index, u32 offset, u32 buffer, Tegra::Shader::TextureType type, bool is_array, bool is_shadow, bool is_buffer, bool is_indexed) : index{index}, offset{offset}, buffer{buffer}, type{type}, is_array{is_array}, is_shadow{is_shadow}, is_buffer{is_buffer}, is_bindless{true}, is_indexed{is_indexed} {} - constexpr u32 GetIndex() const { - return index; - } - - constexpr u32 GetOffset() const { - return offset; - } - - constexpr u32 GetBuffer() const { - return buffer; - } - - constexpr Tegra::Shader::TextureType GetType() const { - return type; - } - - constexpr bool IsArray() const { - return is_array; - } - - constexpr bool IsShadow() const { - return is_shadow; - } - - constexpr bool IsBuffer() const { - return is_buffer; - } - - constexpr bool IsBindless() const { - return is_bindless; - } - - constexpr bool IsIndexed() const { - return is_indexed; - } - - constexpr u32 Size() const { - return size; - } - - constexpr void SetSize(u32 new_size) { - size = new_size; - } - -private: - u32 index{}; ///< Emulated index given for the this sampler. - u32 offset{}; ///< Offset in the const buffer from where the sampler is being read. - u32 buffer{}; ///< Buffer where the bindless sampler is being read (unused on bound samplers). - u32 size{1}; ///< Size of the sampler. + u32 index = 0; ///< Emulated index given for the this sampler. + u32 offset = 0; ///< Offset in the const buffer from where the sampler is being read. + u32 secondary_offset = 0; ///< Secondary offset in the const buffer. + u32 buffer = 0; ///< Buffer where the bindless sampler is read. + u32 secondary_buffer = 0; ///< Secondary buffer where the bindless sampler is read. + u32 size = 1; ///< Size of the sampler. Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc) - bool is_array{}; ///< Whether the texture is being sampled as an array texture or not. - bool is_shadow{}; ///< Whether the texture is being sampled as a depth texture or not. - bool is_buffer{}; ///< Whether the texture is a texture buffer without sampler. - bool is_bindless{}; ///< Whether this sampler belongs to a bindless texture or not. - bool is_indexed{}; ///< Whether this sampler is an indexed array of textures. + bool is_array = false; ///< Whether the texture is being sampled as an array texture or not. + bool is_shadow = false; ///< Whether the texture is being sampled as a depth texture or not. + bool is_buffer = false; ///< Whether the texture is a texture buffer without sampler. + bool is_bindless = false; ///< Whether this sampler belongs to a bindless texture or not. + bool is_indexed = false; ///< Whether this sampler is an indexed array of textures. + bool is_separated = false; ///< Whether the image and sampler is separated or not. }; /// Represents a tracked bindless sampler into a direct const buffer -class ArraySamplerNode final { -public: - explicit ArraySamplerNode(u32 index, u32 base_offset, u32 bindless_var) - : index{index}, base_offset{base_offset}, bindless_var{bindless_var} {} - - constexpr u32 GetIndex() const { - return index; - } - - constexpr u32 GetBaseOffset() const { - return base_offset; - } - - constexpr u32 GetIndexVar() const { - return bindless_var; - } - -private: +struct ArraySamplerNode { u32 index; u32 base_offset; u32 bindless_var; }; -/// Represents a tracked bindless sampler into a direct const buffer -class BindlessSamplerNode final { -public: - explicit BindlessSamplerNode(u32 index, u32 offset) : index{index}, offset{offset} {} - - constexpr u32 GetIndex() const { - return index; - } - - constexpr u32 GetOffset() const { - return offset; - } +/// Represents a tracked separate sampler image pair that was folded statically +struct SeparateSamplerNode { + std::pair<u32, u32> indices; + std::pair<u32, u32> offsets; +}; -private: +/// Represents a tracked bindless sampler into a direct const buffer +struct BindlessSamplerNode { u32 index; u32 offset; }; -class Image final { +struct Image { public: - /// This constructor is for bound images + /// Bound images constructor constexpr explicit Image(u32 index, u32 offset, Tegra::Shader::ImageType type) : index{index}, offset{offset}, type{type} {} - /// This constructor is for bindless samplers + /// Bindless samplers constructor constexpr explicit Image(u32 index, u32 offset, u32 buffer, Tegra::Shader::ImageType type) : index{index}, offset{offset}, buffer{buffer}, type{type}, is_bindless{true} {} @@ -403,53 +362,20 @@ public: is_atomic = true; } - constexpr u32 GetIndex() const { - return index; - } - - constexpr u32 GetOffset() const { - return offset; - } - - constexpr u32 GetBuffer() const { - return buffer; - } - - constexpr Tegra::Shader::ImageType GetType() const { - return type; - } - - constexpr bool IsBindless() const { - return is_bindless; - } - - constexpr bool IsWritten() const { - return is_written; - } - - constexpr bool IsRead() const { - return is_read; - } - - constexpr bool IsAtomic() const { - return is_atomic; - } - -private: - u32 index{}; - u32 offset{}; - u32 buffer{}; + u32 index = 0; + u32 offset = 0; + u32 buffer = 0; Tegra::Shader::ImageType type{}; - bool is_bindless{}; - bool is_written{}; - bool is_read{}; - bool is_atomic{}; + bool is_bindless = false; + bool is_written = false; + bool is_read = false; + bool is_atomic = false; }; struct GlobalMemoryBase { - u32 cbuf_index{}; - u32 cbuf_offset{}; + u32 cbuf_index = 0; + u32 cbuf_offset = 0; bool operator<(const GlobalMemoryBase& rhs) const { return std::tie(cbuf_index, cbuf_offset) < std::tie(rhs.cbuf_index, rhs.cbuf_offset); @@ -463,7 +389,7 @@ struct MetaArithmetic { /// Parameters describing a texture sampler struct MetaTexture { - const Sampler& sampler; + Sampler sampler; Node array; Node depth_compare; std::vector<Node> aoffi; diff --git a/src/video_core/shader/node_helper.h b/src/video_core/shader/node_helper.h index 11231bbea..1e0886185 100644 --- a/src/video_core/shader/node_helper.h +++ b/src/video_core/shader/node_helper.h @@ -48,7 +48,7 @@ Node MakeNode(Args&&... args) { template <typename T, typename... Args> TrackSampler MakeTrackSampler(Args&&... args) { static_assert(std::is_convertible_v<T, TrackSamplerData>); - return std::make_shared<TrackSamplerData>(T(std::forward<Args>(args)...)); + return std::make_shared<TrackSamplerData>(T{std::forward<Args>(args)...}); } template <typename... Args> diff --git a/src/video_core/shader/registry.cpp b/src/video_core/shader/registry.cpp index af70b3f35..148d91fcb 100644 --- a/src/video_core/shader/registry.cpp +++ b/src/video_core/shader/registry.cpp @@ -24,44 +24,45 @@ GraphicsInfo MakeGraphicsInfo(ShaderType shader_stage, ConstBufferEngineInterfac if (shader_stage == ShaderType::Compute) { return {}; } - auto& graphics = static_cast<Tegra::Engines::Maxwell3D&>(engine); - - GraphicsInfo info; - info.tfb_layouts = graphics.regs.tfb_layouts; - info.tfb_varying_locs = graphics.regs.tfb_varying_locs; - info.primitive_topology = graphics.regs.draw.topology; - info.tessellation_primitive = graphics.regs.tess_mode.prim; - info.tessellation_spacing = graphics.regs.tess_mode.spacing; - info.tfb_enabled = graphics.regs.tfb_enabled; - info.tessellation_clockwise = graphics.regs.tess_mode.cw; - return info; + + auto& graphics = dynamic_cast<Tegra::Engines::Maxwell3D&>(engine); + + return { + .tfb_layouts = graphics.regs.tfb_layouts, + .tfb_varying_locs = graphics.regs.tfb_varying_locs, + .primitive_topology = graphics.regs.draw.topology, + .tessellation_primitive = graphics.regs.tess_mode.prim, + .tessellation_spacing = graphics.regs.tess_mode.spacing, + .tfb_enabled = graphics.regs.tfb_enabled != 0, + .tessellation_clockwise = graphics.regs.tess_mode.cw.Value() != 0, + }; } ComputeInfo MakeComputeInfo(ShaderType shader_stage, ConstBufferEngineInterface& engine) { if (shader_stage != ShaderType::Compute) { return {}; } - auto& compute = static_cast<Tegra::Engines::KeplerCompute&>(engine); + + auto& compute = dynamic_cast<Tegra::Engines::KeplerCompute&>(engine); const auto& launch = compute.launch_description; - ComputeInfo info; - info.workgroup_size = {launch.block_dim_x, launch.block_dim_y, launch.block_dim_z}; - info.local_memory_size_in_words = launch.local_pos_alloc; - info.shared_memory_size_in_words = launch.shared_alloc; - return info; + return { + .workgroup_size = {launch.block_dim_x, launch.block_dim_y, launch.block_dim_z}, + .shared_memory_size_in_words = launch.shared_alloc, + .local_memory_size_in_words = launch.local_pos_alloc, + }; } } // Anonymous namespace -Registry::Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info) +Registry::Registry(ShaderType shader_stage, const SerializedRegistryInfo& info) : stage{shader_stage}, stored_guest_driver_profile{info.guest_driver_profile}, bound_buffer{info.bound_buffer}, graphics_info{info.graphics}, compute_info{info.compute} {} -Registry::Registry(Tegra::Engines::ShaderType shader_stage, - Tegra::Engines::ConstBufferEngineInterface& engine) - : stage{shader_stage}, engine{&engine}, bound_buffer{engine.GetBoundBuffer()}, - graphics_info{MakeGraphicsInfo(shader_stage, engine)}, compute_info{MakeComputeInfo( - shader_stage, engine)} {} +Registry::Registry(ShaderType shader_stage, ConstBufferEngineInterface& engine_) + : stage{shader_stage}, engine{&engine_}, bound_buffer{engine_.GetBoundBuffer()}, + graphics_info{MakeGraphicsInfo(shader_stage, engine_)}, compute_info{MakeComputeInfo( + shader_stage, engine_)} {} Registry::~Registry() = default; @@ -93,8 +94,27 @@ std::optional<SamplerDescriptor> Registry::ObtainBoundSampler(u32 offset) { return value; } -std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer, - u32 offset) { +std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainSeparateSampler( + std::pair<u32, u32> buffers, std::pair<u32, u32> offsets) { + SeparateSamplerKey key; + key.buffers = buffers; + key.offsets = offsets; + const auto iter = separate_samplers.find(key); + if (iter != separate_samplers.end()) { + return iter->second; + } + if (!engine) { + return std::nullopt; + } + + const u32 handle_1 = engine->AccessConstBuffer32(stage, key.buffers.first, key.offsets.first); + const u32 handle_2 = engine->AccessConstBuffer32(stage, key.buffers.second, key.offsets.second); + const SamplerDescriptor value = engine->AccessSampler(handle_1 | handle_2); + separate_samplers.emplace(key, value); + return value; +} + +std::optional<SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer, u32 offset) { const std::pair key = {buffer, offset}; const auto iter = bindless_samplers.find(key); if (iter != bindless_samplers.end()) { diff --git a/src/video_core/shader/registry.h b/src/video_core/shader/registry.h index 0c80d35fd..4bebefdde 100644 --- a/src/video_core/shader/registry.h +++ b/src/video_core/shader/registry.h @@ -19,8 +19,39 @@ namespace VideoCommon::Shader { +struct SeparateSamplerKey { + std::pair<u32, u32> buffers; + std::pair<u32, u32> offsets; +}; + +} // namespace VideoCommon::Shader + +namespace std { + +template <> +struct hash<VideoCommon::Shader::SeparateSamplerKey> { + std::size_t operator()(const VideoCommon::Shader::SeparateSamplerKey& key) const noexcept { + return std::hash<u32>{}(key.buffers.first ^ key.buffers.second ^ key.offsets.first ^ + key.offsets.second); + } +}; + +template <> +struct equal_to<VideoCommon::Shader::SeparateSamplerKey> { + bool operator()(const VideoCommon::Shader::SeparateSamplerKey& lhs, + const VideoCommon::Shader::SeparateSamplerKey& rhs) const noexcept { + return lhs.buffers == rhs.buffers && lhs.offsets == rhs.offsets; + } +}; + +} // namespace std + +namespace VideoCommon::Shader { + using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>; using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>; +using SeparateSamplerMap = + std::unordered_map<SeparateSamplerKey, Tegra::Engines::SamplerDescriptor>; using BindlessSamplerMap = std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>; @@ -63,7 +94,7 @@ public: explicit Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info); explicit Registry(Tegra::Engines::ShaderType shader_stage, - Tegra::Engines::ConstBufferEngineInterface& engine); + Tegra::Engines::ConstBufferEngineInterface& engine_); ~Registry(); @@ -73,6 +104,9 @@ public: std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset); + std::optional<Tegra::Engines::SamplerDescriptor> ObtainSeparateSampler( + std::pair<u32, u32> buffers, std::pair<u32, u32> offsets); + std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset); /// Inserts a key. @@ -128,6 +162,7 @@ private: Tegra::Engines::ConstBufferEngineInterface* engine = nullptr; KeyMap keys; BoundSamplerMap bound_samplers; + SeparateSamplerMap separate_samplers; BindlessSamplerMap bindless_samplers; u32 bound_buffer; GraphicsInfo graphics_info; diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp index 8852c8a1b..29d794b34 100644 --- a/src/video_core/shader/shader_ir.cpp +++ b/src/video_core/shader/shader_ir.cpp @@ -10,6 +10,7 @@ #include "common/common_types.h" #include "common/logging/log.h" #include "video_core/engines/shader_bytecode.h" +#include "video_core/shader/node.h" #include "video_core/shader/node_helper.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" @@ -56,8 +57,7 @@ Node ShaderIR::GetConstBuffer(u64 index_, u64 offset_) { const auto index = static_cast<u32>(index_); const auto offset = static_cast<u32>(offset_); - const auto [entry, is_new] = used_cbufs.try_emplace(index); - entry->second.MarkAsUsed(offset); + used_cbufs.try_emplace(index).first->second.MarkAsUsed(offset); return MakeNode<CbufNode>(index, Immediate(offset)); } @@ -66,8 +66,7 @@ Node ShaderIR::GetConstBufferIndirect(u64 index_, u64 offset_, Node node) { const auto index = static_cast<u32>(index_); const auto offset = static_cast<u32>(offset_); - const auto [entry, is_new] = used_cbufs.try_emplace(index); - entry->second.MarkAsUsedIndirect(); + used_cbufs.try_emplace(index).first->second.MarkAsUsedIndirect(); Node final_offset = [&] { // Attempt to inline constant buffer without a variable offset. This is done to allow @@ -113,9 +112,9 @@ Node ShaderIR::GetOutputAttribute(Attribute::Index index, u64 element, Node buff } Node ShaderIR::GetInternalFlag(InternalFlag flag, bool negated) const { - const Node node = MakeNode<InternalFlagNode>(flag); + Node node = MakeNode<InternalFlagNode>(flag); if (negated) { - return Operation(OperationCode::LogicalNegate, node); + return Operation(OperationCode::LogicalNegate, std::move(node)); } return node; } @@ -166,6 +165,7 @@ Node ShaderIR::ConvertIntegerSize(Node value, Register::Size size, bool is_signe std::move(value), Immediate(16)); value = SignedOperation(OperationCode::IArithmeticShiftRight, is_signed, NO_PRECISE, std::move(value), Immediate(16)); + return value; case Register::Size::Word: // Default - do nothing return value; @@ -244,56 +244,44 @@ Node ShaderIR::GetSaturatedHalfFloat(Node value, bool saturate) { } Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, Node op_b) { + if (condition == PredCondition::T) { + return GetPredicate(true); + } else if (condition == PredCondition::F) { + return GetPredicate(false); + } + static constexpr std::array comparison_table{ - std::pair{PredCondition::LessThan, OperationCode::LogicalFLessThan}, - std::pair{PredCondition::Equal, OperationCode::LogicalFEqual}, - std::pair{PredCondition::LessEqual, OperationCode::LogicalFLessEqual}, - std::pair{PredCondition::GreaterThan, OperationCode::LogicalFGreaterThan}, - std::pair{PredCondition::NotEqual, OperationCode::LogicalFNotEqual}, - std::pair{PredCondition::GreaterEqual, OperationCode::LogicalFGreaterEqual}, - std::pair{PredCondition::LessThanWithNan, OperationCode::LogicalFLessThan}, - std::pair{PredCondition::NotEqualWithNan, OperationCode::LogicalFNotEqual}, - std::pair{PredCondition::LessEqualWithNan, OperationCode::LogicalFLessEqual}, - std::pair{PredCondition::GreaterThanWithNan, OperationCode::LogicalFGreaterThan}, - std::pair{PredCondition::GreaterEqualWithNan, OperationCode::LogicalFGreaterEqual}, + OperationCode(0), + OperationCode::LogicalFOrdLessThan, // LT + OperationCode::LogicalFOrdEqual, // EQ + OperationCode::LogicalFOrdLessEqual, // LE + OperationCode::LogicalFOrdGreaterThan, // GT + OperationCode::LogicalFOrdNotEqual, // NE + OperationCode::LogicalFOrdGreaterEqual, // GE + OperationCode::LogicalFOrdered, // NUM + OperationCode::LogicalFUnordered, // NAN + OperationCode::LogicalFUnordLessThan, // LTU + OperationCode::LogicalFUnordEqual, // EQU + OperationCode::LogicalFUnordLessEqual, // LEU + OperationCode::LogicalFUnordGreaterThan, // GTU + OperationCode::LogicalFUnordNotEqual, // NEU + OperationCode::LogicalFUnordGreaterEqual, // GEU }; + const std::size_t index = static_cast<std::size_t>(condition); + ASSERT_MSG(index < std::size(comparison_table), "Invalid condition={}", index); - const auto comparison = - std::find_if(comparison_table.cbegin(), comparison_table.cend(), - [condition](const auto entry) { return condition == entry.first; }); - UNIMPLEMENTED_IF_MSG(comparison == comparison_table.cend(), - "Unknown predicate comparison operation"); - - Node predicate = Operation(comparison->second, NO_PRECISE, op_a, op_b); - - if (condition == PredCondition::LessThanWithNan || - condition == PredCondition::NotEqualWithNan || - condition == PredCondition::LessEqualWithNan || - condition == PredCondition::GreaterThanWithNan || - condition == PredCondition::GreaterEqualWithNan) { - predicate = Operation(OperationCode::LogicalOr, predicate, - Operation(OperationCode::LogicalFIsNan, op_a)); - predicate = Operation(OperationCode::LogicalOr, predicate, - Operation(OperationCode::LogicalFIsNan, op_b)); - } - - return predicate; + return Operation(comparison_table[index], op_a, op_b); } Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_signed, Node op_a, Node op_b) { static constexpr std::array comparison_table{ - std::pair{PredCondition::LessThan, OperationCode::LogicalILessThan}, - std::pair{PredCondition::Equal, OperationCode::LogicalIEqual}, - std::pair{PredCondition::LessEqual, OperationCode::LogicalILessEqual}, - std::pair{PredCondition::GreaterThan, OperationCode::LogicalIGreaterThan}, - std::pair{PredCondition::NotEqual, OperationCode::LogicalINotEqual}, - std::pair{PredCondition::GreaterEqual, OperationCode::LogicalIGreaterEqual}, - std::pair{PredCondition::LessThanWithNan, OperationCode::LogicalILessThan}, - std::pair{PredCondition::NotEqualWithNan, OperationCode::LogicalINotEqual}, - std::pair{PredCondition::LessEqualWithNan, OperationCode::LogicalILessEqual}, - std::pair{PredCondition::GreaterThanWithNan, OperationCode::LogicalIGreaterThan}, - std::pair{PredCondition::GreaterEqualWithNan, OperationCode::LogicalIGreaterEqual}, + std::pair{PredCondition::LT, OperationCode::LogicalILessThan}, + std::pair{PredCondition::EQ, OperationCode::LogicalIEqual}, + std::pair{PredCondition::LE, OperationCode::LogicalILessEqual}, + std::pair{PredCondition::GT, OperationCode::LogicalIGreaterThan}, + std::pair{PredCondition::NE, OperationCode::LogicalINotEqual}, + std::pair{PredCondition::GE, OperationCode::LogicalIGreaterEqual}, }; const auto comparison = @@ -302,32 +290,24 @@ Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_si UNIMPLEMENTED_IF_MSG(comparison == comparison_table.cend(), "Unknown predicate comparison operation"); - Node predicate = SignedOperation(comparison->second, is_signed, NO_PRECISE, std::move(op_a), - std::move(op_b)); - - UNIMPLEMENTED_IF_MSG(condition == PredCondition::LessThanWithNan || - condition == PredCondition::NotEqualWithNan || - condition == PredCondition::LessEqualWithNan || - condition == PredCondition::GreaterThanWithNan || - condition == PredCondition::GreaterEqualWithNan, - "NaN comparisons for integers are not implemented"); - return predicate; + return SignedOperation(comparison->second, is_signed, NO_PRECISE, std::move(op_a), + std::move(op_b)); } Node ShaderIR::GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, Node op_a, Node op_b) { static constexpr std::array comparison_table{ - std::pair{PredCondition::LessThan, OperationCode::Logical2HLessThan}, - std::pair{PredCondition::Equal, OperationCode::Logical2HEqual}, - std::pair{PredCondition::LessEqual, OperationCode::Logical2HLessEqual}, - std::pair{PredCondition::GreaterThan, OperationCode::Logical2HGreaterThan}, - std::pair{PredCondition::NotEqual, OperationCode::Logical2HNotEqual}, - std::pair{PredCondition::GreaterEqual, OperationCode::Logical2HGreaterEqual}, - std::pair{PredCondition::LessThanWithNan, OperationCode::Logical2HLessThanWithNan}, - std::pair{PredCondition::NotEqualWithNan, OperationCode::Logical2HNotEqualWithNan}, - std::pair{PredCondition::LessEqualWithNan, OperationCode::Logical2HLessEqualWithNan}, - std::pair{PredCondition::GreaterThanWithNan, OperationCode::Logical2HGreaterThanWithNan}, - std::pair{PredCondition::GreaterEqualWithNan, OperationCode::Logical2HGreaterEqualWithNan}, + std::pair{PredCondition::LT, OperationCode::Logical2HLessThan}, + std::pair{PredCondition::EQ, OperationCode::Logical2HEqual}, + std::pair{PredCondition::LE, OperationCode::Logical2HLessEqual}, + std::pair{PredCondition::GT, OperationCode::Logical2HGreaterThan}, + std::pair{PredCondition::NE, OperationCode::Logical2HNotEqual}, + std::pair{PredCondition::GE, OperationCode::Logical2HGreaterEqual}, + std::pair{PredCondition::LTU, OperationCode::Logical2HLessThanWithNan}, + std::pair{PredCondition::LEU, OperationCode::Logical2HLessEqualWithNan}, + std::pair{PredCondition::GTU, OperationCode::Logical2HGreaterThanWithNan}, + std::pair{PredCondition::NEU, OperationCode::Logical2HNotEqualWithNan}, + std::pair{PredCondition::GEU, OperationCode::Logical2HGreaterEqualWithNan}, }; const auto comparison = @@ -398,7 +378,7 @@ void ShaderIR::SetInternalFlagsFromFloat(NodeBlock& bb, Node value, bool sets_cc if (!sets_cc) { return; } - Node zerop = Operation(OperationCode::LogicalFEqual, std::move(value), Immediate(0.0f)); + Node zerop = Operation(OperationCode::LogicalFOrdEqual, std::move(value), Immediate(0.0f)); SetInternalFlag(bb, InternalFlag::Zero, std::move(zerop)); LOG_WARNING(HW_GPU, "Condition codes implementation is incomplete"); } diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index c6e7bdf50..3a98b2104 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -18,6 +18,7 @@ #include "video_core/engines/shader_header.h" #include "video_core/shader/ast.h" #include "video_core/shader/compiler_settings.h" +#include "video_core/shader/memory_util.h" #include "video_core/shader/node.h" #include "video_core/shader/registry.h" @@ -25,16 +26,13 @@ namespace VideoCommon::Shader { struct ShaderBlock; -using ProgramCode = std::vector<u64>; - constexpr u32 MAX_PROGRAM_LENGTH = 0x1000; -class ConstBuffer { -public: - explicit ConstBuffer(u32 max_offset, bool is_indirect) +struct ConstBuffer { + constexpr explicit ConstBuffer(u32 max_offset, bool is_indirect) : max_offset{max_offset}, is_indirect{is_indirect} {} - ConstBuffer() = default; + constexpr ConstBuffer() = default; void MarkAsUsed(u64 offset) { max_offset = std::max(max_offset, static_cast<u32>(offset)); @@ -57,8 +55,8 @@ public: } private: - u32 max_offset{}; - bool is_indirect{}; + u32 max_offset = 0; + bool is_indirect = false; }; struct GlobalMemoryUsage { @@ -192,10 +190,14 @@ private: friend class ASTDecoder; struct SamplerInfo { - Tegra::Shader::TextureType type; - bool is_array; - bool is_shadow; - bool is_buffer; + std::optional<Tegra::Shader::TextureType> type; + std::optional<bool> is_array; + std::optional<bool> is_shadow; + std::optional<bool> is_buffer; + + constexpr bool IsComplete() const noexcept { + return type && is_array && is_shadow && is_buffer; + } }; void Decode(); @@ -328,16 +330,15 @@ private: OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation); /// Queries the missing sampler info from the execution context. - SamplerInfo GetSamplerInfo(std::optional<SamplerInfo> sampler_info, u32 offset, - std::optional<u32> buffer = std::nullopt); + SamplerInfo GetSamplerInfo(SamplerInfo info, + std::optional<Tegra::Engines::SamplerDescriptor> sampler); - /// Accesses a texture sampler - const Sampler* GetSampler(const Tegra::Shader::Sampler& sampler, - std::optional<SamplerInfo> sampler_info = std::nullopt); + /// Accesses a texture sampler. + std::optional<Sampler> GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo info); /// Accesses a texture sampler for a bindless texture. - const Sampler* GetBindlessSampler(Tegra::Shader::Register reg, Node& index_var, - std::optional<SamplerInfo> sampler_info = std::nullopt); + std::optional<Sampler> GetBindlessSampler(Tegra::Shader::Register reg, SamplerInfo info, + Node& index_var); /// Accesses an image. Image& GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type); @@ -408,8 +409,14 @@ private: std::tuple<Node, u32, u32> TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const; - std::tuple<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code, - s64 cursor); + std::pair<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code, + s64 cursor); + + std::pair<Node, TrackSampler> HandleBindlessIndirectRead(const CbufNode& cbuf, + const OperationNode& operation, + Node gpr, Node base_offset, + Node tracked, const NodeBlock& code, + s64 cursor); std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const; diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp index 10739b37d..6be3ea92b 100644 --- a/src/video_core/shader/track.cpp +++ b/src/video_core/shader/track.cpp @@ -14,6 +14,7 @@ namespace VideoCommon::Shader { namespace { + std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor, OperationCode operation_code) { for (; cursor >= 0; --cursor) { @@ -27,8 +28,9 @@ std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor, if (const auto conditional = std::get_if<ConditionalNode>(&*node)) { const auto& conditional_code = conditional->GetCode(); - auto [found, internal_cursor] = FindOperation( + auto result = FindOperation( conditional_code, static_cast<s64>(conditional_code.size() - 1), operation_code); + auto& found = result.first; if (found) { return {std::move(found), cursor}; } @@ -62,7 +64,8 @@ bool AmendNodeCv(std::size_t amend_index, Node node) { if (const auto operation = std::get_if<OperationNode>(&*node)) { operation->SetAmendIndex(amend_index); return true; - } else if (const auto conditional = std::get_if<ConditionalNode>(&*node)) { + } + if (const auto conditional = std::get_if<ConditionalNode>(&*node)) { conditional->SetAmendIndex(amend_index); return true; } @@ -71,39 +74,27 @@ bool AmendNodeCv(std::size_t amend_index, Node node) { } // Anonymous namespace -std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code, - s64 cursor) { +std::pair<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code, + s64 cursor) { if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) { + const u32 cbuf_index = cbuf->GetIndex(); + // Constant buffer found, test if it's an immediate - const auto offset = cbuf->GetOffset(); + const auto& offset = cbuf->GetOffset(); if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { - auto track = - MakeTrackSampler<BindlessSamplerNode>(cbuf->GetIndex(), immediate->GetValue()); + auto track = MakeTrackSampler<BindlessSamplerNode>(cbuf_index, immediate->GetValue()); return {tracked, track}; - } else if (const auto operation = std::get_if<OperationNode>(&*offset)) { + } + if (const auto operation = std::get_if<OperationNode>(&*offset)) { const u32 bound_buffer = registry.GetBoundBuffer(); - if (bound_buffer != cbuf->GetIndex()) { + if (bound_buffer != cbuf_index) { return {}; } - const auto pair = DecoupleIndirectRead(*operation); - if (!pair) { - return {}; + if (const std::optional pair = DecoupleIndirectRead(*operation)) { + auto [gpr, base_offset] = *pair; + return HandleBindlessIndirectRead(*cbuf, *operation, gpr, base_offset, tracked, + code, cursor); } - auto [gpr, base_offset] = *pair; - const auto offset_inm = std::get_if<ImmediateNode>(&*base_offset); - const auto& gpu_driver = registry.AccessGuestDriverProfile(); - const u32 bindless_cv = NewCustomVariable(); - const Node op = - Operation(OperationCode::UDiv, gpr, Immediate(gpu_driver.GetTextureHandlerSize())); - - const Node cv_node = GetCustomVariable(bindless_cv); - Node amend_op = Operation(OperationCode::Assign, cv_node, std::move(op)); - const std::size_t amend_index = DeclareAmend(amend_op); - AmendNodeCv(amend_index, code[cursor]); - // TODO Implement Bindless Index custom variable - auto track = MakeTrackSampler<ArraySamplerNode>(cbuf->GetIndex(), - offset_inm->GetValue(), bindless_cv); - return {tracked, track}; } return {}; } @@ -120,10 +111,23 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons return TrackBindlessSampler(source, code, new_cursor); } if (const auto operation = std::get_if<OperationNode>(&*tracked)) { - for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) { - if (auto found = TrackBindlessSampler((*operation)[i - 1], code, cursor); - std::get<0>(found)) { - // Cbuf found in operand. + const OperationNode& op = *operation; + + const OperationCode opcode = operation->GetCode(); + if (opcode == OperationCode::IBitwiseOr || opcode == OperationCode::UBitwiseOr) { + ASSERT(op.GetOperandsCount() == 2); + auto [node_a, index_a, offset_a] = TrackCbuf(op[0], code, cursor); + auto [node_b, index_b, offset_b] = TrackCbuf(op[1], code, cursor); + if (node_a && node_b) { + auto track = MakeTrackSampler<SeparateSamplerNode>(std::pair{index_a, index_b}, + std::pair{offset_a, offset_b}); + return {tracked, std::move(track)}; + } + } + std::size_t i = op.GetOperandsCount(); + while (i--) { + if (auto found = TrackBindlessSampler(op[i - 1], code, cursor); std::get<0>(found)) { + // Constant buffer found in operand. return found; } } @@ -137,11 +141,31 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons return {}; } +std::pair<Node, TrackSampler> ShaderIR::HandleBindlessIndirectRead( + const CbufNode& cbuf, const OperationNode& operation, Node gpr, Node base_offset, Node tracked, + const NodeBlock& code, s64 cursor) { + const auto offset_imm = std::get<ImmediateNode>(*base_offset); + const auto& gpu_driver = registry.AccessGuestDriverProfile(); + const u32 bindless_cv = NewCustomVariable(); + const u32 texture_handler_size = gpu_driver.GetTextureHandlerSize(); + Node op = Operation(OperationCode::UDiv, gpr, Immediate(texture_handler_size)); + + Node cv_node = GetCustomVariable(bindless_cv); + Node amend_op = Operation(OperationCode::Assign, std::move(cv_node), std::move(op)); + const std::size_t amend_index = DeclareAmend(std::move(amend_op)); + AmendNodeCv(amend_index, code[cursor]); + + // TODO: Implement bindless index custom variable + auto track = + MakeTrackSampler<ArraySamplerNode>(cbuf.GetIndex(), offset_imm.GetValue(), bindless_cv); + return {tracked, track}; +} + std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const { if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) { // Constant buffer found, test if it's an immediate - const auto offset = cbuf->GetOffset(); + const auto& offset = cbuf->GetOffset(); if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { return {tracked, cbuf->GetIndex(), immediate->GetValue()}; } @@ -151,21 +175,13 @@ std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& co if (gpr->GetIndex() == Tegra::Shader::Register::ZeroIndex) { return {}; } - s64 current_cursor = cursor; - while (current_cursor > 0) { - // Reduce the cursor in one to avoid infinite loops when the instruction sets the same - // register that it uses as operand - const auto [source, new_cursor] = TrackRegister(gpr, code, current_cursor - 1); - current_cursor = new_cursor; - if (!source) { - continue; - } - const auto [base_address, index, offset] = TrackCbuf(source, code, current_cursor); - if (base_address != nullptr) { - return {base_address, index, offset}; - } + // Reduce the cursor in one to avoid infinite loops when the instruction sets the same + // register that it uses as operand + const auto [source, new_cursor] = TrackRegister(gpr, code, cursor - 1); + if (!source) { + return {}; } - return {}; + return TrackCbuf(source, code, new_cursor); } if (const auto operation = std::get_if<OperationNode>(&*tracked)) { for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) { @@ -186,15 +202,15 @@ std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& co std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const { // Reduce the cursor in one to avoid infinite loops when the instruction sets the same register // that it uses as operand - const auto [found, found_cursor] = - TrackRegister(&std::get<GprNode>(*tracked), code, cursor - 1); + const auto result = TrackRegister(&std::get<GprNode>(*tracked), code, cursor - 1); + const auto& found = result.first; if (!found) { - return {}; + return std::nullopt; } if (const auto immediate = std::get_if<ImmediateNode>(&*found)) { return immediate->GetValue(); } - return {}; + return std::nullopt; } std::pair<Node, s64> ShaderIR::TrackRegister(const GprNode* tracked, const NodeBlock& code, diff --git a/src/video_core/shader_cache.h b/src/video_core/shader_cache.h new file mode 100644 index 000000000..015a789d6 --- /dev/null +++ b/src/video_core/shader_cache.h @@ -0,0 +1,240 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <algorithm> +#include <memory> +#include <mutex> +#include <unordered_map> +#include <utility> +#include <vector> + +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/rasterizer_interface.h" + +namespace VideoCommon { + +template <class T> +class ShaderCache { + static constexpr u64 PAGE_BITS = 14; + static constexpr u64 PAGE_SIZE = u64(1) << PAGE_BITS; + + struct Entry { + VAddr addr_start; + VAddr addr_end; + T* data; + + bool is_memory_marked = true; + + constexpr bool Overlaps(VAddr start, VAddr end) const noexcept { + return start < addr_end && addr_start < end; + } + }; + +public: + virtual ~ShaderCache() = default; + + /// @brief Removes shaders inside a given region + /// @note Checks for ranges + /// @param addr Start address of the invalidation + /// @param size Number of bytes of the invalidation + void InvalidateRegion(VAddr addr, std::size_t size) { + std::scoped_lock lock{invalidation_mutex}; + InvalidatePagesInRegion(addr, size); + RemovePendingShaders(); + } + + /// @brief Unmarks a memory region as cached and marks it for removal + /// @param addr Start address of the CPU write operation + /// @param size Number of bytes of the CPU write operation + void OnCPUWrite(VAddr addr, std::size_t size) { + std::lock_guard lock{invalidation_mutex}; + InvalidatePagesInRegion(addr, size); + } + + /// @brief Flushes delayed removal operations + void SyncGuestHost() { + std::scoped_lock lock{invalidation_mutex}; + RemovePendingShaders(); + } + + /// @brief Tries to obtain a cached shader starting in a given address + /// @note Doesn't check for ranges, the given address has to be the start of the shader + /// @param addr Start address of the shader, this doesn't cache for region + /// @return Pointer to a valid shader, nullptr when nothing is found + T* TryGet(VAddr addr) const { + std::scoped_lock lock{lookup_mutex}; + + const auto it = lookup_cache.find(addr); + if (it == lookup_cache.end()) { + return nullptr; + } + return it->second->data; + } + +protected: + explicit ShaderCache(VideoCore::RasterizerInterface& rasterizer_) : rasterizer{rasterizer_} {} + + /// @brief Register in the cache a given entry + /// @param data Shader to store in the cache + /// @param addr Start address of the shader that will be registered + /// @param size Size in bytes of the shader + void Register(std::unique_ptr<T> data, VAddr addr, std::size_t size) { + std::scoped_lock lock{invalidation_mutex, lookup_mutex}; + + const VAddr addr_end = addr + size; + Entry* const entry = NewEntry(addr, addr_end, data.get()); + + const u64 page_end = (addr_end + PAGE_SIZE - 1) >> PAGE_BITS; + for (u64 page = addr >> PAGE_BITS; page < page_end; ++page) { + invalidation_cache[page].push_back(entry); + } + + storage.push_back(std::move(data)); + + rasterizer.UpdatePagesCachedCount(addr, size, 1); + } + + /// @brief Called when a shader is going to be removed + /// @param shader Shader that will be removed + /// @pre invalidation_cache is locked + /// @pre lookup_mutex is locked + virtual void OnShaderRemoval([[maybe_unused]] T* shader) {} + +private: + /// @brief Invalidate pages in a given region + /// @pre invalidation_mutex is locked + void InvalidatePagesInRegion(VAddr addr, std::size_t size) { + const VAddr addr_end = addr + size; + const u64 page_end = (addr_end + PAGE_SIZE - 1) >> PAGE_BITS; + for (u64 page = addr >> PAGE_BITS; page < page_end; ++page) { + auto it = invalidation_cache.find(page); + if (it == invalidation_cache.end()) { + continue; + } + InvalidatePageEntries(it->second, addr, addr_end); + } + } + + /// @brief Remove shaders marked for deletion + /// @pre invalidation_mutex is locked + void RemovePendingShaders() { + if (marked_for_removal.empty()) { + return; + } + // Remove duplicates + std::sort(marked_for_removal.begin(), marked_for_removal.end()); + marked_for_removal.erase(std::unique(marked_for_removal.begin(), marked_for_removal.end()), + marked_for_removal.end()); + + std::vector<T*> removed_shaders; + removed_shaders.reserve(marked_for_removal.size()); + + std::scoped_lock lock{lookup_mutex}; + + for (Entry* const entry : marked_for_removal) { + removed_shaders.push_back(entry->data); + + const auto it = lookup_cache.find(entry->addr_start); + ASSERT(it != lookup_cache.end()); + lookup_cache.erase(it); + } + marked_for_removal.clear(); + + if (!removed_shaders.empty()) { + RemoveShadersFromStorage(std::move(removed_shaders)); + } + } + + /// @brief Invalidates entries in a given range for the passed page + /// @param entries Vector of entries in the page, it will be modified on overlaps + /// @param addr Start address of the invalidation + /// @param addr_end Non-inclusive end address of the invalidation + /// @pre invalidation_mutex is locked + void InvalidatePageEntries(std::vector<Entry*>& entries, VAddr addr, VAddr addr_end) { + std::size_t index = 0; + while (index < entries.size()) { + Entry* const entry = entries[index]; + if (!entry->Overlaps(addr, addr_end)) { + ++index; + continue; + } + + UnmarkMemory(entry); + RemoveEntryFromInvalidationCache(entry); + marked_for_removal.push_back(entry); + } + } + + /// @brief Removes all references to an entry in the invalidation cache + /// @param entry Entry to remove from the invalidation cache + /// @pre invalidation_mutex is locked + void RemoveEntryFromInvalidationCache(const Entry* entry) { + const u64 page_end = (entry->addr_end + PAGE_SIZE - 1) >> PAGE_BITS; + for (u64 page = entry->addr_start >> PAGE_BITS; page < page_end; ++page) { + const auto entries_it = invalidation_cache.find(page); + ASSERT(entries_it != invalidation_cache.end()); + std::vector<Entry*>& entries = entries_it->second; + + const auto entry_it = std::find(entries.begin(), entries.end(), entry); + ASSERT(entry_it != entries.end()); + entries.erase(entry_it); + } + } + + /// @brief Unmarks an entry from the rasterizer cache + /// @param entry Entry to unmark from memory + void UnmarkMemory(Entry* entry) { + if (!entry->is_memory_marked) { + return; + } + entry->is_memory_marked = false; + + const VAddr addr = entry->addr_start; + const std::size_t size = entry->addr_end - addr; + rasterizer.UpdatePagesCachedCount(addr, size, -1); + } + + /// @brief Removes a vector of shaders from a list + /// @param removed_shaders Shaders to be removed from the storage + /// @pre invalidation_mutex is locked + /// @pre lookup_mutex is locked + void RemoveShadersFromStorage(std::vector<T*> removed_shaders) { + // Notify removals + for (T* const shader : removed_shaders) { + OnShaderRemoval(shader); + } + + // Remove them from the cache + const auto is_removed = [&removed_shaders](const std::unique_ptr<T>& shader) { + return std::find(removed_shaders.begin(), removed_shaders.end(), shader.get()) != + removed_shaders.end(); + }; + std::erase_if(storage, is_removed); + } + + /// @brief Creates a new entry in the lookup cache and returns its pointer + /// @pre lookup_mutex is locked + Entry* NewEntry(VAddr addr, VAddr addr_end, T* data) { + auto entry = std::make_unique<Entry>(Entry{addr, addr_end, data}); + Entry* const entry_pointer = entry.get(); + + lookup_cache.emplace(addr, std::move(entry)); + return entry_pointer; + } + + VideoCore::RasterizerInterface& rasterizer; + + mutable std::mutex lookup_mutex; + std::mutex invalidation_mutex; + + std::unordered_map<u64, std::unique_ptr<Entry>> lookup_cache; + std::unordered_map<u64, std::vector<Entry*>> invalidation_cache; + std::vector<std::unique_ptr<T>> storage; + std::vector<Entry*> marked_for_removal; +}; + +} // namespace VideoCommon diff --git a/src/video_core/shader_notify.cpp b/src/video_core/shader_notify.cpp new file mode 100644 index 000000000..c3c71657d --- /dev/null +++ b/src/video_core/shader_notify.cpp @@ -0,0 +1,42 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "video_core/shader_notify.h" + +using namespace std::chrono_literals; + +namespace VideoCore { +namespace { +constexpr auto UPDATE_TICK = 32ms; +} + +ShaderNotify::ShaderNotify() = default; +ShaderNotify::~ShaderNotify() = default; + +std::size_t ShaderNotify::GetShadersBuilding() { + const auto now = std::chrono::high_resolution_clock::now(); + const auto diff = now - last_update; + if (diff > UPDATE_TICK) { + std::shared_lock lock(mutex); + last_updated_count = accurate_count; + } + return last_updated_count; +} + +std::size_t ShaderNotify::GetShadersBuildingAccurate() { + std::shared_lock lock{mutex}; + return accurate_count; +} + +void ShaderNotify::MarkShaderComplete() { + std::unique_lock lock{mutex}; + accurate_count--; +} + +void ShaderNotify::MarkSharderBuilding() { + std::unique_lock lock{mutex}; + accurate_count++; +} + +} // namespace VideoCore diff --git a/src/video_core/shader_notify.h b/src/video_core/shader_notify.h new file mode 100644 index 000000000..a9c92d179 --- /dev/null +++ b/src/video_core/shader_notify.h @@ -0,0 +1,29 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <chrono> +#include <shared_mutex> +#include "common/common_types.h" + +namespace VideoCore { +class ShaderNotify { +public: + ShaderNotify(); + ~ShaderNotify(); + + std::size_t GetShadersBuilding(); + std::size_t GetShadersBuildingAccurate(); + + void MarkShaderComplete(); + void MarkSharderBuilding(); + +private: + std::size_t last_updated_count{}; + std::size_t accurate_count{}; + std::shared_mutex mutex; + std::chrono::high_resolution_clock::time_point last_update{}; +}; +} // namespace VideoCore diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index cc7181229..1688267bb 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -74,115 +74,131 @@ bool SurfaceTargetIsArray(SurfaceTarget target) { PixelFormat PixelFormatFromDepthFormat(Tegra::DepthFormat format) { switch (format) { - case Tegra::DepthFormat::S8_Z24_UNORM: - return PixelFormat::S8Z24; - case Tegra::DepthFormat::Z24_S8_UNORM: - return PixelFormat::Z24S8; - case Tegra::DepthFormat::Z32_FLOAT: - return PixelFormat::Z32F; - case Tegra::DepthFormat::Z16_UNORM: - return PixelFormat::Z16; - case Tegra::DepthFormat::Z32_S8_X24_FLOAT: - return PixelFormat::Z32FS8; + case Tegra::DepthFormat::S8_UINT_Z24_UNORM: + return PixelFormat::S8_UINT_D24_UNORM; + case Tegra::DepthFormat::D24S8_UNORM: + return PixelFormat::D24_UNORM_S8_UINT; + case Tegra::DepthFormat::D32_FLOAT: + return PixelFormat::D32_FLOAT; + case Tegra::DepthFormat::D16_UNORM: + return PixelFormat::D16_UNORM; + case Tegra::DepthFormat::D32_FLOAT_S8X24_UINT: + return PixelFormat::D32_FLOAT_S8_UINT; default: - LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format)); - UNREACHABLE(); - return PixelFormat::S8Z24; + UNIMPLEMENTED_MSG("Unimplemented format={}", static_cast<u32>(format)); + return PixelFormat::S8_UINT_D24_UNORM; } } PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) { switch (format) { - case Tegra::RenderTargetFormat::RGBA8_SRGB: - return PixelFormat::RGBA8_SRGB; - case Tegra::RenderTargetFormat::RGBA8_UNORM: - return PixelFormat::ABGR8U; - case Tegra::RenderTargetFormat::RGBA8_SNORM: - return PixelFormat::ABGR8S; - case Tegra::RenderTargetFormat::RGBA8_UINT: - return PixelFormat::ABGR8UI; - case Tegra::RenderTargetFormat::BGRA8_SRGB: - return PixelFormat::BGRA8_SRGB; - case Tegra::RenderTargetFormat::BGRA8_UNORM: - return PixelFormat::BGRA8; - case Tegra::RenderTargetFormat::RGB10_A2_UNORM: - return PixelFormat::A2B10G10R10U; - case Tegra::RenderTargetFormat::RGBA16_FLOAT: - return PixelFormat::RGBA16F; - case Tegra::RenderTargetFormat::RGBA16_UNORM: - return PixelFormat::RGBA16U; - case Tegra::RenderTargetFormat::RGBA16_SNORM: - return PixelFormat::RGBA16S; - case Tegra::RenderTargetFormat::RGBA16_UINT: - return PixelFormat::RGBA16UI; - case Tegra::RenderTargetFormat::RGBA32_FLOAT: - return PixelFormat::RGBA32F; - case Tegra::RenderTargetFormat::RG32_FLOAT: - return PixelFormat::RG32F; - case Tegra::RenderTargetFormat::R11G11B10_FLOAT: - return PixelFormat::R11FG11FB10F; - case Tegra::RenderTargetFormat::B5G6R5_UNORM: - return PixelFormat::B5G6R5U; - case Tegra::RenderTargetFormat::BGR5A1_UNORM: - return PixelFormat::A1B5G5R5U; - case Tegra::RenderTargetFormat::RGBA32_UINT: - return PixelFormat::RGBA32UI; - case Tegra::RenderTargetFormat::R8_UNORM: - return PixelFormat::R8U; - case Tegra::RenderTargetFormat::R8_UINT: - return PixelFormat::R8UI; - case Tegra::RenderTargetFormat::RG16_FLOAT: - return PixelFormat::RG16F; - case Tegra::RenderTargetFormat::RG16_UINT: - return PixelFormat::RG16UI; - case Tegra::RenderTargetFormat::RG16_SINT: - return PixelFormat::RG16I; - case Tegra::RenderTargetFormat::RG16_UNORM: - return PixelFormat::RG16; - case Tegra::RenderTargetFormat::RG16_SNORM: - return PixelFormat::RG16S; - case Tegra::RenderTargetFormat::RG8_UNORM: - return PixelFormat::RG8U; - case Tegra::RenderTargetFormat::RG8_SNORM: - return PixelFormat::RG8S; - case Tegra::RenderTargetFormat::R16_FLOAT: - return PixelFormat::R16F; + case Tegra::RenderTargetFormat::R32B32G32A32_FLOAT: + return PixelFormat::R32G32B32A32_FLOAT; + case Tegra::RenderTargetFormat::R32G32B32A32_SINT: + return PixelFormat::R32G32B32A32_SINT; + case Tegra::RenderTargetFormat::R32G32B32A32_UINT: + return PixelFormat::R32G32B32A32_UINT; + case Tegra::RenderTargetFormat::R16G16B16A16_UNORM: + return PixelFormat::R16G16B16A16_UNORM; + case Tegra::RenderTargetFormat::R16G16B16A16_SNORM: + return PixelFormat::R16G16B16A16_SNORM; + case Tegra::RenderTargetFormat::R16G16B16A16_SINT: + return PixelFormat::R16G16B16A16_SINT; + case Tegra::RenderTargetFormat::R16G16B16A16_UINT: + return PixelFormat::R16G16B16A16_UINT; + case Tegra::RenderTargetFormat::R16G16B16A16_FLOAT: + return PixelFormat::R16G16B16A16_FLOAT; + case Tegra::RenderTargetFormat::R32G32_FLOAT: + return PixelFormat::R32G32_FLOAT; + case Tegra::RenderTargetFormat::R32G32_SINT: + return PixelFormat::R32G32_SINT; + case Tegra::RenderTargetFormat::R32G32_UINT: + return PixelFormat::R32G32_UINT; + case Tegra::RenderTargetFormat::R16G16B16X16_FLOAT: + return PixelFormat::R16G16B16X16_FLOAT; + case Tegra::RenderTargetFormat::B8G8R8A8_UNORM: + return PixelFormat::B8G8R8A8_UNORM; + case Tegra::RenderTargetFormat::B8G8R8A8_SRGB: + return PixelFormat::B8G8R8A8_SRGB; + case Tegra::RenderTargetFormat::A2B10G10R10_UNORM: + return PixelFormat::A2B10G10R10_UNORM; + case Tegra::RenderTargetFormat::A2B10G10R10_UINT: + return PixelFormat::A2B10G10R10_UINT; + case Tegra::RenderTargetFormat::A8B8G8R8_UNORM: + return PixelFormat::A8B8G8R8_UNORM; + case Tegra::RenderTargetFormat::A8B8G8R8_SRGB: + return PixelFormat::A8B8G8R8_SRGB; + case Tegra::RenderTargetFormat::A8B8G8R8_SNORM: + return PixelFormat::A8B8G8R8_SNORM; + case Tegra::RenderTargetFormat::A8B8G8R8_SINT: + return PixelFormat::A8B8G8R8_SINT; + case Tegra::RenderTargetFormat::A8B8G8R8_UINT: + return PixelFormat::A8B8G8R8_UINT; + case Tegra::RenderTargetFormat::R16G16_UNORM: + return PixelFormat::R16G16_UNORM; + case Tegra::RenderTargetFormat::R16G16_SNORM: + return PixelFormat::R16G16_SNORM; + case Tegra::RenderTargetFormat::R16G16_SINT: + return PixelFormat::R16G16_SINT; + case Tegra::RenderTargetFormat::R16G16_UINT: + return PixelFormat::R16G16_UINT; + case Tegra::RenderTargetFormat::R16G16_FLOAT: + return PixelFormat::R16G16_FLOAT; + case Tegra::RenderTargetFormat::B10G11R11_FLOAT: + return PixelFormat::B10G11R11_FLOAT; + case Tegra::RenderTargetFormat::R32_SINT: + return PixelFormat::R32_SINT; + case Tegra::RenderTargetFormat::R32_UINT: + return PixelFormat::R32_UINT; + case Tegra::RenderTargetFormat::R32_FLOAT: + return PixelFormat::R32_FLOAT; + case Tegra::RenderTargetFormat::R5G6B5_UNORM: + return PixelFormat::R5G6B5_UNORM; + case Tegra::RenderTargetFormat::A1R5G5B5_UNORM: + return PixelFormat::A1R5G5B5_UNORM; + case Tegra::RenderTargetFormat::R8G8_UNORM: + return PixelFormat::R8G8_UNORM; + case Tegra::RenderTargetFormat::R8G8_SNORM: + return PixelFormat::R8G8_SNORM; + case Tegra::RenderTargetFormat::R8G8_SINT: + return PixelFormat::R8G8_SINT; + case Tegra::RenderTargetFormat::R8G8_UINT: + return PixelFormat::R8G8_UINT; case Tegra::RenderTargetFormat::R16_UNORM: - return PixelFormat::R16U; + return PixelFormat::R16_UNORM; case Tegra::RenderTargetFormat::R16_SNORM: - return PixelFormat::R16S; - case Tegra::RenderTargetFormat::R16_UINT: - return PixelFormat::R16UI; + return PixelFormat::R16_SNORM; case Tegra::RenderTargetFormat::R16_SINT: - return PixelFormat::R16I; - case Tegra::RenderTargetFormat::R32_FLOAT: - return PixelFormat::R32F; - case Tegra::RenderTargetFormat::R32_SINT: - return PixelFormat::R32I; - case Tegra::RenderTargetFormat::R32_UINT: - return PixelFormat::R32UI; - case Tegra::RenderTargetFormat::RG32_UINT: - return PixelFormat::RG32UI; - case Tegra::RenderTargetFormat::RGBX16_FLOAT: - return PixelFormat::RGBX16F; + return PixelFormat::R16_SINT; + case Tegra::RenderTargetFormat::R16_UINT: + return PixelFormat::R16_UINT; + case Tegra::RenderTargetFormat::R16_FLOAT: + return PixelFormat::R16_FLOAT; + case Tegra::RenderTargetFormat::R8_UNORM: + return PixelFormat::R8_UNORM; + case Tegra::RenderTargetFormat::R8_SNORM: + return PixelFormat::R8_SNORM; + case Tegra::RenderTargetFormat::R8_SINT: + return PixelFormat::R8_SINT; + case Tegra::RenderTargetFormat::R8_UINT: + return PixelFormat::R8_UINT; default: - LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format)); - UNREACHABLE(); - return PixelFormat::RGBA8_SRGB; + UNIMPLEMENTED_MSG("Unimplemented format={}", static_cast<int>(format)); + return PixelFormat::A8B8G8R8_UNORM; } } PixelFormat PixelFormatFromGPUPixelFormat(Tegra::FramebufferConfig::PixelFormat format) { switch (format) { - case Tegra::FramebufferConfig::PixelFormat::ABGR8: - return PixelFormat::ABGR8U; - case Tegra::FramebufferConfig::PixelFormat::RGB565: - return PixelFormat::B5G6R5U; - case Tegra::FramebufferConfig::PixelFormat::BGRA8: - return PixelFormat::BGRA8; + case Tegra::FramebufferConfig::PixelFormat::A8B8G8R8_UNORM: + return PixelFormat::A8B8G8R8_UNORM; + case Tegra::FramebufferConfig::PixelFormat::RGB565_UNORM: + return PixelFormat::R5G6B5_UNORM; + case Tegra::FramebufferConfig::PixelFormat::B8G8R8A8_UNORM: + return PixelFormat::B8G8R8A8_UNORM; default: UNIMPLEMENTED_MSG("Unimplemented format={}", static_cast<u32>(format)); - return PixelFormat::ABGR8U; + return PixelFormat::A8B8G8R8_UNORM; } } @@ -210,27 +226,27 @@ SurfaceType GetFormatType(PixelFormat pixel_format) { bool IsPixelFormatASTC(PixelFormat format) { switch (format) { - case PixelFormat::ASTC_2D_4X4: - case PixelFormat::ASTC_2D_5X4: - case PixelFormat::ASTC_2D_5X5: - case PixelFormat::ASTC_2D_8X8: - case PixelFormat::ASTC_2D_8X5: + case PixelFormat::ASTC_2D_4X4_UNORM: + case PixelFormat::ASTC_2D_5X4_UNORM: + case PixelFormat::ASTC_2D_5X5_UNORM: + case PixelFormat::ASTC_2D_8X8_UNORM: + case PixelFormat::ASTC_2D_8X5_UNORM: case PixelFormat::ASTC_2D_4X4_SRGB: case PixelFormat::ASTC_2D_5X4_SRGB: case PixelFormat::ASTC_2D_5X5_SRGB: case PixelFormat::ASTC_2D_8X8_SRGB: case PixelFormat::ASTC_2D_8X5_SRGB: - case PixelFormat::ASTC_2D_10X8: + case PixelFormat::ASTC_2D_10X8_UNORM: case PixelFormat::ASTC_2D_10X8_SRGB: - case PixelFormat::ASTC_2D_6X6: + case PixelFormat::ASTC_2D_6X6_UNORM: case PixelFormat::ASTC_2D_6X6_SRGB: - case PixelFormat::ASTC_2D_10X10: + case PixelFormat::ASTC_2D_10X10_UNORM: case PixelFormat::ASTC_2D_10X10_SRGB: - case PixelFormat::ASTC_2D_12X12: + case PixelFormat::ASTC_2D_12X12_UNORM: case PixelFormat::ASTC_2D_12X12_SRGB: - case PixelFormat::ASTC_2D_8X6: + case PixelFormat::ASTC_2D_8X6_UNORM: case PixelFormat::ASTC_2D_8X6_SRGB: - case PixelFormat::ASTC_2D_6X5: + case PixelFormat::ASTC_2D_6X5_UNORM: case PixelFormat::ASTC_2D_6X5_SRGB: return true; default: @@ -240,12 +256,12 @@ bool IsPixelFormatASTC(PixelFormat format) { bool IsPixelFormatSRGB(PixelFormat format) { switch (format) { - case PixelFormat::RGBA8_SRGB: - case PixelFormat::BGRA8_SRGB: - case PixelFormat::DXT1_SRGB: - case PixelFormat::DXT23_SRGB: - case PixelFormat::DXT45_SRGB: - case PixelFormat::BC7U_SRGB: + case PixelFormat::A8B8G8R8_SRGB: + case PixelFormat::B8G8R8A8_SRGB: + case PixelFormat::BC1_RGBA_SRGB: + case PixelFormat::BC2_SRGB: + case PixelFormat::BC3_SRGB: + case PixelFormat::BC7_SRGB: case PixelFormat::ASTC_2D_4X4_SRGB: case PixelFormat::ASTC_2D_8X8_SRGB: case PixelFormat::ASTC_2D_8X5_SRGB: @@ -267,25 +283,4 @@ std::pair<u32, u32> GetASTCBlockSize(PixelFormat format) { return {GetDefaultBlockWidth(format), GetDefaultBlockHeight(format)}; } -bool IsFormatBCn(PixelFormat format) { - switch (format) { - case PixelFormat::DXT1: - case PixelFormat::DXT23: - case PixelFormat::DXT45: - case PixelFormat::DXN1: - case PixelFormat::DXN2SNORM: - case PixelFormat::DXN2UNORM: - case PixelFormat::BC7U: - case PixelFormat::BC6H_UF16: - case PixelFormat::BC6H_SF16: - case PixelFormat::DXT1_SRGB: - case PixelFormat::DXT23_SRGB: - case PixelFormat::DXT45_SRGB: - case PixelFormat::BC7U_SRGB: - return true; - default: - return false; - } -} - } // namespace VideoCore::Surface diff --git a/src/video_core/surface.h b/src/video_core/surface.h index e0acd44d3..cfd12fa61 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -15,93 +15,105 @@ namespace VideoCore::Surface { enum class PixelFormat { - ABGR8U = 0, - ABGR8S = 1, - ABGR8UI = 2, - B5G6R5U = 3, - A2B10G10R10U = 4, - A1B5G5R5U = 5, - R8U = 6, - R8UI = 7, - RGBA16F = 8, - RGBA16U = 9, - RGBA16S = 10, - RGBA16UI = 11, - R11FG11FB10F = 12, - RGBA32UI = 13, - DXT1 = 14, - DXT23 = 15, - DXT45 = 16, - DXN1 = 17, // This is also known as BC4 - DXN2UNORM = 18, - DXN2SNORM = 19, - BC7U = 20, - BC6H_UF16 = 21, - BC6H_SF16 = 22, - ASTC_2D_4X4 = 23, - BGRA8 = 24, - RGBA32F = 25, - RG32F = 26, - R32F = 27, - R16F = 28, - R16U = 29, - R16S = 30, - R16UI = 31, - R16I = 32, - RG16 = 33, - RG16F = 34, - RG16UI = 35, - RG16I = 36, - RG16S = 37, - RGB32F = 38, - RGBA8_SRGB = 39, - RG8U = 40, - RG8S = 41, - RG32UI = 42, - RGBX16F = 43, - R32UI = 44, - R32I = 45, - ASTC_2D_8X8 = 46, - ASTC_2D_8X5 = 47, - ASTC_2D_5X4 = 48, - BGRA8_SRGB = 49, - DXT1_SRGB = 50, - DXT23_SRGB = 51, - DXT45_SRGB = 52, - BC7U_SRGB = 53, - R4G4B4A4U = 54, - ASTC_2D_4X4_SRGB = 55, - ASTC_2D_8X8_SRGB = 56, - ASTC_2D_8X5_SRGB = 57, - ASTC_2D_5X4_SRGB = 58, - ASTC_2D_5X5 = 59, - ASTC_2D_5X5_SRGB = 60, - ASTC_2D_10X8 = 61, - ASTC_2D_10X8_SRGB = 62, - ASTC_2D_6X6 = 63, - ASTC_2D_6X6_SRGB = 64, - ASTC_2D_10X10 = 65, - ASTC_2D_10X10_SRGB = 66, - ASTC_2D_12X12 = 67, - ASTC_2D_12X12_SRGB = 68, - ASTC_2D_8X6 = 69, - ASTC_2D_8X6_SRGB = 70, - ASTC_2D_6X5 = 71, - ASTC_2D_6X5_SRGB = 72, - E5B9G9R9F = 73, + A8B8G8R8_UNORM, + A8B8G8R8_SNORM, + A8B8G8R8_SINT, + A8B8G8R8_UINT, + R5G6B5_UNORM, + B5G6R5_UNORM, + A1R5G5B5_UNORM, + A2B10G10R10_UNORM, + A2B10G10R10_UINT, + A1B5G5R5_UNORM, + R8_UNORM, + R8_SNORM, + R8_SINT, + R8_UINT, + R16G16B16A16_FLOAT, + R16G16B16A16_UNORM, + R16G16B16A16_SNORM, + R16G16B16A16_SINT, + R16G16B16A16_UINT, + B10G11R11_FLOAT, + R32G32B32A32_UINT, + BC1_RGBA_UNORM, + BC2_UNORM, + BC3_UNORM, + BC4_UNORM, + BC4_SNORM, + BC5_UNORM, + BC5_SNORM, + BC7_UNORM, + BC6H_UFLOAT, + BC6H_SFLOAT, + ASTC_2D_4X4_UNORM, + B8G8R8A8_UNORM, + R32G32B32A32_FLOAT, + R32G32B32A32_SINT, + R32G32_FLOAT, + R32G32_SINT, + R32_FLOAT, + R16_FLOAT, + R16_UNORM, + R16_SNORM, + R16_UINT, + R16_SINT, + R16G16_UNORM, + R16G16_FLOAT, + R16G16_UINT, + R16G16_SINT, + R16G16_SNORM, + R32G32B32_FLOAT, + A8B8G8R8_SRGB, + R8G8_UNORM, + R8G8_SNORM, + R8G8_SINT, + R8G8_UINT, + R32G32_UINT, + R16G16B16X16_FLOAT, + R32_UINT, + R32_SINT, + ASTC_2D_8X8_UNORM, + ASTC_2D_8X5_UNORM, + ASTC_2D_5X4_UNORM, + B8G8R8A8_SRGB, + BC1_RGBA_SRGB, + BC2_SRGB, + BC3_SRGB, + BC7_SRGB, + A4B4G4R4_UNORM, + ASTC_2D_4X4_SRGB, + ASTC_2D_8X8_SRGB, + ASTC_2D_8X5_SRGB, + ASTC_2D_5X4_SRGB, + ASTC_2D_5X5_UNORM, + ASTC_2D_5X5_SRGB, + ASTC_2D_10X8_UNORM, + ASTC_2D_10X8_SRGB, + ASTC_2D_6X6_UNORM, + ASTC_2D_6X6_SRGB, + ASTC_2D_10X10_UNORM, + ASTC_2D_10X10_SRGB, + ASTC_2D_12X12_UNORM, + ASTC_2D_12X12_SRGB, + ASTC_2D_8X6_UNORM, + ASTC_2D_8X6_SRGB, + ASTC_2D_6X5_UNORM, + ASTC_2D_6X5_SRGB, + E5B9G9R9_FLOAT, MaxColorFormat, // Depth formats - Z32F = 74, - Z16 = 75, + D32_FLOAT = MaxColorFormat, + D16_UNORM, MaxDepthFormat, // DepthStencil formats - Z24S8 = 76, - S8Z24 = 77, - Z32FS8 = 78, + D24_UNORM_S8_UINT = MaxDepthFormat, + S8_UINT_D24_UNORM, + D32_FLOAT_S8_UINT, MaxDepthStencilFormat, @@ -129,85 +141,97 @@ enum class SurfaceTarget { }; constexpr std::array<u32, MaxPixelFormat> compression_factor_shift_table = {{ - 0, // ABGR8U - 0, // ABGR8S - 0, // ABGR8UI - 0, // B5G6R5U - 0, // A2B10G10R10U - 0, // A1B5G5R5U - 0, // R8U - 0, // R8UI - 0, // RGBA16F - 0, // RGBA16U - 0, // RGBA16S - 0, // RGBA16UI - 0, // R11FG11FB10F - 0, // RGBA32UI - 2, // DXT1 - 2, // DXT23 - 2, // DXT45 - 2, // DXN1 - 2, // DXN2UNORM - 2, // DXN2SNORM - 2, // BC7U - 2, // BC6H_UF16 - 2, // BC6H_SF16 - 2, // ASTC_2D_4X4 - 0, // BGRA8 - 0, // RGBA32F - 0, // RG32F - 0, // R32F - 0, // R16F - 0, // R16U - 0, // R16S - 0, // R16UI - 0, // R16I - 0, // RG16 - 0, // RG16F - 0, // RG16UI - 0, // RG16I - 0, // RG16S - 0, // RGB32F - 0, // RGBA8_SRGB - 0, // RG8U - 0, // RG8S - 0, // RG32UI - 0, // RGBX16F - 0, // R32UI - 0, // R32I - 2, // ASTC_2D_8X8 - 2, // ASTC_2D_8X5 - 2, // ASTC_2D_5X4 - 0, // BGRA8_SRGB - 2, // DXT1_SRGB - 2, // DXT23_SRGB - 2, // DXT45_SRGB - 2, // BC7U_SRGB - 0, // R4G4B4A4U + 0, // A8B8G8R8_UNORM + 0, // A8B8G8R8_SNORM + 0, // A8B8G8R8_SINT + 0, // A8B8G8R8_UINT + 0, // R5G6B5_UNORM + 0, // B5G6R5_UNORM + 0, // A1R5G5B5_UNORM + 0, // A2B10G10R10_UNORM + 0, // A2B10G10R10_UINT + 0, // A1B5G5R5_UNORM + 0, // R8_UNORM + 0, // R8_SNORM + 0, // R8_SINT + 0, // R8_UINT + 0, // R16G16B16A16_FLOAT + 0, // R16G16B16A16_UNORM + 0, // R16G16B16A16_SNORM + 0, // R16G16B16A16_SINT + 0, // R16G16B16A16_UINT + 0, // B10G11R11_FLOAT + 0, // R32G32B32A32_UINT + 2, // BC1_RGBA_UNORM + 2, // BC2_UNORM + 2, // BC3_UNORM + 2, // BC4_UNORM + 2, // BC4_SNORM + 2, // BC5_UNORM + 2, // BC5_SNORM + 2, // BC7_UNORM + 2, // BC6H_UFLOAT + 2, // BC6H_SFLOAT + 2, // ASTC_2D_4X4_UNORM + 0, // B8G8R8A8_UNORM + 0, // R32G32B32A32_FLOAT + 0, // R32G32B32A32_SINT + 0, // R32G32_FLOAT + 0, // R32G32_SINT + 0, // R32_FLOAT + 0, // R16_FLOAT + 0, // R16_UNORM + 0, // R16_SNORM + 0, // R16_UINT + 0, // R16_SINT + 0, // R16G16_UNORM + 0, // R16G16_FLOAT + 0, // R16G16_UINT + 0, // R16G16_SINT + 0, // R16G16_SNORM + 0, // R32G32B32_FLOAT + 0, // A8B8G8R8_SRGB + 0, // R8G8_UNORM + 0, // R8G8_SNORM + 0, // R8G8_SINT + 0, // R8G8_UINT + 0, // R32G32_UINT + 0, // R16G16B16X16_FLOAT + 0, // R32_UINT + 0, // R32_SINT + 2, // ASTC_2D_8X8_UNORM + 2, // ASTC_2D_8X5_UNORM + 2, // ASTC_2D_5X4_UNORM + 0, // B8G8R8A8_SRGB + 2, // BC1_RGBA_SRGB + 2, // BC2_SRGB + 2, // BC3_SRGB + 2, // BC7_SRGB + 0, // A4B4G4R4_UNORM 2, // ASTC_2D_4X4_SRGB 2, // ASTC_2D_8X8_SRGB 2, // ASTC_2D_8X5_SRGB 2, // ASTC_2D_5X4_SRGB - 2, // ASTC_2D_5X5 + 2, // ASTC_2D_5X5_UNORM 2, // ASTC_2D_5X5_SRGB - 2, // ASTC_2D_10X8 + 2, // ASTC_2D_10X8_UNORM 2, // ASTC_2D_10X8_SRGB - 2, // ASTC_2D_6X6 + 2, // ASTC_2D_6X6_UNORM 2, // ASTC_2D_6X6_SRGB - 2, // ASTC_2D_10X10 + 2, // ASTC_2D_10X10_UNORM 2, // ASTC_2D_10X10_SRGB - 2, // ASTC_2D_12X12 + 2, // ASTC_2D_12X12_UNORM 2, // ASTC_2D_12X12_SRGB - 2, // ASTC_2D_8X6 + 2, // ASTC_2D_8X6_UNORM 2, // ASTC_2D_8X6_SRGB - 2, // ASTC_2D_6X5 + 2, // ASTC_2D_6X5_UNORM 2, // ASTC_2D_6X5_SRGB - 0, // E5B9G9R9F - 0, // Z32F - 0, // Z16 - 0, // Z24S8 - 0, // S8Z24 - 0, // Z32FS8 + 0, // E5B9G9R9_FLOAT + 0, // D32_FLOAT + 0, // D16_UNORM + 0, // D24_UNORM_S8_UINT + 0, // S8_UINT_D24_UNORM + 0, // D32_FLOAT_S8_UINT }}; /** @@ -227,85 +251,97 @@ inline constexpr u32 GetCompressionFactor(PixelFormat format) { } constexpr std::array<u32, MaxPixelFormat> block_width_table = {{ - 1, // ABGR8U - 1, // ABGR8S - 1, // ABGR8UI - 1, // B5G6R5U - 1, // A2B10G10R10U - 1, // A1B5G5R5U - 1, // R8U - 1, // R8UI - 1, // RGBA16F - 1, // RGBA16U - 1, // RGBA16S - 1, // RGBA16UI - 1, // R11FG11FB10F - 1, // RGBA32UI - 4, // DXT1 - 4, // DXT23 - 4, // DXT45 - 4, // DXN1 - 4, // DXN2UNORM - 4, // DXN2SNORM - 4, // BC7U - 4, // BC6H_UF16 - 4, // BC6H_SF16 - 4, // ASTC_2D_4X4 - 1, // BGRA8 - 1, // RGBA32F - 1, // RG32F - 1, // R32F - 1, // R16F - 1, // R16U - 1, // R16S - 1, // R16UI - 1, // R16I - 1, // RG16 - 1, // RG16F - 1, // RG16UI - 1, // RG16I - 1, // RG16S - 1, // RGB32F - 1, // RGBA8_SRGB - 1, // RG8U - 1, // RG8S - 1, // RG32UI - 1, // RGBX16F - 1, // R32UI - 1, // R32I - 8, // ASTC_2D_8X8 - 8, // ASTC_2D_8X5 - 5, // ASTC_2D_5X4 - 1, // BGRA8_SRGB - 4, // DXT1_SRGB - 4, // DXT23_SRGB - 4, // DXT45_SRGB - 4, // BC7U_SRGB - 1, // R4G4B4A4U + 1, // A8B8G8R8_UNORM + 1, // A8B8G8R8_SNORM + 1, // A8B8G8R8_SINT + 1, // A8B8G8R8_UINT + 1, // R5G6B5_UNORM + 1, // B5G6R5_UNORM + 1, // A1R5G5B5_UNORM + 1, // A2B10G10R10_UNORM + 1, // A2B10G10R10_UINT + 1, // A1B5G5R5_UNORM + 1, // R8_UNORM + 1, // R8_SNORM + 1, // R8_SINT + 1, // R8_UINT + 1, // R16G16B16A16_FLOAT + 1, // R16G16B16A16_UNORM + 1, // R16G16B16A16_SNORM + 1, // R16G16B16A16_SINT + 1, // R16G16B16A16_UINT + 1, // B10G11R11_FLOAT + 1, // R32G32B32A32_UINT + 4, // BC1_RGBA_UNORM + 4, // BC2_UNORM + 4, // BC3_UNORM + 4, // BC4_UNORM + 4, // BC4_SNORM + 4, // BC5_UNORM + 4, // BC5_SNORM + 4, // BC7_UNORM + 4, // BC6H_UFLOAT + 4, // BC6H_SFLOAT + 4, // ASTC_2D_4X4_UNORM + 1, // B8G8R8A8_UNORM + 1, // R32G32B32A32_FLOAT + 1, // R32G32B32A32_SINT + 1, // R32G32_FLOAT + 1, // R32G32_SINT + 1, // R32_FLOAT + 1, // R16_FLOAT + 1, // R16_UNORM + 1, // R16_SNORM + 1, // R16_UINT + 1, // R16_SINT + 1, // R16G16_UNORM + 1, // R16G16_FLOAT + 1, // R16G16_UINT + 1, // R16G16_SINT + 1, // R16G16_SNORM + 1, // R32G32B32_FLOAT + 1, // A8B8G8R8_SRGB + 1, // R8G8_UNORM + 1, // R8G8_SNORM + 1, // R8G8_SINT + 1, // R8G8_UINT + 1, // R32G32_UINT + 1, // R16G16B16X16_FLOAT + 1, // R32_UINT + 1, // R32_SINT + 8, // ASTC_2D_8X8_UNORM + 8, // ASTC_2D_8X5_UNORM + 5, // ASTC_2D_5X4_UNORM + 1, // B8G8R8A8_SRGB + 4, // BC1_RGBA_SRGB + 4, // BC2_SRGB + 4, // BC3_SRGB + 4, // BC7_SRGB + 1, // A4B4G4R4_UNORM 4, // ASTC_2D_4X4_SRGB 8, // ASTC_2D_8X8_SRGB 8, // ASTC_2D_8X5_SRGB 5, // ASTC_2D_5X4_SRGB - 5, // ASTC_2D_5X5 + 5, // ASTC_2D_5X5_UNORM 5, // ASTC_2D_5X5_SRGB - 10, // ASTC_2D_10X8 + 10, // ASTC_2D_10X8_UNORM 10, // ASTC_2D_10X8_SRGB - 6, // ASTC_2D_6X6 + 6, // ASTC_2D_6X6_UNORM 6, // ASTC_2D_6X6_SRGB - 10, // ASTC_2D_10X10 + 10, // ASTC_2D_10X10_UNORM 10, // ASTC_2D_10X10_SRGB - 12, // ASTC_2D_12X12 + 12, // ASTC_2D_12X12_UNORM 12, // ASTC_2D_12X12_SRGB - 8, // ASTC_2D_8X6 + 8, // ASTC_2D_8X6_UNORM 8, // ASTC_2D_8X6_SRGB - 6, // ASTC_2D_6X5 + 6, // ASTC_2D_6X5_UNORM 6, // ASTC_2D_6X5_SRGB - 1, // E5B9G9R9F - 1, // Z32F - 1, // Z16 - 1, // Z24S8 - 1, // S8Z24 - 1, // Z32FS8 + 1, // E5B9G9R9_FLOAT + 1, // D32_FLOAT + 1, // D16_UNORM + 1, // D24_UNORM_S8_UINT + 1, // S8_UINT_D24_UNORM + 1, // D32_FLOAT_S8_UINT }}; static constexpr u32 GetDefaultBlockWidth(PixelFormat format) { @@ -317,85 +353,97 @@ static constexpr u32 GetDefaultBlockWidth(PixelFormat format) { } constexpr std::array<u32, MaxPixelFormat> block_height_table = {{ - 1, // ABGR8U - 1, // ABGR8S - 1, // ABGR8UI - 1, // B5G6R5U - 1, // A2B10G10R10U - 1, // A1B5G5R5U - 1, // R8U - 1, // R8UI - 1, // RGBA16F - 1, // RGBA16U - 1, // RGBA16S - 1, // RGBA16UI - 1, // R11FG11FB10F - 1, // RGBA32UI - 4, // DXT1 - 4, // DXT23 - 4, // DXT45 - 4, // DXN1 - 4, // DXN2UNORM - 4, // DXN2SNORM - 4, // BC7U - 4, // BC6H_UF16 - 4, // BC6H_SF16 - 4, // ASTC_2D_4X4 - 1, // BGRA8 - 1, // RGBA32F - 1, // RG32F - 1, // R32F - 1, // R16F - 1, // R16U - 1, // R16S - 1, // R16UI - 1, // R16I - 1, // RG16 - 1, // RG16F - 1, // RG16UI - 1, // RG16I - 1, // RG16S - 1, // RGB32F - 1, // RGBA8_SRGB - 1, // RG8U - 1, // RG8S - 1, // RG32UI - 1, // RGBX16F - 1, // R32UI - 1, // R32I - 8, // ASTC_2D_8X8 - 5, // ASTC_2D_8X5 - 4, // ASTC_2D_5X4 - 1, // BGRA8_SRGB - 4, // DXT1_SRGB - 4, // DXT23_SRGB - 4, // DXT45_SRGB - 4, // BC7U_SRGB - 1, // R4G4B4A4U + 1, // A8B8G8R8_UNORM + 1, // A8B8G8R8_SNORM + 1, // A8B8G8R8_SINT + 1, // A8B8G8R8_UINT + 1, // R5G6B5_UNORM + 1, // B5G6R5_UNORM + 1, // A1R5G5B5_UNORM + 1, // A2B10G10R10_UNORM + 1, // A2B10G10R10_UINT + 1, // A1B5G5R5_UNORM + 1, // R8_UNORM + 1, // R8_SNORM + 1, // R8_SINT + 1, // R8_UINT + 1, // R16G16B16A16_FLOAT + 1, // R16G16B16A16_UNORM + 1, // R16G16B16A16_SNORM + 1, // R16G16B16A16_SINT + 1, // R16G16B16A16_UINT + 1, // B10G11R11_FLOAT + 1, // R32G32B32A32_UINT + 4, // BC1_RGBA_UNORM + 4, // BC2_UNORM + 4, // BC3_UNORM + 4, // BC4_UNORM + 4, // BC4_SNORM + 4, // BC5_UNORM + 4, // BC5_SNORM + 4, // BC7_UNORM + 4, // BC6H_UFLOAT + 4, // BC6H_SFLOAT + 4, // ASTC_2D_4X4_UNORM + 1, // B8G8R8A8_UNORM + 1, // R32G32B32A32_FLOAT + 1, // R32G32B32A32_SINT + 1, // R32G32_FLOAT + 1, // R32G32_SINT + 1, // R32_FLOAT + 1, // R16_FLOAT + 1, // R16_UNORM + 1, // R16_SNORM + 1, // R16_UINT + 1, // R16_SINT + 1, // R16G16_UNORM + 1, // R16G16_FLOAT + 1, // R16G16_UINT + 1, // R16G16_SINT + 1, // R16G16_SNORM + 1, // R32G32B32_FLOAT + 1, // A8B8G8R8_SRGB + 1, // R8G8_UNORM + 1, // R8G8_SNORM + 1, // R8G8_SINT + 1, // R8G8_UINT + 1, // R32G32_UINT + 1, // R16G16B16X16_FLOAT + 1, // R32_UINT + 1, // R32_SINT + 8, // ASTC_2D_8X8_UNORM + 5, // ASTC_2D_8X5_UNORM + 4, // ASTC_2D_5X4_UNORM + 1, // B8G8R8A8_SRGB + 4, // BC1_RGBA_SRGB + 4, // BC2_SRGB + 4, // BC3_SRGB + 4, // BC7_SRGB + 1, // A4B4G4R4_UNORM 4, // ASTC_2D_4X4_SRGB 8, // ASTC_2D_8X8_SRGB 5, // ASTC_2D_8X5_SRGB 4, // ASTC_2D_5X4_SRGB - 5, // ASTC_2D_5X5 + 5, // ASTC_2D_5X5_UNORM 5, // ASTC_2D_5X5_SRGB - 8, // ASTC_2D_10X8 + 8, // ASTC_2D_10X8_UNORM 8, // ASTC_2D_10X8_SRGB - 6, // ASTC_2D_6X6 + 6, // ASTC_2D_6X6_UNORM 6, // ASTC_2D_6X6_SRGB - 10, // ASTC_2D_10X10 + 10, // ASTC_2D_10X10_UNORM 10, // ASTC_2D_10X10_SRGB - 12, // ASTC_2D_12X12 + 12, // ASTC_2D_12X12_UNORM 12, // ASTC_2D_12X12_SRGB - 6, // ASTC_2D_8X6 + 6, // ASTC_2D_8X6_UNORM 6, // ASTC_2D_8X6_SRGB - 5, // ASTC_2D_6X5 + 5, // ASTC_2D_6X5_UNORM 5, // ASTC_2D_6X5_SRGB - 1, // E5B9G9R9F - 1, // Z32F - 1, // Z16 - 1, // Z24S8 - 1, // S8Z24 - 1, // Z32FS8 + 1, // E5B9G9R9_FLOAT + 1, // D32_FLOAT + 1, // D16_UNORM + 1, // D24_UNORM_S8_UINT + 1, // S8_UINT_D24_UNORM + 1, // D32_FLOAT_S8_UINT }}; static constexpr u32 GetDefaultBlockHeight(PixelFormat format) { @@ -407,85 +455,97 @@ static constexpr u32 GetDefaultBlockHeight(PixelFormat format) { } constexpr std::array<u32, MaxPixelFormat> bpp_table = {{ - 32, // ABGR8U - 32, // ABGR8S - 32, // ABGR8UI - 16, // B5G6R5U - 32, // A2B10G10R10U - 16, // A1B5G5R5U - 8, // R8U - 8, // R8UI - 64, // RGBA16F - 64, // RGBA16U - 64, // RGBA16S - 64, // RGBA16UI - 32, // R11FG11FB10F - 128, // RGBA32UI - 64, // DXT1 - 128, // DXT23 - 128, // DXT45 - 64, // DXN1 - 128, // DXN2UNORM - 128, // DXN2SNORM - 128, // BC7U - 128, // BC6H_UF16 - 128, // BC6H_SF16 - 128, // ASTC_2D_4X4 - 32, // BGRA8 - 128, // RGBA32F - 64, // RG32F - 32, // R32F - 16, // R16F - 16, // R16U - 16, // R16S - 16, // R16UI - 16, // R16I - 32, // RG16 - 32, // RG16F - 32, // RG16UI - 32, // RG16I - 32, // RG16S - 96, // RGB32F - 32, // RGBA8_SRGB - 16, // RG8U - 16, // RG8S - 64, // RG32UI - 64, // RGBX16F - 32, // R32UI - 32, // R32I - 128, // ASTC_2D_8X8 - 128, // ASTC_2D_8X5 - 128, // ASTC_2D_5X4 - 32, // BGRA8_SRGB - 64, // DXT1_SRGB - 128, // DXT23_SRGB - 128, // DXT45_SRGB - 128, // BC7U - 16, // R4G4B4A4U + 32, // A8B8G8R8_UNORM + 32, // A8B8G8R8_SNORM + 32, // A8B8G8R8_SINT + 32, // A8B8G8R8_UINT + 16, // R5G6B5_UNORM + 16, // B5G6R5_UNORM + 16, // A1R5G5B5_UNORM + 32, // A2B10G10R10_UNORM + 32, // A2B10G10R10_UINT + 16, // A1B5G5R5_UNORM + 8, // R8_UNORM + 8, // R8_SNORM + 8, // R8_SINT + 8, // R8_UINT + 64, // R16G16B16A16_FLOAT + 64, // R16G16B16A16_UNORM + 64, // R16G16B16A16_SNORM + 64, // R16G16B16A16_SINT + 64, // R16G16B16A16_UINT + 32, // B10G11R11_FLOAT + 128, // R32G32B32A32_UINT + 64, // BC1_RGBA_UNORM + 128, // BC2_UNORM + 128, // BC3_UNORM + 64, // BC4_UNORM + 64, // BC4_SNORM + 128, // BC5_UNORM + 128, // BC5_SNORM + 128, // BC7_UNORM + 128, // BC6H_UFLOAT + 128, // BC6H_SFLOAT + 128, // ASTC_2D_4X4_UNORM + 32, // B8G8R8A8_UNORM + 128, // R32G32B32A32_FLOAT + 128, // R32G32B32A32_SINT + 64, // R32G32_FLOAT + 64, // R32G32_SINT + 32, // R32_FLOAT + 16, // R16_FLOAT + 16, // R16_UNORM + 16, // R16_SNORM + 16, // R16_UINT + 16, // R16_SINT + 32, // R16G16_UNORM + 32, // R16G16_FLOAT + 32, // R16G16_UINT + 32, // R16G16_SINT + 32, // R16G16_SNORM + 96, // R32G32B32_FLOAT + 32, // A8B8G8R8_SRGB + 16, // R8G8_UNORM + 16, // R8G8_SNORM + 16, // R8G8_SINT + 16, // R8G8_UINT + 64, // R32G32_UINT + 64, // R16G16B16X16_FLOAT + 32, // R32_UINT + 32, // R32_SINT + 128, // ASTC_2D_8X8_UNORM + 128, // ASTC_2D_8X5_UNORM + 128, // ASTC_2D_5X4_UNORM + 32, // B8G8R8A8_SRGB + 64, // BC1_RGBA_SRGB + 128, // BC2_SRGB + 128, // BC3_SRGB + 128, // BC7_UNORM + 16, // A4B4G4R4_UNORM 128, // ASTC_2D_4X4_SRGB 128, // ASTC_2D_8X8_SRGB 128, // ASTC_2D_8X5_SRGB 128, // ASTC_2D_5X4_SRGB - 128, // ASTC_2D_5X5 + 128, // ASTC_2D_5X5_UNORM 128, // ASTC_2D_5X5_SRGB - 128, // ASTC_2D_10X8 + 128, // ASTC_2D_10X8_UNORM 128, // ASTC_2D_10X8_SRGB - 128, // ASTC_2D_6X6 + 128, // ASTC_2D_6X6_UNORM 128, // ASTC_2D_6X6_SRGB - 128, // ASTC_2D_10X10 + 128, // ASTC_2D_10X10_UNORM 128, // ASTC_2D_10X10_SRGB - 128, // ASTC_2D_12X12 + 128, // ASTC_2D_12X12_UNORM 128, // ASTC_2D_12X12_SRGB - 128, // ASTC_2D_8X6 + 128, // ASTC_2D_8X6_UNORM 128, // ASTC_2D_8X6_SRGB - 128, // ASTC_2D_6X5 + 128, // ASTC_2D_6X5_UNORM 128, // ASTC_2D_6X5_SRGB - 32, // E5B9G9R9F - 32, // Z32F - 16, // Z16 - 32, // Z24S8 - 32, // S8Z24 - 64, // Z32FS8 + 32, // E5B9G9R9_FLOAT + 32, // D32_FLOAT + 16, // D16_UNORM + 32, // D24_UNORM_S8_UINT + 32, // S8_UINT_D24_UNORM + 64, // D32_FLOAT_S8_UINT }}; static constexpr u32 GetFormatBpp(PixelFormat format) { @@ -524,7 +584,4 @@ bool IsPixelFormatSRGB(PixelFormat format); std::pair<u32, u32> GetASTCBlockSize(PixelFormat format); -/// Returns true if the specified PixelFormat is a BCn format, e.g. DXT or DXN -bool IsFormatBCn(PixelFormat format); - } // namespace VideoCore::Surface diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp index e151c26c4..7d5a75648 100644 --- a/src/video_core/texture_cache/format_lookup_table.cpp +++ b/src/video_core/texture_cache/format_lookup_table.cpp @@ -19,8 +19,6 @@ constexpr auto SNORM = ComponentType::SNORM; constexpr auto UNORM = ComponentType::UNORM; constexpr auto SINT = ComponentType::SINT; constexpr auto UINT = ComponentType::UINT; -constexpr auto SNORM_FORCE_FP16 = ComponentType::SNORM_FORCE_FP16; -constexpr auto UNORM_FORCE_FP16 = ComponentType::UNORM_FORCE_FP16; constexpr auto FLOAT = ComponentType::FLOAT; constexpr bool C = false; // Normal color constexpr bool S = true; // Srgb @@ -41,117 +39,126 @@ struct Table { ComponentType alpha_component; bool is_srgb; }; -constexpr std::array<Table, 76> DefinitionTable = {{ - {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U}, - {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S}, - {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI}, - {TextureFormat::A8R8G8B8, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::RGBA8_SRGB}, +constexpr std::array<Table, 86> DefinitionTable = {{ + {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A8B8G8R8_UNORM}, + {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::A8B8G8R8_SNORM}, + {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::A8B8G8R8_UINT}, + {TextureFormat::A8R8G8B8, C, SINT, SINT, SINT, SINT, PixelFormat::A8B8G8R8_SINT}, + {TextureFormat::A8R8G8B8, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::A8B8G8R8_SRGB}, - {TextureFormat::B5G6R5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::B5G6R5U}, + {TextureFormat::B5G6R5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::B5G6R5_UNORM}, - {TextureFormat::A2B10G10R10, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A2B10G10R10U}, + {TextureFormat::A2B10G10R10, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A2B10G10R10_UNORM}, + {TextureFormat::A2B10G10R10, C, UINT, UINT, UINT, UINT, PixelFormat::A2B10G10R10_UINT}, - {TextureFormat::A1B5G5R5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A1B5G5R5U}, + {TextureFormat::A1B5G5R5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A1B5G5R5_UNORM}, - {TextureFormat::A4B4G4R4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R4G4B4A4U}, + {TextureFormat::A4B4G4R4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A4B4G4R4_UNORM}, - {TextureFormat::R8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R8U}, - {TextureFormat::R8, C, UINT, UINT, UINT, UINT, PixelFormat::R8UI}, + {TextureFormat::R8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R8_UNORM}, + {TextureFormat::R8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R8_SNORM}, + {TextureFormat::R8, C, UINT, UINT, UINT, UINT, PixelFormat::R8_UINT}, + {TextureFormat::R8, C, SINT, SINT, SINT, SINT, PixelFormat::R8_SINT}, - {TextureFormat::G8R8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RG8U}, - {TextureFormat::G8R8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RG8S}, + {TextureFormat::R8G8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R8G8_UNORM}, + {TextureFormat::R8G8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R8G8_SNORM}, + {TextureFormat::R8G8, C, UINT, UINT, UINT, UINT, PixelFormat::R8G8_UINT}, + {TextureFormat::R8G8, C, SINT, SINT, SINT, SINT, PixelFormat::R8G8_SINT}, - {TextureFormat::R16_G16_B16_A16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RGBA16S}, - {TextureFormat::R16_G16_B16_A16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RGBA16U}, - {TextureFormat::R16_G16_B16_A16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::RGBA16F}, - {TextureFormat::R16_G16_B16_A16, C, UINT, UINT, UINT, UINT, PixelFormat::RGBA16UI}, + {TextureFormat::R16G16B16A16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R16G16B16A16_SNORM}, + {TextureFormat::R16G16B16A16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R16G16B16A16_UNORM}, + {TextureFormat::R16G16B16A16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R16G16B16A16_FLOAT}, + {TextureFormat::R16G16B16A16, C, UINT, UINT, UINT, UINT, PixelFormat::R16G16B16A16_UINT}, + {TextureFormat::R16G16B16A16, C, SINT, SINT, SINT, SINT, PixelFormat::R16G16B16A16_SINT}, - {TextureFormat::R16_G16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::RG16F}, - {TextureFormat::R16_G16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RG16}, - {TextureFormat::R16_G16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RG16S}, - {TextureFormat::R16_G16, C, UINT, UINT, UINT, UINT, PixelFormat::RG16UI}, - {TextureFormat::R16_G16, C, SINT, SINT, SINT, SINT, PixelFormat::RG16I}, + {TextureFormat::R16G16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R16G16_FLOAT}, + {TextureFormat::R16G16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R16G16_UNORM}, + {TextureFormat::R16G16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R16G16_SNORM}, + {TextureFormat::R16G16, C, UINT, UINT, UINT, UINT, PixelFormat::R16G16_UINT}, + {TextureFormat::R16G16, C, SINT, SINT, SINT, SINT, PixelFormat::R16G16_SINT}, - {TextureFormat::R16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R16F}, - {TextureFormat::R16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R16U}, - {TextureFormat::R16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R16S}, - {TextureFormat::R16, C, UINT, UINT, UINT, UINT, PixelFormat::R16UI}, - {TextureFormat::R16, C, SINT, SINT, SINT, SINT, PixelFormat::R16I}, + {TextureFormat::R16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R16_FLOAT}, + {TextureFormat::R16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R16_UNORM}, + {TextureFormat::R16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R16_SNORM}, + {TextureFormat::R16, C, UINT, UINT, UINT, UINT, PixelFormat::R16_UINT}, + {TextureFormat::R16, C, SINT, SINT, SINT, SINT, PixelFormat::R16_SINT}, - {TextureFormat::BF10GF11RF11, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R11FG11FB10F}, + {TextureFormat::B10G11R11, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::B10G11R11_FLOAT}, - {TextureFormat::R32_G32_B32_A32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::RGBA32F}, - {TextureFormat::R32_G32_B32_A32, C, UINT, UINT, UINT, UINT, PixelFormat::RGBA32UI}, + {TextureFormat::R32G32B32A32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32G32B32A32_FLOAT}, + {TextureFormat::R32G32B32A32, C, UINT, UINT, UINT, UINT, PixelFormat::R32G32B32A32_UINT}, + {TextureFormat::R32G32B32A32, C, SINT, SINT, SINT, SINT, PixelFormat::R32G32B32A32_SINT}, - {TextureFormat::R32_G32_B32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::RGB32F}, + {TextureFormat::R32G32B32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32G32B32_FLOAT}, - {TextureFormat::R32_G32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::RG32F}, - {TextureFormat::R32_G32, C, UINT, UINT, UINT, UINT, PixelFormat::RG32UI}, + {TextureFormat::R32G32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32G32_FLOAT}, + {TextureFormat::R32G32, C, UINT, UINT, UINT, UINT, PixelFormat::R32G32_UINT}, + {TextureFormat::R32G32, C, SINT, SINT, SINT, SINT, PixelFormat::R32G32_SINT}, - {TextureFormat::R32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32F}, - {TextureFormat::R32, C, UINT, UINT, UINT, UINT, PixelFormat::R32UI}, - {TextureFormat::R32, C, SINT, SINT, SINT, SINT, PixelFormat::R32I}, + {TextureFormat::R32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32_FLOAT}, + {TextureFormat::R32, C, UINT, UINT, UINT, UINT, PixelFormat::R32_UINT}, + {TextureFormat::R32, C, SINT, SINT, SINT, SINT, PixelFormat::R32_SINT}, - {TextureFormat::E5B9G9R9_SHAREDEXP, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::E5B9G9R9F}, + {TextureFormat::E5B9G9R9, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::E5B9G9R9_FLOAT}, - {TextureFormat::ZF32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::Z32F}, - {TextureFormat::Z16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::Z16}, - {TextureFormat::S8Z24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24}, - {TextureFormat::ZF32_X24S8, C, FLOAT, UINT, UNORM, UNORM, PixelFormat::Z32FS8}, + {TextureFormat::D32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::D32_FLOAT}, + {TextureFormat::D16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::D16_UNORM}, + {TextureFormat::S8D24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8_UINT_D24_UNORM}, + {TextureFormat::R8G24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8_UINT_D24_UNORM}, + {TextureFormat::D32S8, C, FLOAT, UINT, UNORM, UNORM, PixelFormat::D32_FLOAT_S8_UINT}, - {TextureFormat::DXT1, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT1}, - {TextureFormat::DXT1, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT1_SRGB}, + {TextureFormat::BC1_RGBA, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC1_RGBA_UNORM}, + {TextureFormat::BC1_RGBA, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC1_RGBA_SRGB}, - {TextureFormat::DXT23, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT23}, - {TextureFormat::DXT23, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT23_SRGB}, + {TextureFormat::BC2, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC2_UNORM}, + {TextureFormat::BC2, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC2_SRGB}, - {TextureFormat::DXT45, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT45}, - {TextureFormat::DXT45, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT45_SRGB}, + {TextureFormat::BC3, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC3_UNORM}, + {TextureFormat::BC3, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC3_SRGB}, - // TODO: Use a different pixel format for SNORM - {TextureFormat::DXN1, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXN1}, - {TextureFormat::DXN1, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::DXN1}, + {TextureFormat::BC4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC4_UNORM}, + {TextureFormat::BC4, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::BC4_SNORM}, - {TextureFormat::DXN2, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXN2UNORM}, - {TextureFormat::DXN2, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::DXN2SNORM}, + {TextureFormat::BC5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC5_UNORM}, + {TextureFormat::BC5, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::BC5_SNORM}, - {TextureFormat::BC7U, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC7U}, - {TextureFormat::BC7U, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC7U_SRGB}, + {TextureFormat::BC7, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC7_UNORM}, + {TextureFormat::BC7, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC7_SRGB}, - {TextureFormat::BC6H_SF16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::BC6H_SF16}, - {TextureFormat::BC6H_UF16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::BC6H_UF16}, + {TextureFormat::BC6H_SFLOAT, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::BC6H_SFLOAT}, + {TextureFormat::BC6H_UFLOAT, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::BC6H_UFLOAT}, - {TextureFormat::ASTC_2D_4X4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_4X4}, + {TextureFormat::ASTC_2D_4X4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_4X4_UNORM}, {TextureFormat::ASTC_2D_4X4, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_4X4_SRGB}, - {TextureFormat::ASTC_2D_5X4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X4}, + {TextureFormat::ASTC_2D_5X4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X4_UNORM}, {TextureFormat::ASTC_2D_5X4, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X4_SRGB}, - {TextureFormat::ASTC_2D_5X5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X5}, + {TextureFormat::ASTC_2D_5X5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X5_UNORM}, {TextureFormat::ASTC_2D_5X5, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X5_SRGB}, - {TextureFormat::ASTC_2D_8X8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X8}, + {TextureFormat::ASTC_2D_8X8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X8_UNORM}, {TextureFormat::ASTC_2D_8X8, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X8_SRGB}, - {TextureFormat::ASTC_2D_8X5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X5}, + {TextureFormat::ASTC_2D_8X5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X5_UNORM}, {TextureFormat::ASTC_2D_8X5, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X5_SRGB}, - {TextureFormat::ASTC_2D_10X8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X8}, + {TextureFormat::ASTC_2D_10X8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X8_UNORM}, {TextureFormat::ASTC_2D_10X8, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X8_SRGB}, - {TextureFormat::ASTC_2D_6X6, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X6}, + {TextureFormat::ASTC_2D_6X6, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X6_UNORM}, {TextureFormat::ASTC_2D_6X6, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X6_SRGB}, - {TextureFormat::ASTC_2D_10X10, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X10}, + {TextureFormat::ASTC_2D_10X10, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X10_UNORM}, {TextureFormat::ASTC_2D_10X10, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X10_SRGB}, - {TextureFormat::ASTC_2D_12X12, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_12X12}, + {TextureFormat::ASTC_2D_12X12, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_12X12_UNORM}, {TextureFormat::ASTC_2D_12X12, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_12X12_SRGB}, - {TextureFormat::ASTC_2D_8X6, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X6}, + {TextureFormat::ASTC_2D_8X6, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X6_UNORM}, {TextureFormat::ASTC_2D_8X6, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X6_SRGB}, - {TextureFormat::ASTC_2D_6X5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X5}, + {TextureFormat::ASTC_2D_6X5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X5_UNORM}, {TextureFormat::ASTC_2D_6X5, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X5_SRGB}, }}; @@ -182,7 +189,7 @@ PixelFormat FormatLookupTable::GetPixelFormat(TextureFormat format, bool is_srgb static_cast<int>(format), is_srgb, static_cast<int>(red_component), static_cast<int>(green_component), static_cast<int>(blue_component), static_cast<int>(alpha_component)); - return PixelFormat::ABGR8U; + return PixelFormat::A8B8G8R8_UNORM; } void FormatLookupTable::Set(TextureFormat format, bool is_srgb, ComponentType red_component, @@ -196,9 +203,9 @@ std::size_t FormatLookupTable::CalculateIndex(TextureFormat format, bool is_srgb ComponentType alpha_component) noexcept { const auto format_index = static_cast<std::size_t>(format); const auto red_index = static_cast<std::size_t>(red_component); - const auto green_index = static_cast<std::size_t>(red_component); - const auto blue_index = static_cast<std::size_t>(red_component); - const auto alpha_index = static_cast<std::size_t>(red_component); + const auto green_index = static_cast<std::size_t>(green_component); + const auto blue_index = static_cast<std::size_t>(blue_component); + const auto alpha_index = static_cast<std::size_t>(alpha_component); const std::size_t srgb_index = is_srgb ? 1 : 0; return format_index * PerFormat + diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp index 7af0e792c..b44c09d71 100644 --- a/src/video_core/texture_cache/surface_base.cpp +++ b/src/video_core/texture_cache/surface_base.cpp @@ -115,17 +115,24 @@ std::optional<std::pair<u32, u32>> SurfaceBaseImpl::GetLayerMipmap( if (gpu_addr == candidate_gpu_addr) { return {{0, 0}}; } + if (candidate_gpu_addr < gpu_addr) { - return {}; + return std::nullopt; } + const auto relative_address{static_cast<GPUVAddr>(candidate_gpu_addr - gpu_addr)}; const auto layer{static_cast<u32>(relative_address / layer_size)}; + if (layer >= params.depth) { + return std::nullopt; + } + const GPUVAddr mipmap_address = relative_address - layer_size * layer; const auto mipmap_it = Common::BinaryFind(mipmap_offsets.begin(), mipmap_offsets.end(), mipmap_address); if (mipmap_it == mipmap_offsets.end()) { - return {}; + return std::nullopt; } + const auto level{static_cast<u32>(std::distance(mipmap_offsets.begin(), mipmap_it))}; return std::make_pair(layer, level); } @@ -225,7 +232,7 @@ void SurfaceBaseImpl::LoadBuffer(Tegra::MemoryManager& memory_manager, } } - if (!is_converted && params.pixel_format != PixelFormat::S8Z24) { + if (!is_converted && params.pixel_format != PixelFormat::S8_UINT_D24_UNORM) { return; } @@ -251,6 +258,11 @@ void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager, tmp_buffer.resize(guest_memory_size); host_ptr = tmp_buffer.data(); + if (params.target == SurfaceTarget::Texture3D) { + // Special case for 3D texture segments + memory_manager.ReadBlockUnsafe(gpu_addr, host_ptr, guest_memory_size); + } + if (params.is_tiled) { ASSERT_MSG(params.block_width == 0, "Block width is defined as {}", params.block_width); for (u32 level = 0; level < params.num_levels; ++level) { diff --git a/src/video_core/texture_cache/surface_base.h b/src/video_core/texture_cache/surface_base.h index a39a8661b..173f2edba 100644 --- a/src/video_core/texture_cache/surface_base.h +++ b/src/video_core/texture_cache/surface_base.h @@ -72,9 +72,9 @@ public: return (cpu_addr < end) && (cpu_addr_end > start); } - bool IsInside(const GPUVAddr other_start, const GPUVAddr other_end) { + bool IsInside(const GPUVAddr other_start, const GPUVAddr other_end) const { const GPUVAddr gpu_addr_end = gpu_addr + guest_memory_size; - return (gpu_addr <= other_start && other_end <= gpu_addr_end); + return gpu_addr <= other_start && other_end <= gpu_addr_end; } // Use only when recycling a surface @@ -192,6 +192,22 @@ public: index = index_; } + void SetMemoryMarked(bool is_memory_marked_) { + is_memory_marked = is_memory_marked_; + } + + bool IsMemoryMarked() const { + return is_memory_marked; + } + + void SetSyncPending(bool is_sync_pending_) { + is_sync_pending = is_sync_pending_; + } + + bool IsSyncPending() const { + return is_sync_pending; + } + void MarkAsPicked(bool is_picked_) { is_picked = is_picked_; } @@ -201,8 +217,8 @@ public: } bool IsProtected() const { - // Only 3D Slices are to be protected - return is_target && params.block_depth > 0; + // Only 3D slices are to be protected + return is_target && params.target == SurfaceTarget::Texture3D; } bool IsRenderTarget() const { @@ -234,6 +250,11 @@ public: return GetView(ViewParams(overview_params.target, 0, num_layers, 0, params.num_levels)); } + TView Emplace3DView(u32 slice, u32 depth, u32 base_level, u32 num_levels) { + return GetView(ViewParams(VideoCore::Surface::SurfaceTarget::Texture3D, slice, depth, + base_level, num_levels)); + } + std::optional<TView> EmplaceIrregularView(const SurfaceParams& view_params, const GPUVAddr view_addr, const std::size_t candidate_size, const u32 mipmap, @@ -256,8 +277,8 @@ public: std::optional<TView> EmplaceView(const SurfaceParams& view_params, const GPUVAddr view_addr, const std::size_t candidate_size) { if (params.target == SurfaceTarget::Texture3D || - (params.num_levels == 1 && !params.is_layered) || - view_params.target == SurfaceTarget::Texture3D) { + view_params.target == SurfaceTarget::Texture3D || + (params.num_levels == 1 && !params.is_layered)) { return {}; } const auto layer_mipmap{GetLayerMipmap(view_addr)}; @@ -303,6 +324,8 @@ private: bool is_target{}; bool is_registered{}; bool is_picked{}; + bool is_memory_marked{}; + bool is_sync_pending{}; u32 index{NO_RT}; u64 modification_tick{}; }; diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp index 6f3ef45be..13dd16356 100644 --- a/src/video_core/texture_cache/surface_params.cpp +++ b/src/video_core/texture_cache/surface_params.cpp @@ -74,21 +74,21 @@ SurfaceParams SurfaceParams::CreateForTexture(const FormatLookupTable& lookup_ta SurfaceParams params; params.is_tiled = tic.IsTiled(); params.srgb_conversion = tic.IsSrgbConversionEnabled(); - params.block_width = params.is_tiled ? tic.BlockWidth() : 0, - params.block_height = params.is_tiled ? tic.BlockHeight() : 0, - params.block_depth = params.is_tiled ? tic.BlockDepth() : 0, + params.block_width = params.is_tiled ? tic.BlockWidth() : 0; + params.block_height = params.is_tiled ? tic.BlockHeight() : 0; + params.block_depth = params.is_tiled ? tic.BlockDepth() : 0; params.tile_width_spacing = params.is_tiled ? (1 << tic.tile_width_spacing.Value()) : 1; params.pixel_format = lookup_table.GetPixelFormat( tic.format, params.srgb_conversion, tic.r_type, tic.g_type, tic.b_type, tic.a_type); params.type = GetFormatType(params.pixel_format); - if (entry.IsShadow() && params.type == SurfaceType::ColorTexture) { + if (entry.is_shadow && params.type == SurfaceType::ColorTexture) { switch (params.pixel_format) { - case PixelFormat::R16U: - case PixelFormat::R16F: - params.pixel_format = PixelFormat::Z16; + case PixelFormat::R16_UNORM: + case PixelFormat::R16_FLOAT: + params.pixel_format = PixelFormat::D16_UNORM; break; - case PixelFormat::R32F: - params.pixel_format = PixelFormat::Z32F; + case PixelFormat::R32_FLOAT: + params.pixel_format = PixelFormat::D32_FLOAT; break; default: UNIMPLEMENTED_MSG("Unimplemented shadow convert format: {}", @@ -96,7 +96,6 @@ SurfaceParams SurfaceParams::CreateForTexture(const FormatLookupTable& lookup_ta } params.type = GetFormatType(params.pixel_format); } - params.type = GetFormatType(params.pixel_format); // TODO: on 1DBuffer we should use the tic info. if (tic.IsBuffer()) { params.target = SurfaceTarget::TextureBuffer; @@ -108,7 +107,7 @@ SurfaceParams SurfaceParams::CreateForTexture(const FormatLookupTable& lookup_ta params.emulated_levels = 1; params.is_layered = false; } else { - params.target = TextureTypeToSurfaceTarget(entry.GetType(), entry.IsArray()); + params.target = TextureTypeToSurfaceTarget(entry.type, entry.is_array); params.width = tic.Width(); params.height = tic.Height(); params.depth = tic.Depth(); @@ -130,15 +129,14 @@ SurfaceParams SurfaceParams::CreateForImage(const FormatLookupTable& lookup_tabl SurfaceParams params; params.is_tiled = tic.IsTiled(); params.srgb_conversion = tic.IsSrgbConversionEnabled(); - params.block_width = params.is_tiled ? tic.BlockWidth() : 0, - params.block_height = params.is_tiled ? tic.BlockHeight() : 0, - params.block_depth = params.is_tiled ? tic.BlockDepth() : 0, + params.block_width = params.is_tiled ? tic.BlockWidth() : 0; + params.block_height = params.is_tiled ? tic.BlockHeight() : 0; + params.block_depth = params.is_tiled ? tic.BlockDepth() : 0; params.tile_width_spacing = params.is_tiled ? (1 << tic.tile_width_spacing.Value()) : 1; params.pixel_format = lookup_table.GetPixelFormat( tic.format, params.srgb_conversion, tic.r_type, tic.g_type, tic.b_type, tic.a_type); params.type = GetFormatType(params.pixel_format); - params.type = GetFormatType(params.pixel_format); - params.target = ImageTypeToSurfaceTarget(entry.GetType()); + params.target = ImageTypeToSurfaceTarget(entry.type); // TODO: on 1DBuffer we should use the tic info. if (tic.IsBuffer()) { params.target = SurfaceTarget::TextureBuffer; @@ -165,39 +163,40 @@ SurfaceParams SurfaceParams::CreateForImage(const FormatLookupTable& lookup_tabl return params; } -SurfaceParams SurfaceParams::CreateForDepthBuffer(Core::System& system) { - const auto& regs = system.GPU().Maxwell3D().regs; - regs.zeta_width, regs.zeta_height, regs.zeta.format, regs.zeta.memory_layout.type; - SurfaceParams params; - params.is_tiled = regs.zeta.memory_layout.type == - Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear; - params.srgb_conversion = false; - params.block_width = std::min(regs.zeta.memory_layout.block_width.Value(), 5U); - params.block_height = std::min(regs.zeta.memory_layout.block_height.Value(), 5U); - params.block_depth = std::min(regs.zeta.memory_layout.block_depth.Value(), 5U); - params.tile_width_spacing = 1; - params.pixel_format = PixelFormatFromDepthFormat(regs.zeta.format); - params.type = GetFormatType(params.pixel_format); - params.width = regs.zeta_width; - params.height = regs.zeta_height; - params.pitch = 0; - params.num_levels = 1; - params.emulated_levels = 1; - - const bool is_layered = regs.zeta_layers > 1 && params.block_depth == 0; - params.is_layered = is_layered; - params.target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D; - params.depth = is_layered ? regs.zeta_layers.Value() : 1U; - return params; +SurfaceParams SurfaceParams::CreateForDepthBuffer(Tegra::Engines::Maxwell3D& maxwell3d) { + const auto& regs = maxwell3d.regs; + const auto block_depth = std::min(regs.zeta.memory_layout.block_depth.Value(), 5U); + const bool is_layered = regs.zeta_layers > 1 && block_depth == 0; + const auto pixel_format = PixelFormatFromDepthFormat(regs.zeta.format); + return { + .is_tiled = regs.zeta.memory_layout.type == + Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear, + .srgb_conversion = false, + .is_layered = is_layered, + .block_width = std::min(regs.zeta.memory_layout.block_width.Value(), 5U), + .block_height = std::min(regs.zeta.memory_layout.block_height.Value(), 5U), + .block_depth = block_depth, + .tile_width_spacing = 1, + .width = regs.zeta_width, + .height = regs.zeta_height, + .depth = is_layered ? regs.zeta_layers.Value() : 1U, + .pitch = 0, + .num_levels = 1, + .emulated_levels = 1, + .pixel_format = pixel_format, + .type = GetFormatType(pixel_format), + .target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D, + }; } -SurfaceParams SurfaceParams::CreateForFramebuffer(Core::System& system, std::size_t index) { - const auto& config{system.GPU().Maxwell3D().regs.rt[index]}; +SurfaceParams SurfaceParams::CreateForFramebuffer(Tegra::Engines::Maxwell3D& maxwell3d, + std::size_t index) { + const auto& config{maxwell3d.regs.rt[index]}; SurfaceParams params; params.is_tiled = config.memory_layout.type == Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear; - params.srgb_conversion = config.format == Tegra::RenderTargetFormat::BGRA8_SRGB || - config.format == Tegra::RenderTargetFormat::RGBA8_SRGB; + params.srgb_conversion = config.format == Tegra::RenderTargetFormat::B8G8R8A8_SRGB || + config.format == Tegra::RenderTargetFormat::A8B8G8R8_SRGB; params.block_width = config.memory_layout.block_width; params.block_height = config.memory_layout.block_height; params.block_depth = config.memory_layout.block_depth; @@ -216,45 +215,60 @@ SurfaceParams SurfaceParams::CreateForFramebuffer(Core::System& system, std::siz params.num_levels = 1; params.emulated_levels = 1; - const bool is_layered = config.layers > 1 && params.block_depth == 0; - params.is_layered = is_layered; - params.depth = is_layered ? config.layers.Value() : 1; - params.target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D; + if (config.memory_layout.is_3d != 0) { + params.depth = config.layers.Value(); + params.is_layered = false; + params.target = SurfaceTarget::Texture3D; + } else if (config.layers > 1) { + params.depth = config.layers.Value(); + params.is_layered = true; + params.target = SurfaceTarget::Texture2DArray; + } else { + params.depth = 1; + params.is_layered = false; + params.target = SurfaceTarget::Texture2D; + } return params; } SurfaceParams SurfaceParams::CreateForFermiCopySurface( const Tegra::Engines::Fermi2D::Regs::Surface& config) { - SurfaceParams params{}; - params.is_tiled = !config.linear; - params.srgb_conversion = config.format == Tegra::RenderTargetFormat::BGRA8_SRGB || - config.format == Tegra::RenderTargetFormat::RGBA8_SRGB; - params.block_width = params.is_tiled ? std::min(config.BlockWidth(), 5U) : 0, - params.block_height = params.is_tiled ? std::min(config.BlockHeight(), 5U) : 0, - params.block_depth = params.is_tiled ? std::min(config.BlockDepth(), 5U) : 0, - params.tile_width_spacing = 1; - params.pixel_format = PixelFormatFromRenderTargetFormat(config.format); - params.type = GetFormatType(params.pixel_format); - params.width = config.width; - params.height = config.height; - params.pitch = config.pitch; - // TODO(Rodrigo): Try to guess the surface target from depth and layer parameters - params.target = SurfaceTarget::Texture2D; - params.depth = 1; - params.num_levels = 1; - params.emulated_levels = 1; + const bool is_tiled = !config.linear; + const auto pixel_format = PixelFormatFromRenderTargetFormat(config.format); + + SurfaceParams params{ + .is_tiled = is_tiled, + .srgb_conversion = config.format == Tegra::RenderTargetFormat::B8G8R8A8_SRGB || + config.format == Tegra::RenderTargetFormat::A8B8G8R8_SRGB, + .is_layered = false, + .block_width = is_tiled ? std::min(config.BlockWidth(), 5U) : 0U, + .block_height = is_tiled ? std::min(config.BlockHeight(), 5U) : 0U, + .block_depth = is_tiled ? std::min(config.BlockDepth(), 5U) : 0U, + .tile_width_spacing = 1, + .width = config.width, + .height = config.height, + .depth = 1, + .pitch = config.pitch, + .num_levels = 1, + .emulated_levels = 1, + .pixel_format = pixel_format, + .type = GetFormatType(pixel_format), + // TODO(Rodrigo): Try to guess texture arrays from parameters + .target = SurfaceTarget::Texture2D, + }; + params.is_layered = params.IsLayered(); return params; } VideoCore::Surface::SurfaceTarget SurfaceParams::ExpectedTarget( const VideoCommon::Shader::Sampler& entry) { - return TextureTypeToSurfaceTarget(entry.GetType(), entry.IsArray()); + return TextureTypeToSurfaceTarget(entry.type, entry.is_array); } VideoCore::Surface::SurfaceTarget SurfaceParams::ExpectedTarget( const VideoCommon::Shader::Image& entry) { - return ImageTypeToSurfaceTarget(entry.GetType()); + return ImageTypeToSurfaceTarget(entry.type); } bool SurfaceParams::IsLayered() const { @@ -335,8 +349,7 @@ std::size_t SurfaceParams::GetLayerSize(bool as_host_size, bool uncompressed) co size += GetInnerMipmapMemorySize(level, as_host_size, uncompressed); } if (is_tiled && is_layered) { - return Common::AlignBits(size, - Tegra::Texture::GetGOBSizeShift() + block_height + block_depth); + return Common::AlignBits(size, Tegra::Texture::GOB_SIZE_SHIFT + block_height + block_depth); } return size; } @@ -410,7 +423,7 @@ std::tuple<u32, u32, u32> SurfaceParams::GetBlockOffsetXYZ(u32 offset) const { const u32 block_size = GetBlockSize(); const u32 block_index = offset / block_size; const u32 gob_offset = offset % block_size; - const u32 gob_index = gob_offset / static_cast<u32>(Tegra::Texture::GetGOBSize()); + const u32 gob_index = gob_offset / static_cast<u32>(Tegra::Texture::GOB_SIZE); const u32 x_gob_pixels = 64U / GetBytesPerPixel(); const u32 x_block_pixels = x_gob_pixels << block_width; const u32 y_block_pixels = 8U << block_height; diff --git a/src/video_core/texture_cache/surface_params.h b/src/video_core/texture_cache/surface_params.h index 24957df8d..4466c3c34 100644 --- a/src/video_core/texture_cache/surface_params.h +++ b/src/video_core/texture_cache/surface_params.h @@ -33,10 +33,11 @@ public: const VideoCommon::Shader::Image& entry); /// Creates SurfaceCachedParams for a depth buffer configuration. - static SurfaceParams CreateForDepthBuffer(Core::System& system); + static SurfaceParams CreateForDepthBuffer(Tegra::Engines::Maxwell3D& maxwell3d); /// Creates SurfaceCachedParams from a framebuffer configuration. - static SurfaceParams CreateForFramebuffer(Core::System& system, std::size_t index); + static SurfaceParams CreateForFramebuffer(Tegra::Engines::Maxwell3D& maxwell3d, + std::size_t index); /// Creates SurfaceCachedParams from a Fermi2D surface configuration. static SurfaceParams CreateForFermiCopySurface( @@ -204,7 +205,7 @@ public: static std::size_t AlignLayered(const std::size_t out_size, const u32 block_height, const u32 block_depth) { return Common::AlignBits(out_size, - Tegra::Texture::GetGOBSizeShift() + block_height + block_depth); + Tegra::Texture::GOB_SIZE_SHIFT + block_height + block_depth); } /// Converts a width from a type of surface into another. This helps represent the diff --git a/src/video_core/texture_cache/surface_view.cpp b/src/video_core/texture_cache/surface_view.cpp index 57a1f5803..6b5f5984b 100644 --- a/src/video_core/texture_cache/surface_view.cpp +++ b/src/video_core/texture_cache/surface_view.cpp @@ -20,4 +20,8 @@ bool ViewParams::operator==(const ViewParams& rhs) const { std::tie(rhs.base_layer, rhs.num_layers, rhs.base_level, rhs.num_levels, rhs.target); } +bool ViewParams::operator!=(const ViewParams& rhs) const { + return !operator==(rhs); +} + } // namespace VideoCommon diff --git a/src/video_core/texture_cache/surface_view.h b/src/video_core/texture_cache/surface_view.h index b17fd11a9..90a8bb0ae 100644 --- a/src/video_core/texture_cache/surface_view.h +++ b/src/video_core/texture_cache/surface_view.h @@ -21,6 +21,7 @@ struct ViewParams { std::size_t Hash() const; bool operator==(const ViewParams& rhs) const; + bool operator!=(const ViewParams& rhs) const; bool IsLayered() const { switch (target) { diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 4edd4313b..ea835c59f 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -6,6 +6,7 @@ #include <algorithm> #include <array> +#include <list> #include <memory> #include <mutex> #include <set> @@ -13,6 +14,7 @@ #include <unordered_map> #include <vector> +#include <boost/container/small_vector.hpp> #include <boost/icl/interval_map.hpp> #include <boost/range/iterator_range.hpp> @@ -22,6 +24,7 @@ #include "core/core.h" #include "core/memory.h" #include "core/settings.h" +#include "video_core/compatible_formats.h" #include "video_core/dirty_flags.h" #include "video_core/engines/fermi_2d.h" #include "video_core/engines/maxwell_3d.h" @@ -45,13 +48,14 @@ class RasterizerInterface; namespace VideoCommon { +using VideoCore::Surface::FormatCompatibility; using VideoCore::Surface::PixelFormat; - using VideoCore::Surface::SurfaceTarget; using RenderTargetConfig = Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig; template <typename TSurface, typename TView> class TextureCache { + using VectorSurface = boost::container::small_vector<TSurface, 1>; public: void InvalidateRegion(VAddr addr, std::size_t size) { @@ -62,6 +66,30 @@ public: } } + void OnCPUWrite(VAddr addr, std::size_t size) { + std::lock_guard lock{mutex}; + + for (const auto& surface : GetSurfacesInRegion(addr, size)) { + if (surface->IsMemoryMarked()) { + UnmarkMemory(surface); + surface->SetSyncPending(true); + marked_for_unregister.emplace_back(surface); + } + } + } + + void SyncGuestHost() { + std::lock_guard lock{mutex}; + + for (const auto& surface : marked_for_unregister) { + if (surface->IsRegistered()) { + surface->SetSyncPending(false); + Unregister(surface); + } + } + marked_for_unregister.clear(); + } + /** * Guarantees that rendertargets don't unregister themselves if the * collide. Protection is currently only done on 3D slices. @@ -85,10 +113,20 @@ public: return a->GetModificationTick() < b->GetModificationTick(); }); for (const auto& surface : surfaces) { + mutex.unlock(); FlushSurface(surface); + mutex.lock(); } } + bool MustFlushRegion(VAddr addr, std::size_t size) { + std::lock_guard lock{mutex}; + + const auto surfaces = GetSurfacesInRegion(addr, size); + return std::any_of(surfaces.cbegin(), surfaces.cend(), + [](const TSurface& surface) { return surface->IsModified(); }); + } + TView GetTextureSurface(const Tegra::Texture::TICEntry& tic, const VideoCommon::Shader::Sampler& entry) { std::lock_guard lock{mutex}; @@ -97,8 +135,7 @@ public: return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); } - const std::optional<VAddr> cpu_addr = - system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr); + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); if (!cpu_addr) { return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); } @@ -108,7 +145,7 @@ public: } const auto params{SurfaceParams::CreateForTexture(format_lookup_table, tic, entry)}; - const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, false); + const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, true, false); if (guard_samplers) { sampled_textures.push_back(surface); } @@ -122,13 +159,12 @@ public: if (!gpu_addr) { return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); } - const std::optional<VAddr> cpu_addr = - system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr); + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); if (!cpu_addr) { return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); } const auto params{SurfaceParams::CreateForImage(format_lookup_table, tic, entry)}; - const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, false); + const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, true, false); if (guard_samplers) { sampled_textures.push_back(surface); } @@ -143,13 +179,13 @@ public: return any_rt; } - TView GetDepthBufferSurface() { + TView GetDepthBufferSurface(bool preserve_contents) { std::lock_guard lock{mutex}; - auto& maxwell3d = system.GPU().Maxwell3D(); - if (!maxwell3d.dirty.flags[VideoCommon::Dirty::ZetaBuffer]) { + auto& dirty = maxwell3d.dirty; + if (!dirty.flags[VideoCommon::Dirty::ZetaBuffer]) { return depth_buffer.view; } - maxwell3d.dirty.flags[VideoCommon::Dirty::ZetaBuffer] = false; + dirty.flags[VideoCommon::Dirty::ZetaBuffer] = false; const auto& regs{maxwell3d.regs}; const auto gpu_addr{regs.zeta.Address()}; @@ -157,14 +193,13 @@ public: SetEmptyDepthBuffer(); return {}; } - const std::optional<VAddr> cpu_addr = - system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr); + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); if (!cpu_addr) { SetEmptyDepthBuffer(); return {}; } - const auto depth_params{SurfaceParams::CreateForDepthBuffer(system)}; - auto surface_view = GetSurface(gpu_addr, *cpu_addr, depth_params, true); + const auto depth_params{SurfaceParams::CreateForDepthBuffer(maxwell3d)}; + auto surface_view = GetSurface(gpu_addr, *cpu_addr, depth_params, preserve_contents, true); if (depth_buffer.target) depth_buffer.target->MarkAsRenderTarget(false, NO_RT); depth_buffer.target = surface_view.first; @@ -174,10 +209,9 @@ public: return surface_view.second; } - TView GetColorBufferSurface(std::size_t index) { + TView GetColorBufferSurface(std::size_t index, bool preserve_contents) { std::lock_guard lock{mutex}; ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets); - auto& maxwell3d = system.GPU().Maxwell3D(); if (!maxwell3d.dirty.flags[VideoCommon::Dirty::ColorBuffer0 + index]) { return render_targets[index].view; } @@ -197,17 +231,23 @@ public: return {}; } - const std::optional<VAddr> cpu_addr = - system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr); + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); if (!cpu_addr) { SetEmptyColorBuffer(index); return {}; } - auto surface_view = GetSurface(gpu_addr, *cpu_addr, - SurfaceParams::CreateForFramebuffer(system, index), true); - if (render_targets[index].target) - render_targets[index].target->MarkAsRenderTarget(false, NO_RT); + auto surface_view = + GetSurface(gpu_addr, *cpu_addr, SurfaceParams::CreateForFramebuffer(maxwell3d, index), + preserve_contents, true); + if (render_targets[index].target) { + auto& surface = render_targets[index].target; + surface->MarkAsRenderTarget(false, NO_RT); + const auto& cr_params = surface->GetSurfaceParams(); + if (!cr_params.is_tiled && Settings::values.use_asynchronous_gpu_emulation.GetValue()) { + AsyncFlushSurface(surface); + } + } render_targets[index].target = surface_view.first; render_targets[index].view = surface_view.second; if (render_targets[index].target) @@ -254,40 +294,69 @@ public: const GPUVAddr src_gpu_addr = src_config.Address(); const GPUVAddr dst_gpu_addr = dst_config.Address(); DeduceBestBlit(src_params, dst_params, src_gpu_addr, dst_gpu_addr); - const std::optional<VAddr> dst_cpu_addr = - system.GPU().MemoryManager().GpuToCpuAddress(dst_gpu_addr); - const std::optional<VAddr> src_cpu_addr = - system.GPU().MemoryManager().GpuToCpuAddress(src_gpu_addr); - std::pair<TSurface, TView> dst_surface = - GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, false); - std::pair<TSurface, TView> src_surface = - GetSurface(src_gpu_addr, *src_cpu_addr, src_params, false); - ImageBlit(src_surface.second, dst_surface.second, copy_config); + + const std::optional<VAddr> dst_cpu_addr = gpu_memory.GpuToCpuAddress(dst_gpu_addr); + const std::optional<VAddr> src_cpu_addr = gpu_memory.GpuToCpuAddress(src_gpu_addr); + std::pair dst_surface = GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false); + TView src_surface = GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false).second; + ImageBlit(src_surface, dst_surface.second, copy_config); dst_surface.first->MarkAsModified(true, Tick()); } - TSurface TryFindFramebufferSurface(VAddr addr) { + TSurface TryFindFramebufferSurface(VAddr addr) const { if (!addr) { return nullptr; } const VAddr page = addr >> registry_page_bits; - std::vector<TSurface>& list = registry[page]; - for (auto& surface : list) { - if (surface->GetCpuAddr() == addr) { - return surface; - } + const auto it = registry.find(page); + if (it == registry.end()) { + return nullptr; } - return nullptr; + const auto& list = it->second; + const auto found = std::find_if(list.begin(), list.end(), [addr](const auto& surface) { + return surface->GetCpuAddr() == addr; + }); + return found != list.end() ? *found : nullptr; } u64 Tick() { return ++ticks; } + void CommitAsyncFlushes() { + committed_flushes.push_back(uncommitted_flushes); + uncommitted_flushes.reset(); + } + + bool HasUncommittedFlushes() const { + return uncommitted_flushes != nullptr; + } + + bool ShouldWaitAsyncFlushes() const { + return !committed_flushes.empty() && committed_flushes.front() != nullptr; + } + + void PopAsyncFlushes() { + if (committed_flushes.empty()) { + return; + } + auto& flush_list = committed_flushes.front(); + if (!flush_list) { + committed_flushes.pop_front(); + return; + } + for (TSurface& surface : *flush_list) { + FlushSurface(surface); + } + committed_flushes.pop_front(); + } + protected: - explicit TextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - bool is_astc_supported) - : system{system}, is_astc_supported{is_astc_supported}, rasterizer{rasterizer} { + explicit TextureCache(VideoCore::RasterizerInterface& rasterizer_, + Tegra::Engines::Maxwell3D& maxwell3d_, Tegra::MemoryManager& gpu_memory_, + bool is_astc_supported_) + : is_astc_supported{is_astc_supported_}, rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, + gpu_memory{gpu_memory_} { for (std::size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) { SetEmptyColorBuffer(i); } @@ -300,9 +369,9 @@ protected: siblings_table[static_cast<std::size_t>(b)] = a; }; std::fill(siblings_table.begin(), siblings_table.end(), PixelFormat::Invalid); - make_siblings(PixelFormat::Z16, PixelFormat::R16U); - make_siblings(PixelFormat::Z32F, PixelFormat::R32F); - make_siblings(PixelFormat::Z32FS8, PixelFormat::RG32F); + make_siblings(PixelFormat::D16_UNORM, PixelFormat::R16_UNORM); + make_siblings(PixelFormat::D32_FLOAT, PixelFormat::R32_FLOAT); + make_siblings(PixelFormat::D32_FLOAT_S8_UINT, PixelFormat::R32G32_FLOAT); sampled_textures.reserve(64); } @@ -322,7 +391,7 @@ protected: virtual void BufferCopy(TSurface& src_surface, TSurface& dst_surface) = 0; void ManageRenderTargetUnregister(TSurface& surface) { - auto& dirty = system.GPU().Maxwell3D().dirty; + auto& dirty = maxwell3d.dirty; const u32 index = surface->GetRenderTarget(); if (index == DEPTH_RT) { dirty.flags[VideoCommon::Dirty::ZetaBuffer] = true; @@ -335,8 +404,7 @@ protected: void Register(TSurface surface) { const GPUVAddr gpu_addr = surface->GetGpuAddr(); const std::size_t size = surface->GetSizeInBytes(); - const std::optional<VAddr> cpu_addr = - system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr); + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); if (!cpu_addr) { LOG_CRITICAL(HW_GPU, "Failed to register surface with unmapped gpu_address 0x{:016x}", gpu_addr); @@ -345,9 +413,20 @@ protected: surface->SetCpuAddr(*cpu_addr); RegisterInnerCache(surface); surface->MarkAsRegistered(true); + surface->SetMemoryMarked(true); rasterizer.UpdatePagesCachedCount(*cpu_addr, size, 1); } + void UnmarkMemory(TSurface surface) { + if (!surface->IsMemoryMarked()) { + return; + } + const std::size_t size = surface->GetSizeInBytes(); + const VAddr cpu_addr = surface->GetCpuAddr(); + rasterizer.UpdatePagesCachedCount(cpu_addr, size, -1); + surface->SetMemoryMarked(false); + } + void Unregister(TSurface surface) { if (guard_render_targets && surface->IsProtected()) { return; @@ -355,9 +434,11 @@ protected: if (!guard_render_targets && surface->IsRenderTarget()) { ManageRenderTargetUnregister(surface); } - const std::size_t size = surface->GetSizeInBytes(); - const VAddr cpu_addr = surface->GetCpuAddr(); - rasterizer.UpdatePagesCachedCount(cpu_addr, size, -1); + UnmarkMemory(surface); + if (surface->IsSyncPending()) { + marked_for_unregister.remove(surface); + surface->SetSyncPending(false); + } UnregisterInnerCache(surface); surface->MarkAsRegistered(false); ReserveSurface(surface->GetSurfaceParams(), surface); @@ -373,7 +454,6 @@ protected: return new_surface; } - Core::System& system; const bool is_astc_supported; private: @@ -415,18 +495,18 @@ private: * @param untopological Indicates to the recycler that the texture has no way * to match the overlaps due to topological reasons. **/ - RecycleStrategy PickStrategy(std::vector<TSurface>& overlaps, const SurfaceParams& params, + RecycleStrategy PickStrategy(VectorSurface& overlaps, const SurfaceParams& params, const GPUVAddr gpu_addr, const MatchTopologyResult untopological) { - if (Settings::values.use_accurate_gpu_emulation) { + if (Settings::IsGPULevelExtreme()) { return RecycleStrategy::Flush; } // 3D Textures decision - if (params.block_depth > 1 || params.target == SurfaceTarget::Texture3D) { + if (params.target == SurfaceTarget::Texture3D) { return RecycleStrategy::Flush; } for (const auto& s : overlaps) { const auto& s_params = s->GetSurfaceParams(); - if (s_params.block_depth > 1 || s_params.target == SurfaceTarget::Texture3D) { + if (s_params.target == SurfaceTarget::Texture3D) { return RecycleStrategy::Flush; } } @@ -450,18 +530,21 @@ private: * @param overlaps The overlapping surfaces registered in the cache. * @param params The parameters for the new surface. * @param gpu_addr The starting address of the new surface. + * @param preserve_contents Indicates that the new surface should be loaded from memory or left + * blank. * @param untopological Indicates to the recycler that the texture has no way to match the * overlaps due to topological reasons. **/ - std::pair<TSurface, TView> RecycleSurface(std::vector<TSurface>& overlaps, - const SurfaceParams& params, const GPUVAddr gpu_addr, + std::pair<TSurface, TView> RecycleSurface(VectorSurface& overlaps, const SurfaceParams& params, + const GPUVAddr gpu_addr, const bool preserve_contents, const MatchTopologyResult untopological) { + const bool do_load = preserve_contents && Settings::IsGPULevelExtreme(); for (auto& surface : overlaps) { Unregister(surface); } switch (PickStrategy(overlaps, params, gpu_addr, untopological)) { case RecycleStrategy::Ignore: { - return InitializeSurface(gpu_addr, params, Settings::values.use_accurate_gpu_emulation); + return InitializeSurface(gpu_addr, params, do_load); } case RecycleStrategy::Flush: { std::sort(overlaps.begin(), overlaps.end(), @@ -471,7 +554,7 @@ private: for (auto& surface : overlaps) { FlushSurface(surface); } - return InitializeSurface(gpu_addr, params); + return InitializeSurface(gpu_addr, params, preserve_contents); } case RecycleStrategy::BufferCopy: { auto new_surface = GetUncachedSurface(gpu_addr, params); @@ -480,7 +563,7 @@ private: } default: { UNIMPLEMENTED_MSG("Unimplemented Texture Cache Recycling Strategy!"); - return InitializeSurface(gpu_addr, params); + return InitializeSurface(gpu_addr, params, do_load); } } } @@ -507,15 +590,15 @@ private: } else { new_surface = GetUncachedSurface(gpu_addr, params); } - const auto& final_params = new_surface->GetSurfaceParams(); + const SurfaceParams& final_params = new_surface->GetSurfaceParams(); if (cr_params.type != final_params.type) { - if (Settings::values.use_accurate_gpu_emulation) { + if (Settings::IsGPULevelExtreme()) { BufferCopy(current_surface, new_surface); } } else { std::vector<CopyParams> bricks = current_surface->BreakDown(final_params); for (auto& brick : bricks) { - ImageCopy(current_surface, new_surface, brick); + TryCopyImage(current_surface, new_surface, brick); } } Unregister(current_surface); @@ -563,47 +646,65 @@ private: * @param params The parameters on the new surface. * @param gpu_addr The starting address of the new surface. **/ - std::optional<std::pair<TSurface, TView>> TryReconstructSurface(std::vector<TSurface>& overlaps, + std::optional<std::pair<TSurface, TView>> TryReconstructSurface(VectorSurface& overlaps, const SurfaceParams& params, - const GPUVAddr gpu_addr) { + GPUVAddr gpu_addr) { if (params.target == SurfaceTarget::Texture3D) { - return {}; + return std::nullopt; } - bool modified = false; + const auto test_modified = [](TSurface& surface) { return surface->IsModified(); }; TSurface new_surface = GetUncachedSurface(gpu_addr, params); - u32 passed_tests = 0; + + if (std::none_of(overlaps.begin(), overlaps.end(), test_modified)) { + LoadSurface(new_surface); + for (const auto& surface : overlaps) { + Unregister(surface); + } + Register(new_surface); + return {{new_surface, new_surface->GetMainView()}}; + } + + std::size_t passed_tests = 0; for (auto& surface : overlaps) { const SurfaceParams& src_params = surface->GetSurfaceParams(); - if (src_params.is_layered || src_params.num_levels > 1) { - // We send this cases to recycle as they are more complex to handle - return {}; - } - const std::size_t candidate_size = surface->GetSizeInBytes(); - auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())}; + const auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())}; if (!mipmap_layer) { continue; } - const auto [layer, mipmap] = *mipmap_layer; - if (new_surface->GetMipmapSize(mipmap) != candidate_size) { + const auto [base_layer, base_mipmap] = *mipmap_layer; + if (new_surface->GetMipmapSize(base_mipmap) != surface->GetMipmapSize(0)) { continue; } - modified |= surface->IsModified(); - // Now we got all the data set up - const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap); - const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap); - const CopyParams copy_params(0, 0, 0, 0, 0, layer, 0, mipmap, width, height, 1); - passed_tests++; - ImageCopy(surface, new_surface, copy_params); + ++passed_tests; + + // Copy all mipmaps and layers + const u32 block_width = params.GetDefaultBlockWidth(); + const u32 block_height = params.GetDefaultBlockHeight(); + for (u32 mipmap = base_mipmap; mipmap < base_mipmap + src_params.num_levels; ++mipmap) { + const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap); + const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap); + if (width < block_width || height < block_height) { + // Current APIs forbid copying small compressed textures, avoid errors + break; + } + const CopyParams copy_params(0, 0, 0, 0, 0, base_layer, 0, mipmap, width, height, + src_params.depth); + TryCopyImage(surface, new_surface, copy_params); + } } if (passed_tests == 0) { - return {}; + return std::nullopt; + } + if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) { // In Accurate GPU all tests should pass, else we recycle - } else if (Settings::values.use_accurate_gpu_emulation && passed_tests != overlaps.size()) { - return {}; + return std::nullopt; } + + const bool modified = std::any_of(overlaps.begin(), overlaps.end(), test_modified); for (const auto& surface : overlaps) { Unregister(surface); } + new_surface->MarkAsModified(modified, Tick()); Register(new_surface); return {{new_surface, new_surface->GetMainView()}}; @@ -614,64 +715,26 @@ private: * textures within the GPU if possible. Falls back to LLE when it isn't possible to use any of * the HLE methods. * - * @param overlaps The overlapping surfaces registered in the cache. - * @param params The parameters on the new surface. - * @param gpu_addr The starting address of the new surface. - * @param cache_addr The starting address of the new surface on physical memory. + * @param overlaps The overlapping surfaces registered in the cache. + * @param params The parameters on the new surface. + * @param gpu_addr The starting address of the new surface. + * @param cpu_addr The starting address of the new surface on physical memory. + * @param preserve_contents Indicates that the new surface should be loaded from memory or + * left blank. */ - std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(std::vector<TSurface>& overlaps, + std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(VectorSurface& overlaps, const SurfaceParams& params, - const GPUVAddr gpu_addr, - const VAddr cpu_addr) { - if (params.target == SurfaceTarget::Texture3D) { - bool failed = false; - if (params.num_levels > 1) { - // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach - return std::nullopt; - } - TSurface new_surface = GetUncachedSurface(gpu_addr, params); - bool modified = false; - for (auto& surface : overlaps) { - const SurfaceParams& src_params = surface->GetSurfaceParams(); - if (src_params.target != SurfaceTarget::Texture2D) { - failed = true; - break; - } - if (src_params.height != params.height) { - failed = true; - break; - } - if (src_params.block_depth != params.block_depth || - src_params.block_height != params.block_height) { - failed = true; - break; - } - const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr); - const auto [x, y, z] = params.GetBlockOffsetXYZ(offset); - modified |= surface->IsModified(); - const CopyParams copy_params(0, 0, 0, 0, 0, z, 0, 0, params.width, params.height, - 1); - ImageCopy(surface, new_surface, copy_params); - } - if (failed) { - return std::nullopt; - } - for (const auto& surface : overlaps) { - Unregister(surface); - } - new_surface->MarkAsModified(modified, Tick()); - Register(new_surface); - auto view = new_surface->GetMainView(); - return {{std::move(new_surface), view}}; - } else { + GPUVAddr gpu_addr, VAddr cpu_addr, + bool preserve_contents) { + if (params.target != SurfaceTarget::Texture3D) { for (const auto& surface : overlaps) { if (!surface->MatchTarget(params.target)) { if (overlaps.size() == 1 && surface->GetCpuAddr() == cpu_addr) { - if (Settings::values.use_accurate_gpu_emulation) { + if (Settings::IsGPULevelExtreme()) { return std::nullopt; } Unregister(surface); - return InitializeSurface(gpu_addr, params); + return InitializeSurface(gpu_addr, params, preserve_contents); } return std::nullopt; } @@ -679,11 +742,60 @@ private: continue; } if (surface->MatchesStructure(params) == MatchStructureResult::FullMatch) { - return {{surface, surface->GetMainView()}}; + return std::make_pair(surface, surface->GetMainView()); + } + } + return InitializeSurface(gpu_addr, params, preserve_contents); + } + + if (params.num_levels > 1) { + // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach + return std::nullopt; + } + + if (overlaps.size() == 1) { + const auto& surface = overlaps[0]; + const SurfaceParams& overlap_params = surface->GetSurfaceParams(); + // Don't attempt to render to textures with more than one level for now + // The texture has to be to the right or the sample address if we want to render to it + if (overlap_params.num_levels == 1 && cpu_addr >= surface->GetCpuAddr()) { + const u32 offset = static_cast<u32>(cpu_addr - surface->GetCpuAddr()); + const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset)); + if (slice < overlap_params.depth) { + auto view = surface->Emplace3DView(slice, params.depth, 0, 1); + return std::make_pair(std::move(surface), std::move(view)); } } - return InitializeSurface(gpu_addr, params); } + + TSurface new_surface = GetUncachedSurface(gpu_addr, params); + bool modified = false; + + for (auto& surface : overlaps) { + const SurfaceParams& src_params = surface->GetSurfaceParams(); + if (src_params.target != SurfaceTarget::Texture2D || + src_params.height != params.height || + src_params.block_depth != params.block_depth || + src_params.block_height != params.block_height) { + return std::nullopt; + } + modified |= surface->IsModified(); + + const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr); + const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset)); + const u32 width = params.width; + const u32 height = params.height; + const CopyParams copy_params(0, 0, 0, 0, 0, slice, 0, 0, width, height, 1); + TryCopyImage(surface, new_surface, copy_params); + } + for (const auto& surface : overlaps) { + Unregister(surface); + } + new_surface->MarkAsModified(modified, Tick()); + Register(new_surface); + + TView view = new_surface->GetMainView(); + return std::make_pair(std::move(new_surface), std::move(view)); } /** @@ -705,10 +817,13 @@ private: * * @param gpu_addr The starting address of the candidate surface. * @param params The parameters on the candidate surface. + * @param preserve_contents Indicates that the new surface should be loaded from memory or + * left blank. * @param is_render Whether or not the surface is a render target. **/ std::pair<TSurface, TView> GetSurface(const GPUVAddr gpu_addr, const VAddr cpu_addr, - const SurfaceParams& params, bool is_render) { + const SurfaceParams& params, bool preserve_contents, + bool is_render) { // Step 1 // Check Level 1 Cache for a fast structural match. If candidate surface // matches at certain level we are pretty much done. @@ -716,8 +831,9 @@ private: TSurface& current_surface = iter->second; const auto topological_result = current_surface->MatchesTopology(params); if (topological_result != MatchTopologyResult::FullMatch) { - std::vector<TSurface> overlaps{current_surface}; - return RecycleSurface(overlaps, params, gpu_addr, topological_result); + VectorSurface overlaps{current_surface}; + return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, + topological_result); } const auto struct_result = current_surface->MatchesStructure(params); @@ -742,7 +858,7 @@ private: // If none are found, we are done. we just load the surface and create it. if (overlaps.empty()) { - return InitializeSurface(gpu_addr, params); + return InitializeSurface(gpu_addr, params, preserve_contents); } // Step 3 @@ -752,13 +868,15 @@ private: for (const auto& surface : overlaps) { const auto topological_result = surface->MatchesTopology(params); if (topological_result != MatchTopologyResult::FullMatch) { - return RecycleSurface(overlaps, params, gpu_addr, topological_result); + return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, + topological_result); } } - // Check if it's a 3D texture + // Manage 3D textures if (params.block_depth > 0) { - auto surface = Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr); + auto surface = + Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr, preserve_contents); if (surface) { return *surface; } @@ -771,14 +889,12 @@ private: // two things either the candidate surface is a supertexture of the overlap // or they don't match in any known way. if (!current_surface->IsInside(gpu_addr, gpu_addr + candidate_size)) { - if (current_surface->GetGpuAddr() == gpu_addr) { - std::optional<std::pair<TSurface, TView>> view = - TryReconstructSurface(overlaps, params, gpu_addr); - if (view) { - return *view; - } + const std::optional view = TryReconstructSurface(overlaps, params, gpu_addr); + if (view) { + return *view; } - return RecycleSurface(overlaps, params, gpu_addr, MatchTopologyResult::FullMatch); + return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, + MatchTopologyResult::FullMatch); } // Now we check if the candidate is a mipmap/layer of the overlap std::optional<TView> view = @@ -802,7 +918,7 @@ private: pair.first->EmplaceView(params, gpu_addr, candidate_size); if (mirage_view) return {pair.first, *mirage_view}; - return RecycleSurface(overlaps, params, gpu_addr, + return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, MatchTopologyResult::FullMatch); } return {current_surface, *view}; @@ -818,7 +934,8 @@ private: } } // We failed all the tests, recycle the overlaps into a new texture. - return RecycleSurface(overlaps, params, gpu_addr, MatchTopologyResult::FullMatch); + return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, + MatchTopologyResult::FullMatch); } /** @@ -831,8 +948,7 @@ private: * @param params The parameters on the candidate surface. **/ Deduction DeduceSurface(const GPUVAddr gpu_addr, const SurfaceParams& params) { - const std::optional<VAddr> cpu_addr = - system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr); + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); if (!cpu_addr) { Deduction result{}; @@ -892,7 +1008,9 @@ private: params.target = target; params.is_tiled = false; params.srgb_conversion = false; - params.is_layered = false; + params.is_layered = + target == SurfaceTarget::Texture1DArray || target == SurfaceTarget::Texture2DArray || + target == SurfaceTarget::TextureCubemap || target == SurfaceTarget::TextureCubeArray; params.block_width = 0; params.block_height = 0; params.block_depth = 0; @@ -906,7 +1024,7 @@ private: params.pitch = 4; params.num_levels = 1; params.emulated_levels = 1; - params.pixel_format = VideoCore::Surface::PixelFormat::R8U; + params.pixel_format = VideoCore::Surface::PixelFormat::R8_UNORM; params.type = VideoCore::Surface::SurfaceType::ColorTexture; auto surface = CreateSurface(0ULL, params); invalid_memory.resize(surface->GetHostSizeInBytes(), 0U); @@ -929,7 +1047,7 @@ private: void DeduceBestBlit(SurfaceParams& src_params, SurfaceParams& dst_params, const GPUVAddr src_gpu_addr, const GPUVAddr dst_gpu_addr) { auto deduced_src = DeduceSurface(src_gpu_addr, src_params); - auto deduced_dst = DeduceSurface(src_gpu_addr, src_params); + auto deduced_dst = DeduceSurface(dst_gpu_addr, dst_params); if (deduced_src.Failed() || deduced_dst.Failed()) { return; } @@ -976,10 +1094,10 @@ private: } std::pair<TSurface, TView> InitializeSurface(GPUVAddr gpu_addr, const SurfaceParams& params, - bool do_load = true) { + bool preserve_contents) { auto new_surface{GetUncachedSurface(gpu_addr, params)}; Register(new_surface); - if (do_load) { + if (preserve_contents) { LoadSurface(new_surface); } return {new_surface, new_surface->GetMainView()}; @@ -987,7 +1105,7 @@ private: void LoadSurface(const TSurface& surface) { staging_cache.GetBuffer(0).resize(surface->GetHostSizeInBytes()); - surface->LoadBuffer(system.GPU().MemoryManager(), staging_cache); + surface->LoadBuffer(gpu_memory, staging_cache); surface->UploadTexture(staging_cache.GetBuffer(0)); surface->MarkAsModified(false, Tick()); } @@ -998,7 +1116,7 @@ private: } staging_cache.GetBuffer(0).resize(surface->GetHostSizeInBytes()); surface->DownloadTexture(staging_cache.GetBuffer(0)); - surface->FlushBuffer(system.GPU().MemoryManager(), staging_cache); + surface->FlushBuffer(gpu_memory, staging_cache); surface->MarkAsModified(false, Tick()); } @@ -1025,23 +1143,25 @@ private: } } - std::vector<TSurface> GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) { + VectorSurface GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) { if (size == 0) { return {}; } const VAddr cpu_addr_end = cpu_addr + size; - VAddr start = cpu_addr >> registry_page_bits; const VAddr end = (cpu_addr_end - 1) >> registry_page_bits; - std::vector<TSurface> surfaces; - while (start <= end) { - std::vector<TSurface>& list = registry[start]; - for (auto& surface : list) { - if (!surface->IsPicked() && surface->Overlaps(cpu_addr, cpu_addr_end)) { - surface->MarkAsPicked(true); - surfaces.push_back(surface); + VectorSurface surfaces; + for (VAddr start = cpu_addr >> registry_page_bits; start <= end; ++start) { + const auto it = registry.find(start); + if (it == registry.end()) { + continue; + } + for (auto& surface : it->second) { + if (surface->IsPicked() || !surface->Overlaps(cpu_addr, cpu_addr_end)) { + continue; } + surface->MarkAsPicked(true); + surfaces.push_back(surface); } - start++; } for (auto& surface : surfaces) { surface->MarkAsPicked(false); @@ -1066,6 +1186,19 @@ private: return {}; } + /// Try to do an image copy logging when formats are incompatible. + void TryCopyImage(TSurface& src, TSurface& dst, const CopyParams& copy) { + const SurfaceParams& src_params = src->GetSurfaceParams(); + const SurfaceParams& dst_params = dst->GetSurfaceParams(); + if (!format_compatibility.TestCopy(src_params.pixel_format, dst_params.pixel_format)) { + LOG_ERROR(HW_GPU, "Illegal copy between formats={{{}, {}}}", + static_cast<int>(dst_params.pixel_format), + static_cast<int>(src_params.pixel_format)); + return; + } + ImageCopy(src, dst, copy); + } + constexpr PixelFormat GetSiblingFormat(PixelFormat format) const { return siblings_table[static_cast<std::size_t>(format)]; } @@ -1073,7 +1206,7 @@ private: /// Returns true the shader sampler entry is compatible with the TIC texture type. static bool IsTypeCompatible(Tegra::Texture::TextureType tic_type, const VideoCommon::Shader::Sampler& entry) { - const auto shader_type = entry.GetType(); + const auto shader_type = entry.type; switch (tic_type) { case Tegra::Texture::TextureType::Texture1D: case Tegra::Texture::TextureType::Texture1DArray: @@ -1094,7 +1227,7 @@ private: if (shader_type == Tegra::Shader::TextureType::TextureCube) { return true; } - return shader_type == Tegra::Shader::TextureType::Texture2D && entry.IsArray(); + return shader_type == Tegra::Shader::TextureType::Texture2D && entry.is_array; } UNREACHABLE(); return true; @@ -1105,9 +1238,19 @@ private: TView view; }; + void AsyncFlushSurface(TSurface& surface) { + if (!uncommitted_flushes) { + uncommitted_flushes = std::make_shared<std::list<TSurface>>(); + } + uncommitted_flushes->push_back(surface); + } + VideoCore::RasterizerInterface& rasterizer; + Tegra::Engines::Maxwell3D& maxwell3d; + Tegra::MemoryManager& gpu_memory; FormatLookupTable format_lookup_table; + FormatCompatibility format_compatibility; u64 ticks{}; @@ -1149,6 +1292,11 @@ private: std::unordered_map<u32, TSurface> invalid_cache; std::vector<u8> invalid_memory; + std::list<TSurface> marked_for_unregister; + + std::shared_ptr<std::list<TSurface>> uncommitted_flushes{}; + std::list<std::shared_ptr<std::list<TSurface>>> committed_flushes; + StagingCache staging_cache; std::recursive_mutex mutex; }; diff --git a/src/video_core/textures/convert.cpp b/src/video_core/textures/convert.cpp index f3efa7eb0..962921483 100644 --- a/src/video_core/textures/convert.cpp +++ b/src/video_core/textures/convert.cpp @@ -35,7 +35,7 @@ void SwapS8Z24ToZ24S8(u8* data, u32 width, u32 height) { S8Z24 s8z24_pixel{}; Z24S8 z24s8_pixel{}; constexpr auto bpp{ - VideoCore::Surface::GetBytesPerPixel(VideoCore::Surface::PixelFormat::S8Z24)}; + VideoCore::Surface::GetBytesPerPixel(VideoCore::Surface::PixelFormat::S8_UINT_D24_UNORM)}; for (std::size_t y = 0; y < height; ++y) { for (std::size_t x = 0; x < width; ++x) { const std::size_t offset{bpp * (y * width + x)}; @@ -73,7 +73,7 @@ void ConvertFromGuestToHost(u8* in_data, u8* out_data, PixelFormat pixel_format, in_data, width, height, depth, block_width, block_height); std::copy(rgba8_data.begin(), rgba8_data.end(), out_data); - } else if (convert_s8z24 && pixel_format == PixelFormat::S8Z24) { + } else if (convert_s8z24 && pixel_format == PixelFormat::S8_UINT_D24_UNORM) { Tegra::Texture::ConvertS8Z24ToZ24S8(in_data, width, height); } } @@ -85,7 +85,7 @@ void ConvertFromHostToGuest(u8* data, PixelFormat pixel_format, u32 width, u32 h static_cast<u32>(pixel_format)); UNREACHABLE(); - } else if (convert_s8z24 && pixel_format == PixelFormat::S8Z24) { + } else if (convert_s8z24 && pixel_format == PixelFormat::S8_UINT_D24_UNORM) { Tegra::Texture::ConvertZ24S8ToS8Z24(data, width, height); } } diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp index 7df5f1452..16d46a018 100644 --- a/src/video_core/textures/decoders.cpp +++ b/src/video_core/textures/decoders.cpp @@ -6,11 +6,13 @@ #include <cstring> #include "common/alignment.h" #include "common/assert.h" +#include "common/bit_util.h" #include "video_core/gpu.h" #include "video_core/textures/decoders.h" #include "video_core/textures/texture.h" namespace Tegra::Texture { +namespace { /** * This table represents the internal swizzle of a gob, @@ -36,20 +38,10 @@ struct alignas(64) SwizzleTable { std::array<std::array<u16, M>, N> values{}; }; -constexpr u32 gob_size_x_shift = 6; -constexpr u32 gob_size_y_shift = 3; -constexpr u32 gob_size_z_shift = 0; -constexpr u32 gob_size_shift = gob_size_x_shift + gob_size_y_shift + gob_size_z_shift; +constexpr u32 FAST_SWIZZLE_ALIGN = 16; -constexpr u32 gob_size_x = 1U << gob_size_x_shift; -constexpr u32 gob_size_y = 1U << gob_size_y_shift; -constexpr u32 gob_size_z = 1U << gob_size_z_shift; -constexpr u32 gob_size = 1U << gob_size_shift; - -constexpr u32 fast_swizzle_align = 16; - -constexpr auto legacy_swizzle_table = SwizzleTable<gob_size_y, gob_size_x, gob_size_z>(); -constexpr auto fast_swizzle_table = SwizzleTable<gob_size_y, 4, fast_swizzle_align>(); +constexpr auto LEGACY_SWIZZLE_TABLE = SwizzleTable<GOB_SIZE_X, GOB_SIZE_X, GOB_SIZE_Z>(); +constexpr auto FAST_SWIZZLE_TABLE = SwizzleTable<GOB_SIZE_Y, 4, FAST_SWIZZLE_ALIGN>(); /** * This function manages ALL the GOBs(Group of Bytes) Inside a single block. @@ -68,17 +60,17 @@ void PreciseProcessBlock(u8* const swizzled_data, u8* const unswizzled_data, con u32 y_address = z_address; u32 pixel_base = layer_z * z + y_start * stride_x; for (u32 y = y_start; y < y_end; y++) { - const auto& table = legacy_swizzle_table[y % gob_size_y]; + const auto& table = LEGACY_SWIZZLE_TABLE[y % GOB_SIZE_Y]; for (u32 x = x_start; x < x_end; x++) { - const u32 swizzle_offset{y_address + table[x * bytes_per_pixel % gob_size_x]}; + const u32 swizzle_offset{y_address + table[x * bytes_per_pixel % GOB_SIZE_X]}; const u32 pixel_index{x * out_bytes_per_pixel + pixel_base}; data_ptrs[unswizzle] = swizzled_data + swizzle_offset; data_ptrs[!unswizzle] = unswizzled_data + pixel_index; std::memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel); } pixel_base += stride_x; - if ((y + 1) % gob_size_y == 0) - y_address += gob_size; + if ((y + 1) % GOB_SIZE_Y == 0) + y_address += GOB_SIZE; } z_address += xy_block_size; } @@ -103,18 +95,18 @@ void FastProcessBlock(u8* const swizzled_data, u8* const unswizzled_data, const u32 y_address = z_address; u32 pixel_base = layer_z * z + y_start * stride_x; for (u32 y = y_start; y < y_end; y++) { - const auto& table = fast_swizzle_table[y % gob_size_y]; - for (u32 xb = x_startb; xb < x_endb; xb += fast_swizzle_align) { - const u32 swizzle_offset{y_address + table[(xb / fast_swizzle_align) % 4]}; + const auto& table = FAST_SWIZZLE_TABLE[y % GOB_SIZE_Y]; + for (u32 xb = x_startb; xb < x_endb; xb += FAST_SWIZZLE_ALIGN) { + const u32 swizzle_offset{y_address + table[(xb / FAST_SWIZZLE_ALIGN) % 4]}; const u32 out_x = xb * out_bytes_per_pixel / bytes_per_pixel; const u32 pixel_index{out_x + pixel_base}; data_ptrs[unswizzle ? 1 : 0] = swizzled_data + swizzle_offset; data_ptrs[unswizzle ? 0 : 1] = unswizzled_data + pixel_index; - std::memcpy(data_ptrs[0], data_ptrs[1], fast_swizzle_align); + std::memcpy(data_ptrs[0], data_ptrs[1], FAST_SWIZZLE_ALIGN); } pixel_base += stride_x; - if ((y + 1) % gob_size_y == 0) - y_address += gob_size; + if ((y + 1) % GOB_SIZE_Y == 0) + y_address += GOB_SIZE; } z_address += xy_block_size; } @@ -137,9 +129,9 @@ void SwizzledData(u8* const swizzled_data, u8* const unswizzled_data, const bool auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); }; const u32 stride_x = width * out_bytes_per_pixel; const u32 layer_z = height * stride_x; - const u32 gob_elements_x = gob_size_x / bytes_per_pixel; - constexpr u32 gob_elements_y = gob_size_y; - constexpr u32 gob_elements_z = gob_size_z; + const u32 gob_elements_x = GOB_SIZE_X / bytes_per_pixel; + constexpr u32 gob_elements_y = GOB_SIZE_Y; + constexpr u32 gob_elements_z = GOB_SIZE_Z; const u32 block_x_elements = gob_elements_x; const u32 block_y_elements = gob_elements_y * block_height; const u32 block_z_elements = gob_elements_z * block_depth; @@ -147,7 +139,7 @@ void SwizzledData(u8* const swizzled_data, u8* const unswizzled_data, const bool const u32 blocks_on_x = div_ceil(aligned_width, block_x_elements); const u32 blocks_on_y = div_ceil(height, block_y_elements); const u32 blocks_on_z = div_ceil(depth, block_z_elements); - const u32 xy_block_size = gob_size * block_height; + const u32 xy_block_size = GOB_SIZE * block_height; const u32 block_size = xy_block_size * block_depth; u32 tile_offset = 0; for (u32 zb = 0; zb < blocks_on_z; zb++) { @@ -174,12 +166,14 @@ void SwizzledData(u8* const swizzled_data, u8* const unswizzled_data, const bool } } +} // Anonymous namespace + void CopySwizzledData(u32 width, u32 height, u32 depth, u32 bytes_per_pixel, u32 out_bytes_per_pixel, u8* const swizzled_data, u8* const unswizzled_data, bool unswizzle, u32 block_height, u32 block_depth, u32 width_spacing) { const u32 block_height_size{1U << block_height}; const u32 block_depth_size{1U << block_depth}; - if (bytes_per_pixel % 3 != 0 && (width * bytes_per_pixel) % fast_swizzle_align == 0) { + if (bytes_per_pixel % 3 != 0 && (width * bytes_per_pixel) % FAST_SWIZZLE_ALIGN == 0) { SwizzledData<true>(swizzled_data, unswizzled_data, unswizzle, width, height, depth, bytes_per_pixel, out_bytes_per_pixel, block_height_size, block_depth_size, width_spacing); @@ -190,53 +184,6 @@ void CopySwizzledData(u32 width, u32 height, u32 depth, u32 bytes_per_pixel, } } -u32 BytesPerPixel(TextureFormat format) { - switch (format) { - case TextureFormat::DXT1: - case TextureFormat::DXN1: - // In this case a 'pixel' actually refers to a 4x4 tile. - return 8; - case TextureFormat::DXT23: - case TextureFormat::DXT45: - case TextureFormat::DXN2: - case TextureFormat::BC7U: - case TextureFormat::BC6H_UF16: - case TextureFormat::BC6H_SF16: - // In this case a 'pixel' actually refers to a 4x4 tile. - return 16; - case TextureFormat::R32_G32_B32: - return 12; - case TextureFormat::ASTC_2D_4X4: - case TextureFormat::ASTC_2D_5X4: - case TextureFormat::ASTC_2D_8X8: - case TextureFormat::ASTC_2D_8X5: - case TextureFormat::ASTC_2D_10X8: - case TextureFormat::ASTC_2D_5X5: - case TextureFormat::A8R8G8B8: - case TextureFormat::A2B10G10R10: - case TextureFormat::BF10GF11RF11: - case TextureFormat::R32: - case TextureFormat::R16_G16: - return 4; - case TextureFormat::A1B5G5R5: - case TextureFormat::B5G6R5: - case TextureFormat::G8R8: - case TextureFormat::R16: - return 2; - case TextureFormat::R8: - return 1; - case TextureFormat::R16_G16_B16_A16: - return 8; - case TextureFormat::R32_G32_B32_A32: - return 16; - case TextureFormat::R32_G32: - return 8; - default: - UNIMPLEMENTED_MSG("Format not implemented"); - return 1; - } -} - void UnswizzleTexture(u8* const unswizzled_data, u8* address, u32 tile_size_x, u32 tile_size_y, u32 bytes_per_pixel, u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth, u32 width_spacing) { @@ -256,47 +203,82 @@ std::vector<u8> UnswizzleTexture(u8* address, u32 tile_size_x, u32 tile_size_y, } void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, - u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, + u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data, u32 block_height_bit, u32 offset_x, u32 offset_y) { const u32 block_height = 1U << block_height_bit; - const u32 image_width_in_gobs{(swizzled_width * bytes_per_pixel + (gob_size_x - 1)) / - gob_size_x}; + const u32 image_width_in_gobs = + (swizzled_width * bytes_per_pixel + (GOB_SIZE_X - 1)) / GOB_SIZE_X; for (u32 line = 0; line < subrect_height; ++line) { const u32 dst_y = line + offset_y; const u32 gob_address_y = - (dst_y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs + - ((dst_y % (gob_size_y * block_height)) / gob_size_y) * gob_size; - const auto& table = legacy_swizzle_table[dst_y % gob_size_y]; + (dst_y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs + + ((dst_y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE; + const auto& table = LEGACY_SWIZZLE_TABLE[dst_y % GOB_SIZE_Y]; for (u32 x = 0; x < subrect_width; ++x) { const u32 dst_x = x + offset_x; const u32 gob_address = - gob_address_y + (dst_x * bytes_per_pixel / gob_size_x) * gob_size * block_height; - const u32 swizzled_offset = gob_address + table[(dst_x * bytes_per_pixel) % gob_size_x]; - u8* source_line = unswizzled_data + line * source_pitch + x * bytes_per_pixel; - u8* dest_addr = swizzled_data + swizzled_offset; + gob_address_y + (dst_x * bytes_per_pixel / GOB_SIZE_X) * GOB_SIZE * block_height; + const u32 swizzled_offset = gob_address + table[(dst_x * bytes_per_pixel) % GOB_SIZE_X]; + const u32 unswizzled_offset = line * source_pitch + x * bytes_per_pixel; + const u8* const source_line = unswizzled_data + unswizzled_offset; + u8* const dest_addr = swizzled_data + swizzled_offset; std::memcpy(dest_addr, source_line, bytes_per_pixel); } } } -void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 swizzled_width, - u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, - u32 block_height_bit, u32 offset_x, u32 offset_y) { - const u32 block_height = 1U << block_height_bit; - for (u32 line = 0; line < subrect_height; ++line) { - const u32 y2 = line + offset_y; - const u32 gob_address_y = (y2 / (gob_size_y * block_height)) * gob_size * block_height + - ((y2 % (gob_size_y * block_height)) / gob_size_y) * gob_size; - const auto& table = legacy_swizzle_table[y2 % gob_size_y]; - for (u32 x = 0; x < subrect_width; ++x) { - const u32 x2 = (x + offset_x) * bytes_per_pixel; - const u32 gob_address = gob_address_y + (x2 / gob_size_x) * gob_size * block_height; - const u32 swizzled_offset = gob_address + table[x2 % gob_size_x]; - u8* dest_line = unswizzled_data + line * dest_pitch + x * bytes_per_pixel; - u8* source_addr = swizzled_data + swizzled_offset; +void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 bytes_per_pixel, + u32 block_height, u32 origin_x, u32 origin_y, u8* output, const u8* input) { + const u32 stride = width * bytes_per_pixel; + const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X; + const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height); + + const u32 block_height_mask = (1U << block_height) - 1; + const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height; + + for (u32 line = 0; line < line_count; ++line) { + const u32 src_y = line + origin_y; + const auto& table = LEGACY_SWIZZLE_TABLE[src_y % GOB_SIZE_Y]; + + const u32 block_y = src_y >> GOB_SIZE_Y_SHIFT; + const u32 src_offset_y = (block_y >> block_height) * block_size + + ((block_y & block_height_mask) << GOB_SIZE_SHIFT); + for (u32 column = 0; column < line_length_in; ++column) { + const u32 src_x = (column + origin_x) * bytes_per_pixel; + const u32 src_offset_x = (src_x >> GOB_SIZE_X_SHIFT) << x_shift; + + const u32 swizzled_offset = src_offset_y + src_offset_x + table[src_x % GOB_SIZE_X]; + const u32 unswizzled_offset = line * pitch + column * bytes_per_pixel; + + std::memcpy(output + unswizzled_offset, input + swizzled_offset, bytes_per_pixel); + } + } +} + +void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height, + u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x, + u32 origin_y, u8* output, const u8* input) { + UNIMPLEMENTED_IF(origin_x > 0); + UNIMPLEMENTED_IF(origin_y > 0); - std::memcpy(dest_line, source_addr, bytes_per_pixel); + const u32 stride = width * bytes_per_pixel; + const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X; + const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth); + + const u32 block_height_mask = (1U << block_height) - 1; + const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height + block_depth; + + for (u32 line = 0; line < line_count; ++line) { + const auto& table = LEGACY_SWIZZLE_TABLE[line % GOB_SIZE_Y]; + const u32 block_y = line / GOB_SIZE_Y; + const u32 dst_offset_y = + (block_y >> block_height) * block_size + (block_y & block_height_mask) * GOB_SIZE; + for (u32 x = 0; x < line_length_in; ++x) { + const u32 dst_offset = + ((x / GOB_SIZE_X) << x_shift) + dst_offset_y + table[x % GOB_SIZE_X]; + const u32 src_offset = x * bytes_per_pixel + line * pitch; + std::memcpy(output + dst_offset, input + src_offset, bytes_per_pixel); } } } @@ -305,17 +287,17 @@ void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 const u32 block_height_bit, const std::size_t copy_size, const u8* source_data, u8* swizzle_data) { const u32 block_height = 1U << block_height_bit; - const u32 image_width_in_gobs{(width + gob_size_x - 1) / gob_size_x}; + const u32 image_width_in_gobs{(width + GOB_SIZE_X - 1) / GOB_SIZE_X}; std::size_t count = 0; for (std::size_t y = dst_y; y < height && count < copy_size; ++y) { const std::size_t gob_address_y = - (y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs + - ((y % (gob_size_y * block_height)) / gob_size_y) * gob_size; - const auto& table = legacy_swizzle_table[y % gob_size_y]; + (y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs + + ((y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE; + const auto& table = LEGACY_SWIZZLE_TABLE[y % GOB_SIZE_Y]; for (std::size_t x = dst_x; x < width && count < copy_size; ++x) { const std::size_t gob_address = - gob_address_y + (x / gob_size_x) * gob_size * block_height; - const std::size_t swizzled_offset = gob_address + table[x % gob_size_x]; + gob_address_y + (x / GOB_SIZE_X) * GOB_SIZE * block_height; + const std::size_t swizzled_offset = gob_address + table[x % GOB_SIZE_X]; const u8* source_line = source_data + count; u8* dest_addr = swizzle_data + swizzled_offset; count++; @@ -325,58 +307,30 @@ void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 } } -std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat format, u32 width, - u32 height) { - std::vector<u8> rgba_data; - - // TODO(Subv): Implement. - switch (format) { - case TextureFormat::DXT1: - case TextureFormat::DXT23: - case TextureFormat::DXT45: - case TextureFormat::DXN1: - case TextureFormat::DXN2: - case TextureFormat::BC7U: - case TextureFormat::BC6H_UF16: - case TextureFormat::BC6H_SF16: - case TextureFormat::ASTC_2D_4X4: - case TextureFormat::ASTC_2D_8X8: - case TextureFormat::ASTC_2D_5X5: - case TextureFormat::ASTC_2D_10X8: - case TextureFormat::A8R8G8B8: - case TextureFormat::A2B10G10R10: - case TextureFormat::A1B5G5R5: - case TextureFormat::B5G6R5: - case TextureFormat::R8: - case TextureFormat::G8R8: - case TextureFormat::BF10GF11RF11: - case TextureFormat::R32_G32_B32_A32: - case TextureFormat::R32_G32: - case TextureFormat::R32: - case TextureFormat::R16: - case TextureFormat::R16_G16: - case TextureFormat::R32_G32_B32: - // TODO(Subv): For the time being just forward the same data without any decoding. - rgba_data = texture_data; - break; - default: - UNIMPLEMENTED_MSG("Format not implemented"); - break; - } - - return rgba_data; -} - std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth) { if (tiled) { - const u32 aligned_width = Common::AlignBits(width * bytes_per_pixel, gob_size_x_shift); - const u32 aligned_height = Common::AlignBits(height, gob_size_y_shift + block_height); - const u32 aligned_depth = Common::AlignBits(depth, gob_size_z_shift + block_depth); + const u32 aligned_width = Common::AlignBits(width * bytes_per_pixel, GOB_SIZE_X_SHIFT); + const u32 aligned_height = Common::AlignBits(height, GOB_SIZE_Y_SHIFT + block_height); + const u32 aligned_depth = Common::AlignBits(depth, GOB_SIZE_Z_SHIFT + block_depth); return aligned_width * aligned_height * aligned_depth; } else { return width * height * depth * bytes_per_pixel; } } +u64 GetGOBOffset(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height, + u32 bytes_per_pixel) { + auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); }; + const u32 gobs_in_block = 1 << block_height; + const u32 y_blocks = GOB_SIZE_Y << block_height; + const u32 x_per_gob = GOB_SIZE_X / bytes_per_pixel; + const u32 x_blocks = div_ceil(width, x_per_gob); + const u32 block_size = GOB_SIZE * gobs_in_block; + const u32 stride = block_size * x_blocks; + const u32 base = (dst_y / y_blocks) * stride + (dst_x / x_per_gob) * block_size; + const u32 relative_y = dst_y % y_blocks; + return base + (relative_y / GOB_SIZE_Y) * GOB_SIZE; +} + } // namespace Tegra::Texture diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h index e5eac3f3b..01e156bc8 100644 --- a/src/video_core/textures/decoders.h +++ b/src/video_core/textures/decoders.h @@ -10,15 +10,15 @@ namespace Tegra::Texture { -// GOBSize constant. Calculated by 64 bytes in x multiplied by 8 y coords, represents -// an small rect of (64/bytes_per_pixel)X8. -inline std::size_t GetGOBSize() { - return 512; -} +constexpr u32 GOB_SIZE_X = 64; +constexpr u32 GOB_SIZE_Y = 8; +constexpr u32 GOB_SIZE_Z = 1; +constexpr u32 GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z; -inline std::size_t GetGOBSizeShift() { - return 9; -} +constexpr std::size_t GOB_SIZE_X_SHIFT = 6; +constexpr std::size_t GOB_SIZE_Y_SHIFT = 3; +constexpr std::size_t GOB_SIZE_Z_SHIFT = 0; +constexpr std::size_t GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT; /// Unswizzles a swizzled texture without changing its format. void UnswizzleTexture(u8* unswizzled_data, u8* address, u32 tile_size_x, u32 tile_size_y, @@ -38,26 +38,42 @@ void CopySwizzledData(u32 width, u32 height, u32 depth, u32 bytes_per_pixel, u32 out_bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, bool unswizzle, u32 block_height, u32 block_depth, u32 width_spacing); -/// Decodes an unswizzled texture into a A8R8G8B8 texture. -std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat format, u32 width, - u32 height); - /// This function calculates the correct size of a texture depending if it's tiled or not. std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth); /// Copies an untiled subrectangle into a tiled surface. void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, - u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height, - u32 offset_x, u32 offset_y); + u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data, + u32 block_height_bit, u32 offset_x, u32 offset_y); /// Copies a tiled subrectangle into a linear surface. -void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 swizzled_width, - u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height, - u32 offset_x, u32 offset_y); +void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 bytes_per_pixel, + u32 block_height, u32 origin_x, u32 origin_y, u8* output, const u8* input); + +/// @brief Swizzles a 2D array of pixels into a 3D texture +/// @param line_length_in Number of pixels per line +/// @param line_count Number of lines +/// @param pitch Number of bytes per line +/// @param width Width of the swizzled texture +/// @param height Height of the swizzled texture +/// @param bytes_per_pixel Number of bytes used per pixel +/// @param block_height Block height shift +/// @param block_depth Block depth shift +/// @param origin_x Column offset in pixels of the swizzled texture +/// @param origin_y Row offset in pixels of the swizzled texture +/// @param output Pointer to the pixels of the swizzled texture +/// @param input Pointer to the 2D array of pixels used as input +/// @pre input and output points to an array large enough to hold the number of bytes used +void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height, + u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x, + u32 origin_y, u8* output, const u8* input); + +void SwizzleKepler(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height, + std::size_t copy_size, const u8* source_data, u8* swizzle_data); -void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y, - const u32 block_height, const std::size_t copy_size, const u8* source_data, - u8* swizzle_data); +/// Obtains the offset of the gob for positions 'dst_x' & 'dst_y' +u64 GetGOBOffset(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height, + u32 bytes_per_pixel); } // namespace Tegra::Texture diff --git a/src/video_core/textures/texture.cpp b/src/video_core/textures/texture.cpp index d1939d744..4171e3ef2 100644 --- a/src/video_core/textures/texture.cpp +++ b/src/video_core/textures/texture.cpp @@ -48,7 +48,7 @@ constexpr std::array<float, 256> SRGB_CONVERSION_LUT = { }; unsigned SettingsMinimumAnisotropy() noexcept { - switch (static_cast<Anisotropy>(Settings::values.max_anisotropy)) { + switch (static_cast<Anisotropy>(Settings::values.max_anisotropy.GetValue())) { default: case Anisotropy::Default: return 1U; diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h index eba05aced..0574fef12 100644 --- a/src/video_core/textures/texture.h +++ b/src/video_core/textures/texture.h @@ -12,10 +12,10 @@ namespace Tegra::Texture { enum class TextureFormat : u32 { - R32_G32_B32_A32 = 0x01, - R32_G32_B32 = 0x02, - R16_G16_B16_A16 = 0x03, - R32_G32 = 0x04, + R32G32B32A32 = 0x01, + R32G32B32 = 0x02, + R16G16B16A16 = 0x03, + R32G32 = 0x04, R32_B24G8 = 0x05, ETC2_RGB = 0x06, X8B8G8R8 = 0x07, @@ -23,19 +23,19 @@ enum class TextureFormat : u32 { A2B10G10R10 = 0x09, ETC2_RGB_PTA = 0x0a, ETC2_RGBA = 0x0b, - R16_G16 = 0x0c, - G8R24 = 0x0d, - G24R8 = 0x0e, + R16G16 = 0x0c, + R24G8 = 0x0d, + R8G24 = 0x0e, R32 = 0x0f, - BC6H_SF16 = 0x10, - BC6H_UF16 = 0x11, + BC6H_SFLOAT = 0x10, + BC6H_UFLOAT = 0x11, A4B4G4R4 = 0x12, A5B5G5R1 = 0x13, A1B5G5R5 = 0x14, B5G6R5 = 0x15, B6G5R5 = 0x16, - BC7U = 0x17, - G8R8 = 0x18, + BC7 = 0x17, + R8G8 = 0x18, EAC = 0x19, EACX2 = 0x1a, R16 = 0x1b, @@ -43,23 +43,23 @@ enum class TextureFormat : u32 { R8 = 0x1d, G4R4 = 0x1e, R1 = 0x1f, - E5B9G9R9_SHAREDEXP = 0x20, - BF10GF11RF11 = 0x21, + E5B9G9R9 = 0x20, + B10G11R11 = 0x21, G8B8G8R8 = 0x22, B8G8R8G8 = 0x23, - DXT1 = 0x24, - DXT23 = 0x25, - DXT45 = 0x26, - DXN1 = 0x27, - DXN2 = 0x28, - S8Z24 = 0x29, + BC1_RGBA = 0x24, + BC2 = 0x25, + BC3 = 0x26, + BC4 = 0x27, + BC5 = 0x28, + S8D24 = 0x29, X8Z24 = 0x2a, - Z24S8 = 0x2b, + D24S8 = 0x2b, X4V4Z24__COV4R4V = 0x2c, X4V4Z24__COV8R8V = 0x2d, V8Z24__COV4R12V = 0x2e, - ZF32 = 0x2f, - ZF32_X24S8 = 0x30, + D32 = 0x2f, + D32S8 = 0x30, X8Z24_X20V4S8__COV4R4V = 0x31, X8Z24_X20V4S8__COV8R8V = 0x32, ZF32_X20V4X8__COV4R4V = 0x33, @@ -69,7 +69,7 @@ enum class TextureFormat : u32 { X8Z24_X16V8S8__COV4R12V = 0x37, ZF32_X16V8X8__COV4R12V = 0x38, ZF32_X16V8S8__COV4R12V = 0x39, - Z16 = 0x3a, + D16 = 0x3a, V8Z24__COV8R24V = 0x3b, X8Z24_X16V8S8__COV8R24V = 0x3c, ZF32_X16V8X8__COV8R24V = 0x3d, @@ -375,7 +375,4 @@ struct FullTextureInfo { TSCEntry tsc; }; -/// Returns the number of bytes per pixel of the input texture format. -u32 BytesPerPixel(TextureFormat format); - } // namespace Tegra::Texture diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp index f60bdc60a..dd5cee4a1 100644 --- a/src/video_core/video_core.cpp +++ b/src/video_core/video_core.cpp @@ -3,6 +3,7 @@ // Refer to the license.txt file included. #include <memory> + #include "common/logging/log.h" #include "core/core.h" #include "core/settings.h" @@ -16,43 +17,56 @@ #include "video_core/video_core.h" namespace { -std::unique_ptr<VideoCore::RendererBase> CreateRenderer(Core::Frontend::EmuWindow& emu_window, - Core::System& system, - Core::Frontend::GraphicsContext& context) { - switch (Settings::values.renderer_backend) { + +std::unique_ptr<VideoCore::RendererBase> CreateRenderer( + Core::System& system, Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu, + std::unique_ptr<Core::Frontend::GraphicsContext> context) { + auto& telemetry_session = system.TelemetrySession(); + auto& cpu_memory = system.Memory(); + + switch (Settings::values.renderer_backend.GetValue()) { case Settings::RendererBackend::OpenGL: - return std::make_unique<OpenGL::RendererOpenGL>(emu_window, system, context); + return std::make_unique<OpenGL::RendererOpenGL>(telemetry_session, emu_window, cpu_memory, + gpu, std::move(context)); #ifdef HAS_VULKAN case Settings::RendererBackend::Vulkan: - return std::make_unique<Vulkan::RendererVulkan>(emu_window, system); + return std::make_unique<Vulkan::RendererVulkan>(telemetry_session, emu_window, cpu_memory, + gpu, std::move(context)); #endif default: return nullptr; } } + } // Anonymous namespace namespace VideoCore { std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system) { + std::unique_ptr<Tegra::GPU> gpu; + const bool use_nvdec = Settings::values.use_nvdec_emulation.GetValue(); + if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) { + gpu = std::make_unique<VideoCommon::GPUAsynch>(system, use_nvdec); + } else { + gpu = std::make_unique<VideoCommon::GPUSynch>(system, use_nvdec); + } + auto context = emu_window.CreateSharedContext(); const auto scope = context->Acquire(); - auto renderer = CreateRenderer(emu_window, system, *context); + + auto renderer = CreateRenderer(system, emu_window, *gpu, std::move(context)); if (!renderer->Init()) { return nullptr; } - if (Settings::values.use_asynchronous_gpu_emulation) { - return std::make_unique<VideoCommon::GPUAsynch>(system, std::move(renderer), - std::move(context)); - } - return std::make_unique<VideoCommon::GPUSynch>(system, std::move(renderer), std::move(context)); + gpu->BindRenderer(std::move(renderer)); + return gpu; } u16 GetResolutionScaleFactor(const RendererBase& renderer) { return static_cast<u16>( - Settings::values.resolution_factor != 0 - ? Settings::values.resolution_factor + Settings::values.resolution_factor.GetValue() != 0 + ? Settings::values.resolution_factor.GetValue() : renderer.GetRenderWindow().GetFramebufferLayout().GetScalingRatio()); } |
