diff options
Diffstat (limited to 'src/video_core')
40 files changed, 601 insertions, 273 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index f9454bbaa..e31eb30c0 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -293,6 +293,7 @@ endif() if (MSVC) target_compile_options(video_core PRIVATE /we4267 # 'var' : conversion from 'size_t' to 'type', possible loss of data + /we4244 # 'var' : conversion from integer to 'type', possible loss of data /we4456 # Declaration of 'identifier' hides previous local declaration /we4457 # Declaration of 'identifier' hides function parameter /we4458 # Declaration of 'identifier' hides class member diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h index a39505903..b121d36a3 100644 --- a/src/video_core/buffer_cache/buffer_base.h +++ b/src/video_core/buffer_cache/buffer_base.h @@ -256,6 +256,16 @@ public: stream_score += score; } + /// Sets the new frame tick + void SetFrameTick(u64 new_frame_tick) noexcept { + frame_tick = new_frame_tick; + } + + /// Returns the new frame tick + [[nodiscard]] u64 FrameTick() const noexcept { + return frame_tick; + } + /// Returns the likeliness of this being a stream buffer [[nodiscard]] int StreamScore() const noexcept { return stream_score; @@ -586,6 +596,7 @@ private: RasterizerInterface* rasterizer = nullptr; VAddr cpu_addr = 0; Words words; + u64 frame_tick = 0; BufferFlagBits flags{}; int stream_score = 0; }; diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index d371b842f..cad7f902d 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -18,6 +18,7 @@ #include "common/common_types.h" #include "common/div_ceil.h" +#include "common/literals.h" #include "common/microprofile.h" #include "common/scope_exit.h" #include "common/settings.h" @@ -47,8 +48,11 @@ constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8; constexpr u32 NUM_STORAGE_BUFFERS = 16; constexpr u32 NUM_STAGES = 5; +using namespace Common::Literals; + template <typename P> class BufferCache { + // Page size for caching purposes. // This is unrelated to the CPU page size and it can be changed as it seems optimal. static constexpr u32 PAGE_BITS = 16; @@ -65,6 +69,9 @@ class BufferCache { static constexpr BufferId NULL_BUFFER_ID{0}; + static constexpr u64 EXPECTED_MEMORY = 512_MiB; + static constexpr u64 CRITICAL_MEMORY = 1_GiB; + using Maxwell = Tegra::Engines::Maxwell3D::Regs; using Runtime = typename P::Runtime; @@ -92,7 +99,7 @@ class BufferCache { }; public: - static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = 4096; + static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = static_cast<u32>(4_KiB); explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, Tegra::Engines::Maxwell3D& maxwell3d_, @@ -188,6 +195,8 @@ private: ((cpu_addr + size) & ~Core::Memory::PAGE_MASK); } + void RunGarbageCollector(); + void BindHostIndexBuffer(); void BindHostVertexBuffers(); @@ -243,6 +252,8 @@ private: template <bool insert> void ChangeRegister(BufferId buffer_id); + void TouchBuffer(Buffer& buffer) const noexcept; + bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); @@ -255,6 +266,10 @@ private: void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies); + void DownloadBufferMemory(Buffer& buffer_id); + + void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size); + void DeleteBuffer(BufferId buffer_id); void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id); @@ -319,6 +334,10 @@ private: size_t immediate_buffer_capacity = 0; std::unique_ptr<u8[]> immediate_buffer_alloc; + typename SlotVector<Buffer>::Iterator deletion_iterator; + u64 frame_tick = 0; + u64 total_used_memory = 0; + std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table; }; @@ -332,6 +351,28 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} { // Ensure the first slot is used for the null buffer void(slot_buffers.insert(runtime, NullBufferParams{})); + deletion_iterator = slot_buffers.end(); +} + +template <class P> +void BufferCache<P>::RunGarbageCollector() { + const bool aggressive_gc = total_used_memory >= CRITICAL_MEMORY; + const u64 ticks_to_destroy = aggressive_gc ? 60 : 120; + int num_iterations = aggressive_gc ? 64 : 32; + for (; num_iterations > 0; --num_iterations) { + if (deletion_iterator == slot_buffers.end()) { + deletion_iterator = slot_buffers.begin(); + } + ++deletion_iterator; + if (deletion_iterator == slot_buffers.end()) { + break; + } + const auto [buffer_id, buffer] = *deletion_iterator; + if (buffer->FrameTick() + ticks_to_destroy < frame_tick) { + DownloadBufferMemory(*buffer); + DeleteBuffer(buffer_id); + } + } } template <class P> @@ -349,6 +390,10 @@ void BufferCache<P>::TickFrame() { const bool skip_preferred = hits * 256 < shots * 251; uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0; + if (Settings::values.use_caches_gc.GetValue() && total_used_memory >= EXPECTED_MEMORY) { + RunGarbageCollector(); + } + ++frame_tick; delayed_destruction_ring.Tick(); } @@ -372,48 +417,7 @@ void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) { template <class P> void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) { ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { - boost::container::small_vector<BufferCopy, 1> copies; - u64 total_size_bytes = 0; - u64 largest_copy = 0; - buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { - copies.push_back(BufferCopy{ - .src_offset = range_offset, - .dst_offset = total_size_bytes, - .size = range_size, - }); - total_size_bytes += range_size; - largest_copy = std::max(largest_copy, range_size); - }); - if (total_size_bytes == 0) { - return; - } - MICROPROFILE_SCOPE(GPU_DownloadMemory); - - if constexpr (USE_MEMORY_MAPS) { - auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); - const u8* const mapped_memory = download_staging.mapped_span.data(); - const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size()); - for (BufferCopy& copy : copies) { - // Modify copies to have the staging offset in mind - copy.dst_offset += download_staging.offset; - } - runtime.CopyBuffer(download_staging.buffer, buffer, copies_span); - runtime.Finish(); - for (const BufferCopy& copy : copies) { - const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; - // Undo the modified offset - const u64 dst_offset = copy.dst_offset - download_staging.offset; - const u8* copy_mapped_memory = mapped_memory + dst_offset; - cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size); - } - } else { - const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); - for (const BufferCopy& copy : copies) { - buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); - const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; - cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size); - } - } + DownloadBufferMemory(buffer, cpu_addr, size); }); } @@ -640,6 +644,7 @@ bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { template <class P> void BufferCache<P>::BindHostIndexBuffer() { Buffer& buffer = slot_buffers[index_buffer.buffer_id]; + TouchBuffer(buffer); const u32 offset = buffer.Offset(index_buffer.cpu_addr); const u32 size = index_buffer.size; SynchronizeBuffer(buffer, index_buffer.cpu_addr, size); @@ -658,6 +663,7 @@ void BufferCache<P>::BindHostVertexBuffers() { for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { const Binding& binding = vertex_buffers[index]; Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer); SynchronizeBuffer(buffer, binding.cpu_addr, binding.size); if (!flags[Dirty::VertexBuffer0 + index]) { continue; @@ -693,6 +699,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 const VAddr cpu_addr = binding.cpu_addr; const u32 size = binding.size; Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer); const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID && size <= uniform_buffer_skip_cache_size && !buffer.IsRegionGpuModified(cpu_addr, size); @@ -744,6 +751,7 @@ void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) { ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) { const Binding& binding = storage_buffers[stage][index]; Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer); const u32 size = binding.size; SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -766,6 +774,7 @@ void BufferCache<P>::BindHostTransformFeedbackBuffers() { for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) { const Binding& binding = transform_feedback_buffers[index]; Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer); const u32 size = binding.size; SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -784,6 +793,7 @@ void BufferCache<P>::BindHostComputeUniformBuffers() { ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) { const Binding& binding = compute_uniform_buffers[index]; Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer); const u32 size = binding.size; SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -803,6 +813,7 @@ void BufferCache<P>::BindHostComputeStorageBuffers() { ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) { const Binding& binding = compute_storage_buffers[index]; Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer); const u32 size = binding.size; SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -1101,6 +1112,7 @@ BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size); const u32 size = static_cast<u32>(overlap.end - overlap.begin); const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size); + TouchBuffer(slot_buffers[new_buffer_id]); for (const BufferId overlap_id : overlap.ids) { JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); } @@ -1122,8 +1134,14 @@ template <class P> template <bool insert> void BufferCache<P>::ChangeRegister(BufferId buffer_id) { const Buffer& buffer = slot_buffers[buffer_id]; + const auto size = buffer.SizeBytes(); + if (insert) { + total_used_memory += Common::AlignUp(size, 1024); + } else { + total_used_memory -= Common::AlignUp(size, 1024); + } const VAddr cpu_addr_begin = buffer.CpuAddr(); - const VAddr cpu_addr_end = cpu_addr_begin + buffer.SizeBytes(); + const VAddr cpu_addr_end = cpu_addr_begin + size; const u64 page_begin = cpu_addr_begin / PAGE_SIZE; const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE); for (u64 page = page_begin; page != page_end; ++page) { @@ -1136,6 +1154,11 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) { } template <class P> +void BufferCache<P>::TouchBuffer(Buffer& buffer) const noexcept { + buffer.SetFrameTick(frame_tick); +} + +template <class P> bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { if (buffer.CpuAddr() == 0) { return true; @@ -1212,6 +1235,57 @@ void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, } template <class P> +void BufferCache<P>::DownloadBufferMemory(Buffer& buffer) { + DownloadBufferMemory(buffer, buffer.CpuAddr(), buffer.SizeBytes()); +} + +template <class P> +void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 size) { + boost::container::small_vector<BufferCopy, 1> copies; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { + copies.push_back(BufferCopy{ + .src_offset = range_offset, + .dst_offset = total_size_bytes, + .size = range_size, + }); + total_size_bytes += range_size; + largest_copy = std::max(largest_copy, range_size); + }); + if (total_size_bytes == 0) { + return; + } + MICROPROFILE_SCOPE(GPU_DownloadMemory); + + if constexpr (USE_MEMORY_MAPS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); + const u8* const mapped_memory = download_staging.mapped_span.data(); + const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size()); + for (BufferCopy& copy : copies) { + // Modify copies to have the staging offset in mind + copy.dst_offset += download_staging.offset; + } + runtime.CopyBuffer(download_staging.buffer, buffer, copies_span); + runtime.Finish(); + for (const BufferCopy& copy : copies) { + const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; + // Undo the modified offset + const u64 dst_offset = copy.dst_offset - download_staging.offset; + const u8* copy_mapped_memory = mapped_memory + dst_offset; + cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size); + } + } else { + const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); + for (const BufferCopy& copy : copies) { + buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); + const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; + cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size); + } + } +} + +template <class P> void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { const auto scalar_replace = [buffer_id](Binding& binding) { if (binding.buffer_id == buffer_id) { @@ -1236,6 +1310,7 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { Unregister(buffer_id); delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id])); + slot_buffers.erase(buffer_id); NotifyBufferDeletion(); } diff --git a/src/video_core/command_classes/codecs/codec.h b/src/video_core/command_classes/codecs/codec.h index 8a2a6c360..3e135a2a6 100644 --- a/src/video_core/command_classes/codecs/codec.h +++ b/src/video_core/command_classes/codecs/codec.h @@ -14,10 +14,18 @@ extern "C" { #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wconversion" #endif +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4242) // conversion from 'type' to 'type', possible loss of data +#pragma warning(disable : 4244) // conversion from 'type' to 'type', possible loss of data +#endif #include <libavcodec/avcodec.h> #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic pop #endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif } namespace Tegra { diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp index 0a8b82f2b..5faf8c0f1 100644 --- a/src/video_core/command_classes/vic.cpp +++ b/src/video_core/command_classes/vic.cpp @@ -3,7 +3,28 @@ // Refer to the license.txt file included. #include <array> + +extern "C" { +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif +#ifdef _MSC_VER +#pragma warning(disable : 4244) // conversion from 'type' to 'type', possible loss of data +#pragma warning(push) +#endif +#include <libswscale/swscale.h> +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif +} + #include "common/assert.h" +#include "common/logging/log.h" + #include "video_core/command_classes/nvdec.h" #include "video_core/command_classes/vic.h" #include "video_core/engines/maxwell_3d.h" @@ -11,10 +32,6 @@ #include "video_core/memory_manager.h" #include "video_core/textures/decoders.h" -extern "C" { -#include <libswscale/swscale.h> -} - namespace Tegra { Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_) diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index ffed42a29..335383955 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -242,6 +242,7 @@ public: return 4; default: UNREACHABLE(); + return 1; } } diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index eaba1b103..c37f15bfd 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -11,12 +11,8 @@ #define UNIFORM(n) #define BINDING_INPUT_BUFFER 0 #define BINDING_ENC_BUFFER 1 -#define BINDING_6_TO_8_BUFFER 2 -#define BINDING_7_TO_8_BUFFER 3 -#define BINDING_8_TO_8_BUFFER 4 -#define BINDING_BYTE_TO_16_BUFFER 5 -#define BINDING_SWIZZLE_BUFFER 6 -#define BINDING_OUTPUT_IMAGE 7 +#define BINDING_SWIZZLE_BUFFER 2 +#define BINDING_OUTPUT_IMAGE 3 #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv @@ -26,10 +22,6 @@ #define BINDING_SWIZZLE_BUFFER 0 #define BINDING_INPUT_BUFFER 1 #define BINDING_ENC_BUFFER 2 -#define BINDING_6_TO_8_BUFFER 3 -#define BINDING_7_TO_8_BUFFER 4 -#define BINDING_8_TO_8_BUFFER 5 -#define BINDING_BYTE_TO_16_BUFFER 6 #define BINDING_OUTPUT_IMAGE 0 #endif @@ -76,19 +68,6 @@ layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues { EncodingData encoding_values[]; }; -// ASTC Precompiled tables -layout(binding = BINDING_6_TO_8_BUFFER, std430) readonly buffer REPLICATE_6_BIT_TO_8 { - uint REPLICATE_6_BIT_TO_8_TABLE[]; -}; -layout(binding = BINDING_7_TO_8_BUFFER, std430) readonly buffer REPLICATE_7_BIT_TO_8 { - uint REPLICATE_7_BIT_TO_8_TABLE[]; -}; -layout(binding = BINDING_8_TO_8_BUFFER, std430) readonly buffer REPLICATE_8_BIT_TO_8 { - uint REPLICATE_8_BIT_TO_8_TABLE[]; -}; -layout(binding = BINDING_BYTE_TO_16_BUFFER, std430) readonly buffer REPLICATE_BYTE_TO_16 { - uint REPLICATE_BYTE_TO_16_TABLE[]; -}; layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image; @@ -139,6 +118,19 @@ const uint REPLICATE_4_BIT_TO_6_TABLE[16] = const uint REPLICATE_5_BIT_TO_6_TABLE[32] = uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63); +const uint REPLICATE_6_BIT_TO_8_TABLE[64] = + uint[](0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 65, 69, 73, 77, 81, 85, 89, + 93, 97, 101, 105, 109, 113, 117, 121, 125, 130, 134, 138, 142, 146, 150, 154, 158, 162, + 166, 170, 174, 178, 182, 186, 190, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, + 239, 243, 247, 251, 255); +const uint REPLICATE_7_BIT_TO_8_TABLE[128] = + uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, + 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, + 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, + 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 159, 161, 163, + 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191, 193, 195, 197, 199, + 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, + 237, 239, 241, 243, 245, 247, 249, 251, 253, 255); // Input ASTC texture globals uint current_index = 0; @@ -207,8 +199,7 @@ uint Replicate(uint val, uint num_bits, uint to_bit) { } uvec4 ReplicateByteTo16(uvec4 value) { - return uvec4(REPLICATE_BYTE_TO_16_TABLE[value.x], REPLICATE_BYTE_TO_16_TABLE[value.y], - REPLICATE_BYTE_TO_16_TABLE[value.z], REPLICATE_BYTE_TO_16_TABLE[value.w]); + return value * 0x101; } uint ReplicateBitTo7(uint value) { @@ -236,7 +227,7 @@ uint FastReplicateTo8(uint value, uint num_bits) { case 7: return REPLICATE_7_BIT_TO_8_TABLE[value]; case 8: - return REPLICATE_8_BIT_TO_8_TABLE[value]; + return value; } return Replicate(value, num_bits, 8); } @@ -1327,6 +1318,9 @@ void main() { offset += swizzle; const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1)); + if (any(greaterThanEqual(coord, imageSize(dest_image)))) { + return; + } uint block_index = pos.z * gl_WorkGroupSize.x * gl_WorkGroupSize.y + pos.y * gl_WorkGroupSize.x + pos.x; diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index f968b5b16..07939432f 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -4,10 +4,10 @@ #pragma once -#include <atomic> #include <functional> #include <optional> #include <span> +#include <stop_token> #include "common/common_types.h" #include "video_core/engines/fermi_2d.h" #include "video_core/gpu.h" @@ -123,7 +123,7 @@ public: virtual void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {} /// Initialize disk cached resources for the game being emulated - virtual void LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading, + virtual void LoadDiskResources(u64 title_id, std::stop_token stop_loading, const DiskResourceLoadCallback& callback) {} /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver. diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp index 3e4d88c30..e8d8d2aa5 100644 --- a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp @@ -454,7 +454,7 @@ private: template <typename... Args> void AddExpression(std::string_view text, Args&&... args) { - shader_source += fmt::format(text, std::forward<Args>(args)...); + shader_source += fmt::format(fmt::runtime(text), std::forward<Args>(args)...); } template <typename... Args> diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index f87bb269b..eb8bdaa85 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -351,7 +351,7 @@ void RasterizerOpenGL::SetupShaders(bool is_indexed) { } } -void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading, +void RasterizerOpenGL::LoadDiskResources(u64 title_id, std::stop_token stop_loading, const VideoCore::DiskResourceLoadCallback& callback) { shader_cache.LoadDiskCache(title_id, stop_loading, callback); } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 76298517f..9995a563b 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -94,7 +94,7 @@ public: const Tegra::Engines::Fermi2D::Config& copy_config) override; bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) override; - void LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading, + void LoadDiskResources(u64 title_id, std::stop_token stop_loading, const VideoCore::DiskResourceLoadCallback& callback) override; /// Returns true when there are commands queued to the OpenGL server. diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 5cf7cd151..5a01c59ec 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -331,7 +331,7 @@ ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer_, ShaderCacheOpenGL::~ShaderCacheOpenGL() = default; -void ShaderCacheOpenGL::LoadDiskCache(u64 title_id, const std::atomic_bool& stop_loading, +void ShaderCacheOpenGL::LoadDiskCache(u64 title_id, std::stop_token stop_loading, const VideoCore::DiskResourceLoadCallback& callback) { disk_cache.BindTitleID(title_id); const std::optional transferable = disk_cache.LoadTransferable(); @@ -372,7 +372,7 @@ void ShaderCacheOpenGL::LoadDiskCache(u64 title_id, const std::atomic_bool& stop const auto scope = context->Acquire(); for (std::size_t i = begin; i < end; ++i) { - if (stop_loading) { + if (stop_loading.stop_requested()) { return; } const auto& entry = (*transferable)[i]; @@ -435,7 +435,7 @@ void ShaderCacheOpenGL::LoadDiskCache(u64 title_id, const std::atomic_bool& stop precompiled_cache_altered = true; return; } - if (stop_loading) { + if (stop_loading.stop_requested()) { return; } diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index 2aed0697e..b30308b6f 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -127,7 +127,7 @@ public: ~ShaderCacheOpenGL() override; /// Loads disk cache for the current game - void LoadDiskCache(u64 title_id, const std::atomic_bool& stop_loading, + void LoadDiskCache(u64 title_id, std::stop_token stop_loading, const VideoCore::DiskResourceLoadCallback& callback); /// Gets the current specified shader stage program diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index ac78d344c..9c28498e8 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -96,7 +96,7 @@ public: // etc). template <typename... Args> void AddLine(std::string_view text, Args&&... args) { - AddExpression(fmt::format(text, std::forward<Args>(args)...)); + AddExpression(fmt::format(fmt::runtime(text), std::forward<Args>(args)...)); AddNewLine(); } diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h index 6dbb6bfba..2e67922a6 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.h +++ b/src/video_core/renderer_opengl/gl_stream_buffer.h @@ -12,12 +12,15 @@ #include <glad/glad.h> #include "common/common_types.h" +#include "common/literals.h" #include "video_core/renderer_opengl/gl_resource_manager.h" namespace OpenGL { +using namespace Common::Literals; + class StreamBuffer { - static constexpr size_t STREAM_BUFFER_SIZE = 64 * 1024 * 1024; + static constexpr size_t STREAM_BUFFER_SIZE = 64_MiB; static constexpr size_t NUM_SYNCS = 16; static constexpr size_t REGION_SIZE = STREAM_BUFFER_SIZE / NUM_SYNCS; static constexpr size_t MAX_ALIGNMENT = 256; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 9b4038615..23948feed 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -737,6 +737,8 @@ Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_, } } +Image::~Image() = default; + void Image::UploadMemory(const ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies) { glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer); diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index df8be12ff..25fe61566 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -143,6 +143,14 @@ public: explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr); + ~Image(); + + Image(const Image&) = delete; + Image& operator=(const Image&) = delete; + + Image(Image&&) = default; + Image& operator=(Image&&) = default; + void UploadMemory(const ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies); @@ -235,6 +243,7 @@ struct TextureCacheParams { static constexpr bool ENABLE_VALIDATION = true; static constexpr bool FRAMEBUFFER_BLITS = true; static constexpr bool HAS_EMULATED_COPIES = true; + static constexpr bool HAS_DEVICE_MEMORY_INFO = false; using Runtime = OpenGL::TextureCacheRuntime; using Image = OpenGL::Image; diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index 47fddcb6e..abaf1ee6a 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -69,7 +69,8 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_) swizzle_table_buffer.Create(); astc_buffer.Create(); glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); - glNamedBufferStorage(astc_buffer.handle, sizeof(ASTC_BUFFER_DATA), &ASTC_BUFFER_DATA, 0); + glNamedBufferStorage(astc_buffer.handle, sizeof(ASTC_ENCODINGS_VALUES), &ASTC_ENCODINGS_VALUES, + 0); } UtilShaders::~UtilShaders() = default; @@ -79,12 +80,6 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; static constexpr GLuint BINDING_INPUT_BUFFER = 1; static constexpr GLuint BINDING_ENC_BUFFER = 2; - - static constexpr GLuint BINDING_6_TO_8_BUFFER = 3; - static constexpr GLuint BINDING_7_TO_8_BUFFER = 4; - static constexpr GLuint BINDING_8_TO_8_BUFFER = 5; - static constexpr GLuint BINDING_BYTE_TO_16_BUFFER = 6; - static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; const Extent2D tile_size{ @@ -93,21 +88,7 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, }; program_manager.BindHostCompute(astc_decoder_program.handle); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_buffer.handle, - offsetof(AstcBufferData, encoding_values), - sizeof(AstcBufferData::encoding_values)); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_6_TO_8_BUFFER, astc_buffer.handle, - offsetof(AstcBufferData, replicate_6_to_8), - sizeof(AstcBufferData::replicate_6_to_8)); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_7_TO_8_BUFFER, astc_buffer.handle, - offsetof(AstcBufferData, replicate_7_to_8), - sizeof(AstcBufferData::replicate_7_to_8)); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_8_TO_8_BUFFER, astc_buffer.handle, - offsetof(AstcBufferData, replicate_8_to_8), - sizeof(AstcBufferData::replicate_8_to_8)); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_BYTE_TO_16_BUFFER, astc_buffer.handle, - offsetof(AstcBufferData, replicate_byte_to_16), - sizeof(AstcBufferData::replicate_byte_to_16)); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_buffer.handle); glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); glUniform2ui(1, tile_size.width, tile_size.height); @@ -137,6 +118,12 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers); } + // Precautionary barrier to ensure the compute shader is done decoding prior to texture access. + // GL_TEXTURE_FETCH_BARRIER_BIT and GL_SHADER_IMAGE_ACCESS_BARRIER_BIT are used in a separate + // glMemoryBarrier call by the texture cache runtime + glMemoryBarrier(GL_UNIFORM_BARRIER_BIT | GL_COMMAND_BARRIER_BIT | GL_PIXEL_BUFFER_BARRIER_BIT | + GL_TEXTURE_UPDATE_BARRIER_BIT | GL_BUFFER_UPDATE_BARRIER_BIT | + GL_SHADER_STORAGE_BARRIER_BIT | GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT); program_manager.RestoreGuestCompute(); } diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 8cb65e588..0df4e1a1c 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -55,8 +55,9 @@ size_t BytesPerIndex(VkIndexType index_type) { template <typename T> std::array<T, 6> MakeQuadIndices(u32 quad, u32 first) { std::array<T, 6> indices{0, 1, 2, 0, 2, 3}; - std::ranges::transform(indices, indices.begin(), - [quad, first](u32 index) { return first + index + quad * 4; }); + for (T& index : indices) { + index = static_cast<T>(first + index + quad * 4); + } return indices; } } // Anonymous namespace diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index e11406e58..205cd3b05 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -30,19 +30,16 @@ namespace Vulkan { using Tegra::Texture::SWIZZLE_TABLE; -using Tegra::Texture::ASTC::EncodingsValues; +using Tegra::Texture::ASTC::ASTC_ENCODINGS_VALUES; using namespace Tegra::Texture::ASTC; namespace { constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0; constexpr u32 ASTC_BINDING_ENC_BUFFER = 1; -constexpr u32 ASTC_BINDING_6_TO_8_BUFFER = 2; -constexpr u32 ASTC_BINDING_7_TO_8_BUFFER = 3; -constexpr u32 ASTC_BINDING_8_TO_8_BUFFER = 4; -constexpr u32 ASTC_BINDING_BYTE_TO_16_BUFFER = 5; -constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 6; -constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 7; +constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 2; +constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 3; +constexpr size_t ASTC_NUM_BINDINGS = 4; VkPushConstantRange BuildComputePushConstantRange(std::size_t size) { return { @@ -71,7 +68,7 @@ std::array<VkDescriptorSetLayoutBinding, 2> BuildInputOutputDescriptorSetBinding }}; } -std::array<VkDescriptorSetLayoutBinding, 8> BuildASTCDescriptorSetBindings() { +std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> BuildASTCDescriptorSetBindings() { return {{ { .binding = ASTC_BINDING_INPUT_BUFFER, @@ -88,34 +85,6 @@ std::array<VkDescriptorSetLayoutBinding, 8> BuildASTCDescriptorSetBindings() { .pImmutableSamplers = nullptr, }, { - .binding = ASTC_BINDING_6_TO_8_BUFFER, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, - .pImmutableSamplers = nullptr, - }, - { - .binding = ASTC_BINDING_7_TO_8_BUFFER, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, - .pImmutableSamplers = nullptr, - }, - { - .binding = ASTC_BINDING_8_TO_8_BUFFER, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, - .pImmutableSamplers = nullptr, - }, - { - .binding = ASTC_BINDING_BYTE_TO_16_BUFFER, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, - .pImmutableSamplers = nullptr, - }, - { .binding = ASTC_BINDING_SWIZZLE_BUFFER, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, .descriptorCount = 1, @@ -143,7 +112,8 @@ VkDescriptorUpdateTemplateEntryKHR BuildInputOutputDescriptorUpdateTemplate() { }; } -std::array<VkDescriptorUpdateTemplateEntryKHR, 8> BuildASTCPassDescriptorUpdateTemplateEntry() { +std::array<VkDescriptorUpdateTemplateEntryKHR, ASTC_NUM_BINDINGS> +BuildASTCPassDescriptorUpdateTemplateEntry() { return {{ { .dstBinding = ASTC_BINDING_INPUT_BUFFER, @@ -162,38 +132,6 @@ std::array<VkDescriptorUpdateTemplateEntryKHR, 8> BuildASTCPassDescriptorUpdateT .stride = sizeof(DescriptorUpdateEntry), }, { - .dstBinding = ASTC_BINDING_6_TO_8_BUFFER, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = ASTC_BINDING_6_TO_8_BUFFER * sizeof(DescriptorUpdateEntry), - .stride = sizeof(DescriptorUpdateEntry), - }, - { - .dstBinding = ASTC_BINDING_7_TO_8_BUFFER, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = ASTC_BINDING_7_TO_8_BUFFER * sizeof(DescriptorUpdateEntry), - .stride = sizeof(DescriptorUpdateEntry), - }, - { - .dstBinding = ASTC_BINDING_8_TO_8_BUFFER, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = ASTC_BINDING_8_TO_8_BUFFER * sizeof(DescriptorUpdateEntry), - .stride = sizeof(DescriptorUpdateEntry), - }, - { - .dstBinding = ASTC_BINDING_BYTE_TO_16_BUFFER, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = ASTC_BINDING_BYTE_TO_16_BUFFER * sizeof(DescriptorUpdateEntry), - .stride = sizeof(DescriptorUpdateEntry), - }, - { .dstBinding = ASTC_BINDING_SWIZZLE_BUFFER, .dstArrayElement = 0, .descriptorCount = 1, @@ -222,15 +160,6 @@ struct AstcPushConstants { u32 block_height_mask; }; -struct AstcBufferData { - decltype(SWIZZLE_TABLE) swizzle_table_buffer = SWIZZLE_TABLE; - decltype(EncodingsValues) encoding_values = EncodingsValues; - decltype(REPLICATE_6_BIT_TO_8_TABLE) replicate_6_to_8 = REPLICATE_6_BIT_TO_8_TABLE; - decltype(REPLICATE_7_BIT_TO_8_TABLE) replicate_7_to_8 = REPLICATE_7_BIT_TO_8_TABLE; - decltype(REPLICATE_8_BIT_TO_8_TABLE) replicate_8_to_8 = REPLICATE_8_BIT_TO_8_TABLE; - decltype(REPLICATE_BYTE_TO_16_TABLE) replicate_byte_to_16 = REPLICATE_BYTE_TO_16_TABLE; -} constexpr ASTC_BUFFER_DATA; - } // Anonymous namespace VKComputePass::VKComputePass(const Device& device, VKDescriptorPool& descriptor_pool, @@ -423,7 +352,7 @@ ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_, ASTCDecoderPass::~ASTCDecoderPass() = default; void ASTCDecoderPass::MakeDataBuffer() { - constexpr size_t TOTAL_BUFFER_SIZE = sizeof(ASTC_BUFFER_DATA) + sizeof(SWIZZLE_TABLE); + constexpr size_t TOTAL_BUFFER_SIZE = sizeof(ASTC_ENCODINGS_VALUES) + sizeof(SWIZZLE_TABLE); data_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, @@ -437,9 +366,10 @@ void ASTCDecoderPass::MakeDataBuffer() { data_buffer_commit = memory_allocator.Commit(data_buffer, MemoryUsage::Upload); const auto staging_ref = staging_buffer_pool.Request(TOTAL_BUFFER_SIZE, MemoryUsage::Upload); - std::memcpy(staging_ref.mapped_span.data(), &ASTC_BUFFER_DATA, sizeof(ASTC_BUFFER_DATA)); + std::memcpy(staging_ref.mapped_span.data(), &ASTC_ENCODINGS_VALUES, + sizeof(ASTC_ENCODINGS_VALUES)); // Tack on the swizzle table at the end of the buffer - std::memcpy(staging_ref.mapped_span.data() + sizeof(ASTC_BUFFER_DATA), &SWIZZLE_TABLE, + std::memcpy(staging_ref.mapped_span.data() + sizeof(ASTC_ENCODINGS_VALUES), &SWIZZLE_TABLE, sizeof(SWIZZLE_TABLE)); scheduler.Record([src = staging_ref.buffer, offset = staging_ref.offset, dst = *data_buffer, @@ -509,18 +439,8 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, update_descriptor_queue.Acquire(); update_descriptor_queue.AddBuffer(map.buffer, input_offset, image.guest_size_bytes - swizzle.buffer_offset); - update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, encoding_values), - sizeof(AstcBufferData::encoding_values)); - update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, replicate_6_to_8), - sizeof(AstcBufferData::replicate_6_to_8)); - update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, replicate_7_to_8), - sizeof(AstcBufferData::replicate_7_to_8)); - update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, replicate_8_to_8), - sizeof(AstcBufferData::replicate_8_to_8)); - update_descriptor_queue.AddBuffer(*data_buffer, - offsetof(AstcBufferData, replicate_byte_to_16), - sizeof(AstcBufferData::replicate_byte_to_16)); - update_descriptor_queue.AddBuffer(*data_buffer, sizeof(AstcBufferData), + update_descriptor_queue.AddBuffer(*data_buffer, 0, sizeof(ASTC_ENCODINGS_VALUES)); + update_descriptor_queue.AddBuffer(*data_buffer, sizeof(ASTC_ENCODINGS_VALUES), sizeof(SWIZZLE_TABLE)); update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level)); @@ -569,6 +489,7 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, image_barrier); }); + scheduler.Finish(); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp index db78ce3d9..6852c11b0 100644 --- a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp @@ -2,8 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <atomic> -#include <chrono> +#include <thread> #include "common/settings.h" #include "video_core/renderer_vulkan/vk_master_semaphore.h" @@ -12,8 +11,6 @@ namespace Vulkan { -using namespace std::chrono_literals; - MasterSemaphore::MasterSemaphore(const Device& device) { static constexpr VkSemaphoreTypeCreateInfoKHR semaphore_type_ci{ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR, @@ -34,9 +31,9 @@ MasterSemaphore::MasterSemaphore(const Device& device) { // Validation layers have a bug where they fail to track resource usage when using timeline // semaphores and synchronizing with GetSemaphoreCounterValueKHR. To workaround this issue, have // a separate thread waiting for each timeline semaphore value. - debug_thread = std::thread([this] { + debug_thread = std::jthread([this](std::stop_token stop_token) { u64 counter = 0; - while (!shutdown) { + while (!stop_token.stop_requested()) { if (semaphore.Wait(counter, 10'000'000)) { ++counter; } @@ -44,13 +41,6 @@ MasterSemaphore::MasterSemaphore(const Device& device) { }); } -MasterSemaphore::~MasterSemaphore() { - shutdown = true; - - // This thread might not be started - if (debug_thread.joinable()) { - debug_thread.join(); - } -} +MasterSemaphore::~MasterSemaphore() = default; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.h b/src/video_core/renderer_vulkan/vk_master_semaphore.h index 4b6d64daa..ee3cd35d0 100644 --- a/src/video_core/renderer_vulkan/vk_master_semaphore.h +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.h @@ -65,11 +65,10 @@ public: } private: - vk::Semaphore semaphore; ///< Timeline semaphore. - std::atomic<u64> gpu_tick{0}; ///< Current known GPU tick. - std::atomic<u64> current_tick{1}; ///< Current logical tick. - std::atomic<bool> shutdown{false}; ///< True when the object is being destroyed. - std::thread debug_thread; ///< Debug thread to workaround validation layer bugs. + vk::Semaphore semaphore; ///< Timeline semaphore. + std::atomic<u64> gpu_tick{0}; ///< Current known GPU tick. + std::atomic<u64> current_tick{1}; ///< Current logical tick. + std::jthread debug_thread; ///< Debug thread to workaround validation layer bugs. }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp index 7a1232497..0412b5234 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp @@ -12,6 +12,7 @@ #include "common/assert.h" #include "common/bit_util.h" #include "common/common_types.h" +#include "common/literals.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/vulkan_common/vulkan_device.h" @@ -19,12 +20,15 @@ namespace Vulkan { namespace { + +using namespace Common::Literals; + // Maximum potential alignment of a Vulkan buffer constexpr VkDeviceSize MAX_ALIGNMENT = 256; // Maximum size to put elements in the stream buffer -constexpr VkDeviceSize MAX_STREAM_BUFFER_REQUEST_SIZE = 8 * 1024 * 1024; +constexpr VkDeviceSize MAX_STREAM_BUFFER_REQUEST_SIZE = 8_MiB; // Stream buffer size in bytes -constexpr VkDeviceSize STREAM_BUFFER_SIZE = 128 * 1024 * 1024; +constexpr VkDeviceSize STREAM_BUFFER_SIZE = 128_MiB; constexpr VkDeviceSize REGION_SIZE = STREAM_BUFFER_SIZE / StagingBufferPool::NUM_SYNCS; constexpr VkMemoryPropertyFlags HOST_FLAGS = diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp index a09fe084e..7b4875d0e 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp @@ -10,6 +10,7 @@ #include "common/alignment.h" #include "common/assert.h" +#include "common/literals.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_stream_buffer.h" #include "video_core/vulkan_common/vulkan_device.h" @@ -19,6 +20,8 @@ namespace Vulkan { namespace { +using namespace Common::Literals; + constexpr VkBufferUsageFlags BUFFER_USAGE = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; @@ -26,7 +29,7 @@ constexpr VkBufferUsageFlags BUFFER_USAGE = constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000; constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000; -constexpr u64 PREFERRED_STREAM_BUFFER_SIZE = 256 * 1024 * 1024; +constexpr u64 PREFERRED_STREAM_BUFFER_SIZE = 256_MiB; /// Find a memory type with the passed requirements std::optional<u32> FindMemoryType(const VkPhysicalDeviceMemoryProperties& properties, diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 52860b4cf..a2ab4d1ee 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -818,6 +818,10 @@ void TextureCacheRuntime::CopyImage(Image& dst, Image& src, }); } +u64 TextureCacheRuntime::GetDeviceLocalMemory() const { + return device.GetDeviceLocalMemory(); +} + Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_) : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), scheduler{&runtime.scheduler}, @@ -876,6 +880,8 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_ } } +Image::~Image() = default; + void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) { // TODO: Move this to another API scheduler->RequestOutsideRenderPassOperationContext(); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 4a57d378b..172bcdf98 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -97,6 +97,8 @@ struct TextureCacheRuntime { // All known Vulkan drivers can natively handle BGR textures return true; } + + u64 GetDeviceLocalMemory() const; }; class Image : public VideoCommon::ImageBase { @@ -104,6 +106,14 @@ public: explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr); + ~Image(); + + Image(const Image&) = delete; + Image& operator=(const Image&) = delete; + + Image(Image&&) = default; + Image& operator=(Image&&) = default; + void UploadMemory(const StagingBufferRef& map, std::span<const VideoCommon::BufferImageCopy> copies); @@ -257,6 +267,7 @@ struct TextureCacheParams { static constexpr bool ENABLE_VALIDATION = true; static constexpr bool FRAMEBUFFER_BLITS = false; static constexpr bool HAS_EMULATED_COPIES = false; + static constexpr bool HAS_DEVICE_MEMORY_INFO = true; using Runtime = Vulkan::TextureCacheRuntime; using Image = Vulkan::Image; diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index 6308aef94..eb1746265 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -283,4 +283,11 @@ std::pair<u32, u32> GetASTCBlockSize(PixelFormat format) { return {DefaultBlockWidth(format), DefaultBlockHeight(format)}; } +u64 EstimatedDecompressedSize(u64 base_size, PixelFormat format) { + constexpr u64 RGBA8_PIXEL_SIZE = 4; + const u64 base_block_size = static_cast<u64>(DefaultBlockWidth(format)) * + static_cast<u64>(DefaultBlockHeight(format)) * RGBA8_PIXEL_SIZE; + return (base_size * base_block_size) / BytesPerBlock(format); +} + } // namespace VideoCore::Surface diff --git a/src/video_core/surface.h b/src/video_core/surface.h index c40ab89d0..1503db81f 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -462,4 +462,6 @@ bool IsPixelFormatSRGB(PixelFormat format); std::pair<u32, u32> GetASTCBlockSize(PixelFormat format); +u64 EstimatedDecompressedSize(u64 base_size, PixelFormat format); + } // namespace VideoCore::Surface diff --git a/src/video_core/texture_cache/image_base.cpp b/src/video_core/texture_cache/image_base.cpp index 9914926b3..ad69d32d1 100644 --- a/src/video_core/texture_cache/image_base.cpp +++ b/src/video_core/texture_cache/image_base.cpp @@ -113,6 +113,43 @@ void ImageBase::InsertView(const ImageViewInfo& view_info, ImageViewId image_vie image_view_ids.push_back(image_view_id); } +bool ImageBase::IsSafeDownload() const noexcept { + // Skip images that were not modified from the GPU + if (False(flags & ImageFlagBits::GpuModified)) { + return false; + } + // Skip images that .are. modified from the CPU + // We don't want to write sensitive data from the guest + if (True(flags & ImageFlagBits::CpuModified)) { + return false; + } + if (info.num_samples > 1) { + LOG_WARNING(HW_GPU, "MSAA image downloads are not implemented"); + return false; + } + return true; +} + +void ImageBase::CheckBadOverlapState() { + if (False(flags & ImageFlagBits::BadOverlap)) { + return; + } + if (!overlapping_images.empty()) { + return; + } + flags &= ~ImageFlagBits::BadOverlap; +} + +void ImageBase::CheckAliasState() { + if (False(flags & ImageFlagBits::Alias)) { + return; + } + if (!aliased_images.empty()) { + return; + } + flags &= ~ImageFlagBits::Alias; +} + void AddImageAlias(ImageBase& lhs, ImageBase& rhs, ImageId lhs_id, ImageId rhs_id) { static constexpr auto OPTIONS = RelaxedOptions::Size | RelaxedOptions::Format; ASSERT(lhs.info.type == rhs.info.type); diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index b7f3b7e43..e326cab71 100644 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -25,6 +25,12 @@ enum class ImageFlagBits : u32 { Strong = 1 << 5, ///< Exists in the image table, the dimensions are can be trusted Registered = 1 << 6, ///< True when the image is registered Picked = 1 << 7, ///< Temporary flag to mark the image as picked + + // Garbage Collection Flags + BadOverlap = 1 << 8, ///< This image overlaps other but doesn't fit, has higher + ///< garbage collection priority + Alias = 1 << 9, ///< This image has aliases and has priority on garbage + ///< collection }; DECLARE_ENUM_FLAG_OPERATORS(ImageFlagBits) @@ -44,11 +50,16 @@ struct ImageBase { void InsertView(const ImageViewInfo& view_info, ImageViewId image_view_id); + [[nodiscard]] bool IsSafeDownload() const noexcept; + [[nodiscard]] bool Overlaps(VAddr overlap_cpu_addr, size_t overlap_size) const noexcept { const VAddr overlap_end = overlap_cpu_addr + overlap_size; return cpu_addr < overlap_end && overlap_cpu_addr < cpu_addr_end; } + void CheckBadOverlapState(); + void CheckAliasState(); + ImageInfo info; u32 guest_size_bytes = 0; @@ -72,6 +83,7 @@ struct ImageBase { std::vector<SubresourceBase> slice_subresources; std::vector<AliasedImage> aliased_images; + std::vector<ImageId> overlapping_images; }; struct ImageAllocBase { diff --git a/src/video_core/texture_cache/slot_vector.h b/src/video_core/texture_cache/slot_vector.h index eae3be6ea..6180b8c0e 100644 --- a/src/video_core/texture_cache/slot_vector.h +++ b/src/video_core/texture_cache/slot_vector.h @@ -5,6 +5,7 @@ #pragma once #include <array> +#include <bit> #include <concepts> #include <numeric> #include <type_traits> @@ -32,6 +33,60 @@ template <class T> requires std::is_nothrow_move_assignable_v<T>&& std::is_nothrow_move_constructible_v<T> class SlotVector { public: + class Iterator { + friend SlotVector<T>; + + public: + constexpr Iterator() = default; + + Iterator& operator++() noexcept { + const u64* const bitset = slot_vector->stored_bitset.data(); + const u32 size = static_cast<u32>(slot_vector->stored_bitset.size()) * 64; + if (id.index < size) { + do { + ++id.index; + } while (id.index < size && !IsValid(bitset)); + if (id.index == size) { + id.index = SlotId::INVALID_INDEX; + } + } + return *this; + } + + Iterator operator++(int) noexcept { + const Iterator copy{*this}; + ++*this; + return copy; + } + + bool operator==(const Iterator& other) const noexcept { + return id.index == other.id.index; + } + + bool operator!=(const Iterator& other) const noexcept { + return id.index != other.id.index; + } + + std::pair<SlotId, T*> operator*() const noexcept { + return {id, std::addressof((*slot_vector)[id])}; + } + + T* operator->() const noexcept { + return std::addressof((*slot_vector)[id]); + } + + private: + Iterator(SlotVector<T>* slot_vector_, SlotId id_) noexcept + : slot_vector{slot_vector_}, id{id_} {} + + bool IsValid(const u64* bitset) const noexcept { + return ((bitset[id.index / 64] >> (id.index % 64)) & 1) != 0; + } + + SlotVector<T>* slot_vector; + SlotId id; + }; + ~SlotVector() noexcept { size_t index = 0; for (u64 bits : stored_bitset) { @@ -70,6 +125,20 @@ public: ResetStorageBit(id.index); } + [[nodiscard]] Iterator begin() noexcept { + const auto it = std::ranges::find_if(stored_bitset, [](u64 value) { return value != 0; }); + if (it == stored_bitset.end()) { + return end(); + } + const u32 word_index = static_cast<u32>(std::distance(it, stored_bitset.begin())); + const SlotId first_id{word_index * 64 + static_cast<u32>(std::countr_zero(*it))}; + return Iterator(this, first_id); + } + + [[nodiscard]] Iterator end() noexcept { + return Iterator(this, SlotId{SlotId::INVALID_INDEX}); + } + private: struct NonTrivialDummy { NonTrivialDummy() noexcept {} @@ -140,7 +209,6 @@ private: Entry* values = nullptr; size_t values_capacity = 0; - size_t values_size = 0; std::vector<u64> stored_bitset; std::vector<u32> free_list; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 59b7c678b..c7cfd02b6 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -19,9 +19,10 @@ #include <boost/container/small_vector.hpp> #include "common/alignment.h" -#include "common/common_funcs.h" #include "common/common_types.h" +#include "common/literals.h" #include "common/logging/log.h" +#include "common/settings.h" #include "video_core/compatible_formats.h" #include "video_core/delayed_destruction_ring.h" #include "video_core/dirty_flags.h" @@ -57,6 +58,7 @@ using VideoCore::Surface::PixelFormat; using VideoCore::Surface::PixelFormatFromDepthFormat; using VideoCore::Surface::PixelFormatFromRenderTargetFormat; using VideoCore::Surface::SurfaceType; +using namespace Common::Literals; template <class P> class TextureCache { @@ -69,12 +71,17 @@ class TextureCache { static constexpr bool FRAMEBUFFER_BLITS = P::FRAMEBUFFER_BLITS; /// True when some copies have to be emulated static constexpr bool HAS_EMULATED_COPIES = P::HAS_EMULATED_COPIES; + /// True when the API can provide info about the memory of the device. + static constexpr bool HAS_DEVICE_MEMORY_INFO = P::HAS_DEVICE_MEMORY_INFO; /// Image view ID for null descriptors static constexpr ImageViewId NULL_IMAGE_VIEW_ID{0}; /// Sampler ID for bugged sampler ids static constexpr SamplerId NULL_SAMPLER_ID{0}; + static constexpr u64 DEFAULT_EXPECTED_MEMORY = 1_GiB; + static constexpr u64 DEFAULT_CRITICAL_MEMORY = 2_GiB; + using Runtime = typename P::Runtime; using Image = typename P::Image; using ImageAlloc = typename P::ImageAlloc; @@ -197,6 +204,9 @@ private: } } + /// Runs the Garbage Collector. + void RunGarbageCollector(); + /// Fills image_view_ids in the image views in indices void FillImageViews(DescriptorTable<TICEntry>& table, std::span<ImageViewId> cached_image_view_ids, std::span<const u32> indices, @@ -333,6 +343,10 @@ private: std::unordered_map<u64, std::vector<ImageId>, IdentityHash<u64>> page_table; bool has_deleted_images = false; + u64 total_used_memory = 0; + u64 minimum_memory; + u64 expected_memory; + u64 critical_memory; SlotVector<Image> slot_images; SlotVector<ImageView> slot_image_views; @@ -353,6 +367,7 @@ private: u64 modification_tick = 0; u64 frame_tick = 0; + typename SlotVector<Image>::Iterator deletion_iterator; }; template <class P> @@ -373,11 +388,94 @@ TextureCache<P>::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface& // This way the null resource becomes a compile time constant void(slot_image_views.insert(runtime, NullImageParams{})); void(slot_samplers.insert(runtime, sampler_descriptor)); + + deletion_iterator = slot_images.begin(); + + if constexpr (HAS_DEVICE_MEMORY_INFO) { + const auto device_memory = runtime.GetDeviceLocalMemory(); + const u64 possible_expected_memory = (device_memory * 3) / 10; + const u64 possible_critical_memory = (device_memory * 6) / 10; + expected_memory = std::max(possible_expected_memory, DEFAULT_EXPECTED_MEMORY); + critical_memory = std::max(possible_critical_memory, DEFAULT_CRITICAL_MEMORY); + minimum_memory = 0; + } else { + // on OGL we can be more conservatives as the driver takes care. + expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB; + critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB; + minimum_memory = expected_memory; + } +} + +template <class P> +void TextureCache<P>::RunGarbageCollector() { + const bool high_priority_mode = total_used_memory >= expected_memory; + const bool aggressive_mode = total_used_memory >= critical_memory; + const u64 ticks_to_destroy = high_priority_mode ? 60 : 100; + int num_iterations = aggressive_mode ? 256 : (high_priority_mode ? 128 : 64); + for (; num_iterations > 0; --num_iterations) { + if (deletion_iterator == slot_images.end()) { + deletion_iterator = slot_images.begin(); + if (deletion_iterator == slot_images.end()) { + break; + } + } + auto [image_id, image_tmp] = *deletion_iterator; + Image* image = image_tmp; // fix clang error. + const bool is_alias = True(image->flags & ImageFlagBits::Alias); + const bool is_bad_overlap = True(image->flags & ImageFlagBits::BadOverlap); + const bool must_download = image->IsSafeDownload(); + bool should_care = is_bad_overlap || is_alias || (high_priority_mode && !must_download); + const u64 ticks_needed = + is_bad_overlap + ? ticks_to_destroy >> 4 + : ((should_care && aggressive_mode) ? ticks_to_destroy >> 1 : ticks_to_destroy); + should_care |= aggressive_mode; + if (should_care && image->frame_tick + ticks_needed < frame_tick) { + if (is_bad_overlap) { + const bool overlap_check = std::ranges::all_of( + image->overlapping_images, [&, image](const ImageId& overlap_id) { + auto& overlap = slot_images[overlap_id]; + return overlap.frame_tick >= image->frame_tick; + }); + if (!overlap_check) { + ++deletion_iterator; + continue; + } + } + if (!is_bad_overlap && must_download) { + const bool alias_check = std::ranges::none_of( + image->aliased_images, [&, image](const AliasedImage& alias) { + auto& alias_image = slot_images[alias.id]; + return (alias_image.frame_tick < image->frame_tick) || + (alias_image.modification_tick < image->modification_tick); + }); + + if (alias_check) { + auto map = runtime.DownloadStagingBuffer(image->unswizzled_size_bytes); + const auto copies = FullDownloadCopies(image->info); + image->DownloadMemory(map, copies); + runtime.Finish(); + SwizzleImage(gpu_memory, image->gpu_addr, image->info, copies, map.mapped_span); + } + } + if (True(image->flags & ImageFlagBits::Tracked)) { + UntrackImage(*image); + } + UnregisterImage(image_id); + DeleteImage(image_id); + if (is_bad_overlap) { + ++num_iterations; + } + } + ++deletion_iterator; + } } template <class P> void TextureCache<P>::TickFrame() { - // Tick sentenced resources in this order to ensure they are destroyed in the right order + if (Settings::values.use_caches_gc.GetValue() && total_used_memory > minimum_memory) { + RunGarbageCollector(); + } sentenced_images.Tick(); sentenced_framebuffers.Tick(); sentenced_image_view.Tick(); @@ -568,17 +666,7 @@ template <class P> void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) { std::vector<ImageId> images; ForEachImageInRegion(cpu_addr, size, [this, &images](ImageId image_id, ImageBase& image) { - // Skip images that were not modified from the GPU - if (False(image.flags & ImageFlagBits::GpuModified)) { - return; - } - // Skip images that .are. modified from the CPU - // We don't want to write sensitive data from the guest - if (True(image.flags & ImageFlagBits::CpuModified)) { - return; - } - if (image.info.num_samples > 1) { - LOG_WARNING(HW_GPU, "MSAA image downloads are not implemented"); + if (!image.IsSafeDownload()) { return; } image.flags &= ~ImageFlagBits::GpuModified; @@ -967,6 +1055,7 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA std::vector<ImageId> overlap_ids; std::vector<ImageId> left_aliased_ids; std::vector<ImageId> right_aliased_ids; + std::vector<ImageId> bad_overlap_ids; ForEachImageInRegion(cpu_addr, size_bytes, [&](ImageId overlap_id, ImageBase& overlap) { if (info.type != overlap.info.type) { return; @@ -992,9 +1081,14 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA const ImageBase new_image_base(new_info, gpu_addr, cpu_addr); if (IsSubresource(new_info, overlap, gpu_addr, options, broken_views, native_bgr)) { left_aliased_ids.push_back(overlap_id); + overlap.flags |= ImageFlagBits::Alias; } else if (IsSubresource(overlap.info, new_image_base, overlap.gpu_addr, options, broken_views, native_bgr)) { right_aliased_ids.push_back(overlap_id); + overlap.flags |= ImageFlagBits::Alias; + } else { + bad_overlap_ids.push_back(overlap_id); + overlap.flags |= ImageFlagBits::BadOverlap; } }); const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr); @@ -1022,10 +1116,18 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA for (const ImageId aliased_id : right_aliased_ids) { ImageBase& aliased = slot_images[aliased_id]; AddImageAlias(new_image_base, aliased, new_image_id, aliased_id); + new_image.flags |= ImageFlagBits::Alias; } for (const ImageId aliased_id : left_aliased_ids) { ImageBase& aliased = slot_images[aliased_id]; AddImageAlias(aliased, new_image_base, aliased_id, new_image_id); + new_image.flags |= ImageFlagBits::Alias; + } + for (const ImageId aliased_id : bad_overlap_ids) { + ImageBase& aliased = slot_images[aliased_id]; + aliased.overlapping_images.push_back(new_image_id); + new_image.overlapping_images.push_back(aliased_id); + new_image.flags |= ImageFlagBits::BadOverlap; } RegisterImage(new_image_id); return new_image_id; @@ -1195,6 +1297,13 @@ void TextureCache<P>::RegisterImage(ImageId image_id) { image.flags |= ImageFlagBits::Registered; ForEachPage(image.cpu_addr, image.guest_size_bytes, [this, image_id](u64 page) { page_table[page].push_back(image_id); }); + u64 tentative_size = std::max(image.guest_size_bytes, image.unswizzled_size_bytes); + if ((IsPixelFormatASTC(image.info.format) && + True(image.flags & ImageFlagBits::AcceleratedUpload)) || + True(image.flags & ImageFlagBits::Converted)) { + tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format); + } + total_used_memory += Common::AlignUp(tentative_size, 1024); } template <class P> @@ -1203,6 +1312,14 @@ void TextureCache<P>::UnregisterImage(ImageId image_id) { ASSERT_MSG(True(image.flags & ImageFlagBits::Registered), "Trying to unregister an already registered image"); image.flags &= ~ImageFlagBits::Registered; + image.flags &= ~ImageFlagBits::BadOverlap; + u64 tentative_size = std::max(image.guest_size_bytes, image.unswizzled_size_bytes); + if ((IsPixelFormatASTC(image.info.format) && + True(image.flags & ImageFlagBits::AcceleratedUpload)) || + True(image.flags & ImageFlagBits::Converted)) { + tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format); + } + total_used_memory -= Common::AlignUp(tentative_size, 1024); ForEachPage(image.cpu_addr, image.guest_size_bytes, [this, image_id](u64 page) { const auto page_it = page_table.find(page); if (page_it == page_table.end()) { @@ -1276,9 +1393,19 @@ void TextureCache<P>::DeleteImage(ImageId image_id) { std::erase_if(other_image.aliased_images, [image_id](const AliasedImage& other_alias) { return other_alias.id == image_id; }); + other_image.CheckAliasState(); ASSERT_MSG(num_removed_aliases == 1, "Invalid number of removed aliases: {}", num_removed_aliases); } + for (const ImageId overlap_id : image.overlapping_images) { + ImageBase& other_image = slot_images[overlap_id]; + [[maybe_unused]] const size_t num_removed_overlaps = std::erase_if( + other_image.overlapping_images, + [image_id](const ImageId other_overlap_id) { return other_overlap_id == image_id; }); + other_image.CheckBadOverlapState(); + ASSERT_MSG(num_removed_overlaps == 1, "Invalid number of removed overlapps: {}", + num_removed_overlaps); + } for (const ImageViewId image_view_id : image_view_ids) { sentenced_image_view.Push(std::move(slot_image_views[image_view_id])); slot_image_views.erase(image_view_id); diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp index 6835fd747..4efe042b6 100644 --- a/src/video_core/texture_cache/util.cpp +++ b/src/video_core/texture_cache/util.cpp @@ -581,6 +581,8 @@ void SwizzleBlockLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr for (s32 layer = 0; layer < info.resources.layers; ++layer) { const std::span<const u8> src = input.subspan(host_offset); + gpu_memory.ReadBlockUnsafe(gpu_addr + guest_offset, dst.data(), dst.size_bytes()); + SwizzleTexture(dst, src, bytes_per_block, num_tiles.width, num_tiles.height, num_tiles.depth, block.height, block.depth); diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index 9b2177ebd..7b756ba41 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp @@ -269,7 +269,7 @@ static void DecodeQuintBlock(InputBitStream& bits, IntegerEncodedVector& result, static void DecodeIntegerSequence(IntegerEncodedVector& result, InputBitStream& bits, u32 maxRange, u32 nValues) { // Determine encoding parameters - IntegerEncodedValue val = EncodingsValues[maxRange]; + IntegerEncodedValue val = ASTC_ENCODINGS_VALUES[maxRange]; // Start decoding u32 nValsDecoded = 0; @@ -310,7 +310,7 @@ struct TexelWeightParams { nIdxs *= 2; } - return EncodingsValues[m_MaxWeight].GetBitLength(nIdxs); + return ASTC_ENCODINGS_VALUES[m_MaxWeight].GetBitLength(nIdxs); } u32 GetNumWeightValues() const { @@ -551,6 +551,8 @@ static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) { } } } + +static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>(); static constexpr u32 ReplicateByteTo16(std::size_t value) { return REPLICATE_BYTE_TO_16_TABLE[value]; } @@ -753,12 +755,12 @@ static void DecodeColorValues(u32* out, std::span<u8> data, const u32* modes, co // figure out the max value for each of them... u32 range = 256; while (--range > 0) { - IntegerEncodedValue val = EncodingsValues[range]; + IntegerEncodedValue val = ASTC_ENCODINGS_VALUES[range]; u32 bitLength = val.GetBitLength(nValues); if (bitLength <= nBitsForColorData) { // Find the smallest possible range that matches the given encoding while (--range > 0) { - IntegerEncodedValue newval = EncodingsValues[range]; + IntegerEncodedValue newval = ASTC_ENCODINGS_VALUES[range]; if (!newval.MatchesEncoding(val)) { break; } diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h index c1c37dfe7..0229ae122 100644 --- a/src/video_core/textures/astc.h +++ b/src/video_core/textures/astc.h @@ -77,7 +77,7 @@ constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() { return encodings; } -constexpr std::array<IntegerEncodedValue, 256> EncodingsValues = MakeEncodedValues(); +constexpr std::array<IntegerEncodedValue, 256> ASTC_ENCODINGS_VALUES = MakeEncodedValues(); // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] // is the same as [(num_bits - 1):0] and repeats all the way down. @@ -116,19 +116,10 @@ constexpr auto MakeReplicateTable() { return table; } -constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>(); constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>(); constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>(); constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>(); -struct AstcBufferData { - decltype(EncodingsValues) encoding_values = EncodingsValues; - decltype(REPLICATE_6_BIT_TO_8_TABLE) replicate_6_to_8 = REPLICATE_6_BIT_TO_8_TABLE; - decltype(REPLICATE_7_BIT_TO_8_TABLE) replicate_7_to_8 = REPLICATE_7_BIT_TO_8_TABLE; - decltype(REPLICATE_8_BIT_TO_8_TABLE) replicate_8_to_8 = REPLICATE_8_BIT_TO_8_TABLE; - decltype(REPLICATE_BYTE_TO_16_TABLE) replicate_byte_to_16 = REPLICATE_BYTE_TO_16_TABLE; -} constexpr ASTC_BUFFER_DATA; - void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth, uint32_t block_width, uint32_t block_height, std::span<uint8_t> output); diff --git a/src/video_core/vulkan_common/nsight_aftermath_tracker.cpp b/src/video_core/vulkan_common/nsight_aftermath_tracker.cpp index f0ee76519..758c038ba 100644 --- a/src/video_core/vulkan_common/nsight_aftermath_tracker.cpp +++ b/src/video_core/vulkan_common/nsight_aftermath_tracker.cpp @@ -50,7 +50,7 @@ NsightAftermathTracker::NsightAftermathTracker() { } dump_dir = Common::FS::GetYuzuPath(Common::FS::YuzuPath::LogDir) / "gpucrash"; - void(Common::FS::RemoveDirRecursively(dump_dir)); + Common::FS::RemoveDirRecursively(dump_dir); if (!Common::FS::CreateDir(dump_dir)) { LOG_ERROR(Render_Vulkan, "Failed to create Nsight Aftermath dump directory"); return; diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index c52cd246b..f214510da 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -408,6 +408,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR } logical = vk::Device::Create(physical, queue_cis, extensions, first_next, dld); + CollectPhysicalMemoryInfo(); CollectTelemetryParameters(); CollectToolingInfo(); @@ -839,6 +840,17 @@ void Device::CollectTelemetryParameters() { } } +void Device::CollectPhysicalMemoryInfo() { + const auto mem_properties = physical.GetMemoryProperties(); + const size_t num_properties = mem_properties.memoryHeapCount; + device_access_memory = 0; + for (size_t element = 0; element < num_properties; ++element) { + if ((mem_properties.memoryHeaps[element].flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) != 0) { + device_access_memory += mem_properties.memoryHeaps[element].size; + } + } +} + void Device::CollectToolingInfo() { if (!ext_tooling_info) { return; diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index c5504467d..96c0f8c60 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -228,6 +228,10 @@ public: return use_asynchronous_shaders; } + u64 GetDeviceLocalMemory() const { + return device_access_memory; + } + private: /// Checks if the physical device is suitable. void CheckSuitability(bool requires_swapchain) const; @@ -247,6 +251,9 @@ private: /// Collects information about attached tools. void CollectToolingInfo(); + /// Collects information about the device's local memory. + void CollectPhysicalMemoryInfo(); + /// Returns a list of queue initialization descriptors. std::vector<VkDeviceQueueCreateInfo> GetDeviceQueueCreateInfos() const; @@ -260,21 +267,22 @@ private: bool IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage, FormatType format_type) const; - VkInstance instance; ///< Vulkan instance. - vk::DeviceDispatch dld; ///< Device function pointers. - vk::PhysicalDevice physical; ///< Physical device. - VkPhysicalDeviceProperties properties; ///< Device properties. - vk::Device logical; ///< Logical device. - vk::Queue graphics_queue; ///< Main graphics queue. - vk::Queue present_queue; ///< Main present queue. - u32 instance_version{}; ///< Vulkan onstance version. - u32 graphics_family{}; ///< Main graphics queue family index. - u32 present_family{}; ///< Main present queue family index. - VkDriverIdKHR driver_id{}; ///< Driver ID. - VkShaderStageFlags guest_warp_stages{}; ///< Stages where the guest warp size can be forced.ed - bool is_optimal_astc_supported{}; ///< Support for native ASTC. - bool is_float16_supported{}; ///< Support for float16 arithmetics. - bool is_warp_potentially_bigger{}; ///< Host warp size can be bigger than guest. + VkInstance instance; ///< Vulkan instance. + vk::DeviceDispatch dld; ///< Device function pointers. + vk::PhysicalDevice physical; ///< Physical device. + VkPhysicalDeviceProperties properties; ///< Device properties. + vk::Device logical; ///< Logical device. + vk::Queue graphics_queue; ///< Main graphics queue. + vk::Queue present_queue; ///< Main present queue. + u32 instance_version{}; ///< Vulkan onstance version. + u32 graphics_family{}; ///< Main graphics queue family index. + u32 present_family{}; ///< Main present queue family index. + VkDriverIdKHR driver_id{}; ///< Driver ID. + VkShaderStageFlags guest_warp_stages{}; ///< Stages where the guest warp size can be forced. + u64 device_access_memory{}; ///< Total size of device local memory in bytes. + bool is_optimal_astc_supported{}; ///< Support for native ASTC. + bool is_float16_supported{}; ///< Support for float16 arithmetics. + bool is_warp_potentially_bigger{}; ///< Host warp size can be bigger than guest. bool is_formatless_image_load_supported{}; ///< Support for shader image read without format. bool is_shader_storage_image_multisample{}; ///< Support for image operations on MSAA images. bool is_blit_depth_stencil_supported{}; ///< Support for blitting from and to depth stencil. diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp index 5edd06ebc..aa173d19e 100644 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp @@ -69,10 +69,10 @@ constexpr VkExportMemoryAllocateInfo EXPORT_ALLOCATE_INFO{ class MemoryAllocation { public: - explicit MemoryAllocation(vk::DeviceMemory memory_, VkMemoryPropertyFlags properties, - u64 allocation_size_, u32 type) - : memory{std::move(memory_)}, allocation_size{allocation_size_}, property_flags{properties}, - shifted_memory_type{1U << type} {} + explicit MemoryAllocation(MemoryAllocator* const allocator_, vk::DeviceMemory memory_, + VkMemoryPropertyFlags properties, u64 allocation_size_, u32 type) + : allocator{allocator_}, memory{std::move(memory_)}, allocation_size{allocation_size_}, + property_flags{properties}, shifted_memory_type{1U << type} {} #if defined(_WIN32) || defined(__unix__) ~MemoryAllocation() { @@ -106,6 +106,10 @@ public: const auto it = std::ranges::find(commits, begin, &Range::begin); ASSERT_MSG(it != commits.end(), "Invalid commit"); commits.erase(it); + if (commits.empty()) { + // Do not call any code involving 'this' after this call, the object will be destroyed + allocator->ReleaseMemory(this); + } } [[nodiscard]] std::span<u8> Map() { @@ -171,6 +175,7 @@ private: return candidate; } + MemoryAllocator* const allocator; ///< Parent memory allocation. const vk::DeviceMemory memory; ///< Vulkan memory allocation handler. const u64 allocation_size; ///< Size of this allocation. const VkMemoryPropertyFlags property_flags; ///< Vulkan memory property flags. @@ -275,10 +280,17 @@ bool MemoryAllocator::TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, return false; } } - allocations.push_back(std::make_unique<MemoryAllocation>(std::move(memory), flags, size, type)); + allocations.push_back( + std::make_unique<MemoryAllocation>(this, std::move(memory), flags, size, type)); return true; } +void MemoryAllocator::ReleaseMemory(MemoryAllocation* alloc) { + const auto it = std::ranges::find(allocations, alloc, &std::unique_ptr<MemoryAllocation>::get); + ASSERT(it != allocations.end()); + allocations.erase(it); +} + std::optional<MemoryCommit> MemoryAllocator::TryCommit(const VkMemoryRequirements& requirements, VkMemoryPropertyFlags flags) { for (auto& allocation : allocations) { diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.h b/src/video_core/vulkan_common/vulkan_memory_allocator.h index db12d02f4..b61e931e0 100644 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.h +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.h @@ -69,6 +69,8 @@ private: /// Memory allocator container. /// Allocates and releases memory allocations on demand. class MemoryAllocator { + friend MemoryAllocation; + public: /** * Construct memory allocator @@ -104,6 +106,9 @@ private: /// Tries to allocate a chunk of memory. bool TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size); + /// Releases a chunk of memory. + void ReleaseMemory(MemoryAllocation* alloc); + /// Tries to allocate a memory commit. std::optional<MemoryCommit> TryCommit(const VkMemoryRequirements& requirements, VkMemoryPropertyFlags flags); |
