diff options
Diffstat (limited to 'src/video_core')
46 files changed, 2308 insertions, 278 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 47190c464..e31eb30c0 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -237,6 +237,7 @@ add_library(video_core STATIC texture_cache/util.cpp texture_cache/util.h textures/astc.h + textures/astc.cpp textures/decoders.cpp textures/decoders.h textures/texture.cpp @@ -292,6 +293,7 @@ endif() if (MSVC) target_compile_options(video_core PRIVATE /we4267 # 'var' : conversion from 'size_t' to 'type', possible loss of data + /we4244 # 'var' : conversion from integer to 'type', possible loss of data /we4456 # Declaration of 'identifier' hides previous local declaration /we4457 # Declaration of 'identifier' hides function parameter /we4458 # Declaration of 'identifier' hides class member diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h index a39505903..b121d36a3 100644 --- a/src/video_core/buffer_cache/buffer_base.h +++ b/src/video_core/buffer_cache/buffer_base.h @@ -256,6 +256,16 @@ public: stream_score += score; } + /// Sets the new frame tick + void SetFrameTick(u64 new_frame_tick) noexcept { + frame_tick = new_frame_tick; + } + + /// Returns the new frame tick + [[nodiscard]] u64 FrameTick() const noexcept { + return frame_tick; + } + /// Returns the likeliness of this being a stream buffer [[nodiscard]] int StreamScore() const noexcept { return stream_score; @@ -586,6 +596,7 @@ private: RasterizerInterface* rasterizer = nullptr; VAddr cpu_addr = 0; Words words; + u64 frame_tick = 0; BufferFlagBits flags{}; int stream_score = 0; }; diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index d371b842f..cad7f902d 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -18,6 +18,7 @@ #include "common/common_types.h" #include "common/div_ceil.h" +#include "common/literals.h" #include "common/microprofile.h" #include "common/scope_exit.h" #include "common/settings.h" @@ -47,8 +48,11 @@ constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8; constexpr u32 NUM_STORAGE_BUFFERS = 16; constexpr u32 NUM_STAGES = 5; +using namespace Common::Literals; + template <typename P> class BufferCache { + // Page size for caching purposes. // This is unrelated to the CPU page size and it can be changed as it seems optimal. static constexpr u32 PAGE_BITS = 16; @@ -65,6 +69,9 @@ class BufferCache { static constexpr BufferId NULL_BUFFER_ID{0}; + static constexpr u64 EXPECTED_MEMORY = 512_MiB; + static constexpr u64 CRITICAL_MEMORY = 1_GiB; + using Maxwell = Tegra::Engines::Maxwell3D::Regs; using Runtime = typename P::Runtime; @@ -92,7 +99,7 @@ class BufferCache { }; public: - static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = 4096; + static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = static_cast<u32>(4_KiB); explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, Tegra::Engines::Maxwell3D& maxwell3d_, @@ -188,6 +195,8 @@ private: ((cpu_addr + size) & ~Core::Memory::PAGE_MASK); } + void RunGarbageCollector(); + void BindHostIndexBuffer(); void BindHostVertexBuffers(); @@ -243,6 +252,8 @@ private: template <bool insert> void ChangeRegister(BufferId buffer_id); + void TouchBuffer(Buffer& buffer) const noexcept; + bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); @@ -255,6 +266,10 @@ private: void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies); + void DownloadBufferMemory(Buffer& buffer_id); + + void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size); + void DeleteBuffer(BufferId buffer_id); void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id); @@ -319,6 +334,10 @@ private: size_t immediate_buffer_capacity = 0; std::unique_ptr<u8[]> immediate_buffer_alloc; + typename SlotVector<Buffer>::Iterator deletion_iterator; + u64 frame_tick = 0; + u64 total_used_memory = 0; + std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table; }; @@ -332,6 +351,28 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} { // Ensure the first slot is used for the null buffer void(slot_buffers.insert(runtime, NullBufferParams{})); + deletion_iterator = slot_buffers.end(); +} + +template <class P> +void BufferCache<P>::RunGarbageCollector() { + const bool aggressive_gc = total_used_memory >= CRITICAL_MEMORY; + const u64 ticks_to_destroy = aggressive_gc ? 60 : 120; + int num_iterations = aggressive_gc ? 64 : 32; + for (; num_iterations > 0; --num_iterations) { + if (deletion_iterator == slot_buffers.end()) { + deletion_iterator = slot_buffers.begin(); + } + ++deletion_iterator; + if (deletion_iterator == slot_buffers.end()) { + break; + } + const auto [buffer_id, buffer] = *deletion_iterator; + if (buffer->FrameTick() + ticks_to_destroy < frame_tick) { + DownloadBufferMemory(*buffer); + DeleteBuffer(buffer_id); + } + } } template <class P> @@ -349,6 +390,10 @@ void BufferCache<P>::TickFrame() { const bool skip_preferred = hits * 256 < shots * 251; uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0; + if (Settings::values.use_caches_gc.GetValue() && total_used_memory >= EXPECTED_MEMORY) { + RunGarbageCollector(); + } + ++frame_tick; delayed_destruction_ring.Tick(); } @@ -372,48 +417,7 @@ void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) { template <class P> void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) { ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { - boost::container::small_vector<BufferCopy, 1> copies; - u64 total_size_bytes = 0; - u64 largest_copy = 0; - buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { - copies.push_back(BufferCopy{ - .src_offset = range_offset, - .dst_offset = total_size_bytes, - .size = range_size, - }); - total_size_bytes += range_size; - largest_copy = std::max(largest_copy, range_size); - }); - if (total_size_bytes == 0) { - return; - } - MICROPROFILE_SCOPE(GPU_DownloadMemory); - - if constexpr (USE_MEMORY_MAPS) { - auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); - const u8* const mapped_memory = download_staging.mapped_span.data(); - const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size()); - for (BufferCopy& copy : copies) { - // Modify copies to have the staging offset in mind - copy.dst_offset += download_staging.offset; - } - runtime.CopyBuffer(download_staging.buffer, buffer, copies_span); - runtime.Finish(); - for (const BufferCopy& copy : copies) { - const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; - // Undo the modified offset - const u64 dst_offset = copy.dst_offset - download_staging.offset; - const u8* copy_mapped_memory = mapped_memory + dst_offset; - cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size); - } - } else { - const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); - for (const BufferCopy& copy : copies) { - buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); - const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; - cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size); - } - } + DownloadBufferMemory(buffer, cpu_addr, size); }); } @@ -640,6 +644,7 @@ bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { template <class P> void BufferCache<P>::BindHostIndexBuffer() { Buffer& buffer = slot_buffers[index_buffer.buffer_id]; + TouchBuffer(buffer); const u32 offset = buffer.Offset(index_buffer.cpu_addr); const u32 size = index_buffer.size; SynchronizeBuffer(buffer, index_buffer.cpu_addr, size); @@ -658,6 +663,7 @@ void BufferCache<P>::BindHostVertexBuffers() { for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { const Binding& binding = vertex_buffers[index]; Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer); SynchronizeBuffer(buffer, binding.cpu_addr, binding.size); if (!flags[Dirty::VertexBuffer0 + index]) { continue; @@ -693,6 +699,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 const VAddr cpu_addr = binding.cpu_addr; const u32 size = binding.size; Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer); const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID && size <= uniform_buffer_skip_cache_size && !buffer.IsRegionGpuModified(cpu_addr, size); @@ -744,6 +751,7 @@ void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) { ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) { const Binding& binding = storage_buffers[stage][index]; Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer); const u32 size = binding.size; SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -766,6 +774,7 @@ void BufferCache<P>::BindHostTransformFeedbackBuffers() { for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) { const Binding& binding = transform_feedback_buffers[index]; Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer); const u32 size = binding.size; SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -784,6 +793,7 @@ void BufferCache<P>::BindHostComputeUniformBuffers() { ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) { const Binding& binding = compute_uniform_buffers[index]; Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer); const u32 size = binding.size; SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -803,6 +813,7 @@ void BufferCache<P>::BindHostComputeStorageBuffers() { ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) { const Binding& binding = compute_storage_buffers[index]; Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer); const u32 size = binding.size; SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -1101,6 +1112,7 @@ BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size); const u32 size = static_cast<u32>(overlap.end - overlap.begin); const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size); + TouchBuffer(slot_buffers[new_buffer_id]); for (const BufferId overlap_id : overlap.ids) { JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); } @@ -1122,8 +1134,14 @@ template <class P> template <bool insert> void BufferCache<P>::ChangeRegister(BufferId buffer_id) { const Buffer& buffer = slot_buffers[buffer_id]; + const auto size = buffer.SizeBytes(); + if (insert) { + total_used_memory += Common::AlignUp(size, 1024); + } else { + total_used_memory -= Common::AlignUp(size, 1024); + } const VAddr cpu_addr_begin = buffer.CpuAddr(); - const VAddr cpu_addr_end = cpu_addr_begin + buffer.SizeBytes(); + const VAddr cpu_addr_end = cpu_addr_begin + size; const u64 page_begin = cpu_addr_begin / PAGE_SIZE; const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE); for (u64 page = page_begin; page != page_end; ++page) { @@ -1136,6 +1154,11 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) { } template <class P> +void BufferCache<P>::TouchBuffer(Buffer& buffer) const noexcept { + buffer.SetFrameTick(frame_tick); +} + +template <class P> bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { if (buffer.CpuAddr() == 0) { return true; @@ -1212,6 +1235,57 @@ void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, } template <class P> +void BufferCache<P>::DownloadBufferMemory(Buffer& buffer) { + DownloadBufferMemory(buffer, buffer.CpuAddr(), buffer.SizeBytes()); +} + +template <class P> +void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 size) { + boost::container::small_vector<BufferCopy, 1> copies; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { + copies.push_back(BufferCopy{ + .src_offset = range_offset, + .dst_offset = total_size_bytes, + .size = range_size, + }); + total_size_bytes += range_size; + largest_copy = std::max(largest_copy, range_size); + }); + if (total_size_bytes == 0) { + return; + } + MICROPROFILE_SCOPE(GPU_DownloadMemory); + + if constexpr (USE_MEMORY_MAPS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); + const u8* const mapped_memory = download_staging.mapped_span.data(); + const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size()); + for (BufferCopy& copy : copies) { + // Modify copies to have the staging offset in mind + copy.dst_offset += download_staging.offset; + } + runtime.CopyBuffer(download_staging.buffer, buffer, copies_span); + runtime.Finish(); + for (const BufferCopy& copy : copies) { + const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; + // Undo the modified offset + const u64 dst_offset = copy.dst_offset - download_staging.offset; + const u8* copy_mapped_memory = mapped_memory + dst_offset; + cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size); + } + } else { + const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); + for (const BufferCopy& copy : copies) { + buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); + const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; + cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size); + } + } +} + +template <class P> void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { const auto scalar_replace = [buffer_id](Binding& binding) { if (binding.buffer_id == buffer_id) { @@ -1236,6 +1310,7 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { Unregister(buffer_id); delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id])); + slot_buffers.erase(buffer_id); NotifyBufferDeletion(); } diff --git a/src/video_core/command_classes/codecs/codec.h b/src/video_core/command_classes/codecs/codec.h index 8a2a6c360..3e135a2a6 100644 --- a/src/video_core/command_classes/codecs/codec.h +++ b/src/video_core/command_classes/codecs/codec.h @@ -14,10 +14,18 @@ extern "C" { #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wconversion" #endif +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4242) // conversion from 'type' to 'type', possible loss of data +#pragma warning(disable : 4244) // conversion from 'type' to 'type', possible loss of data +#endif #include <libavcodec/avcodec.h> #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic pop #endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif } namespace Tegra { diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp index 0a8b82f2b..5faf8c0f1 100644 --- a/src/video_core/command_classes/vic.cpp +++ b/src/video_core/command_classes/vic.cpp @@ -3,7 +3,28 @@ // Refer to the license.txt file included. #include <array> + +extern "C" { +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif +#ifdef _MSC_VER +#pragma warning(disable : 4244) // conversion from 'type' to 'type', possible loss of data +#pragma warning(push) +#endif +#include <libswscale/swscale.h> +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif +} + #include "common/assert.h" +#include "common/logging/log.h" + #include "video_core/command_classes/nvdec.h" #include "video_core/command_classes/vic.h" #include "video_core/engines/maxwell_3d.h" @@ -11,10 +32,6 @@ #include "video_core/memory_manager.h" #include "video_core/textures/decoders.h" -extern "C" { -#include <libswscale/swscale.h> -} - namespace Tegra { Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_) diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index ffed42a29..335383955 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -242,6 +242,7 @@ public: return 4; default: UNREACHABLE(); + return 1; } } diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index 703e34587..c37f15bfd 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -11,12 +11,8 @@ #define UNIFORM(n) #define BINDING_INPUT_BUFFER 0 #define BINDING_ENC_BUFFER 1 -#define BINDING_6_TO_8_BUFFER 2 -#define BINDING_7_TO_8_BUFFER 3 -#define BINDING_8_TO_8_BUFFER 4 -#define BINDING_BYTE_TO_16_BUFFER 5 -#define BINDING_SWIZZLE_BUFFER 6 -#define BINDING_OUTPUT_IMAGE 7 +#define BINDING_SWIZZLE_BUFFER 2 +#define BINDING_OUTPUT_IMAGE 3 #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv @@ -26,10 +22,6 @@ #define BINDING_SWIZZLE_BUFFER 0 #define BINDING_INPUT_BUFFER 1 #define BINDING_ENC_BUFFER 2 -#define BINDING_6_TO_8_BUFFER 3 -#define BINDING_7_TO_8_BUFFER 4 -#define BINDING_8_TO_8_BUFFER 5 -#define BINDING_BYTE_TO_16_BUFFER 6 #define BINDING_OUTPUT_IMAGE 0 #endif @@ -76,19 +68,6 @@ layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues { EncodingData encoding_values[]; }; -// ASTC Precompiled tables -layout(binding = BINDING_6_TO_8_BUFFER, std430) readonly buffer REPLICATE_6_BIT_TO_8 { - uint REPLICATE_6_BIT_TO_8_TABLE[]; -}; -layout(binding = BINDING_7_TO_8_BUFFER, std430) readonly buffer REPLICATE_7_BIT_TO_8 { - uint REPLICATE_7_BIT_TO_8_TABLE[]; -}; -layout(binding = BINDING_8_TO_8_BUFFER, std430) readonly buffer REPLICATE_8_BIT_TO_8 { - uint REPLICATE_8_BIT_TO_8_TABLE[]; -}; -layout(binding = BINDING_BYTE_TO_16_BUFFER, std430) readonly buffer REPLICATE_BYTE_TO_16 { - uint REPLICATE_BYTE_TO_16_TABLE[]; -}; layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image; @@ -139,6 +118,19 @@ const uint REPLICATE_4_BIT_TO_6_TABLE[16] = const uint REPLICATE_5_BIT_TO_6_TABLE[32] = uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63); +const uint REPLICATE_6_BIT_TO_8_TABLE[64] = + uint[](0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 65, 69, 73, 77, 81, 85, 89, + 93, 97, 101, 105, 109, 113, 117, 121, 125, 130, 134, 138, 142, 146, 150, 154, 158, 162, + 166, 170, 174, 178, 182, 186, 190, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, + 239, 243, 247, 251, 255); +const uint REPLICATE_7_BIT_TO_8_TABLE[128] = + uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, + 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, + 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, + 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 159, 161, 163, + 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191, 193, 195, 197, 199, + 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, + 237, 239, 241, 243, 245, 247, 249, 251, 253, 255); // Input ASTC texture globals uint current_index = 0; @@ -207,8 +199,7 @@ uint Replicate(uint val, uint num_bits, uint to_bit) { } uvec4 ReplicateByteTo16(uvec4 value) { - return uvec4(REPLICATE_BYTE_TO_16_TABLE[value.x], REPLICATE_BYTE_TO_16_TABLE[value.y], - REPLICATE_BYTE_TO_16_TABLE[value.z], REPLICATE_BYTE_TO_16_TABLE[value.w]); + return value * 0x101; } uint ReplicateBitTo7(uint value) { @@ -236,7 +227,7 @@ uint FastReplicateTo8(uint value, uint num_bits) { case 7: return REPLICATE_7_BIT_TO_8_TABLE[value]; case 8: - return REPLICATE_8_BIT_TO_8_TABLE[value]; + return value; } return Replicate(value, num_bits, 8); } @@ -763,7 +754,7 @@ void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) { case 1: { READ_UINT_VALUES(2) uint L0 = (v[0] >> 2) | (v[1] & 0xC0); - uint L1 = max(L0 + (v[1] & 0x3F), 0xFFU); + uint L1 = min(L0 + (v[1] & 0x3F), 0xFFU); ep1 = uvec4(0xFF, L0, L0, L0); ep2 = uvec4(0xFF, L1, L1, L1); break; @@ -1327,6 +1318,9 @@ void main() { offset += swizzle; const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1)); + if (any(greaterThanEqual(coord, imageSize(dest_image)))) { + return; + } uint block_index = pos.z * gl_WorkGroupSize.x * gl_WorkGroupSize.y + pos.y * gl_WorkGroupSize.x + pos.x; diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index f968b5b16..07939432f 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -4,10 +4,10 @@ #pragma once -#include <atomic> #include <functional> #include <optional> #include <span> +#include <stop_token> #include "common/common_types.h" #include "video_core/engines/fermi_2d.h" #include "video_core/gpu.h" @@ -123,7 +123,7 @@ public: virtual void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {} /// Initialize disk cached resources for the game being emulated - virtual void LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading, + virtual void LoadDiskResources(u64 title_id, std::stop_token stop_loading, const DiskResourceLoadCallback& callback) {} /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver. diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h index 320ee8d30..63d8ad42a 100644 --- a/src/video_core/renderer_base.h +++ b/src/video_core/renderer_base.h @@ -42,6 +42,8 @@ public: [[nodiscard]] virtual RasterizerInterface* ReadRasterizer() = 0; + [[nodiscard]] virtual std::string GetDeviceVendor() const = 0; + // Getter/setter functions: // ------------------------ diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp index 3e4d88c30..e8d8d2aa5 100644 --- a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp @@ -454,7 +454,7 @@ private: template <typename... Args> void AddExpression(std::string_view text, Args&&... args) { - shader_source += fmt::format(text, std::forward<Args>(args)...); + shader_source += fmt::format(fmt::runtime(text), std::forward<Args>(args)...); } template <typename... Args> diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 3f4532ca7..3b00614e7 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -202,13 +202,13 @@ Device::Device() { LOG_ERROR(Render_OpenGL, "OpenGL 4.6 is not available"); throw std::runtime_error{"Insufficient version"}; } - const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); + vendor_name = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION)); const std::vector extensions = GetExtensions(); - const bool is_nvidia = vendor == "NVIDIA Corporation"; - const bool is_amd = vendor == "ATI Technologies Inc."; - const bool is_intel = vendor == "Intel"; + const bool is_nvidia = vendor_name == "NVIDIA Corporation"; + const bool is_amd = vendor_name == "ATI Technologies Inc."; + const bool is_intel = vendor_name == "Intel"; #ifdef __unix__ const bool is_linux = true; @@ -275,6 +275,56 @@ Device::Device() { } } +std::string Device::GetVendorName() const { + if (vendor_name == "NVIDIA Corporation") { + return "NVIDIA"; + } + if (vendor_name == "ATI Technologies Inc.") { + return "AMD"; + } + if (vendor_name == "Intel") { + // For Mesa, `Intel` is an overloaded vendor string that could mean crocus or iris. + // Simply return `INTEL` for those as well as the Windows driver. + return "INTEL"; + } + if (vendor_name == "Intel Open Source Technology Center") { + return "I965"; + } + if (vendor_name == "Mesa Project") { + return "I915"; + } + if (vendor_name == "Mesa/X.org") { + // This vendor string is overloaded between llvmpipe, softpipe, and virgl, so just return + // MESA instead of one of those driver names. + return "MESA"; + } + if (vendor_name == "AMD") { + return "RADEONSI"; + } + if (vendor_name == "nouveau") { + return "NOUVEAU"; + } + if (vendor_name == "X.Org") { + return "R600"; + } + if (vendor_name == "Collabora Ltd") { + return "ZINK"; + } + if (vendor_name == "Intel Corporation") { + return "OPENSWR"; + } + if (vendor_name == "Microsoft Corporation") { + return "D3D12"; + } + if (vendor_name == "NVIDIA") { + // Mesa's tegra driver reports `NVIDIA`. Only present in this list because the default + // strategy would have returned `NVIDIA` here for this driver, the same result as the + // proprietary driver. + return "TEGRA"; + } + return vendor_name; +} + Device::Device(std::nullptr_t) { max_uniform_buffers.fill(std::numeric_limits<u32>::max()); uniform_buffer_alignment = 4; diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index f24bd0c7b..2c2b13767 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -22,6 +22,8 @@ public: explicit Device(); explicit Device(std::nullptr_t); + [[nodiscard]] std::string GetVendorName() const; + u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept { return max_uniform_buffers[static_cast<std::size_t>(shader_type)]; } @@ -130,6 +132,7 @@ private: static bool TestVariableAoffi(); static bool TestPreciseBug(); + std::string vendor_name; std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{}; std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{}; size_t uniform_buffer_alignment{}; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index f87bb269b..eb8bdaa85 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -351,7 +351,7 @@ void RasterizerOpenGL::SetupShaders(bool is_indexed) { } } -void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading, +void RasterizerOpenGL::LoadDiskResources(u64 title_id, std::stop_token stop_loading, const VideoCore::DiskResourceLoadCallback& callback) { shader_cache.LoadDiskCache(title_id, stop_loading, callback); } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 76298517f..9995a563b 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -94,7 +94,7 @@ public: const Tegra::Engines::Fermi2D::Config& copy_config) override; bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) override; - void LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading, + void LoadDiskResources(u64 title_id, std::stop_token stop_loading, const VideoCore::DiskResourceLoadCallback& callback) override; /// Returns true when there are commands queued to the OpenGL server. diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 5cf7cd151..5a01c59ec 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -331,7 +331,7 @@ ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer_, ShaderCacheOpenGL::~ShaderCacheOpenGL() = default; -void ShaderCacheOpenGL::LoadDiskCache(u64 title_id, const std::atomic_bool& stop_loading, +void ShaderCacheOpenGL::LoadDiskCache(u64 title_id, std::stop_token stop_loading, const VideoCore::DiskResourceLoadCallback& callback) { disk_cache.BindTitleID(title_id); const std::optional transferable = disk_cache.LoadTransferable(); @@ -372,7 +372,7 @@ void ShaderCacheOpenGL::LoadDiskCache(u64 title_id, const std::atomic_bool& stop const auto scope = context->Acquire(); for (std::size_t i = begin; i < end; ++i) { - if (stop_loading) { + if (stop_loading.stop_requested()) { return; } const auto& entry = (*transferable)[i]; @@ -435,7 +435,7 @@ void ShaderCacheOpenGL::LoadDiskCache(u64 title_id, const std::atomic_bool& stop precompiled_cache_altered = true; return; } - if (stop_loading) { + if (stop_loading.stop_requested()) { return; } diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index 2aed0697e..b30308b6f 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -127,7 +127,7 @@ public: ~ShaderCacheOpenGL() override; /// Loads disk cache for the current game - void LoadDiskCache(u64 title_id, const std::atomic_bool& stop_loading, + void LoadDiskCache(u64 title_id, std::stop_token stop_loading, const VideoCore::DiskResourceLoadCallback& callback); /// Gets the current specified shader stage program diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index ac78d344c..9c28498e8 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -96,7 +96,7 @@ public: // etc). template <typename... Args> void AddLine(std::string_view text, Args&&... args) { - AddExpression(fmt::format(text, std::forward<Args>(args)...)); + AddExpression(fmt::format(fmt::runtime(text), std::forward<Args>(args)...)); AddNewLine(); } diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h index 6dbb6bfba..2e67922a6 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.h +++ b/src/video_core/renderer_opengl/gl_stream_buffer.h @@ -12,12 +12,15 @@ #include <glad/glad.h> #include "common/common_types.h" +#include "common/literals.h" #include "video_core/renderer_opengl/gl_resource_manager.h" namespace OpenGL { +using namespace Common::Literals; + class StreamBuffer { - static constexpr size_t STREAM_BUFFER_SIZE = 64 * 1024 * 1024; + static constexpr size_t STREAM_BUFFER_SIZE = 64_MiB; static constexpr size_t NUM_SYNCS = 16; static constexpr size_t REGION_SIZE = STREAM_BUFFER_SIZE / NUM_SYNCS; static constexpr size_t MAX_ALIGNMENT = 256; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index ffe9edc1b..23948feed 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -9,6 +9,8 @@ #include <glad/glad.h> +#include "common/settings.h" + #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_state_tracker.h" @@ -307,7 +309,9 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4 [[nodiscard]] bool CanBeAccelerated(const TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info) { - return !runtime.HasNativeASTC() && IsPixelFormatASTC(info.format); + if (IsPixelFormatASTC(info.format)) { + return !runtime.HasNativeASTC() && Settings::values.accelerate_astc.GetValue(); + } // Disable other accelerated uploads for now as they don't implement swizzled uploads return false; switch (info.type) { @@ -733,6 +737,8 @@ Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_, } } +Image::~Image() = default; + void Image::UploadMemory(const ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies) { glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer); diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index df8be12ff..25fe61566 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -143,6 +143,14 @@ public: explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr); + ~Image(); + + Image(const Image&) = delete; + Image& operator=(const Image&) = delete; + + Image(Image&&) = default; + Image& operator=(Image&&) = default; + void UploadMemory(const ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies); @@ -235,6 +243,7 @@ struct TextureCacheParams { static constexpr bool ENABLE_VALIDATION = true; static constexpr bool FRAMEBUFFER_BLITS = true; static constexpr bool HAS_EMULATED_COPIES = true; + static constexpr bool HAS_DEVICE_MEMORY_INFO = false; using Runtime = OpenGL::TextureCacheRuntime; using Image = OpenGL::Image; diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index cc19a110f..0b66f8332 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -70,6 +70,10 @@ public: return &rasterizer; } + [[nodiscard]] std::string GetDeviceVendor() const override { + return device.GetVendorName(); + } + private: /// Initializes the OpenGL state and creates persistent objects. void InitOpenGLObjects(); diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index 47fddcb6e..abaf1ee6a 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -69,7 +69,8 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_) swizzle_table_buffer.Create(); astc_buffer.Create(); glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); - glNamedBufferStorage(astc_buffer.handle, sizeof(ASTC_BUFFER_DATA), &ASTC_BUFFER_DATA, 0); + glNamedBufferStorage(astc_buffer.handle, sizeof(ASTC_ENCODINGS_VALUES), &ASTC_ENCODINGS_VALUES, + 0); } UtilShaders::~UtilShaders() = default; @@ -79,12 +80,6 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; static constexpr GLuint BINDING_INPUT_BUFFER = 1; static constexpr GLuint BINDING_ENC_BUFFER = 2; - - static constexpr GLuint BINDING_6_TO_8_BUFFER = 3; - static constexpr GLuint BINDING_7_TO_8_BUFFER = 4; - static constexpr GLuint BINDING_8_TO_8_BUFFER = 5; - static constexpr GLuint BINDING_BYTE_TO_16_BUFFER = 6; - static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; const Extent2D tile_size{ @@ -93,21 +88,7 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, }; program_manager.BindHostCompute(astc_decoder_program.handle); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_buffer.handle, - offsetof(AstcBufferData, encoding_values), - sizeof(AstcBufferData::encoding_values)); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_6_TO_8_BUFFER, astc_buffer.handle, - offsetof(AstcBufferData, replicate_6_to_8), - sizeof(AstcBufferData::replicate_6_to_8)); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_7_TO_8_BUFFER, astc_buffer.handle, - offsetof(AstcBufferData, replicate_7_to_8), - sizeof(AstcBufferData::replicate_7_to_8)); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_8_TO_8_BUFFER, astc_buffer.handle, - offsetof(AstcBufferData, replicate_8_to_8), - sizeof(AstcBufferData::replicate_8_to_8)); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_BYTE_TO_16_BUFFER, astc_buffer.handle, - offsetof(AstcBufferData, replicate_byte_to_16), - sizeof(AstcBufferData::replicate_byte_to_16)); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_buffer.handle); glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); glUniform2ui(1, tile_size.width, tile_size.height); @@ -137,6 +118,12 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers); } + // Precautionary barrier to ensure the compute shader is done decoding prior to texture access. + // GL_TEXTURE_FETCH_BARRIER_BIT and GL_SHADER_IMAGE_ACCESS_BARRIER_BIT are used in a separate + // glMemoryBarrier call by the texture cache runtime + glMemoryBarrier(GL_UNIFORM_BARRIER_BIT | GL_COMMAND_BARRIER_BIT | GL_PIXEL_BUFFER_BARRIER_BIT | + GL_TEXTURE_UPDATE_BARRIER_BIT | GL_BUFFER_UPDATE_BARRIER_BIT | + GL_SHADER_STORAGE_BARRIER_BIT | GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT); program_manager.RestoreGuestCompute(); } diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index 72071316c..d7d17e110 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -47,6 +47,10 @@ public: return &rasterizer; } + [[nodiscard]] std::string GetDeviceVendor() const override { + return device.GetDriverName(); + } + private: void Report() const; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 8cb65e588..0df4e1a1c 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -55,8 +55,9 @@ size_t BytesPerIndex(VkIndexType index_type) { template <typename T> std::array<T, 6> MakeQuadIndices(u32 quad, u32 first) { std::array<T, 6> indices{0, 1, 2, 0, 2, 3}; - std::ranges::transform(indices, indices.begin(), - [quad, first](u32 index) { return first + index + quad * 4; }); + for (T& index : indices) { + index = static_cast<T>(first + index + quad * 4); + } return indices; } } // Anonymous namespace diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index e11406e58..205cd3b05 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -30,19 +30,16 @@ namespace Vulkan { using Tegra::Texture::SWIZZLE_TABLE; -using Tegra::Texture::ASTC::EncodingsValues; +using Tegra::Texture::ASTC::ASTC_ENCODINGS_VALUES; using namespace Tegra::Texture::ASTC; namespace { constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0; constexpr u32 ASTC_BINDING_ENC_BUFFER = 1; -constexpr u32 ASTC_BINDING_6_TO_8_BUFFER = 2; -constexpr u32 ASTC_BINDING_7_TO_8_BUFFER = 3; -constexpr u32 ASTC_BINDING_8_TO_8_BUFFER = 4; -constexpr u32 ASTC_BINDING_BYTE_TO_16_BUFFER = 5; -constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 6; -constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 7; +constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 2; +constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 3; +constexpr size_t ASTC_NUM_BINDINGS = 4; VkPushConstantRange BuildComputePushConstantRange(std::size_t size) { return { @@ -71,7 +68,7 @@ std::array<VkDescriptorSetLayoutBinding, 2> BuildInputOutputDescriptorSetBinding }}; } -std::array<VkDescriptorSetLayoutBinding, 8> BuildASTCDescriptorSetBindings() { +std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> BuildASTCDescriptorSetBindings() { return {{ { .binding = ASTC_BINDING_INPUT_BUFFER, @@ -88,34 +85,6 @@ std::array<VkDescriptorSetLayoutBinding, 8> BuildASTCDescriptorSetBindings() { .pImmutableSamplers = nullptr, }, { - .binding = ASTC_BINDING_6_TO_8_BUFFER, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, - .pImmutableSamplers = nullptr, - }, - { - .binding = ASTC_BINDING_7_TO_8_BUFFER, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, - .pImmutableSamplers = nullptr, - }, - { - .binding = ASTC_BINDING_8_TO_8_BUFFER, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, - .pImmutableSamplers = nullptr, - }, - { - .binding = ASTC_BINDING_BYTE_TO_16_BUFFER, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, - .pImmutableSamplers = nullptr, - }, - { .binding = ASTC_BINDING_SWIZZLE_BUFFER, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, .descriptorCount = 1, @@ -143,7 +112,8 @@ VkDescriptorUpdateTemplateEntryKHR BuildInputOutputDescriptorUpdateTemplate() { }; } -std::array<VkDescriptorUpdateTemplateEntryKHR, 8> BuildASTCPassDescriptorUpdateTemplateEntry() { +std::array<VkDescriptorUpdateTemplateEntryKHR, ASTC_NUM_BINDINGS> +BuildASTCPassDescriptorUpdateTemplateEntry() { return {{ { .dstBinding = ASTC_BINDING_INPUT_BUFFER, @@ -162,38 +132,6 @@ std::array<VkDescriptorUpdateTemplateEntryKHR, 8> BuildASTCPassDescriptorUpdateT .stride = sizeof(DescriptorUpdateEntry), }, { - .dstBinding = ASTC_BINDING_6_TO_8_BUFFER, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = ASTC_BINDING_6_TO_8_BUFFER * sizeof(DescriptorUpdateEntry), - .stride = sizeof(DescriptorUpdateEntry), - }, - { - .dstBinding = ASTC_BINDING_7_TO_8_BUFFER, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = ASTC_BINDING_7_TO_8_BUFFER * sizeof(DescriptorUpdateEntry), - .stride = sizeof(DescriptorUpdateEntry), - }, - { - .dstBinding = ASTC_BINDING_8_TO_8_BUFFER, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = ASTC_BINDING_8_TO_8_BUFFER * sizeof(DescriptorUpdateEntry), - .stride = sizeof(DescriptorUpdateEntry), - }, - { - .dstBinding = ASTC_BINDING_BYTE_TO_16_BUFFER, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = ASTC_BINDING_BYTE_TO_16_BUFFER * sizeof(DescriptorUpdateEntry), - .stride = sizeof(DescriptorUpdateEntry), - }, - { .dstBinding = ASTC_BINDING_SWIZZLE_BUFFER, .dstArrayElement = 0, .descriptorCount = 1, @@ -222,15 +160,6 @@ struct AstcPushConstants { u32 block_height_mask; }; -struct AstcBufferData { - decltype(SWIZZLE_TABLE) swizzle_table_buffer = SWIZZLE_TABLE; - decltype(EncodingsValues) encoding_values = EncodingsValues; - decltype(REPLICATE_6_BIT_TO_8_TABLE) replicate_6_to_8 = REPLICATE_6_BIT_TO_8_TABLE; - decltype(REPLICATE_7_BIT_TO_8_TABLE) replicate_7_to_8 = REPLICATE_7_BIT_TO_8_TABLE; - decltype(REPLICATE_8_BIT_TO_8_TABLE) replicate_8_to_8 = REPLICATE_8_BIT_TO_8_TABLE; - decltype(REPLICATE_BYTE_TO_16_TABLE) replicate_byte_to_16 = REPLICATE_BYTE_TO_16_TABLE; -} constexpr ASTC_BUFFER_DATA; - } // Anonymous namespace VKComputePass::VKComputePass(const Device& device, VKDescriptorPool& descriptor_pool, @@ -423,7 +352,7 @@ ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_, ASTCDecoderPass::~ASTCDecoderPass() = default; void ASTCDecoderPass::MakeDataBuffer() { - constexpr size_t TOTAL_BUFFER_SIZE = sizeof(ASTC_BUFFER_DATA) + sizeof(SWIZZLE_TABLE); + constexpr size_t TOTAL_BUFFER_SIZE = sizeof(ASTC_ENCODINGS_VALUES) + sizeof(SWIZZLE_TABLE); data_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, @@ -437,9 +366,10 @@ void ASTCDecoderPass::MakeDataBuffer() { data_buffer_commit = memory_allocator.Commit(data_buffer, MemoryUsage::Upload); const auto staging_ref = staging_buffer_pool.Request(TOTAL_BUFFER_SIZE, MemoryUsage::Upload); - std::memcpy(staging_ref.mapped_span.data(), &ASTC_BUFFER_DATA, sizeof(ASTC_BUFFER_DATA)); + std::memcpy(staging_ref.mapped_span.data(), &ASTC_ENCODINGS_VALUES, + sizeof(ASTC_ENCODINGS_VALUES)); // Tack on the swizzle table at the end of the buffer - std::memcpy(staging_ref.mapped_span.data() + sizeof(ASTC_BUFFER_DATA), &SWIZZLE_TABLE, + std::memcpy(staging_ref.mapped_span.data() + sizeof(ASTC_ENCODINGS_VALUES), &SWIZZLE_TABLE, sizeof(SWIZZLE_TABLE)); scheduler.Record([src = staging_ref.buffer, offset = staging_ref.offset, dst = *data_buffer, @@ -509,18 +439,8 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, update_descriptor_queue.Acquire(); update_descriptor_queue.AddBuffer(map.buffer, input_offset, image.guest_size_bytes - swizzle.buffer_offset); - update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, encoding_values), - sizeof(AstcBufferData::encoding_values)); - update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, replicate_6_to_8), - sizeof(AstcBufferData::replicate_6_to_8)); - update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, replicate_7_to_8), - sizeof(AstcBufferData::replicate_7_to_8)); - update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, replicate_8_to_8), - sizeof(AstcBufferData::replicate_8_to_8)); - update_descriptor_queue.AddBuffer(*data_buffer, - offsetof(AstcBufferData, replicate_byte_to_16), - sizeof(AstcBufferData::replicate_byte_to_16)); - update_descriptor_queue.AddBuffer(*data_buffer, sizeof(AstcBufferData), + update_descriptor_queue.AddBuffer(*data_buffer, 0, sizeof(ASTC_ENCODINGS_VALUES)); + update_descriptor_queue.AddBuffer(*data_buffer, sizeof(ASTC_ENCODINGS_VALUES), sizeof(SWIZZLE_TABLE)); update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level)); @@ -569,6 +489,7 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, image_barrier); }); + scheduler.Finish(); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp index db78ce3d9..6852c11b0 100644 --- a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp @@ -2,8 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <atomic> -#include <chrono> +#include <thread> #include "common/settings.h" #include "video_core/renderer_vulkan/vk_master_semaphore.h" @@ -12,8 +11,6 @@ namespace Vulkan { -using namespace std::chrono_literals; - MasterSemaphore::MasterSemaphore(const Device& device) { static constexpr VkSemaphoreTypeCreateInfoKHR semaphore_type_ci{ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR, @@ -34,9 +31,9 @@ MasterSemaphore::MasterSemaphore(const Device& device) { // Validation layers have a bug where they fail to track resource usage when using timeline // semaphores and synchronizing with GetSemaphoreCounterValueKHR. To workaround this issue, have // a separate thread waiting for each timeline semaphore value. - debug_thread = std::thread([this] { + debug_thread = std::jthread([this](std::stop_token stop_token) { u64 counter = 0; - while (!shutdown) { + while (!stop_token.stop_requested()) { if (semaphore.Wait(counter, 10'000'000)) { ++counter; } @@ -44,13 +41,6 @@ MasterSemaphore::MasterSemaphore(const Device& device) { }); } -MasterSemaphore::~MasterSemaphore() { - shutdown = true; - - // This thread might not be started - if (debug_thread.joinable()) { - debug_thread.join(); - } -} +MasterSemaphore::~MasterSemaphore() = default; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.h b/src/video_core/renderer_vulkan/vk_master_semaphore.h index 4b6d64daa..ee3cd35d0 100644 --- a/src/video_core/renderer_vulkan/vk_master_semaphore.h +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.h @@ -65,11 +65,10 @@ public: } private: - vk::Semaphore semaphore; ///< Timeline semaphore. - std::atomic<u64> gpu_tick{0}; ///< Current known GPU tick. - std::atomic<u64> current_tick{1}; ///< Current logical tick. - std::atomic<bool> shutdown{false}; ///< True when the object is being destroyed. - std::thread debug_thread; ///< Debug thread to workaround validation layer bugs. + vk::Semaphore semaphore; ///< Timeline semaphore. + std::atomic<u64> gpu_tick{0}; ///< Current known GPU tick. + std::atomic<u64> current_tick{1}; ///< Current logical tick. + std::jthread debug_thread; ///< Debug thread to workaround validation layer bugs. }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp index 7a1232497..0412b5234 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp @@ -12,6 +12,7 @@ #include "common/assert.h" #include "common/bit_util.h" #include "common/common_types.h" +#include "common/literals.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/vulkan_common/vulkan_device.h" @@ -19,12 +20,15 @@ namespace Vulkan { namespace { + +using namespace Common::Literals; + // Maximum potential alignment of a Vulkan buffer constexpr VkDeviceSize MAX_ALIGNMENT = 256; // Maximum size to put elements in the stream buffer -constexpr VkDeviceSize MAX_STREAM_BUFFER_REQUEST_SIZE = 8 * 1024 * 1024; +constexpr VkDeviceSize MAX_STREAM_BUFFER_REQUEST_SIZE = 8_MiB; // Stream buffer size in bytes -constexpr VkDeviceSize STREAM_BUFFER_SIZE = 128 * 1024 * 1024; +constexpr VkDeviceSize STREAM_BUFFER_SIZE = 128_MiB; constexpr VkDeviceSize REGION_SIZE = STREAM_BUFFER_SIZE / StagingBufferPool::NUM_SYNCS; constexpr VkMemoryPropertyFlags HOST_FLAGS = diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp index a09fe084e..7b4875d0e 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp @@ -10,6 +10,7 @@ #include "common/alignment.h" #include "common/assert.h" +#include "common/literals.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_stream_buffer.h" #include "video_core/vulkan_common/vulkan_device.h" @@ -19,6 +20,8 @@ namespace Vulkan { namespace { +using namespace Common::Literals; + constexpr VkBufferUsageFlags BUFFER_USAGE = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; @@ -26,7 +29,7 @@ constexpr VkBufferUsageFlags BUFFER_USAGE = constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000; constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000; -constexpr u64 PREFERRED_STREAM_BUFFER_SIZE = 256 * 1024 * 1024; +constexpr u64 PREFERRED_STREAM_BUFFER_SIZE = 256_MiB; /// Find a memory type with the passed requirements std::optional<u32> FindMemoryType(const VkPhysicalDeviceMemoryProperties& properties, diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index bdd0ce8bc..a2ab4d1ee 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -8,6 +8,7 @@ #include <vector> #include "common/bit_cast.h" +#include "common/settings.h" #include "video_core/engines/fermi_2d.h" #include "video_core/renderer_vulkan/blit_image.h" @@ -817,6 +818,10 @@ void TextureCacheRuntime::CopyImage(Image& dst, Image& src, }); } +u64 TextureCacheRuntime::GetDeviceLocalMemory() const { + return device.GetDeviceLocalMemory(); +} + Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_) : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), scheduler{&runtime.scheduler}, @@ -828,7 +833,11 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_ commit = runtime.memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal); } if (IsPixelFormatASTC(info.format) && !runtime.device.IsOptimalAstcSupported()) { - flags |= VideoCommon::ImageFlagBits::AcceleratedUpload; + if (Settings::values.accelerate_astc.GetValue()) { + flags |= VideoCommon::ImageFlagBits::AcceleratedUpload; + } else { + flags |= VideoCommon::ImageFlagBits::Converted; + } } if (runtime.device.HasDebuggingToolAttached()) { if (image) { @@ -871,6 +880,8 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_ } } +Image::~Image() = default; + void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) { // TODO: Move this to another API scheduler->RequestOutsideRenderPassOperationContext(); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 4a57d378b..172bcdf98 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -97,6 +97,8 @@ struct TextureCacheRuntime { // All known Vulkan drivers can natively handle BGR textures return true; } + + u64 GetDeviceLocalMemory() const; }; class Image : public VideoCommon::ImageBase { @@ -104,6 +106,14 @@ public: explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr); + ~Image(); + + Image(const Image&) = delete; + Image& operator=(const Image&) = delete; + + Image(Image&&) = default; + Image& operator=(Image&&) = default; + void UploadMemory(const StagingBufferRef& map, std::span<const VideoCommon::BufferImageCopy> copies); @@ -257,6 +267,7 @@ struct TextureCacheParams { static constexpr bool ENABLE_VALIDATION = true; static constexpr bool FRAMEBUFFER_BLITS = false; static constexpr bool HAS_EMULATED_COPIES = false; + static constexpr bool HAS_DEVICE_MEMORY_INFO = true; using Runtime = Vulkan::TextureCacheRuntime; using Image = Vulkan::Image; diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index 6308aef94..eb1746265 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -283,4 +283,11 @@ std::pair<u32, u32> GetASTCBlockSize(PixelFormat format) { return {DefaultBlockWidth(format), DefaultBlockHeight(format)}; } +u64 EstimatedDecompressedSize(u64 base_size, PixelFormat format) { + constexpr u64 RGBA8_PIXEL_SIZE = 4; + const u64 base_block_size = static_cast<u64>(DefaultBlockWidth(format)) * + static_cast<u64>(DefaultBlockHeight(format)) * RGBA8_PIXEL_SIZE; + return (base_size * base_block_size) / BytesPerBlock(format); +} + } // namespace VideoCore::Surface diff --git a/src/video_core/surface.h b/src/video_core/surface.h index c40ab89d0..1503db81f 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -462,4 +462,6 @@ bool IsPixelFormatSRGB(PixelFormat format); std::pair<u32, u32> GetASTCBlockSize(PixelFormat format); +u64 EstimatedDecompressedSize(u64 base_size, PixelFormat format); + } // namespace VideoCore::Surface diff --git a/src/video_core/texture_cache/image_base.cpp b/src/video_core/texture_cache/image_base.cpp index 9914926b3..ad69d32d1 100644 --- a/src/video_core/texture_cache/image_base.cpp +++ b/src/video_core/texture_cache/image_base.cpp @@ -113,6 +113,43 @@ void ImageBase::InsertView(const ImageViewInfo& view_info, ImageViewId image_vie image_view_ids.push_back(image_view_id); } +bool ImageBase::IsSafeDownload() const noexcept { + // Skip images that were not modified from the GPU + if (False(flags & ImageFlagBits::GpuModified)) { + return false; + } + // Skip images that .are. modified from the CPU + // We don't want to write sensitive data from the guest + if (True(flags & ImageFlagBits::CpuModified)) { + return false; + } + if (info.num_samples > 1) { + LOG_WARNING(HW_GPU, "MSAA image downloads are not implemented"); + return false; + } + return true; +} + +void ImageBase::CheckBadOverlapState() { + if (False(flags & ImageFlagBits::BadOverlap)) { + return; + } + if (!overlapping_images.empty()) { + return; + } + flags &= ~ImageFlagBits::BadOverlap; +} + +void ImageBase::CheckAliasState() { + if (False(flags & ImageFlagBits::Alias)) { + return; + } + if (!aliased_images.empty()) { + return; + } + flags &= ~ImageFlagBits::Alias; +} + void AddImageAlias(ImageBase& lhs, ImageBase& rhs, ImageId lhs_id, ImageId rhs_id) { static constexpr auto OPTIONS = RelaxedOptions::Size | RelaxedOptions::Format; ASSERT(lhs.info.type == rhs.info.type); diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index b7f3b7e43..e326cab71 100644 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -25,6 +25,12 @@ enum class ImageFlagBits : u32 { Strong = 1 << 5, ///< Exists in the image table, the dimensions are can be trusted Registered = 1 << 6, ///< True when the image is registered Picked = 1 << 7, ///< Temporary flag to mark the image as picked + + // Garbage Collection Flags + BadOverlap = 1 << 8, ///< This image overlaps other but doesn't fit, has higher + ///< garbage collection priority + Alias = 1 << 9, ///< This image has aliases and has priority on garbage + ///< collection }; DECLARE_ENUM_FLAG_OPERATORS(ImageFlagBits) @@ -44,11 +50,16 @@ struct ImageBase { void InsertView(const ImageViewInfo& view_info, ImageViewId image_view_id); + [[nodiscard]] bool IsSafeDownload() const noexcept; + [[nodiscard]] bool Overlaps(VAddr overlap_cpu_addr, size_t overlap_size) const noexcept { const VAddr overlap_end = overlap_cpu_addr + overlap_size; return cpu_addr < overlap_end && overlap_cpu_addr < cpu_addr_end; } + void CheckBadOverlapState(); + void CheckAliasState(); + ImageInfo info; u32 guest_size_bytes = 0; @@ -72,6 +83,7 @@ struct ImageBase { std::vector<SubresourceBase> slice_subresources; std::vector<AliasedImage> aliased_images; + std::vector<ImageId> overlapping_images; }; struct ImageAllocBase { diff --git a/src/video_core/texture_cache/slot_vector.h b/src/video_core/texture_cache/slot_vector.h index eae3be6ea..6180b8c0e 100644 --- a/src/video_core/texture_cache/slot_vector.h +++ b/src/video_core/texture_cache/slot_vector.h @@ -5,6 +5,7 @@ #pragma once #include <array> +#include <bit> #include <concepts> #include <numeric> #include <type_traits> @@ -32,6 +33,60 @@ template <class T> requires std::is_nothrow_move_assignable_v<T>&& std::is_nothrow_move_constructible_v<T> class SlotVector { public: + class Iterator { + friend SlotVector<T>; + + public: + constexpr Iterator() = default; + + Iterator& operator++() noexcept { + const u64* const bitset = slot_vector->stored_bitset.data(); + const u32 size = static_cast<u32>(slot_vector->stored_bitset.size()) * 64; + if (id.index < size) { + do { + ++id.index; + } while (id.index < size && !IsValid(bitset)); + if (id.index == size) { + id.index = SlotId::INVALID_INDEX; + } + } + return *this; + } + + Iterator operator++(int) noexcept { + const Iterator copy{*this}; + ++*this; + return copy; + } + + bool operator==(const Iterator& other) const noexcept { + return id.index == other.id.index; + } + + bool operator!=(const Iterator& other) const noexcept { + return id.index != other.id.index; + } + + std::pair<SlotId, T*> operator*() const noexcept { + return {id, std::addressof((*slot_vector)[id])}; + } + + T* operator->() const noexcept { + return std::addressof((*slot_vector)[id]); + } + + private: + Iterator(SlotVector<T>* slot_vector_, SlotId id_) noexcept + : slot_vector{slot_vector_}, id{id_} {} + + bool IsValid(const u64* bitset) const noexcept { + return ((bitset[id.index / 64] >> (id.index % 64)) & 1) != 0; + } + + SlotVector<T>* slot_vector; + SlotId id; + }; + ~SlotVector() noexcept { size_t index = 0; for (u64 bits : stored_bitset) { @@ -70,6 +125,20 @@ public: ResetStorageBit(id.index); } + [[nodiscard]] Iterator begin() noexcept { + const auto it = std::ranges::find_if(stored_bitset, [](u64 value) { return value != 0; }); + if (it == stored_bitset.end()) { + return end(); + } + const u32 word_index = static_cast<u32>(std::distance(it, stored_bitset.begin())); + const SlotId first_id{word_index * 64 + static_cast<u32>(std::countr_zero(*it))}; + return Iterator(this, first_id); + } + + [[nodiscard]] Iterator end() noexcept { + return Iterator(this, SlotId{SlotId::INVALID_INDEX}); + } + private: struct NonTrivialDummy { NonTrivialDummy() noexcept {} @@ -140,7 +209,6 @@ private: Entry* values = nullptr; size_t values_capacity = 0; - size_t values_size = 0; std::vector<u64> stored_bitset; std::vector<u32> free_list; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 59b7c678b..c7cfd02b6 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -19,9 +19,10 @@ #include <boost/container/small_vector.hpp> #include "common/alignment.h" -#include "common/common_funcs.h" #include "common/common_types.h" +#include "common/literals.h" #include "common/logging/log.h" +#include "common/settings.h" #include "video_core/compatible_formats.h" #include "video_core/delayed_destruction_ring.h" #include "video_core/dirty_flags.h" @@ -57,6 +58,7 @@ using VideoCore::Surface::PixelFormat; using VideoCore::Surface::PixelFormatFromDepthFormat; using VideoCore::Surface::PixelFormatFromRenderTargetFormat; using VideoCore::Surface::SurfaceType; +using namespace Common::Literals; template <class P> class TextureCache { @@ -69,12 +71,17 @@ class TextureCache { static constexpr bool FRAMEBUFFER_BLITS = P::FRAMEBUFFER_BLITS; /// True when some copies have to be emulated static constexpr bool HAS_EMULATED_COPIES = P::HAS_EMULATED_COPIES; + /// True when the API can provide info about the memory of the device. + static constexpr bool HAS_DEVICE_MEMORY_INFO = P::HAS_DEVICE_MEMORY_INFO; /// Image view ID for null descriptors static constexpr ImageViewId NULL_IMAGE_VIEW_ID{0}; /// Sampler ID for bugged sampler ids static constexpr SamplerId NULL_SAMPLER_ID{0}; + static constexpr u64 DEFAULT_EXPECTED_MEMORY = 1_GiB; + static constexpr u64 DEFAULT_CRITICAL_MEMORY = 2_GiB; + using Runtime = typename P::Runtime; using Image = typename P::Image; using ImageAlloc = typename P::ImageAlloc; @@ -197,6 +204,9 @@ private: } } + /// Runs the Garbage Collector. + void RunGarbageCollector(); + /// Fills image_view_ids in the image views in indices void FillImageViews(DescriptorTable<TICEntry>& table, std::span<ImageViewId> cached_image_view_ids, std::span<const u32> indices, @@ -333,6 +343,10 @@ private: std::unordered_map<u64, std::vector<ImageId>, IdentityHash<u64>> page_table; bool has_deleted_images = false; + u64 total_used_memory = 0; + u64 minimum_memory; + u64 expected_memory; + u64 critical_memory; SlotVector<Image> slot_images; SlotVector<ImageView> slot_image_views; @@ -353,6 +367,7 @@ private: u64 modification_tick = 0; u64 frame_tick = 0; + typename SlotVector<Image>::Iterator deletion_iterator; }; template <class P> @@ -373,11 +388,94 @@ TextureCache<P>::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface& // This way the null resource becomes a compile time constant void(slot_image_views.insert(runtime, NullImageParams{})); void(slot_samplers.insert(runtime, sampler_descriptor)); + + deletion_iterator = slot_images.begin(); + + if constexpr (HAS_DEVICE_MEMORY_INFO) { + const auto device_memory = runtime.GetDeviceLocalMemory(); + const u64 possible_expected_memory = (device_memory * 3) / 10; + const u64 possible_critical_memory = (device_memory * 6) / 10; + expected_memory = std::max(possible_expected_memory, DEFAULT_EXPECTED_MEMORY); + critical_memory = std::max(possible_critical_memory, DEFAULT_CRITICAL_MEMORY); + minimum_memory = 0; + } else { + // on OGL we can be more conservatives as the driver takes care. + expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB; + critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB; + minimum_memory = expected_memory; + } +} + +template <class P> +void TextureCache<P>::RunGarbageCollector() { + const bool high_priority_mode = total_used_memory >= expected_memory; + const bool aggressive_mode = total_used_memory >= critical_memory; + const u64 ticks_to_destroy = high_priority_mode ? 60 : 100; + int num_iterations = aggressive_mode ? 256 : (high_priority_mode ? 128 : 64); + for (; num_iterations > 0; --num_iterations) { + if (deletion_iterator == slot_images.end()) { + deletion_iterator = slot_images.begin(); + if (deletion_iterator == slot_images.end()) { + break; + } + } + auto [image_id, image_tmp] = *deletion_iterator; + Image* image = image_tmp; // fix clang error. + const bool is_alias = True(image->flags & ImageFlagBits::Alias); + const bool is_bad_overlap = True(image->flags & ImageFlagBits::BadOverlap); + const bool must_download = image->IsSafeDownload(); + bool should_care = is_bad_overlap || is_alias || (high_priority_mode && !must_download); + const u64 ticks_needed = + is_bad_overlap + ? ticks_to_destroy >> 4 + : ((should_care && aggressive_mode) ? ticks_to_destroy >> 1 : ticks_to_destroy); + should_care |= aggressive_mode; + if (should_care && image->frame_tick + ticks_needed < frame_tick) { + if (is_bad_overlap) { + const bool overlap_check = std::ranges::all_of( + image->overlapping_images, [&, image](const ImageId& overlap_id) { + auto& overlap = slot_images[overlap_id]; + return overlap.frame_tick >= image->frame_tick; + }); + if (!overlap_check) { + ++deletion_iterator; + continue; + } + } + if (!is_bad_overlap && must_download) { + const bool alias_check = std::ranges::none_of( + image->aliased_images, [&, image](const AliasedImage& alias) { + auto& alias_image = slot_images[alias.id]; + return (alias_image.frame_tick < image->frame_tick) || + (alias_image.modification_tick < image->modification_tick); + }); + + if (alias_check) { + auto map = runtime.DownloadStagingBuffer(image->unswizzled_size_bytes); + const auto copies = FullDownloadCopies(image->info); + image->DownloadMemory(map, copies); + runtime.Finish(); + SwizzleImage(gpu_memory, image->gpu_addr, image->info, copies, map.mapped_span); + } + } + if (True(image->flags & ImageFlagBits::Tracked)) { + UntrackImage(*image); + } + UnregisterImage(image_id); + DeleteImage(image_id); + if (is_bad_overlap) { + ++num_iterations; + } + } + ++deletion_iterator; + } } template <class P> void TextureCache<P>::TickFrame() { - // Tick sentenced resources in this order to ensure they are destroyed in the right order + if (Settings::values.use_caches_gc.GetValue() && total_used_memory > minimum_memory) { + RunGarbageCollector(); + } sentenced_images.Tick(); sentenced_framebuffers.Tick(); sentenced_image_view.Tick(); @@ -568,17 +666,7 @@ template <class P> void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) { std::vector<ImageId> images; ForEachImageInRegion(cpu_addr, size, [this, &images](ImageId image_id, ImageBase& image) { - // Skip images that were not modified from the GPU - if (False(image.flags & ImageFlagBits::GpuModified)) { - return; - } - // Skip images that .are. modified from the CPU - // We don't want to write sensitive data from the guest - if (True(image.flags & ImageFlagBits::CpuModified)) { - return; - } - if (image.info.num_samples > 1) { - LOG_WARNING(HW_GPU, "MSAA image downloads are not implemented"); + if (!image.IsSafeDownload()) { return; } image.flags &= ~ImageFlagBits::GpuModified; @@ -967,6 +1055,7 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA std::vector<ImageId> overlap_ids; std::vector<ImageId> left_aliased_ids; std::vector<ImageId> right_aliased_ids; + std::vector<ImageId> bad_overlap_ids; ForEachImageInRegion(cpu_addr, size_bytes, [&](ImageId overlap_id, ImageBase& overlap) { if (info.type != overlap.info.type) { return; @@ -992,9 +1081,14 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA const ImageBase new_image_base(new_info, gpu_addr, cpu_addr); if (IsSubresource(new_info, overlap, gpu_addr, options, broken_views, native_bgr)) { left_aliased_ids.push_back(overlap_id); + overlap.flags |= ImageFlagBits::Alias; } else if (IsSubresource(overlap.info, new_image_base, overlap.gpu_addr, options, broken_views, native_bgr)) { right_aliased_ids.push_back(overlap_id); + overlap.flags |= ImageFlagBits::Alias; + } else { + bad_overlap_ids.push_back(overlap_id); + overlap.flags |= ImageFlagBits::BadOverlap; } }); const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr); @@ -1022,10 +1116,18 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA for (const ImageId aliased_id : right_aliased_ids) { ImageBase& aliased = slot_images[aliased_id]; AddImageAlias(new_image_base, aliased, new_image_id, aliased_id); + new_image.flags |= ImageFlagBits::Alias; } for (const ImageId aliased_id : left_aliased_ids) { ImageBase& aliased = slot_images[aliased_id]; AddImageAlias(aliased, new_image_base, aliased_id, new_image_id); + new_image.flags |= ImageFlagBits::Alias; + } + for (const ImageId aliased_id : bad_overlap_ids) { + ImageBase& aliased = slot_images[aliased_id]; + aliased.overlapping_images.push_back(new_image_id); + new_image.overlapping_images.push_back(aliased_id); + new_image.flags |= ImageFlagBits::BadOverlap; } RegisterImage(new_image_id); return new_image_id; @@ -1195,6 +1297,13 @@ void TextureCache<P>::RegisterImage(ImageId image_id) { image.flags |= ImageFlagBits::Registered; ForEachPage(image.cpu_addr, image.guest_size_bytes, [this, image_id](u64 page) { page_table[page].push_back(image_id); }); + u64 tentative_size = std::max(image.guest_size_bytes, image.unswizzled_size_bytes); + if ((IsPixelFormatASTC(image.info.format) && + True(image.flags & ImageFlagBits::AcceleratedUpload)) || + True(image.flags & ImageFlagBits::Converted)) { + tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format); + } + total_used_memory += Common::AlignUp(tentative_size, 1024); } template <class P> @@ -1203,6 +1312,14 @@ void TextureCache<P>::UnregisterImage(ImageId image_id) { ASSERT_MSG(True(image.flags & ImageFlagBits::Registered), "Trying to unregister an already registered image"); image.flags &= ~ImageFlagBits::Registered; + image.flags &= ~ImageFlagBits::BadOverlap; + u64 tentative_size = std::max(image.guest_size_bytes, image.unswizzled_size_bytes); + if ((IsPixelFormatASTC(image.info.format) && + True(image.flags & ImageFlagBits::AcceleratedUpload)) || + True(image.flags & ImageFlagBits::Converted)) { + tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format); + } + total_used_memory -= Common::AlignUp(tentative_size, 1024); ForEachPage(image.cpu_addr, image.guest_size_bytes, [this, image_id](u64 page) { const auto page_it = page_table.find(page); if (page_it == page_table.end()) { @@ -1276,9 +1393,19 @@ void TextureCache<P>::DeleteImage(ImageId image_id) { std::erase_if(other_image.aliased_images, [image_id](const AliasedImage& other_alias) { return other_alias.id == image_id; }); + other_image.CheckAliasState(); ASSERT_MSG(num_removed_aliases == 1, "Invalid number of removed aliases: {}", num_removed_aliases); } + for (const ImageId overlap_id : image.overlapping_images) { + ImageBase& other_image = slot_images[overlap_id]; + [[maybe_unused]] const size_t num_removed_overlaps = std::erase_if( + other_image.overlapping_images, + [image_id](const ImageId other_overlap_id) { return other_overlap_id == image_id; }); + other_image.CheckBadOverlapState(); + ASSERT_MSG(num_removed_overlaps == 1, "Invalid number of removed overlapps: {}", + num_removed_overlaps); + } for (const ImageViewId image_view_id : image_view_ids) { sentenced_image_view.Push(std::move(slot_image_views[image_view_id])); slot_image_views.erase(image_view_id); diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp index 906604a39..4efe042b6 100644 --- a/src/video_core/texture_cache/util.cpp +++ b/src/video_core/texture_cache/util.cpp @@ -47,6 +47,7 @@ #include "video_core/texture_cache/formatter.h" #include "video_core/texture_cache/samples_helper.h" #include "video_core/texture_cache/util.h" +#include "video_core/textures/astc.h" #include "video_core/textures/decoders.h" namespace VideoCommon { @@ -580,6 +581,8 @@ void SwizzleBlockLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr for (s32 layer = 0; layer < info.resources.layers; ++layer) { const std::span<const u8> src = input.subspan(host_offset); + gpu_memory.ReadBlockUnsafe(gpu_addr + guest_offset, dst.data(), dst.size_bytes()); + SwizzleTexture(dst, src, bytes_per_block, num_tiles.width, num_tiles.height, num_tiles.depth, block.height, block.depth); @@ -884,8 +887,16 @@ void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8 ASSERT(copy.image_extent == mip_size); ASSERT(copy.buffer_row_length == Common::AlignUp(mip_size.width, tile_size.width)); ASSERT(copy.buffer_image_height == Common::AlignUp(mip_size.height, tile_size.height)); - DecompressBC4(input.subspan(copy.buffer_offset), copy.image_extent, - output.subspan(output_offset)); + if (IsPixelFormatASTC(info.format)) { + ASSERT(copy.image_extent.depth == 1); + Tegra::Texture::ASTC::Decompress(input.subspan(copy.buffer_offset), + copy.image_extent.width, copy.image_extent.height, + copy.image_subresource.num_layers, tile_size.width, + tile_size.height, output.subspan(output_offset)); + } else { + DecompressBC4(input.subspan(copy.buffer_offset), copy.image_extent, + output.subspan(output_offset)); + } copy.buffer_offset = output_offset; copy.buffer_row_length = mip_size.width; copy.buffer_image_height = mip_size.height; @@ -1087,7 +1098,15 @@ std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const return std::nullopt; } const ImageInfo& existing = image.info; - if (False(options & RelaxedOptions::Format)) { + if (True(options & RelaxedOptions::Format)) { + // Format checking is relaxed, but we still have to check for matching bytes per block. + // This avoids creating a view for blits on UE4 titles where formats with different bytes + // per block are aliased. + if (BytesPerBlock(existing.format) != BytesPerBlock(candidate.format)) { + return std::nullopt; + } + } else { + // Format comaptibility is not relaxed, ensure we are creating a view on a compatible format if (!IsViewCompatible(existing.format, candidate.format, broken_views, native_bgr)) { return std::nullopt; } diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp new file mode 100644 index 000000000..7b756ba41 --- /dev/null +++ b/src/video_core/textures/astc.cpp @@ -0,0 +1,1579 @@ +// Copyright 2016 The University of North Carolina at Chapel Hill +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Please send all BUG REPORTS to <pavel@cs.unc.edu>. +// <http://gamma.cs.unc.edu/FasTC/> + +#include <algorithm> +#include <cassert> +#include <cstring> +#include <span> +#include <vector> + +#include <boost/container/static_vector.hpp> + +#include "common/common_types.h" +#include "video_core/textures/astc.h" + +class InputBitStream { +public: + constexpr explicit InputBitStream(std::span<const u8> data, size_t start_offset = 0) + : cur_byte{data.data()}, total_bits{data.size()}, next_bit{start_offset % 8} {} + + constexpr size_t GetBitsRead() const { + return bits_read; + } + + constexpr bool ReadBit() { + if (bits_read >= total_bits * 8) { + return 0; + } + const bool bit = ((*cur_byte >> next_bit) & 1) != 0; + ++next_bit; + while (next_bit >= 8) { + next_bit -= 8; + ++cur_byte; + } + ++bits_read; + return bit; + } + + constexpr u32 ReadBits(std::size_t nBits) { + u32 ret = 0; + for (std::size_t i = 0; i < nBits; ++i) { + ret |= (ReadBit() & 1) << i; + } + return ret; + } + + template <std::size_t nBits> + constexpr u32 ReadBits() { + u32 ret = 0; + for (std::size_t i = 0; i < nBits; ++i) { + ret |= (ReadBit() & 1) << i; + } + return ret; + } + +private: + const u8* cur_byte; + size_t total_bits = 0; + size_t next_bit = 0; + size_t bits_read = 0; +}; + +class OutputBitStream { +public: + constexpr explicit OutputBitStream(u8* ptr, std::size_t bits = 0, std::size_t start_offset = 0) + : cur_byte{ptr}, num_bits{bits}, next_bit{start_offset % 8} {} + + constexpr std::size_t GetBitsWritten() const { + return bits_written; + } + + constexpr void WriteBitsR(u32 val, u32 nBits) { + for (u32 i = 0; i < nBits; i++) { + WriteBit((val >> (nBits - i - 1)) & 1); + } + } + + constexpr void WriteBits(u32 val, u32 nBits) { + for (u32 i = 0; i < nBits; i++) { + WriteBit((val >> i) & 1); + } + } + +private: + constexpr void WriteBit(bool b) { + if (bits_written >= num_bits) { + return; + } + + const u32 mask = 1 << next_bit++; + + // clear the bit + *cur_byte &= static_cast<u8>(~mask); + + // Write the bit, if necessary + if (b) + *cur_byte |= static_cast<u8>(mask); + + // Next byte? + if (next_bit >= 8) { + cur_byte += 1; + next_bit = 0; + } + } + + u8* cur_byte; + std::size_t num_bits; + std::size_t bits_written = 0; + std::size_t next_bit = 0; +}; + +template <typename IntType> +class Bits { +public: + explicit Bits(const IntType& v) : m_Bits(v) {} + + Bits(const Bits&) = delete; + Bits& operator=(const Bits&) = delete; + + u8 operator[](u32 bitPos) const { + return static_cast<u8>((m_Bits >> bitPos) & 1); + } + + IntType operator()(u32 start, u32 end) const { + if (start == end) { + return (*this)[start]; + } else if (start > end) { + u32 t = start; + start = end; + end = t; + } + + u64 mask = (1 << (end - start + 1)) - 1; + return (m_Bits >> start) & static_cast<IntType>(mask); + } + +private: + const IntType& m_Bits; +}; + +namespace Tegra::Texture::ASTC { +using IntegerEncodedVector = boost::container::static_vector< + IntegerEncodedValue, 256, + boost::container::static_vector_options< + boost::container::inplace_alignment<alignof(IntegerEncodedValue)>, + boost::container::throw_on_overflow<false>>::type>; + +static void DecodeTritBlock(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) { + // Implement the algorithm in section C.2.12 + std::array<u32, 5> m; + std::array<u32, 5> t; + u32 T; + + // Read the trit encoded block according to + // table C.2.14 + m[0] = bits.ReadBits(nBitsPerValue); + T = bits.ReadBits<2>(); + m[1] = bits.ReadBits(nBitsPerValue); + T |= bits.ReadBits<2>() << 2; + m[2] = bits.ReadBits(nBitsPerValue); + T |= bits.ReadBit() << 4; + m[3] = bits.ReadBits(nBitsPerValue); + T |= bits.ReadBits<2>() << 5; + m[4] = bits.ReadBits(nBitsPerValue); + T |= bits.ReadBit() << 7; + + u32 C = 0; + + Bits<u32> Tb(T); + if (Tb(2, 4) == 7) { + C = (Tb(5, 7) << 2) | Tb(0, 1); + t[4] = t[3] = 2; + } else { + C = Tb(0, 4); + if (Tb(5, 6) == 3) { + t[4] = 2; + t[3] = Tb[7]; + } else { + t[4] = Tb[7]; + t[3] = Tb(5, 6); + } + } + + Bits<u32> Cb(C); + if (Cb(0, 1) == 3) { + t[2] = 2; + t[1] = Cb[4]; + t[0] = (Cb[3] << 1) | (Cb[2] & ~Cb[3]); + } else if (Cb(2, 3) == 3) { + t[2] = 2; + t[1] = 2; + t[0] = Cb(0, 1); + } else { + t[2] = Cb[4]; + t[1] = Cb(2, 3); + t[0] = (Cb[1] << 1) | (Cb[0] & ~Cb[1]); + } + + for (std::size_t i = 0; i < 5; ++i) { + IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Trit, nBitsPerValue); + val.bit_value = m[i]; + val.trit_value = t[i]; + } +} + +static void DecodeQuintBlock(InputBitStream& bits, IntegerEncodedVector& result, + u32 nBitsPerValue) { + // Implement the algorithm in section C.2.12 + u32 m[3]; + u32 q[3]; + u32 Q; + + // Read the trit encoded block according to + // table C.2.15 + m[0] = bits.ReadBits(nBitsPerValue); + Q = bits.ReadBits<3>(); + m[1] = bits.ReadBits(nBitsPerValue); + Q |= bits.ReadBits<2>() << 3; + m[2] = bits.ReadBits(nBitsPerValue); + Q |= bits.ReadBits<2>() << 5; + + Bits<u32> Qb(Q); + if (Qb(1, 2) == 3 && Qb(5, 6) == 0) { + q[0] = q[1] = 4; + q[2] = (Qb[0] << 2) | ((Qb[4] & ~Qb[0]) << 1) | (Qb[3] & ~Qb[0]); + } else { + u32 C = 0; + if (Qb(1, 2) == 3) { + q[2] = 4; + C = (Qb(3, 4) << 3) | ((~Qb(5, 6) & 3) << 1) | Qb[0]; + } else { + q[2] = Qb(5, 6); + C = Qb(0, 4); + } + + Bits<u32> Cb(C); + if (Cb(0, 2) == 5) { + q[1] = 4; + q[0] = Cb(3, 4); + } else { + q[1] = Cb(3, 4); + q[0] = Cb(0, 2); + } + } + + for (std::size_t i = 0; i < 3; ++i) { + IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Quint, nBitsPerValue); + val.bit_value = m[i]; + val.quint_value = q[i]; + } +} + +// Fills result with the values that are encoded in the given +// bitstream. We must know beforehand what the maximum possible +// value is, and how many values we're decoding. +static void DecodeIntegerSequence(IntegerEncodedVector& result, InputBitStream& bits, u32 maxRange, + u32 nValues) { + // Determine encoding parameters + IntegerEncodedValue val = ASTC_ENCODINGS_VALUES[maxRange]; + + // Start decoding + u32 nValsDecoded = 0; + while (nValsDecoded < nValues) { + switch (val.encoding) { + case IntegerEncoding::Quint: + DecodeQuintBlock(bits, result, val.num_bits); + nValsDecoded += 3; + break; + + case IntegerEncoding::Trit: + DecodeTritBlock(bits, result, val.num_bits); + nValsDecoded += 5; + break; + + case IntegerEncoding::JustBits: + val.bit_value = bits.ReadBits(val.num_bits); + result.push_back(val); + nValsDecoded++; + break; + } + } +} + +struct TexelWeightParams { + u32 m_Width = 0; + u32 m_Height = 0; + bool m_bDualPlane = false; + u32 m_MaxWeight = 0; + bool m_bError = false; + bool m_bVoidExtentLDR = false; + bool m_bVoidExtentHDR = false; + + u32 GetPackedBitSize() const { + // How many indices do we have? + u32 nIdxs = m_Height * m_Width; + if (m_bDualPlane) { + nIdxs *= 2; + } + + return ASTC_ENCODINGS_VALUES[m_MaxWeight].GetBitLength(nIdxs); + } + + u32 GetNumWeightValues() const { + u32 ret = m_Width * m_Height; + if (m_bDualPlane) { + ret *= 2; + } + return ret; + } +}; + +static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { + TexelWeightParams params; + + // Read the entire block mode all at once + u16 modeBits = static_cast<u16>(strm.ReadBits<11>()); + + // Does this match the void extent block mode? + if ((modeBits & 0x01FF) == 0x1FC) { + if (modeBits & 0x200) { + params.m_bVoidExtentHDR = true; + } else { + params.m_bVoidExtentLDR = true; + } + + // Next two bits must be one. + if (!(modeBits & 0x400) || !strm.ReadBit()) { + params.m_bError = true; + } + + return params; + } + + // First check if the last four bits are zero + if ((modeBits & 0xF) == 0) { + params.m_bError = true; + return params; + } + + // If the last two bits are zero, then if bits + // [6-8] are all ones, this is also reserved. + if ((modeBits & 0x3) == 0 && (modeBits & 0x1C0) == 0x1C0) { + params.m_bError = true; + return params; + } + + // Otherwise, there is no error... Figure out the layout + // of the block mode. Layout is determined by a number + // between 0 and 9 corresponding to table C.2.8 of the + // ASTC spec. + u32 layout = 0; + + if ((modeBits & 0x1) || (modeBits & 0x2)) { + // layout is in [0-4] + if (modeBits & 0x8) { + // layout is in [2-4] + if (modeBits & 0x4) { + // layout is in [3-4] + if (modeBits & 0x100) { + layout = 4; + } else { + layout = 3; + } + } else { + layout = 2; + } + } else { + // layout is in [0-1] + if (modeBits & 0x4) { + layout = 1; + } else { + layout = 0; + } + } + } else { + // layout is in [5-9] + if (modeBits & 0x100) { + // layout is in [7-9] + if (modeBits & 0x80) { + // layout is in [7-8] + assert((modeBits & 0x40) == 0U); + if (modeBits & 0x20) { + layout = 8; + } else { + layout = 7; + } + } else { + layout = 9; + } + } else { + // layout is in [5-6] + if (modeBits & 0x80) { + layout = 6; + } else { + layout = 5; + } + } + } + + assert(layout < 10); + + // Determine R + u32 R = !!(modeBits & 0x10); + if (layout < 5) { + R |= (modeBits & 0x3) << 1; + } else { + R |= (modeBits & 0xC) >> 1; + } + assert(2 <= R && R <= 7); + + // Determine width & height + switch (layout) { + case 0: { + u32 A = (modeBits >> 5) & 0x3; + u32 B = (modeBits >> 7) & 0x3; + params.m_Width = B + 4; + params.m_Height = A + 2; + break; + } + + case 1: { + u32 A = (modeBits >> 5) & 0x3; + u32 B = (modeBits >> 7) & 0x3; + params.m_Width = B + 8; + params.m_Height = A + 2; + break; + } + + case 2: { + u32 A = (modeBits >> 5) & 0x3; + u32 B = (modeBits >> 7) & 0x3; + params.m_Width = A + 2; + params.m_Height = B + 8; + break; + } + + case 3: { + u32 A = (modeBits >> 5) & 0x3; + u32 B = (modeBits >> 7) & 0x1; + params.m_Width = A + 2; + params.m_Height = B + 6; + break; + } + + case 4: { + u32 A = (modeBits >> 5) & 0x3; + u32 B = (modeBits >> 7) & 0x1; + params.m_Width = B + 2; + params.m_Height = A + 2; + break; + } + + case 5: { + u32 A = (modeBits >> 5) & 0x3; + params.m_Width = 12; + params.m_Height = A + 2; + break; + } + + case 6: { + u32 A = (modeBits >> 5) & 0x3; + params.m_Width = A + 2; + params.m_Height = 12; + break; + } + + case 7: { + params.m_Width = 6; + params.m_Height = 10; + break; + } + + case 8: { + params.m_Width = 10; + params.m_Height = 6; + break; + } + + case 9: { + u32 A = (modeBits >> 5) & 0x3; + u32 B = (modeBits >> 9) & 0x3; + params.m_Width = A + 6; + params.m_Height = B + 6; + break; + } + + default: + assert(false && "Don't know this layout..."); + params.m_bError = true; + break; + } + + // Determine whether or not we're using dual planes + // and/or high precision layouts. + bool D = (layout != 9) && (modeBits & 0x400); + bool H = (layout != 9) && (modeBits & 0x200); + + if (H) { + const u32 maxWeights[6] = {9, 11, 15, 19, 23, 31}; + params.m_MaxWeight = maxWeights[R - 2]; + } else { + const u32 maxWeights[6] = {1, 2, 3, 4, 5, 7}; + params.m_MaxWeight = maxWeights[R - 2]; + } + + params.m_bDualPlane = D; + + return params; +} + +static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth, + u32 blockHeight) { + // Don't actually care about the void extent, just read the bits... + for (s32 i = 0; i < 4; ++i) { + strm.ReadBits<13>(); + } + + // Decode the RGBA components and renormalize them to the range [0, 255] + u16 r = static_cast<u16>(strm.ReadBits<16>()); + u16 g = static_cast<u16>(strm.ReadBits<16>()); + u16 b = static_cast<u16>(strm.ReadBits<16>()); + u16 a = static_cast<u16>(strm.ReadBits<16>()); + + u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 | + (static_cast<u32>(a) & 0xFF00) << 16; + + for (u32 j = 0; j < blockHeight; j++) { + for (u32 i = 0; i < blockWidth; i++) { + outBuf[j * blockWidth + i] = rgba; + } + } +} + +static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) { + for (u32 j = 0; j < blockHeight; j++) { + for (u32 i = 0; i < blockWidth; i++) { + outBuf[j * blockWidth + i] = 0xFFFF00FF; + } + } +} + +static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>(); +static constexpr u32 ReplicateByteTo16(std::size_t value) { + return REPLICATE_BYTE_TO_16_TABLE[value]; +} + +static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>(); +static constexpr u32 ReplicateBitTo7(std::size_t value) { + return REPLICATE_BIT_TO_7_TABLE[value]; +} + +static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>(); +static constexpr u32 ReplicateBitTo9(std::size_t value) { + return REPLICATE_BIT_TO_9_TABLE[value]; +} + +static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>(); +static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>(); +static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>(); +static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>(); +static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>(); +/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback +/// to the runtime implementation +static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) { + switch (num_bits) { + case 1: + return REPLICATE_1_BIT_TO_8_TABLE[value]; + case 2: + return REPLICATE_2_BIT_TO_8_TABLE[value]; + case 3: + return REPLICATE_3_BIT_TO_8_TABLE[value]; + case 4: + return REPLICATE_4_BIT_TO_8_TABLE[value]; + case 5: + return REPLICATE_5_BIT_TO_8_TABLE[value]; + case 6: + return REPLICATE_6_BIT_TO_8_TABLE[value]; + case 7: + return REPLICATE_7_BIT_TO_8_TABLE[value]; + case 8: + return REPLICATE_8_BIT_TO_8_TABLE[value]; + default: + return Replicate(value, num_bits, 8); + } +} + +static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>(); +static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>(); +static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>(); +static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>(); +static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>(); +static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) { + switch (num_bits) { + case 1: + return REPLICATE_1_BIT_TO_6_TABLE[value]; + case 2: + return REPLICATE_2_BIT_TO_6_TABLE[value]; + case 3: + return REPLICATE_3_BIT_TO_6_TABLE[value]; + case 4: + return REPLICATE_4_BIT_TO_6_TABLE[value]; + case 5: + return REPLICATE_5_BIT_TO_6_TABLE[value]; + default: + return Replicate(value, num_bits, 6); + } +} + +class Pixel { +protected: + using ChannelType = s16; + u8 m_BitDepth[4] = {8, 8, 8, 8}; + s16 color[4] = {}; + +public: + Pixel() = default; + Pixel(u32 a, u32 r, u32 g, u32 b, u32 bitDepth = 8) + : m_BitDepth{u8(bitDepth), u8(bitDepth), u8(bitDepth), u8(bitDepth)}, + color{static_cast<ChannelType>(a), static_cast<ChannelType>(r), + static_cast<ChannelType>(g), static_cast<ChannelType>(b)} {} + + // Changes the depth of each pixel. This scales the values to + // the appropriate bit depth by either truncating the least + // significant bits when going from larger to smaller bit depth + // or by repeating the most significant bits when going from + // smaller to larger bit depths. + void ChangeBitDepth() { + for (u32 i = 0; i < 4; i++) { + Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i]); + m_BitDepth[i] = 8; + } + } + + template <typename IntType> + static float ConvertChannelToFloat(IntType channel, u8 bitDepth) { + float denominator = static_cast<float>((1 << bitDepth) - 1); + return static_cast<float>(channel) / denominator; + } + + // Changes the bit depth of a single component. See the comment + // above for how we do this. + static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth) { + assert(oldDepth <= 8); + + if (oldDepth == 8) { + // Do nothing + return val; + } else if (oldDepth == 0) { + return static_cast<ChannelType>((1 << 8) - 1); + } else if (8 > oldDepth) { + return static_cast<ChannelType>(FastReplicateTo8(static_cast<u32>(val), oldDepth)); + } else { + // oldDepth > newDepth + const u8 bitsWasted = static_cast<u8>(oldDepth - 8); + u16 v = static_cast<u16>(val); + v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted); + v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << 8) - 1)); + return static_cast<u8>(v); + } + + assert(false && "We shouldn't get here."); + return 0; + } + + const ChannelType& A() const { + return color[0]; + } + ChannelType& A() { + return color[0]; + } + const ChannelType& R() const { + return color[1]; + } + ChannelType& R() { + return color[1]; + } + const ChannelType& G() const { + return color[2]; + } + ChannelType& G() { + return color[2]; + } + const ChannelType& B() const { + return color[3]; + } + ChannelType& B() { + return color[3]; + } + const ChannelType& Component(u32 idx) const { + return color[idx]; + } + ChannelType& Component(u32 idx) { + return color[idx]; + } + + void GetBitDepth(u8 (&outDepth)[4]) const { + for (s32 i = 0; i < 4; i++) { + outDepth[i] = m_BitDepth[i]; + } + } + + // Take all of the components, transform them to their 8-bit variants, + // and then pack each channel into an R8G8B8A8 32-bit integer. We assume + // that the architecture is little-endian, so the alpha channel will end + // up in the most-significant byte. + u32 Pack() const { + Pixel eightBit(*this); + eightBit.ChangeBitDepth(); + + u32 r = 0; + r |= eightBit.A(); + r <<= 8; + r |= eightBit.B(); + r <<= 8; + r |= eightBit.G(); + r <<= 8; + r |= eightBit.R(); + return r; + } + + // Clamps the pixel to the range [0,255] + void ClampByte() { + for (u32 i = 0; i < 4; i++) { + color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]); + } + } + + void MakeOpaque() { + A() = 255; + } +}; + +static void DecodeColorValues(u32* out, std::span<u8> data, const u32* modes, const u32 nPartitions, + const u32 nBitsForColorData) { + // First figure out how many color values we have + u32 nValues = 0; + for (u32 i = 0; i < nPartitions; i++) { + nValues += ((modes[i] >> 2) + 1) << 1; + } + + // Then based on the number of values and the remaining number of bits, + // figure out the max value for each of them... + u32 range = 256; + while (--range > 0) { + IntegerEncodedValue val = ASTC_ENCODINGS_VALUES[range]; + u32 bitLength = val.GetBitLength(nValues); + if (bitLength <= nBitsForColorData) { + // Find the smallest possible range that matches the given encoding + while (--range > 0) { + IntegerEncodedValue newval = ASTC_ENCODINGS_VALUES[range]; + if (!newval.MatchesEncoding(val)) { + break; + } + } + + // Return to last matching range. + range++; + break; + } + } + + // We now have enough to decode our integer sequence. + IntegerEncodedVector decodedColorValues; + + InputBitStream colorStream(data, 0); + DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues); + + // Once we have the decoded values, we need to dequantize them to the 0-255 range + // This procedure is outlined in ASTC spec C.2.13 + u32 outIdx = 0; + for (auto itr = decodedColorValues.begin(); itr != decodedColorValues.end(); ++itr) { + // Have we already decoded all that we need? + if (outIdx >= nValues) { + break; + } + + const IntegerEncodedValue& val = *itr; + u32 bitlen = val.num_bits; + u32 bitval = val.bit_value; + + assert(bitlen >= 1); + + u32 A = 0, B = 0, C = 0, D = 0; + // A is just the lsb replicated 9 times. + A = ReplicateBitTo9(bitval & 1); + + switch (val.encoding) { + // Replicate bits + case IntegerEncoding::JustBits: + out[outIdx++] = FastReplicateTo8(bitval, bitlen); + break; + + // Use algorithm in C.2.13 + case IntegerEncoding::Trit: { + + D = val.trit_value; + + switch (bitlen) { + case 1: { + C = 204; + } break; + + case 2: { + C = 93; + // B = b000b0bb0 + u32 b = (bitval >> 1) & 1; + B = (b << 8) | (b << 4) | (b << 2) | (b << 1); + } break; + + case 3: { + C = 44; + // B = cb000cbcb + u32 cb = (bitval >> 1) & 3; + B = (cb << 7) | (cb << 2) | cb; + } break; + + case 4: { + C = 22; + // B = dcb000dcb + u32 dcb = (bitval >> 1) & 7; + B = (dcb << 6) | dcb; + } break; + + case 5: { + C = 11; + // B = edcb000ed + u32 edcb = (bitval >> 1) & 0xF; + B = (edcb << 5) | (edcb >> 2); + } break; + + case 6: { + C = 5; + // B = fedcb000f + u32 fedcb = (bitval >> 1) & 0x1F; + B = (fedcb << 4) | (fedcb >> 4); + } break; + + default: + assert(false && "Unsupported trit encoding for color values!"); + break; + } // switch(bitlen) + } // case IntegerEncoding::Trit + break; + + case IntegerEncoding::Quint: { + + D = val.quint_value; + + switch (bitlen) { + case 1: { + C = 113; + } break; + + case 2: { + C = 54; + // B = b0000bb00 + u32 b = (bitval >> 1) & 1; + B = (b << 8) | (b << 3) | (b << 2); + } break; + + case 3: { + C = 26; + // B = cb0000cbc + u32 cb = (bitval >> 1) & 3; + B = (cb << 7) | (cb << 1) | (cb >> 1); + } break; + + case 4: { + C = 13; + // B = dcb0000dc + u32 dcb = (bitval >> 1) & 7; + B = (dcb << 6) | (dcb >> 1); + } break; + + case 5: { + C = 6; + // B = edcb0000e + u32 edcb = (bitval >> 1) & 0xF; + B = (edcb << 5) | (edcb >> 3); + } break; + + default: + assert(false && "Unsupported quint encoding for color values!"); + break; + } // switch(bitlen) + } // case IntegerEncoding::Quint + break; + } // switch(val.encoding) + + if (val.encoding != IntegerEncoding::JustBits) { + u32 T = D * C + B; + T ^= A; + T = (A & 0x80) | (T >> 2); + out[outIdx++] = T; + } + } + + // Make sure that each of our values is in the proper range... + for (u32 i = 0; i < nValues; i++) { + assert(out[i] <= 255); + } +} + +static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) { + u32 bitval = val.bit_value; + u32 bitlen = val.num_bits; + + u32 A = ReplicateBitTo7(bitval & 1); + u32 B = 0, C = 0, D = 0; + + u32 result = 0; + switch (val.encoding) { + case IntegerEncoding::JustBits: + result = FastReplicateTo6(bitval, bitlen); + break; + + case IntegerEncoding::Trit: { + D = val.trit_value; + assert(D < 3); + + switch (bitlen) { + case 0: { + u32 results[3] = {0, 32, 63}; + result = results[D]; + } break; + + case 1: { + C = 50; + } break; + + case 2: { + C = 23; + u32 b = (bitval >> 1) & 1; + B = (b << 6) | (b << 2) | b; + } break; + + case 3: { + C = 11; + u32 cb = (bitval >> 1) & 3; + B = (cb << 5) | cb; + } break; + + default: + assert(false && "Invalid trit encoding for texel weight"); + break; + } + } break; + + case IntegerEncoding::Quint: { + D = val.quint_value; + assert(D < 5); + + switch (bitlen) { + case 0: { + u32 results[5] = {0, 16, 32, 47, 63}; + result = results[D]; + } break; + + case 1: { + C = 28; + } break; + + case 2: { + C = 13; + u32 b = (bitval >> 1) & 1; + B = (b << 6) | (b << 1); + } break; + + default: + assert(false && "Invalid quint encoding for texel weight"); + break; + } + } break; + } + + if (val.encoding != IntegerEncoding::JustBits && bitlen > 0) { + // Decode the value... + result = D * C + B; + result ^= A; + result = (A & 0x20) | (result >> 2); + } + + assert(result < 64); + + // Change from [0,63] to [0,64] + if (result > 32) { + result += 1; + } + + return result; +} + +static void UnquantizeTexelWeights(u32 out[2][144], const IntegerEncodedVector& weights, + const TexelWeightParams& params, const u32 blockWidth, + const u32 blockHeight) { + u32 weightIdx = 0; + u32 unquantized[2][144]; + + for (auto itr = weights.begin(); itr != weights.end(); ++itr) { + unquantized[0][weightIdx] = UnquantizeTexelWeight(*itr); + + if (params.m_bDualPlane) { + ++itr; + unquantized[1][weightIdx] = UnquantizeTexelWeight(*itr); + if (itr == weights.end()) { + break; + } + } + + if (++weightIdx >= (params.m_Width * params.m_Height)) + break; + } + + // Do infill if necessary (Section C.2.18) ... + u32 Ds = (1024 + (blockWidth / 2)) / (blockWidth - 1); + u32 Dt = (1024 + (blockHeight / 2)) / (blockHeight - 1); + + const u32 kPlaneScale = params.m_bDualPlane ? 2U : 1U; + for (u32 plane = 0; plane < kPlaneScale; plane++) + for (u32 t = 0; t < blockHeight; t++) + for (u32 s = 0; s < blockWidth; s++) { + u32 cs = Ds * s; + u32 ct = Dt * t; + + u32 gs = (cs * (params.m_Width - 1) + 32) >> 6; + u32 gt = (ct * (params.m_Height - 1) + 32) >> 6; + + u32 js = gs >> 4; + u32 fs = gs & 0xF; + + u32 jt = gt >> 4; + u32 ft = gt & 0x0F; + + u32 w11 = (fs * ft + 8) >> 4; + u32 w10 = ft - w11; + u32 w01 = fs - w11; + u32 w00 = 16 - fs - ft + w11; + + u32 v0 = js + jt * params.m_Width; + +#define FIND_TEXEL(tidx, bidx) \ + u32 p##bidx = 0; \ + do { \ + if ((tidx) < (params.m_Width * params.m_Height)) { \ + p##bidx = unquantized[plane][(tidx)]; \ + } \ + } while (0) + + FIND_TEXEL(v0, 00); + FIND_TEXEL(v0 + 1, 01); + FIND_TEXEL(v0 + params.m_Width, 10); + FIND_TEXEL(v0 + params.m_Width + 1, 11); + +#undef FIND_TEXEL + + out[plane][t * blockWidth + s] = + (p00 * w00 + p01 * w01 + p10 * w10 + p11 * w11 + 8) >> 4; + } +} + +// Transfers a bit as described in C.2.14 +static inline void BitTransferSigned(int& a, int& b) { + b >>= 1; + b |= a & 0x80; + a >>= 1; + a &= 0x3F; + if (a & 0x20) + a -= 0x40; +} + +// Adds more precision to the blue channel as described +// in C.2.14 +static inline Pixel BlueContract(s32 a, s32 r, s32 g, s32 b) { + return Pixel(static_cast<s16>(a), static_cast<s16>((r + b) >> 1), + static_cast<s16>((g + b) >> 1), static_cast<s16>(b)); +} + +// Partition selection functions as specified in +// C.2.21 +static inline u32 hash52(u32 p) { + p ^= p >> 15; + p -= p << 17; + p += p << 7; + p += p << 4; + p ^= p >> 5; + p += p << 16; + p ^= p >> 7; + p ^= p >> 3; + p ^= p << 6; + p ^= p >> 17; + return p; +} + +static u32 SelectPartition(s32 seed, s32 x, s32 y, s32 z, s32 partitionCount, s32 smallBlock) { + if (1 == partitionCount) + return 0; + + if (smallBlock) { + x <<= 1; + y <<= 1; + z <<= 1; + } + + seed += (partitionCount - 1) * 1024; + + u32 rnum = hash52(static_cast<u32>(seed)); + u8 seed1 = static_cast<u8>(rnum & 0xF); + u8 seed2 = static_cast<u8>((rnum >> 4) & 0xF); + u8 seed3 = static_cast<u8>((rnum >> 8) & 0xF); + u8 seed4 = static_cast<u8>((rnum >> 12) & 0xF); + u8 seed5 = static_cast<u8>((rnum >> 16) & 0xF); + u8 seed6 = static_cast<u8>((rnum >> 20) & 0xF); + u8 seed7 = static_cast<u8>((rnum >> 24) & 0xF); + u8 seed8 = static_cast<u8>((rnum >> 28) & 0xF); + u8 seed9 = static_cast<u8>((rnum >> 18) & 0xF); + u8 seed10 = static_cast<u8>((rnum >> 22) & 0xF); + u8 seed11 = static_cast<u8>((rnum >> 26) & 0xF); + u8 seed12 = static_cast<u8>(((rnum >> 30) | (rnum << 2)) & 0xF); + + seed1 = static_cast<u8>(seed1 * seed1); + seed2 = static_cast<u8>(seed2 * seed2); + seed3 = static_cast<u8>(seed3 * seed3); + seed4 = static_cast<u8>(seed4 * seed4); + seed5 = static_cast<u8>(seed5 * seed5); + seed6 = static_cast<u8>(seed6 * seed6); + seed7 = static_cast<u8>(seed7 * seed7); + seed8 = static_cast<u8>(seed8 * seed8); + seed9 = static_cast<u8>(seed9 * seed9); + seed10 = static_cast<u8>(seed10 * seed10); + seed11 = static_cast<u8>(seed11 * seed11); + seed12 = static_cast<u8>(seed12 * seed12); + + s32 sh1, sh2, sh3; + if (seed & 1) { + sh1 = (seed & 2) ? 4 : 5; + sh2 = (partitionCount == 3) ? 6 : 5; + } else { + sh1 = (partitionCount == 3) ? 6 : 5; + sh2 = (seed & 2) ? 4 : 5; + } + sh3 = (seed & 0x10) ? sh1 : sh2; + + seed1 = static_cast<u8>(seed1 >> sh1); + seed2 = static_cast<u8>(seed2 >> sh2); + seed3 = static_cast<u8>(seed3 >> sh1); + seed4 = static_cast<u8>(seed4 >> sh2); + seed5 = static_cast<u8>(seed5 >> sh1); + seed6 = static_cast<u8>(seed6 >> sh2); + seed7 = static_cast<u8>(seed7 >> sh1); + seed8 = static_cast<u8>(seed8 >> sh2); + seed9 = static_cast<u8>(seed9 >> sh3); + seed10 = static_cast<u8>(seed10 >> sh3); + seed11 = static_cast<u8>(seed11 >> sh3); + seed12 = static_cast<u8>(seed12 >> sh3); + + s32 a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14); + s32 b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10); + s32 c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6); + s32 d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2); + + a &= 0x3F; + b &= 0x3F; + c &= 0x3F; + d &= 0x3F; + + if (partitionCount < 4) + d = 0; + if (partitionCount < 3) + c = 0; + + if (a >= b && a >= c && a >= d) + return 0; + else if (b >= c && b >= d) + return 1; + else if (c >= d) + return 2; + return 3; +} + +static inline u32 Select2DPartition(s32 seed, s32 x, s32 y, s32 partitionCount, s32 smallBlock) { + return SelectPartition(seed, x, y, 0, partitionCount, smallBlock); +} + +// Section C.2.14 +static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const u32*& colorValues, + u32 colorEndpointMode) { +#define READ_UINT_VALUES(N) \ + u32 v[N]; \ + for (u32 i = 0; i < N; i++) { \ + v[i] = *(colorValues++); \ + } + +#define READ_INT_VALUES(N) \ + s32 v[N]; \ + for (u32 i = 0; i < N; i++) { \ + v[i] = static_cast<int>(*(colorValues++)); \ + } + + switch (colorEndpointMode) { + case 0: { + READ_UINT_VALUES(2) + ep1 = Pixel(0xFF, v[0], v[0], v[0]); + ep2 = Pixel(0xFF, v[1], v[1], v[1]); + } break; + + case 1: { + READ_UINT_VALUES(2) + u32 L0 = (v[0] >> 2) | (v[1] & 0xC0); + u32 L1 = std::min(L0 + (v[1] & 0x3F), 0xFFU); + ep1 = Pixel(0xFF, L0, L0, L0); + ep2 = Pixel(0xFF, L1, L1, L1); + } break; + + case 4: { + READ_UINT_VALUES(4) + ep1 = Pixel(v[2], v[0], v[0], v[0]); + ep2 = Pixel(v[3], v[1], v[1], v[1]); + } break; + + case 5: { + READ_INT_VALUES(4) + BitTransferSigned(v[1], v[0]); + BitTransferSigned(v[3], v[2]); + ep1 = Pixel(v[2], v[0], v[0], v[0]); + ep2 = Pixel(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1]); + ep1.ClampByte(); + ep2.ClampByte(); + } break; + + case 6: { + READ_UINT_VALUES(4) + ep1 = Pixel(0xFF, v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8); + ep2 = Pixel(0xFF, v[0], v[1], v[2]); + } break; + + case 8: { + READ_UINT_VALUES(6) + if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) { + ep1 = Pixel(0xFF, v[0], v[2], v[4]); + ep2 = Pixel(0xFF, v[1], v[3], v[5]); + } else { + ep1 = BlueContract(0xFF, v[1], v[3], v[5]); + ep2 = BlueContract(0xFF, v[0], v[2], v[4]); + } + } break; + + case 9: { + READ_INT_VALUES(6) + BitTransferSigned(v[1], v[0]); + BitTransferSigned(v[3], v[2]); + BitTransferSigned(v[5], v[4]); + if (v[1] + v[3] + v[5] >= 0) { + ep1 = Pixel(0xFF, v[0], v[2], v[4]); + ep2 = Pixel(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]); + } else { + ep1 = BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]); + ep2 = BlueContract(0xFF, v[0], v[2], v[4]); + } + ep1.ClampByte(); + ep2.ClampByte(); + } break; + + case 10: { + READ_UINT_VALUES(6) + ep1 = Pixel(v[4], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8); + ep2 = Pixel(v[5], v[0], v[1], v[2]); + } break; + + case 12: { + READ_UINT_VALUES(8) + if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) { + ep1 = Pixel(v[6], v[0], v[2], v[4]); + ep2 = Pixel(v[7], v[1], v[3], v[5]); + } else { + ep1 = BlueContract(v[7], v[1], v[3], v[5]); + ep2 = BlueContract(v[6], v[0], v[2], v[4]); + } + } break; + + case 13: { + READ_INT_VALUES(8) + BitTransferSigned(v[1], v[0]); + BitTransferSigned(v[3], v[2]); + BitTransferSigned(v[5], v[4]); + BitTransferSigned(v[7], v[6]); + if (v[1] + v[3] + v[5] >= 0) { + ep1 = Pixel(v[6], v[0], v[2], v[4]); + ep2 = Pixel(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5]); + } else { + ep1 = BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5]); + ep2 = BlueContract(v[6], v[0], v[2], v[4]); + } + ep1.ClampByte(); + ep2.ClampByte(); + } break; + + default: + assert(false && "Unsupported color endpoint mode (is it HDR?)"); + break; + } + +#undef READ_UINT_VALUES +#undef READ_INT_VALUES +} + +static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth, + const u32 blockHeight, std::span<u32, 12 * 12> outBuf) { + InputBitStream strm(inBuf); + TexelWeightParams weightParams = DecodeBlockInfo(strm); + + // Was there an error? + if (weightParams.m_bError) { + assert(false && "Invalid block mode"); + FillError(outBuf, blockWidth, blockHeight); + return; + } + + if (weightParams.m_bVoidExtentLDR) { + FillVoidExtentLDR(strm, outBuf, blockWidth, blockHeight); + return; + } + + if (weightParams.m_bVoidExtentHDR) { + assert(false && "HDR void extent blocks are unsupported!"); + FillError(outBuf, blockWidth, blockHeight); + return; + } + + if (weightParams.m_Width > blockWidth) { + assert(false && "Texel weight grid width should be smaller than block width"); + FillError(outBuf, blockWidth, blockHeight); + return; + } + + if (weightParams.m_Height > blockHeight) { + assert(false && "Texel weight grid height should be smaller than block height"); + FillError(outBuf, blockWidth, blockHeight); + return; + } + + // Read num partitions + u32 nPartitions = strm.ReadBits<2>() + 1; + assert(nPartitions <= 4); + + if (nPartitions == 4 && weightParams.m_bDualPlane) { + assert(false && "Dual plane mode is incompatible with four partition blocks"); + FillError(outBuf, blockWidth, blockHeight); + return; + } + + // Based on the number of partitions, read the color endpoint mode for + // each partition. + + // Determine partitions, partition index, and color endpoint modes + s32 planeIdx = -1; + u32 partitionIndex; + u32 colorEndpointMode[4] = {0, 0, 0, 0}; + + // Define color data. + u8 colorEndpointData[16]; + memset(colorEndpointData, 0, sizeof(colorEndpointData)); + OutputBitStream colorEndpointStream(colorEndpointData, 16 * 8, 0); + + // Read extra config data... + u32 baseCEM = 0; + if (nPartitions == 1) { + colorEndpointMode[0] = strm.ReadBits<4>(); + partitionIndex = 0; + } else { + partitionIndex = strm.ReadBits<10>(); + baseCEM = strm.ReadBits<6>(); + } + u32 baseMode = (baseCEM & 3); + + // Remaining bits are color endpoint data... + u32 nWeightBits = weightParams.GetPackedBitSize(); + s32 remainingBits = 128 - nWeightBits - static_cast<int>(strm.GetBitsRead()); + + // Consider extra bits prior to texel data... + u32 extraCEMbits = 0; + if (baseMode) { + switch (nPartitions) { + case 2: + extraCEMbits += 2; + break; + case 3: + extraCEMbits += 5; + break; + case 4: + extraCEMbits += 8; + break; + default: + assert(false); + break; + } + } + remainingBits -= extraCEMbits; + + // Do we have a dual plane situation? + u32 planeSelectorBits = 0; + if (weightParams.m_bDualPlane) { + planeSelectorBits = 2; + } + remainingBits -= planeSelectorBits; + + // Read color data... + u32 colorDataBits = remainingBits; + while (remainingBits > 0) { + u32 nb = std::min(remainingBits, 8); + u32 b = strm.ReadBits(nb); + colorEndpointStream.WriteBits(b, nb); + remainingBits -= 8; + } + + // Read the plane selection bits + planeIdx = strm.ReadBits(planeSelectorBits); + + // Read the rest of the CEM + if (baseMode) { + u32 extraCEM = strm.ReadBits(extraCEMbits); + u32 CEM = (extraCEM << 6) | baseCEM; + CEM >>= 2; + + bool C[4] = {0}; + for (u32 i = 0; i < nPartitions; i++) { + C[i] = CEM & 1; + CEM >>= 1; + } + + u8 M[4] = {0}; + for (u32 i = 0; i < nPartitions; i++) { + M[i] = CEM & 3; + CEM >>= 2; + assert(M[i] <= 3); + } + + for (u32 i = 0; i < nPartitions; i++) { + colorEndpointMode[i] = baseMode; + if (!(C[i])) + colorEndpointMode[i] -= 1; + colorEndpointMode[i] <<= 2; + colorEndpointMode[i] |= M[i]; + } + } else if (nPartitions > 1) { + u32 CEM = baseCEM >> 2; + for (u32 i = 0; i < nPartitions; i++) { + colorEndpointMode[i] = CEM; + } + } + + // Make sure everything up till here is sane. + for (u32 i = 0; i < nPartitions; i++) { + assert(colorEndpointMode[i] < 16); + } + assert(strm.GetBitsRead() + weightParams.GetPackedBitSize() == 128); + + // Decode both color data and texel weight data + u32 colorValues[32]; // Four values, two endpoints, four maximum paritions + DecodeColorValues(colorValues, colorEndpointData, colorEndpointMode, nPartitions, + colorDataBits); + + Pixel endpoints[4][2]; + const u32* colorValuesPtr = colorValues; + for (u32 i = 0; i < nPartitions; i++) { + ComputeEndpoints(endpoints[i][0], endpoints[i][1], colorValuesPtr, colorEndpointMode[i]); + } + + // Read the texel weight data.. + std::array<u8, 16> texelWeightData; + std::ranges::copy(inBuf, texelWeightData.begin()); + + // Reverse everything + for (u32 i = 0; i < 8; i++) { +// Taken from http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits +#define REVERSE_BYTE(b) (((b)*0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32 + u8 a = static_cast<u8>(REVERSE_BYTE(texelWeightData[i])); + u8 b = static_cast<u8>(REVERSE_BYTE(texelWeightData[15 - i])); +#undef REVERSE_BYTE + + texelWeightData[i] = b; + texelWeightData[15 - i] = a; + } + + // Make sure that higher non-texel bits are set to zero + const u32 clearByteStart = (weightParams.GetPackedBitSize() >> 3) + 1; + if (clearByteStart > 0 && clearByteStart <= texelWeightData.size()) { + texelWeightData[clearByteStart - 1] &= + static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1); + std::memset(texelWeightData.data() + clearByteStart, 0, + std::min(16U - clearByteStart, 16U)); + } + + IntegerEncodedVector texelWeightValues; + + InputBitStream weightStream(texelWeightData); + + DecodeIntegerSequence(texelWeightValues, weightStream, weightParams.m_MaxWeight, + weightParams.GetNumWeightValues()); + + // Blocks can be at most 12x12, so we can have as many as 144 weights + u32 weights[2][144]; + UnquantizeTexelWeights(weights, texelWeightValues, weightParams, blockWidth, blockHeight); + + // Now that we have endpoints and weights, we can interpolate and generate + // the proper decoding... + for (u32 j = 0; j < blockHeight; j++) + for (u32 i = 0; i < blockWidth; i++) { + u32 partition = Select2DPartition(partitionIndex, i, j, nPartitions, + (blockHeight * blockWidth) < 32); + assert(partition < nPartitions); + + Pixel p; + for (u32 c = 0; c < 4; c++) { + u32 C0 = endpoints[partition][0].Component(c); + C0 = ReplicateByteTo16(C0); + u32 C1 = endpoints[partition][1].Component(c); + C1 = ReplicateByteTo16(C1); + + u32 plane = 0; + if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) { + plane = 1; + } + + u32 weight = weights[plane][j * blockWidth + i]; + u32 C = (C0 * (64 - weight) + C1 * weight + 32) / 64; + if (C == 65535) { + p.Component(c) = 255; + } else { + double Cf = static_cast<double>(C); + p.Component(c) = static_cast<u16>(255.0 * (Cf / 65536.0) + 0.5); + } + } + + outBuf[j * blockWidth + i] = p.Pack(); + } +} + +void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth, + uint32_t block_width, uint32_t block_height, std::span<uint8_t> output) { + u32 block_index = 0; + std::size_t depth_offset = 0; + for (u32 z = 0; z < depth; z++) { + for (u32 y = 0; y < height; y += block_height) { + for (u32 x = 0; x < width; x += block_width) { + const std::span<const u8, 16> blockPtr{data.subspan(block_index * 16, 16)}; + + // Blocks can be at most 12x12 + std::array<u32, 12 * 12> uncompData; + DecompressBlock(blockPtr, block_width, block_height, uncompData); + + u32 decompWidth = std::min(block_width, width - x); + u32 decompHeight = std::min(block_height, height - y); + + const std::span<u8> outRow = output.subspan(depth_offset + (y * width + x) * 4); + for (u32 jj = 0; jj < decompHeight; jj++) { + std::memcpy(outRow.data() + jj * width * 4, + uncompData.data() + jj * block_width, decompWidth * 4); + } + ++block_index; + } + } + depth_offset += height * width * 4; + } +} + +} // namespace Tegra::Texture::ASTC diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h index c1c73fda5..0229ae122 100644 --- a/src/video_core/textures/astc.h +++ b/src/video_core/textures/astc.h @@ -77,7 +77,7 @@ constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() { return encodings; } -constexpr std::array<IntegerEncodedValue, 256> EncodingsValues = MakeEncodedValues(); +constexpr std::array<IntegerEncodedValue, 256> ASTC_ENCODINGS_VALUES = MakeEncodedValues(); // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] // is the same as [(num_bits - 1):0] and repeats all the way down. @@ -116,17 +116,11 @@ constexpr auto MakeReplicateTable() { return table; } -constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>(); constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>(); constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>(); constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>(); -struct AstcBufferData { - decltype(EncodingsValues) encoding_values = EncodingsValues; - decltype(REPLICATE_6_BIT_TO_8_TABLE) replicate_6_to_8 = REPLICATE_6_BIT_TO_8_TABLE; - decltype(REPLICATE_7_BIT_TO_8_TABLE) replicate_7_to_8 = REPLICATE_7_BIT_TO_8_TABLE; - decltype(REPLICATE_8_BIT_TO_8_TABLE) replicate_8_to_8 = REPLICATE_8_BIT_TO_8_TABLE; - decltype(REPLICATE_BYTE_TO_16_TABLE) replicate_byte_to_16 = REPLICATE_BYTE_TO_16_TABLE; -} constexpr ASTC_BUFFER_DATA; +void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth, + uint32_t block_width, uint32_t block_height, std::span<uint8_t> output); } // namespace Tegra::Texture::ASTC diff --git a/src/video_core/vulkan_common/nsight_aftermath_tracker.cpp b/src/video_core/vulkan_common/nsight_aftermath_tracker.cpp index f0ee76519..758c038ba 100644 --- a/src/video_core/vulkan_common/nsight_aftermath_tracker.cpp +++ b/src/video_core/vulkan_common/nsight_aftermath_tracker.cpp @@ -50,7 +50,7 @@ NsightAftermathTracker::NsightAftermathTracker() { } dump_dir = Common::FS::GetYuzuPath(Common::FS::YuzuPath::LogDir) / "gpucrash"; - void(Common::FS::RemoveDirRecursively(dump_dir)); + Common::FS::RemoveDirRecursively(dump_dir); if (!Common::FS::CreateDir(dump_dir)) { LOG_ERROR(Render_Vulkan, "Failed to create Nsight Aftermath dump directory"); return; diff --git a/src/video_core/vulkan_common/vulkan_debug_callback.cpp b/src/video_core/vulkan_common/vulkan_debug_callback.cpp index 5c64c9bf7..0f60765bb 100644 --- a/src/video_core/vulkan_common/vulkan_debug_callback.cpp +++ b/src/video_core/vulkan_common/vulkan_debug_callback.cpp @@ -12,6 +12,14 @@ VkBool32 Callback(VkDebugUtilsMessageSeverityFlagBitsEXT severity, VkDebugUtilsMessageTypeFlagsEXT type, const VkDebugUtilsMessengerCallbackDataEXT* data, [[maybe_unused]] void* user_data) { + // Skip logging known false-positive validation errors + switch (static_cast<u32>(data->messageIdNumber)) { + case 0x682a878au: // VUID-vkCmdBindVertexBuffers2EXT-pBuffers-parameter + case 0x99fb7dfdu: // UNASSIGNED-RequiredParameter (vkCmdBindVertexBuffers2EXT pBuffers[0]) + return VK_FALSE; + default: + break; + } const std::string_view message{data->pMessage}; if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) { LOG_CRITICAL(Render_Vulkan, "{}", message); diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 64206b3d2..f214510da 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -408,6 +408,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR } logical = vk::Device::Create(physical, queue_cis, extensions, first_next, dld); + CollectPhysicalMemoryInfo(); CollectTelemetryParameters(); CollectToolingInfo(); @@ -531,6 +532,27 @@ bool Device::IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags want return (supported_usage & wanted_usage) == wanted_usage; } +std::string Device::GetDriverName() const { + switch (driver_id) { + case VK_DRIVER_ID_AMD_PROPRIETARY: + return "AMD"; + case VK_DRIVER_ID_AMD_OPEN_SOURCE: + return "AMDVLK"; + case VK_DRIVER_ID_MESA_RADV: + return "RADV"; + case VK_DRIVER_ID_NVIDIA_PROPRIETARY: + return "NVIDIA"; + case VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS: + return "INTEL"; + case VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA: + return "ANV"; + case VK_DRIVER_ID_MESA_LLVMPIPE: + return "LAVAPIPE"; + default: + return vendor_name; + } +} + void Device::CheckSuitability(bool requires_swapchain) const { std::bitset<REQUIRED_EXTENSIONS.size()> available_extensions; bool has_swapchain = false; @@ -818,6 +840,17 @@ void Device::CollectTelemetryParameters() { } } +void Device::CollectPhysicalMemoryInfo() { + const auto mem_properties = physical.GetMemoryProperties(); + const size_t num_properties = mem_properties.memoryHeapCount; + device_access_memory = 0; + for (size_t element = 0; element < num_properties; ++element) { + if ((mem_properties.memoryHeaps[element].flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) != 0) { + device_access_memory += mem_properties.memoryHeaps[element].size; + } + } +} + void Device::CollectToolingInfo() { if (!ext_tooling_info) { return; diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 67d70cd22..96c0f8c60 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -45,6 +45,9 @@ public: /// Reports a shader to Nsight Aftermath. void SaveShader(const std::vector<u32>& spirv) const; + /// Returns the name of the VkDriverId reported from Vulkan. + std::string GetDriverName() const; + /// Returns the dispatch loader with direct function pointers of the device. const vk::DeviceDispatch& GetDispatchLoader() const { return dld; @@ -225,6 +228,10 @@ public: return use_asynchronous_shaders; } + u64 GetDeviceLocalMemory() const { + return device_access_memory; + } + private: /// Checks if the physical device is suitable. void CheckSuitability(bool requires_swapchain) const; @@ -244,6 +251,9 @@ private: /// Collects information about attached tools. void CollectToolingInfo(); + /// Collects information about the device's local memory. + void CollectPhysicalMemoryInfo(); + /// Returns a list of queue initialization descriptors. std::vector<VkDeviceQueueCreateInfo> GetDeviceQueueCreateInfos() const; @@ -257,21 +267,22 @@ private: bool IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage, FormatType format_type) const; - VkInstance instance; ///< Vulkan instance. - vk::DeviceDispatch dld; ///< Device function pointers. - vk::PhysicalDevice physical; ///< Physical device. - VkPhysicalDeviceProperties properties; ///< Device properties. - vk::Device logical; ///< Logical device. - vk::Queue graphics_queue; ///< Main graphics queue. - vk::Queue present_queue; ///< Main present queue. - u32 instance_version{}; ///< Vulkan onstance version. - u32 graphics_family{}; ///< Main graphics queue family index. - u32 present_family{}; ///< Main present queue family index. - VkDriverIdKHR driver_id{}; ///< Driver ID. - VkShaderStageFlags guest_warp_stages{}; ///< Stages where the guest warp size can be forced.ed - bool is_optimal_astc_supported{}; ///< Support for native ASTC. - bool is_float16_supported{}; ///< Support for float16 arithmetics. - bool is_warp_potentially_bigger{}; ///< Host warp size can be bigger than guest. + VkInstance instance; ///< Vulkan instance. + vk::DeviceDispatch dld; ///< Device function pointers. + vk::PhysicalDevice physical; ///< Physical device. + VkPhysicalDeviceProperties properties; ///< Device properties. + vk::Device logical; ///< Logical device. + vk::Queue graphics_queue; ///< Main graphics queue. + vk::Queue present_queue; ///< Main present queue. + u32 instance_version{}; ///< Vulkan onstance version. + u32 graphics_family{}; ///< Main graphics queue family index. + u32 present_family{}; ///< Main present queue family index. + VkDriverIdKHR driver_id{}; ///< Driver ID. + VkShaderStageFlags guest_warp_stages{}; ///< Stages where the guest warp size can be forced. + u64 device_access_memory{}; ///< Total size of device local memory in bytes. + bool is_optimal_astc_supported{}; ///< Support for native ASTC. + bool is_float16_supported{}; ///< Support for float16 arithmetics. + bool is_warp_potentially_bigger{}; ///< Host warp size can be bigger than guest. bool is_formatless_image_load_supported{}; ///< Support for shader image read without format. bool is_shader_storage_image_multisample{}; ///< Support for image operations on MSAA images. bool is_blit_depth_stencil_supported{}; ///< Support for blitting from and to depth stencil. diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp index 5edd06ebc..aa173d19e 100644 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp @@ -69,10 +69,10 @@ constexpr VkExportMemoryAllocateInfo EXPORT_ALLOCATE_INFO{ class MemoryAllocation { public: - explicit MemoryAllocation(vk::DeviceMemory memory_, VkMemoryPropertyFlags properties, - u64 allocation_size_, u32 type) - : memory{std::move(memory_)}, allocation_size{allocation_size_}, property_flags{properties}, - shifted_memory_type{1U << type} {} + explicit MemoryAllocation(MemoryAllocator* const allocator_, vk::DeviceMemory memory_, + VkMemoryPropertyFlags properties, u64 allocation_size_, u32 type) + : allocator{allocator_}, memory{std::move(memory_)}, allocation_size{allocation_size_}, + property_flags{properties}, shifted_memory_type{1U << type} {} #if defined(_WIN32) || defined(__unix__) ~MemoryAllocation() { @@ -106,6 +106,10 @@ public: const auto it = std::ranges::find(commits, begin, &Range::begin); ASSERT_MSG(it != commits.end(), "Invalid commit"); commits.erase(it); + if (commits.empty()) { + // Do not call any code involving 'this' after this call, the object will be destroyed + allocator->ReleaseMemory(this); + } } [[nodiscard]] std::span<u8> Map() { @@ -171,6 +175,7 @@ private: return candidate; } + MemoryAllocator* const allocator; ///< Parent memory allocation. const vk::DeviceMemory memory; ///< Vulkan memory allocation handler. const u64 allocation_size; ///< Size of this allocation. const VkMemoryPropertyFlags property_flags; ///< Vulkan memory property flags. @@ -275,10 +280,17 @@ bool MemoryAllocator::TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, return false; } } - allocations.push_back(std::make_unique<MemoryAllocation>(std::move(memory), flags, size, type)); + allocations.push_back( + std::make_unique<MemoryAllocation>(this, std::move(memory), flags, size, type)); return true; } +void MemoryAllocator::ReleaseMemory(MemoryAllocation* alloc) { + const auto it = std::ranges::find(allocations, alloc, &std::unique_ptr<MemoryAllocation>::get); + ASSERT(it != allocations.end()); + allocations.erase(it); +} + std::optional<MemoryCommit> MemoryAllocator::TryCommit(const VkMemoryRequirements& requirements, VkMemoryPropertyFlags flags) { for (auto& allocation : allocations) { diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.h b/src/video_core/vulkan_common/vulkan_memory_allocator.h index db12d02f4..b61e931e0 100644 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.h +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.h @@ -69,6 +69,8 @@ private: /// Memory allocator container. /// Allocates and releases memory allocations on demand. class MemoryAllocator { + friend MemoryAllocation; + public: /** * Construct memory allocator @@ -104,6 +106,9 @@ private: /// Tries to allocate a chunk of memory. bool TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size); + /// Releases a chunk of memory. + void ReleaseMemory(MemoryAllocation* alloc); + /// Tries to allocate a memory commit. std::optional<MemoryCommit> TryCommit(const VkMemoryRequirements& requirements, VkMemoryPropertyFlags flags); |
