diff options
Diffstat (limited to 'src/video_core')
45 files changed, 427 insertions, 238 deletions
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 58a45ab67..6ed4b78f2 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -115,7 +115,34 @@ void BufferCache<P>::WriteMemory(VAddr cpu_addr, u64 size) { template <class P> void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) { - memory_tracker.CachedCpuWrite(cpu_addr, size); + const bool is_dirty = IsRegionRegistered(cpu_addr, size); + if (!is_dirty) { + return; + } + VAddr aligned_start = Common::AlignDown(cpu_addr, YUZU_PAGESIZE); + VAddr aligned_end = Common::AlignUp(cpu_addr + size, YUZU_PAGESIZE); + if (!IsRegionGpuModified(aligned_start, aligned_end - aligned_start)) { + WriteMemory(cpu_addr, size); + return; + } + + tmp_buffer.resize_destructive(size); + cpu_memory.ReadBlockUnsafe(cpu_addr, tmp_buffer.data(), size); + + InlineMemoryImplementation(cpu_addr, size, tmp_buffer); +} + +template <class P> +bool BufferCache<P>::OnCPUWrite(VAddr cpu_addr, u64 size) { + const bool is_dirty = IsRegionRegistered(cpu_addr, size); + if (!is_dirty) { + return false; + } + if (memory_tracker.IsRegionGpuModified(cpu_addr, size)) { + return true; + } + WriteMemory(cpu_addr, size); + return false; } template <class P> @@ -207,9 +234,10 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am if (has_new_downloads) { memory_tracker.MarkRegionAsGpuModified(*cpu_dest_address, amount); } - tmp_buffer.resize_destructive(amount); - cpu_memory.ReadBlockUnsafe(*cpu_src_address, tmp_buffer.data(), amount); - cpu_memory.WriteBlockUnsafe(*cpu_dest_address, tmp_buffer.data(), amount); + + Core::Memory::CpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::UnsafeReadWrite> tmp( + cpu_memory, *cpu_src_address, amount, &tmp_buffer); + tmp.SetAddressAndSize(*cpu_dest_address, amount); return true; } @@ -1553,6 +1581,14 @@ bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size, return false; } + InlineMemoryImplementation(dest_address, copy_size, inlined_buffer); + + return true; +} + +template <class P> +void BufferCache<P>::InlineMemoryImplementation(VAddr dest_address, size_t copy_size, + std::span<const u8> inlined_buffer) { const IntervalType subtract_interval{dest_address, dest_address + copy_size}; ClearDownload(subtract_interval); common_ranges.subtract(subtract_interval); @@ -1574,8 +1610,6 @@ bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size, } else { buffer.ImmediateUpload(buffer.Offset(dest_address), inlined_buffer.first(copy_size)); } - - return true; } template <class P> diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index fe6068cfe..460fc7551 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -245,6 +245,8 @@ public: void CachedWriteMemory(VAddr cpu_addr, u64 size); + bool OnCPUWrite(VAddr cpu_addr, u64 size); + void DownloadMemory(VAddr cpu_addr, u64 size); std::optional<VideoCore::RasterizerDownloadArea> GetFlushArea(VAddr cpu_addr, u64 size); @@ -543,6 +545,9 @@ private: void ClearDownload(IntervalType subtract_interval); + void InlineMemoryImplementation(VAddr dest_address, size_t copy_size, + std::span<const u8> inlined_buffer); + VideoCore::RasterizerInterface& rasterizer; Core::Memory::Memory& cpu_memory; diff --git a/src/video_core/compatible_formats.cpp b/src/video_core/compatible_formats.cpp index ab4f4d407..87d69ebc5 100644 --- a/src/video_core/compatible_formats.cpp +++ b/src/video_core/compatible_formats.cpp @@ -272,6 +272,9 @@ constexpr Table MakeNonNativeBgrCopyTable() { bool IsViewCompatible(PixelFormat format_a, PixelFormat format_b, bool broken_views, bool native_bgr) { + if (format_a == format_b) { + return true; + } if (broken_views) { // If format views are broken, only accept formats that are identical. return format_a == format_b; @@ -282,6 +285,9 @@ bool IsViewCompatible(PixelFormat format_a, PixelFormat format_b, bool broken_vi } bool IsCopyCompatible(PixelFormat format_a, PixelFormat format_b, bool native_bgr) { + if (format_a == format_b) { + return true; + } static constexpr Table BGR_TABLE = MakeNativeBgrCopyTable(); static constexpr Table NO_BGR_TABLE = MakeNonNativeBgrCopyTable(); return IsSupported(native_bgr ? BGR_TABLE : NO_BGR_TABLE, format_a, format_b); diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index 551929824..9f1b340a9 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -5,6 +5,7 @@ #include "common/microprofile.h" #include "common/settings.h" #include "core/core.h" +#include "core/memory.h" #include "video_core/dma_pusher.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/gpu.h" @@ -12,6 +13,8 @@ namespace Tegra { +constexpr u32 MacroRegistersStart = 0xE00; + DmaPusher::DmaPusher(Core::System& system_, GPU& gpu_, MemoryManager& memory_manager_, Control::ChannelState& channel_state_) : gpu{gpu_}, system{system_}, memory_manager{memory_manager_}, puller{gpu_, memory_manager_, @@ -74,25 +77,16 @@ bool DmaPusher::Step() { } // Push buffer non-empty, read a word - command_headers.resize_destructive(command_list_header.size); - constexpr u32 MacroRegistersStart = 0xE00; - if (dma_state.method < MacroRegistersStart) { - if (Settings::IsGPULevelHigh()) { - memory_manager.ReadBlock(dma_state.dma_get, command_headers.data(), - command_list_header.size * sizeof(u32)); - } else { - memory_manager.ReadBlockUnsafe(dma_state.dma_get, command_headers.data(), - command_list_header.size * sizeof(u32)); - } - } else { - const size_t copy_size = command_list_header.size * sizeof(u32); + if (dma_state.method >= MacroRegistersStart) { if (subchannels[dma_state.subchannel]) { - subchannels[dma_state.subchannel]->current_dirty = - memory_manager.IsMemoryDirty(dma_state.dma_get, copy_size); + subchannels[dma_state.subchannel]->current_dirty = memory_manager.IsMemoryDirty( + dma_state.dma_get, command_list_header.size * sizeof(u32)); } - memory_manager.ReadBlockUnsafe(dma_state.dma_get, command_headers.data(), copy_size); } - ProcessCommands(command_headers); + Core::Memory::GpuGuestMemory<Tegra::CommandHeader, + Core::Memory::GuestMemoryFlags::UnsafeRead> + headers(memory_manager, dma_state.dma_get, command_list_header.size, &command_headers); + ProcessCommands(headers); } return true; diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp index 7f5a0c29d..bc64d4486 100644 --- a/src/video_core/engines/engine_upload.cpp +++ b/src/video_core/engines/engine_upload.cpp @@ -5,6 +5,7 @@ #include "common/algorithm.h" #include "common/assert.h" +#include "core/memory.h" #include "video_core/engines/engine_upload.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" @@ -46,15 +47,11 @@ void State::ProcessData(const u32* data, size_t num_data) { void State::ProcessData(std::span<const u8> read_buffer) { const GPUVAddr address{regs.dest.Address()}; if (is_linear) { - if (regs.line_count == 1) { - rasterizer->AccelerateInlineToMemory(address, copy_size, read_buffer); - } else { - for (size_t line = 0; line < regs.line_count; ++line) { - const GPUVAddr dest_line = address + line * regs.dest.pitch; - std::span<const u8> buffer(read_buffer.data() + line * regs.line_length_in, - regs.line_length_in); - rasterizer->AccelerateInlineToMemory(dest_line, regs.line_length_in, buffer); - } + for (size_t line = 0; line < regs.line_count; ++line) { + const GPUVAddr dest_line = address + line * regs.dest.pitch; + std::span<const u8> buffer(read_buffer.data() + line * regs.line_length_in, + regs.line_length_in); + rasterizer->AccelerateInlineToMemory(dest_line, regs.line_length_in, buffer); } } else { u32 width = regs.dest.width; @@ -70,13 +67,14 @@ void State::ProcessData(std::span<const u8> read_buffer) { const std::size_t dst_size = Tegra::Texture::CalculateSize( true, bytes_per_pixel, width, regs.dest.height, regs.dest.depth, regs.dest.BlockHeight(), regs.dest.BlockDepth()); - tmp_buffer.resize_destructive(dst_size); - memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size); - Tegra::Texture::SwizzleSubrect(tmp_buffer, read_buffer, bytes_per_pixel, width, - regs.dest.height, regs.dest.depth, x_offset, regs.dest.y, - x_elements, regs.line_count, regs.dest.BlockHeight(), + + Core::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeReadCachedWrite> + tmp(memory_manager, address, dst_size, &tmp_buffer); + + Tegra::Texture::SwizzleSubrect(tmp, read_buffer, bytes_per_pixel, width, regs.dest.height, + regs.dest.depth, x_offset, regs.dest.y, x_elements, + regs.line_count, regs.dest.BlockHeight(), regs.dest.BlockDepth(), regs.line_length_in); - memory_manager.WriteBlockCached(address, tmp_buffer.data(), dst_size); } } diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index 601095f03..a38d9528a 100644 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -84,7 +84,6 @@ Texture::TICEntry KeplerCompute::GetTICEntry(u32 tic_index) const { Texture::TICEntry tic_entry; memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry)); - return tic_entry; } diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 62d70e9f3..c3696096d 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -9,6 +9,7 @@ #include "common/settings.h" #include "core/core.h" #include "core/core_timing.h" +#include "core/memory.h" #include "video_core/dirty_flags.h" #include "video_core/engines/draw_manager.h" #include "video_core/engines/maxwell_3d.h" @@ -679,17 +680,14 @@ void Maxwell3D::ProcessCBData(u32 value) { Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const { const GPUVAddr tic_address_gpu{regs.tex_header.Address() + tic_index * sizeof(Texture::TICEntry)}; - Texture::TICEntry tic_entry; memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry)); - return tic_entry; } Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const { const GPUVAddr tsc_address_gpu{regs.tex_sampler.Address() + tsc_index * sizeof(Texture::TSCEntry)}; - Texture::TSCEntry tsc_entry; memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry)); return tsc_entry; diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index a290d6ea7..cd8e24b0b 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -7,6 +7,7 @@ #include "common/microprofile.h" #include "common/settings.h" #include "core/core.h" +#include "core/memory.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/maxwell_dma.h" #include "video_core/memory_manager.h" @@ -130,11 +131,12 @@ void MaxwellDMA::Launch() { UNIMPLEMENTED_IF(regs.offset_out % 16 != 0); read_buffer.resize_destructive(16); for (u32 offset = 0; offset < regs.line_length_in; offset += 16) { - memory_manager.ReadBlock( - convert_linear_2_blocklinear_addr(regs.offset_in + offset), - read_buffer.data(), read_buffer.size()); - memory_manager.WriteBlockCached(regs.offset_out + offset, read_buffer.data(), - read_buffer.size()); + Core::Memory::GpuGuestMemoryScoped< + u8, Core::Memory::GuestMemoryFlags::SafeReadCachedWrite> + tmp_write_buffer(memory_manager, + convert_linear_2_blocklinear_addr(regs.offset_in + offset), + 16, &read_buffer); + tmp_write_buffer.SetAddressAndSize(regs.offset_out + offset, 16); } } else if (is_src_pitch && !is_dst_pitch) { UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0); @@ -142,20 +144,19 @@ void MaxwellDMA::Launch() { UNIMPLEMENTED_IF(regs.offset_out % 16 != 0); read_buffer.resize_destructive(16); for (u32 offset = 0; offset < regs.line_length_in; offset += 16) { - memory_manager.ReadBlock(regs.offset_in + offset, read_buffer.data(), - read_buffer.size()); - memory_manager.WriteBlockCached( - convert_linear_2_blocklinear_addr(regs.offset_out + offset), - read_buffer.data(), read_buffer.size()); + Core::Memory::GpuGuestMemoryScoped< + u8, Core::Memory::GuestMemoryFlags::SafeReadCachedWrite> + tmp_write_buffer(memory_manager, regs.offset_in + offset, 16, &read_buffer); + tmp_write_buffer.SetAddressAndSize( + convert_linear_2_blocklinear_addr(regs.offset_out + offset), 16); } } else { if (!accelerate.BufferCopy(regs.offset_in, regs.offset_out, regs.line_length_in)) { - read_buffer.resize_destructive(regs.line_length_in); - memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), - regs.line_length_in, - VideoCommon::CacheType::NoBufferCache); - memory_manager.WriteBlockCached(regs.offset_out, read_buffer.data(), - regs.line_length_in); + Core::Memory::GpuGuestMemoryScoped< + u8, Core::Memory::GuestMemoryFlags::SafeReadCachedWrite> + tmp_write_buffer(memory_manager, regs.offset_in, regs.line_length_in, + &read_buffer); + tmp_write_buffer.SetAddressAndSize(regs.offset_out, regs.line_length_in); } } } @@ -174,8 +175,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() { src_operand.address = regs.offset_in; DMA::BufferOperand dst_operand; - u32 abs_pitch_out = std::abs(static_cast<s32>(regs.pitch_out)); - dst_operand.pitch = abs_pitch_out; + dst_operand.pitch = static_cast<u32>(std::abs(regs.pitch_out)); dst_operand.width = regs.line_length_in; dst_operand.height = regs.line_count; dst_operand.address = regs.offset_out; @@ -222,18 +222,16 @@ void MaxwellDMA::CopyBlockLinearToPitch() { const size_t src_size = CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth); - const size_t dst_size = static_cast<size_t>(abs_pitch_out) * regs.line_count; - read_buffer.resize_destructive(src_size); - write_buffer.resize_destructive(dst_size); + const size_t dst_size = dst_operand.pitch * regs.line_count; - memory_manager.ReadBlock(src_operand.address, read_buffer.data(), src_size); - memory_manager.ReadBlock(dst_operand.address, write_buffer.data(), dst_size); + Core::Memory::GpuGuestMemory<u8, Core::Memory::GuestMemoryFlags::SafeRead> tmp_read_buffer( + memory_manager, src_operand.address, src_size, &read_buffer); + Core::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeReadCachedWrite> + tmp_write_buffer(memory_manager, dst_operand.address, dst_size, &write_buffer); - UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset, - src_params.origin.y, x_elements, regs.line_count, block_height, block_depth, - abs_pitch_out); - - memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size); + UnswizzleSubrect(tmp_write_buffer, tmp_read_buffer, bytes_per_pixel, width, height, depth, + x_offset, src_params.origin.y, x_elements, regs.line_count, block_height, + block_depth, dst_operand.pitch); } void MaxwellDMA::CopyPitchToBlockLinear() { @@ -288,18 +286,17 @@ void MaxwellDMA::CopyPitchToBlockLinear() { CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth); const size_t src_size = static_cast<size_t>(regs.pitch_in) * regs.line_count; - read_buffer.resize_destructive(src_size); - write_buffer.resize_destructive(dst_size); - - memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size); - memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size); - - // If the input is linear and the output is tiled, swizzle the input and copy it over. - SwizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset, - dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth, - regs.pitch_in); - - memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size); + GPUVAddr src_addr = regs.offset_in; + GPUVAddr dst_addr = regs.offset_out; + Core::Memory::GpuGuestMemory<u8, Core::Memory::GuestMemoryFlags::SafeRead> tmp_read_buffer( + memory_manager, src_addr, src_size, &read_buffer); + Core::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeReadCachedWrite> + tmp_write_buffer(memory_manager, dst_addr, dst_size, &write_buffer); + + // If the input is linear and the output is tiled, swizzle the input and copy it over. + SwizzleSubrect(tmp_write_buffer, tmp_read_buffer, bytes_per_pixel, width, height, depth, + x_offset, dst_params.origin.y, x_elements, regs.line_count, block_height, + block_depth, regs.pitch_in); } void MaxwellDMA::CopyBlockLinearToBlockLinear() { @@ -343,23 +340,20 @@ void MaxwellDMA::CopyBlockLinearToBlockLinear() { const u32 pitch = x_elements * bytes_per_pixel; const size_t mid_buffer_size = pitch * regs.line_count; - read_buffer.resize_destructive(src_size); - write_buffer.resize_destructive(dst_size); - intermediate_buffer.resize_destructive(mid_buffer_size); - memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size); - memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size); + Core::Memory::GpuGuestMemory<u8, Core::Memory::GuestMemoryFlags::SafeRead> tmp_read_buffer( + memory_manager, regs.offset_in, src_size, &read_buffer); + Core::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeReadCachedWrite> + tmp_write_buffer(memory_manager, regs.offset_out, dst_size, &write_buffer); - UnswizzleSubrect(intermediate_buffer, read_buffer, bytes_per_pixel, src_width, src.height, + UnswizzleSubrect(intermediate_buffer, tmp_read_buffer, bytes_per_pixel, src_width, src.height, src.depth, src_x_offset, src.origin.y, x_elements, regs.line_count, src.block_size.height, src.block_size.depth, pitch); - SwizzleSubrect(write_buffer, intermediate_buffer, bytes_per_pixel, dst_width, dst.height, + SwizzleSubrect(tmp_write_buffer, intermediate_buffer, bytes_per_pixel, dst_width, dst.height, dst.depth, dst_x_offset, dst.origin.y, x_elements, regs.line_count, dst.block_size.height, dst.block_size.depth, pitch); - - memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size); } void MaxwellDMA::ReleaseSemaphore() { diff --git a/src/video_core/engines/sw_blitter/blitter.cpp b/src/video_core/engines/sw_blitter/blitter.cpp index ff88cd03d..3a599f466 100644 --- a/src/video_core/engines/sw_blitter/blitter.cpp +++ b/src/video_core/engines/sw_blitter/blitter.cpp @@ -159,11 +159,11 @@ bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst, const auto src_bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(src.format)); const auto dst_bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(dst.format)); const size_t src_size = get_surface_size(src, src_bytes_per_pixel); - impl->tmp_buffer.resize_destructive(src_size); - memory_manager.ReadBlock(src.Address(), impl->tmp_buffer.data(), src_size); - const size_t src_copy_size = src_extent_x * src_extent_y * src_bytes_per_pixel; + Core::Memory::GpuGuestMemory<u8, Core::Memory::GuestMemoryFlags::SafeRead> tmp_buffer( + memory_manager, src.Address(), src_size, &impl->tmp_buffer); + const size_t src_copy_size = src_extent_x * src_extent_y * src_bytes_per_pixel; const size_t dst_copy_size = dst_extent_x * dst_extent_y * dst_bytes_per_pixel; impl->src_buffer.resize_destructive(src_copy_size); @@ -200,12 +200,11 @@ bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst, impl->dst_buffer.resize_destructive(dst_copy_size); if (src.linear == Fermi2D::MemoryLayout::BlockLinear) { - UnswizzleSubrect(impl->src_buffer, impl->tmp_buffer, src_bytes_per_pixel, src.width, - src.height, src.depth, config.src_x0, config.src_y0, src_extent_x, - src_extent_y, src.block_height, src.block_depth, - src_extent_x * src_bytes_per_pixel); + UnswizzleSubrect(impl->src_buffer, tmp_buffer, src_bytes_per_pixel, src.width, src.height, + src.depth, config.src_x0, config.src_y0, src_extent_x, src_extent_y, + src.block_height, src.block_depth, src_extent_x * src_bytes_per_pixel); } else { - process_pitch_linear(false, impl->tmp_buffer, impl->src_buffer, src_extent_x, src_extent_y, + process_pitch_linear(false, tmp_buffer, impl->src_buffer, src_extent_x, src_extent_y, src.pitch, config.src_x0, config.src_y0, src_bytes_per_pixel); } @@ -221,20 +220,18 @@ bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst, } const size_t dst_size = get_surface_size(dst, dst_bytes_per_pixel); - impl->tmp_buffer.resize_destructive(dst_size); - memory_manager.ReadBlock(dst.Address(), impl->tmp_buffer.data(), dst_size); + Core::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeReadWrite> + tmp_buffer2(memory_manager, dst.Address(), dst_size, &impl->tmp_buffer); if (dst.linear == Fermi2D::MemoryLayout::BlockLinear) { - SwizzleSubrect(impl->tmp_buffer, impl->dst_buffer, dst_bytes_per_pixel, dst.width, - dst.height, dst.depth, config.dst_x0, config.dst_y0, dst_extent_x, - dst_extent_y, dst.block_height, dst.block_depth, - dst_extent_x * dst_bytes_per_pixel); + SwizzleSubrect(tmp_buffer2, impl->dst_buffer, dst_bytes_per_pixel, dst.width, dst.height, + dst.depth, config.dst_x0, config.dst_y0, dst_extent_x, dst_extent_y, + dst.block_height, dst.block_depth, dst_extent_x * dst_bytes_per_pixel); } else { - process_pitch_linear(true, impl->dst_buffer, impl->tmp_buffer, dst_extent_x, dst_extent_y, + process_pitch_linear(true, impl->dst_buffer, tmp_buffer2, dst_extent_x, dst_extent_y, dst.pitch, config.dst_x0, config.dst_y0, static_cast<size_t>(dst_bytes_per_pixel)); } - memory_manager.WriteBlock(dst.Address(), impl->tmp_buffer.data(), dst_size); return true; } diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h index 35d699bbf..ab20ff30f 100644 --- a/src/video_core/fence_manager.h +++ b/src/video_core/fence_manager.h @@ -69,7 +69,6 @@ public: } void SignalFence(std::function<void()>&& func) { - rasterizer.InvalidateGPUCache(); bool delay_fence = Settings::IsGPULevelHigh(); if constexpr (!can_async_check) { TryReleasePendingFences<false>(); @@ -96,6 +95,7 @@ public: guard.unlock(); cv.notify_all(); } + rasterizer.InvalidateGPUCache(); } void SignalSyncPoint(u32 value) { diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index db385076d..c192e33b2 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -95,7 +95,9 @@ struct GPU::Impl { /// Synchronizes CPU writes with Host GPU memory. void InvalidateGPUCache() { - rasterizer->InvalidateGPUCache(); + std::function<void(VAddr, size_t)> callback_writes( + [this](VAddr address, size_t size) { rasterizer->OnCacheInvalidation(address, size); }); + system.GatherGPUDirtyMemory(callback_writes); } /// Signal the ending of command list. @@ -299,6 +301,10 @@ struct GPU::Impl { gpu_thread.InvalidateRegion(addr, size); } + bool OnCPUWrite(VAddr addr, u64 size) { + return rasterizer->OnCPUWrite(addr, size); + } + /// Notify rasterizer that any caches of the specified region should be flushed and invalidated void FlushAndInvalidateRegion(VAddr addr, u64 size) { gpu_thread.FlushAndInvalidateRegion(addr, size); @@ -561,6 +567,10 @@ void GPU::InvalidateRegion(VAddr addr, u64 size) { impl->InvalidateRegion(addr, size); } +bool GPU::OnCPUWrite(VAddr addr, u64 size) { + return impl->OnCPUWrite(addr, size); +} + void GPU::FlushAndInvalidateRegion(VAddr addr, u64 size) { impl->FlushAndInvalidateRegion(addr, size); } diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index e49c40cf2..ba2838b89 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -250,6 +250,10 @@ public: /// Notify rasterizer that any caches of the specified region should be invalidated void InvalidateRegion(VAddr addr, u64 size); + /// Notify rasterizer that CPU is trying to write this area. It returns true if the area is + /// sensible, false otherwise + bool OnCPUWrite(VAddr addr, u64 size); + /// Notify rasterizer that any caches of the specified region should be flushed and invalidated void FlushAndInvalidateRegion(VAddr addr, u64 size); diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 889144f38..2f0f9f593 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -47,7 +47,7 @@ static void RunThread(std::stop_token stop_token, Core::System& system, } else if (const auto* flush = std::get_if<FlushRegionCommand>(&next.data)) { rasterizer->FlushRegion(flush->addr, flush->size); } else if (const auto* invalidate = std::get_if<InvalidateRegionCommand>(&next.data)) { - rasterizer->OnCPUWrite(invalidate->addr, invalidate->size); + rasterizer->OnCacheInvalidation(invalidate->addr, invalidate->size); } else { ASSERT(false); } @@ -102,12 +102,12 @@ void ThreadManager::TickGPU() { } void ThreadManager::InvalidateRegion(VAddr addr, u64 size) { - rasterizer->OnCPUWrite(addr, size); + rasterizer->OnCacheInvalidation(addr, size); } void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) { // Skip flush on asynch mode, as FlushAndInvalidateRegion is not used for anything too important - rasterizer->OnCPUWrite(addr, size); + rasterizer->OnCacheInvalidation(addr, size); } u64 ThreadManager::PushCommand(CommandData&& command_data, bool block) { diff --git a/src/video_core/host1x/codecs/codec.cpp b/src/video_core/host1x/codecs/codec.cpp index cd6a3a9b8..da07a556f 100644 --- a/src/video_core/host1x/codecs/codec.cpp +++ b/src/video_core/host1x/codecs/codec.cpp @@ -290,7 +290,7 @@ void Codec::Decode() { return vp9_decoder->GetFrameBytes(); default: ASSERT(false); - return std::vector<u8>{}; + return std::span<const u8>{}; } }(); AVPacketPtr packet{av_packet_alloc(), AVPacketDeleter}; diff --git a/src/video_core/host1x/codecs/h264.cpp b/src/video_core/host1x/codecs/h264.cpp index ce827eb6c..862904e39 100644 --- a/src/video_core/host1x/codecs/h264.cpp +++ b/src/video_core/host1x/codecs/h264.cpp @@ -29,15 +29,15 @@ H264::H264(Host1x::Host1x& host1x_) : host1x{host1x_} {} H264::~H264() = default; -const std::vector<u8>& H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state, - bool is_first_frame) { +std::span<const u8> H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state, + bool is_first_frame) { H264DecoderContext context; host1x.MemoryManager().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext)); const s64 frame_number = context.h264_parameter_set.frame_number.Value(); if (!is_first_frame && frame_number != 0) { - frame.resize(context.stream_len); + frame.resize_destructive(context.stream_len); host1x.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size()); return frame; } @@ -135,14 +135,14 @@ const std::vector<u8>& H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegist for (s32 index = 0; index < 6; index++) { writer.WriteBit(true); std::span<const u8> matrix{context.weight_scale}; - writer.WriteScalingList(matrix, index * 16, 16); + writer.WriteScalingList(scan, matrix, index * 16, 16); } if (context.h264_parameter_set.transform_8x8_mode_flag) { for (s32 index = 0; index < 2; index++) { writer.WriteBit(true); std::span<const u8> matrix{context.weight_scale_8x8}; - writer.WriteScalingList(matrix, index * 64, 64); + writer.WriteScalingList(scan, matrix, index * 64, 64); } } @@ -188,8 +188,8 @@ void H264BitWriter::WriteBit(bool state) { WriteBits(state ? 1 : 0, 1); } -void H264BitWriter::WriteScalingList(std::span<const u8> list, s32 start, s32 count) { - static Common::ScratchBuffer<u8> scan{}; +void H264BitWriter::WriteScalingList(Common::ScratchBuffer<u8>& scan, std::span<const u8> list, + s32 start, s32 count) { scan.resize_destructive(count); if (count == 16) { std::memcpy(scan.data(), zig_zag_scan.data(), scan.size()); diff --git a/src/video_core/host1x/codecs/h264.h b/src/video_core/host1x/codecs/h264.h index 5cc86454e..d6b556322 100644 --- a/src/video_core/host1x/codecs/h264.h +++ b/src/video_core/host1x/codecs/h264.h @@ -5,9 +5,11 @@ #include <span> #include <vector> + #include "common/bit_field.h" #include "common/common_funcs.h" #include "common/common_types.h" +#include "common/scratch_buffer.h" #include "video_core/host1x/nvdec_common.h" namespace Tegra { @@ -37,7 +39,8 @@ public: /// Based on section 7.3.2.1.1.1 and Table 7-4 in the H.264 specification /// Writes the scaling matrices of the sream - void WriteScalingList(std::span<const u8> list, s32 start, s32 count); + void WriteScalingList(Common::ScratchBuffer<u8>& scan, std::span<const u8> list, s32 start, + s32 count); /// Return the bitstream as a vector. [[nodiscard]] std::vector<u8>& GetByteArray(); @@ -63,11 +66,12 @@ public: ~H264(); /// Compose the H264 frame for FFmpeg decoding - [[nodiscard]] const std::vector<u8>& ComposeFrame( - const Host1x::NvdecCommon::NvdecRegisters& state, bool is_first_frame = false); + [[nodiscard]] std::span<const u8> ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state, + bool is_first_frame = false); private: - std::vector<u8> frame; + Common::ScratchBuffer<u8> frame; + Common::ScratchBuffer<u8> scan; Host1x::Host1x& host1x; struct H264ParameterSet { diff --git a/src/video_core/host1x/codecs/vp8.cpp b/src/video_core/host1x/codecs/vp8.cpp index 28fb12cb8..ee6392ff9 100644 --- a/src/video_core/host1x/codecs/vp8.cpp +++ b/src/video_core/host1x/codecs/vp8.cpp @@ -12,7 +12,7 @@ VP8::VP8(Host1x::Host1x& host1x_) : host1x{host1x_} {} VP8::~VP8() = default; -const std::vector<u8>& VP8::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) { +std::span<const u8> VP8::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) { VP8PictureInfo info; host1x.MemoryManager().ReadBlock(state.picture_info_offset, &info, sizeof(VP8PictureInfo)); diff --git a/src/video_core/host1x/codecs/vp8.h b/src/video_core/host1x/codecs/vp8.h index 5bf07ecab..7926b73f3 100644 --- a/src/video_core/host1x/codecs/vp8.h +++ b/src/video_core/host1x/codecs/vp8.h @@ -4,10 +4,11 @@ #pragma once #include <array> -#include <vector> +#include <span> #include "common/common_funcs.h" #include "common/common_types.h" +#include "common/scratch_buffer.h" #include "video_core/host1x/nvdec_common.h" namespace Tegra { @@ -24,11 +25,11 @@ public: ~VP8(); /// Compose the VP8 frame for FFmpeg decoding - [[nodiscard]] const std::vector<u8>& ComposeFrame( + [[nodiscard]] std::span<const u8> ComposeFrame( const Host1x::NvdecCommon::NvdecRegisters& state); private: - std::vector<u8> frame; + Common::ScratchBuffer<u8> frame; Host1x::Host1x& host1x; struct VP8PictureInfo { diff --git a/src/video_core/host1x/codecs/vp9.cpp b/src/video_core/host1x/codecs/vp9.cpp index cf40c9012..306c3d0e8 100644 --- a/src/video_core/host1x/codecs/vp9.cpp +++ b/src/video_core/host1x/codecs/vp9.cpp @@ -3,6 +3,7 @@ #include <algorithm> // for std::copy #include <numeric> + #include "common/assert.h" #include "video_core/host1x/codecs/vp9.h" #include "video_core/host1x/host1x.h" diff --git a/src/video_core/host1x/codecs/vp9.h b/src/video_core/host1x/codecs/vp9.h index d4083e8d3..f1ed19508 100644 --- a/src/video_core/host1x/codecs/vp9.h +++ b/src/video_core/host1x/codecs/vp9.h @@ -4,9 +4,11 @@ #pragma once #include <array> +#include <span> #include <vector> #include "common/common_types.h" +#include "common/scratch_buffer.h" #include "common/stream.h" #include "video_core/host1x/codecs/vp9_types.h" #include "video_core/host1x/nvdec_common.h" @@ -128,8 +130,8 @@ public: return !current_frame_info.show_frame; } - /// Returns a const reference to the composed frame data. - [[nodiscard]] const std::vector<u8>& GetFrameBytes() const { + /// Returns a const span to the composed frame data. + [[nodiscard]] std::span<const u8> GetFrameBytes() const { return frame; } @@ -181,7 +183,7 @@ private: [[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader(); Host1x::Host1x& host1x; - std::vector<u8> frame; + Common::ScratchBuffer<u8> frame; std::array<s8, 4> loop_filter_ref_deltas{}; std::array<s8, 2> loop_filter_mode_deltas{}; diff --git a/src/video_core/host1x/codecs/vp9_types.h b/src/video_core/host1x/codecs/vp9_types.h index adad8ed7e..cc9b25690 100644 --- a/src/video_core/host1x/codecs/vp9_types.h +++ b/src/video_core/host1x/codecs/vp9_types.h @@ -5,6 +5,7 @@ #include <array> #include <vector> + #include "common/common_funcs.h" #include "common/common_types.h" diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 45141e488..d16040613 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -10,13 +10,13 @@ #include "core/device_memory.h" #include "core/hle/kernel/k_page_table.h" #include "core/hle/kernel/k_process.h" -#include "core/memory.h" #include "video_core/invalidation_accumulator.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_base.h" namespace Tegra { +using Core::Memory::GuestMemoryFlags; std::atomic<size_t> MemoryManager::unique_identifier_generator{}; @@ -587,13 +587,10 @@ void MemoryManager::InvalidateRegion(GPUVAddr gpu_addr, size_t size, void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size, VideoCommon::CacheType which) { - tmp_buffer.resize_destructive(size); - ReadBlock(gpu_src_addr, tmp_buffer.data(), size, which); - - // The output block must be flushed in case it has data modified from the GPU. - // Fixes NPC geometry in Zombie Panic in Wonderland DX + Core::Memory::GpuGuestMemoryScoped<u8, GuestMemoryFlags::SafeReadWrite> data( + *this, gpu_src_addr, size); + data.SetAddressAndSize(gpu_dest_addr, size); FlushRegion(gpu_dest_addr, size, which); - WriteBlock(gpu_dest_addr, tmp_buffer.data(), size, which); } bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) const { @@ -758,4 +755,23 @@ void MemoryManager::FlushCaching() { accumulator->Clear(); } +const u8* MemoryManager::GetSpan(const GPUVAddr src_addr, const std::size_t size) const { + auto cpu_addr = GpuToCpuAddress(src_addr); + if (cpu_addr) { + return memory.GetSpan(*cpu_addr, size); + } + return nullptr; +} + +u8* MemoryManager::GetSpan(const GPUVAddr src_addr, const std::size_t size) { + if (!IsContinuousRange(src_addr, size)) { + return nullptr; + } + auto cpu_addr = GpuToCpuAddress(src_addr); + if (cpu_addr) { + return memory.GetSpan(*cpu_addr, size); + } + return nullptr; +} + } // namespace Tegra diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 4202c26ff..9b311b9e5 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -15,6 +15,7 @@ #include "common/range_map.h" #include "common/scratch_buffer.h" #include "common/virtual_buffer.h" +#include "core/memory.h" #include "video_core/cache_types.h" #include "video_core/pte_kind.h" @@ -62,6 +63,20 @@ public: [[nodiscard]] u8* GetPointer(GPUVAddr addr); [[nodiscard]] const u8* GetPointer(GPUVAddr addr) const; + template <typename T> + [[nodiscard]] T* GetPointer(GPUVAddr addr) { + const auto address{GpuToCpuAddress(addr)}; + if (!address) { + return {}; + } + return memory.GetPointer(*address); + } + + template <typename T> + [[nodiscard]] const T* GetPointer(GPUVAddr addr) const { + return GetPointer<T*>(addr); + } + /** * ReadBlock and WriteBlock are full read and write operations over virtual * GPU Memory. It's important to use these when GPU memory may not be continuous @@ -139,6 +154,9 @@ public: void FlushCaching(); + const u8* GetSpan(const GPUVAddr src_addr, const std::size_t size) const; + u8* GetSpan(const GPUVAddr src_addr, const std::size_t size); + private: template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped> inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped, diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 7566a8c4e..cb8029a4f 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -109,7 +109,9 @@ public: } /// Notify rasterizer that any caches of the specified region are desync with guest - virtual void OnCPUWrite(VAddr addr, u64 size) = 0; + virtual void OnCacheInvalidation(VAddr addr, u64 size) = 0; + + virtual bool OnCPUWrite(VAddr addr, u64 size) = 0; /// Sync memory between guest and host. virtual void InvalidateGPUCache() = 0; diff --git a/src/video_core/renderer_null/null_rasterizer.cpp b/src/video_core/renderer_null/null_rasterizer.cpp index bf2ce4c49..92ecf6682 100644 --- a/src/video_core/renderer_null/null_rasterizer.cpp +++ b/src/video_core/renderer_null/null_rasterizer.cpp @@ -47,7 +47,10 @@ bool RasterizerNull::MustFlushRegion(VAddr addr, u64 size, VideoCommon::CacheTyp return false; } void RasterizerNull::InvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType) {} -void RasterizerNull::OnCPUWrite(VAddr addr, u64 size) {} +bool RasterizerNull::OnCPUWrite(VAddr addr, u64 size) { + return false; +} +void RasterizerNull::OnCacheInvalidation(VAddr addr, u64 size) {} VideoCore::RasterizerDownloadArea RasterizerNull::GetFlushArea(VAddr addr, u64 size) { VideoCore::RasterizerDownloadArea new_area{ .start_address = Common::AlignDown(addr, Core::Memory::YUZU_PAGESIZE), diff --git a/src/video_core/renderer_null/null_rasterizer.h b/src/video_core/renderer_null/null_rasterizer.h index a8d35d2c1..93b9a6971 100644 --- a/src/video_core/renderer_null/null_rasterizer.h +++ b/src/video_core/renderer_null/null_rasterizer.h @@ -53,7 +53,8 @@ public: VideoCommon::CacheType which = VideoCommon::CacheType::All) override; void InvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; - void OnCPUWrite(VAddr addr, u64 size) override; + void OnCacheInvalidation(VAddr addr, u64 size) override; + bool OnCPUWrite(VAddr addr, u64 size) override; VideoCore::RasterizerDownloadArea GetFlushArea(VAddr addr, u64 size) override; void InvalidateGPUCache() override; void UnmapMemory(VAddr addr, u64 size) override; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index edf527f2d..aadd6967c 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -485,12 +485,33 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size, VideoCommon::Cache } } -void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) { +bool RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) { + MICROPROFILE_SCOPE(OpenGL_CacheManagement); + if (addr == 0 || size == 0) { + return false; + } + + { + std::scoped_lock lock{buffer_cache.mutex}; + if (buffer_cache.OnCPUWrite(addr, size)) { + return true; + } + } + + { + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.WriteMemory(addr, size); + } + + shader_cache.InvalidateRegion(addr, size); + return false; +} + +void RasterizerOpenGL::OnCacheInvalidation(VAddr addr, u64 size) { MICROPROFILE_SCOPE(OpenGL_CacheManagement); if (addr == 0 || size == 0) { return; } - shader_cache.OnCPUWrite(addr, size); { std::scoped_lock lock{texture_cache.mutex}; texture_cache.WriteMemory(addr, size); @@ -499,15 +520,11 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) { std::scoped_lock lock{buffer_cache.mutex}; buffer_cache.CachedWriteMemory(addr, size); } + shader_cache.InvalidateRegion(addr, size); } void RasterizerOpenGL::InvalidateGPUCache() { - MICROPROFILE_SCOPE(OpenGL_CacheManagement); - shader_cache.SyncGuestHost(); - { - std::scoped_lock lock{buffer_cache.mutex}; - buffer_cache.FlushCachedWrites(); - } + gpu.InvalidateGPUCache(); } void RasterizerOpenGL::UnmapMemory(VAddr addr, u64 size) { @@ -519,7 +536,7 @@ void RasterizerOpenGL::UnmapMemory(VAddr addr, u64 size) { std::scoped_lock lock{buffer_cache.mutex}; buffer_cache.WriteMemory(addr, size); } - shader_cache.OnCPUWrite(addr, size); + shader_cache.OnCacheInvalidation(addr, size); } void RasterizerOpenGL::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) { diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index a73ad15c1..8eda2ddba 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -98,7 +98,8 @@ public: VideoCore::RasterizerDownloadArea GetFlushArea(VAddr addr, u64 size) override; void InvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; - void OnCPUWrite(VAddr addr, u64 size) override; + void OnCacheInvalidation(VAddr addr, u64 size) override; + bool OnCPUWrite(VAddr addr, u64 size) override; void InvalidateGPUCache() override; void UnmapMemory(VAddr addr, u64 size) override; void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) override; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index b72f95235..51df18ec3 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -591,7 +591,7 @@ void BufferCacheRuntime::ReserveNullBuffer() { .flags = 0, .size = 4, .usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | - VK_BUFFER_USAGE_TRANSFER_DST_BIT, + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, @@ -599,7 +599,6 @@ void BufferCacheRuntime::ReserveNullBuffer() { if (device.IsExtTransformFeedbackSupported()) { create_info.usage |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; } - create_info.usage |= VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT; null_buffer = memory_allocator.CreateBuffer(create_info, MemoryUsage::DeviceLocal); if (device.HasDebuggingToolAttached()) { null_buffer.SetObjectNameEXT("Null buffer"); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index f7c0d939a..456bb040e 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -566,11 +566,32 @@ void RasterizerVulkan::InnerInvalidation(std::span<const std::pair<VAddr, std::s } } -void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) { +bool RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) { + if (addr == 0 || size == 0) { + return false; + } + + { + std::scoped_lock lock{buffer_cache.mutex}; + if (buffer_cache.OnCPUWrite(addr, size)) { + return true; + } + } + + { + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.WriteMemory(addr, size); + } + + pipeline_cache.InvalidateRegion(addr, size); + return false; +} + +void RasterizerVulkan::OnCacheInvalidation(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; } - pipeline_cache.OnCPUWrite(addr, size); + { std::scoped_lock lock{texture_cache.mutex}; texture_cache.WriteMemory(addr, size); @@ -579,14 +600,11 @@ void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) { std::scoped_lock lock{buffer_cache.mutex}; buffer_cache.CachedWriteMemory(addr, size); } + pipeline_cache.InvalidateRegion(addr, size); } void RasterizerVulkan::InvalidateGPUCache() { - pipeline_cache.SyncGuestHost(); - { - std::scoped_lock lock{buffer_cache.mutex}; - buffer_cache.FlushCachedWrites(); - } + gpu.InvalidateGPUCache(); } void RasterizerVulkan::UnmapMemory(VAddr addr, u64 size) { @@ -598,7 +616,7 @@ void RasterizerVulkan::UnmapMemory(VAddr addr, u64 size) { std::scoped_lock lock{buffer_cache.mutex}; buffer_cache.WriteMemory(addr, size); } - pipeline_cache.OnCPUWrite(addr, size); + pipeline_cache.OnCacheInvalidation(addr, size); } void RasterizerVulkan::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index b39710b3c..73257d964 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -96,7 +96,8 @@ public: void InvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) override; - void OnCPUWrite(VAddr addr, u64 size) override; + void OnCacheInvalidation(VAddr addr, u64 size) override; + bool OnCPUWrite(VAddr addr, u64 size) override; void InvalidateGPUCache() override; void UnmapMemory(VAddr addr, u64 size) override; void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) override; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 8385b5509..3aac3cfab 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -36,8 +36,10 @@ using VideoCommon::ImageFlagBits; using VideoCommon::ImageInfo; using VideoCommon::ImageType; using VideoCommon::SubresourceRange; +using VideoCore::Surface::BytesPerBlock; using VideoCore::Surface::IsPixelFormatASTC; using VideoCore::Surface::IsPixelFormatInteger; +using VideoCore::Surface::SurfaceType; namespace { constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) { @@ -130,7 +132,7 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) { [[nodiscard]] VkImageCreateInfo MakeImageCreateInfo(const Device& device, const ImageInfo& info) { const PixelFormat format = StorageFormat(info.format); const auto format_info = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, false, format); - VkImageCreateFlags flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT; + VkImageCreateFlags flags{}; if (info.type == ImageType::e2D && info.resources.layers >= 6 && info.size.width == info.size.height && !device.HasBrokenCubeImageCompability()) { flags |= VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT; @@ -163,11 +165,24 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) { } [[nodiscard]] vk::Image MakeImage(const Device& device, const MemoryAllocator& allocator, - const ImageInfo& info) { + const ImageInfo& info, std::span<const VkFormat> view_formats) { if (info.type == ImageType::Buffer) { return vk::Image{}; } - return allocator.CreateImage(MakeImageCreateInfo(device, info)); + VkImageCreateInfo image_ci = MakeImageCreateInfo(device, info); + const VkImageFormatListCreateInfo image_format_list = { + .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_LIST_CREATE_INFO, + .pNext = nullptr, + .viewFormatCount = static_cast<u32>(view_formats.size()), + .pViewFormats = view_formats.data(), + }; + if (view_formats.size() > 1) { + image_ci.flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT; + if (device.IsKhrImageFormatListSupported()) { + image_ci.pNext = &image_format_list; + } + } + return allocator.CreateImage(image_ci); } [[nodiscard]] VkImageAspectFlags ImageAspectMask(PixelFormat format) { @@ -806,6 +821,23 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, Scheduler& sched astc_decoder_pass.emplace(device, scheduler, descriptor_pool, staging_buffer_pool, compute_pass_descriptor_queue, memory_allocator); } + if (!device.IsKhrImageFormatListSupported()) { + return; + } + for (size_t index_a = 0; index_a < VideoCore::Surface::MaxPixelFormat; index_a++) { + const auto image_format = static_cast<PixelFormat>(index_a); + if (IsPixelFormatASTC(image_format) && !device.IsOptimalAstcSupported()) { + view_formats[index_a].push_back(VK_FORMAT_A8B8G8R8_UNORM_PACK32); + } + for (size_t index_b = 0; index_b < VideoCore::Surface::MaxPixelFormat; index_b++) { + const auto view_format = static_cast<PixelFormat>(index_b); + if (VideoCore::Surface::IsViewCompatible(image_format, view_format, false, true)) { + const auto view_info = + MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, true, view_format); + view_formats[index_a].push_back(view_info.format); + } + } + } } void TextureCacheRuntime::Finish() { @@ -1265,8 +1297,8 @@ void TextureCacheRuntime::TickFrame() {} Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_) : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), scheduler{&runtime_.scheduler}, - runtime{&runtime_}, - original_image(MakeImage(runtime_.device, runtime_.memory_allocator, info)), + runtime{&runtime_}, original_image(MakeImage(runtime_.device, runtime_.memory_allocator, info, + runtime->ViewFormats(info.format))), aspect_mask(ImageAspectMask(info.format)) { if (IsPixelFormatASTC(info.format) && !runtime->device.IsOptimalAstcSupported()) { if (Settings::values.async_astc.GetValue()) { @@ -1471,7 +1503,8 @@ bool Image::ScaleUp(bool ignore) { auto scaled_info = info; scaled_info.size.width = scaled_width; scaled_info.size.height = scaled_height; - scaled_image = MakeImage(runtime->device, runtime->memory_allocator, scaled_info); + scaled_image = MakeImage(runtime->device, runtime->memory_allocator, scaled_info, + runtime->ViewFormats(info.format)); ignore = false; } current_image = *scaled_image; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 220943116..6621210ea 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -103,6 +103,10 @@ public: [[nodiscard]] VkBuffer GetTemporaryBuffer(size_t needed_size); + std::span<const VkFormat> ViewFormats(PixelFormat format) { + return view_formats[static_cast<std::size_t>(format)]; + } + void BarrierFeedbackLoop(); const Device& device; @@ -113,6 +117,7 @@ public: RenderPassCache& render_pass_cache; std::optional<ASTCDecoderPass> astc_decoder_pass; const Settings::ResolutionScalingInfo& resolution; + std::array<std::vector<VkFormat>, VideoCore::Surface::MaxPixelFormat> view_formats; static constexpr size_t indexing_slots = 8 * sizeof(size_t); std::array<vk::Buffer, indexing_slots> buffers{}; diff --git a/src/video_core/shader_cache.cpp b/src/video_core/shader_cache.cpp index 4db948b6d..01701201d 100644 --- a/src/video_core/shader_cache.cpp +++ b/src/video_core/shader_cache.cpp @@ -24,7 +24,7 @@ void ShaderCache::InvalidateRegion(VAddr addr, size_t size) { RemovePendingShaders(); } -void ShaderCache::OnCPUWrite(VAddr addr, size_t size) { +void ShaderCache::OnCacheInvalidation(VAddr addr, size_t size) { std::scoped_lock lock{invalidation_mutex}; InvalidatePagesInRegion(addr, size); } diff --git a/src/video_core/shader_cache.h b/src/video_core/shader_cache.h index f3cc4c70b..de8e08002 100644 --- a/src/video_core/shader_cache.h +++ b/src/video_core/shader_cache.h @@ -62,7 +62,7 @@ public: /// @brief Unmarks a memory region as cached and marks it for removal /// @param addr Start address of the CPU write operation /// @param size Number of bytes of the CPU write operation - void OnCPUWrite(VAddr addr, size_t size); + void OnCacheInvalidation(VAddr addr, size_t size); /// @brief Flushes delayed removal operations void SyncGuestHost(); diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 8190f3ba1..4457b366f 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -8,6 +8,7 @@ #include "common/alignment.h" #include "common/settings.h" +#include "core/memory.h" #include "video_core/control/channel_state.h" #include "video_core/dirty_flags.h" #include "video_core/engines/kepler_compute.h" @@ -598,6 +599,10 @@ void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz [&](ImageId id, Image&) { deleted_images.push_back(id); }); for (const ImageId id : deleted_images) { Image& image = slot_images[id]; + if (True(image.flags & ImageFlagBits::CpuModified)) { + continue; + } + image.flags |= ImageFlagBits::CpuModified; if (True(image.flags & ImageFlagBits::Remapped)) { continue; } @@ -865,11 +870,15 @@ void TextureCache<P>::PopAsyncFlushes() { template <class P> ImageId TextureCache<P>::DmaImageId(const Tegra::DMA::ImageOperand& operand, bool is_upload) { const ImageInfo dst_info(operand); - const ImageId image_id = FindDMAImage(dst_info, operand.address); - if (!image_id) { + const ImageId dst_id = FindDMAImage(dst_info, operand.address); + if (!dst_id) { + return NULL_IMAGE_ID; + } + auto& image = slot_images[dst_id]; + if (False(image.flags & ImageFlagBits::GpuModified)) { + // No need to waste time on an image that's synced with guest return NULL_IMAGE_ID; } - auto& image = slot_images[image_id]; if (image.info.type == ImageType::e3D) { // Don't accelerate 3D images. return NULL_IMAGE_ID; @@ -883,7 +892,7 @@ ImageId TextureCache<P>::DmaImageId(const Tegra::DMA::ImageOperand& operand, boo if (!base) { return NULL_IMAGE_ID; } - return image_id; + return dst_id; } template <class P> @@ -1018,19 +1027,19 @@ void TextureCache<P>::UploadImageContents(Image& image, StagingBuffer& staging) runtime.AccelerateImageUpload(image, staging, uploads); return; } - const size_t guest_size_bytes = image.guest_size_bytes; - swizzle_data_buffer.resize_destructive(guest_size_bytes); - gpu_memory->ReadBlockUnsafe(gpu_addr, swizzle_data_buffer.data(), guest_size_bytes); + + Core::Memory::GpuGuestMemory<u8, Core::Memory::GuestMemoryFlags::UnsafeRead> swizzle_data( + *gpu_memory, gpu_addr, image.guest_size_bytes, &swizzle_data_buffer); if (True(image.flags & ImageFlagBits::Converted)) { unswizzle_data_buffer.resize_destructive(image.unswizzled_size_bytes); - auto copies = UnswizzleImage(*gpu_memory, gpu_addr, image.info, swizzle_data_buffer, - unswizzle_data_buffer); + auto copies = + UnswizzleImage(*gpu_memory, gpu_addr, image.info, swizzle_data, unswizzle_data_buffer); ConvertImage(unswizzle_data_buffer, image.info, mapped_span, copies); image.UploadMemory(staging, copies); } else { const auto copies = - UnswizzleImage(*gpu_memory, gpu_addr, image.info, swizzle_data_buffer, mapped_span); + UnswizzleImage(*gpu_memory, gpu_addr, image.info, swizzle_data, mapped_span); image.UploadMemory(staging, copies); } } @@ -1223,11 +1232,12 @@ void TextureCache<P>::QueueAsyncDecode(Image& image, ImageId image_id) { decode->image_id = image_id; async_decodes.push_back(std::move(decode)); - Common::ScratchBuffer<u8> local_unswizzle_data_buffer(image.unswizzled_size_bytes); - const size_t guest_size_bytes = image.guest_size_bytes; - swizzle_data_buffer.resize_destructive(guest_size_bytes); - gpu_memory->ReadBlockUnsafe(image.gpu_addr, swizzle_data_buffer.data(), guest_size_bytes); - auto copies = UnswizzleImage(*gpu_memory, image.gpu_addr, image.info, swizzle_data_buffer, + static Common::ScratchBuffer<u8> local_unswizzle_data_buffer; + local_unswizzle_data_buffer.resize_destructive(image.unswizzled_size_bytes); + Core::Memory::GpuGuestMemory<u8, Core::Memory::GuestMemoryFlags::UnsafeRead> swizzle_data( + *gpu_memory, image.gpu_addr, image.guest_size_bytes, &swizzle_data_buffer); + + auto copies = UnswizzleImage(*gpu_memory, image.gpu_addr, image.info, swizzle_data, local_unswizzle_data_buffer); const size_t out_size = MapSizeBytes(image); diff --git a/src/video_core/texture_cache/types.h b/src/video_core/texture_cache/types.h index a0e10643f..0453456b4 100644 --- a/src/video_core/texture_cache/types.h +++ b/src/video_core/texture_cache/types.h @@ -54,7 +54,6 @@ enum class RelaxedOptions : u32 { Format = 1 << 1, Samples = 1 << 2, ForceBrokenViews = 1 << 3, - FormatBpp = 1 << 4, }; DECLARE_ENUM_FLAG_OPERATORS(RelaxedOptions) diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp index 9a618a57a..a83f5d41c 100644 --- a/src/video_core/texture_cache/util.cpp +++ b/src/video_core/texture_cache/util.cpp @@ -20,6 +20,7 @@ #include "common/div_ceil.h" #include "common/scratch_buffer.h" #include "common/settings.h" +#include "core/memory.h" #include "video_core/compatible_formats.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/memory_manager.h" @@ -544,17 +545,15 @@ void SwizzleBlockLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr tile_size.height, info.tile_width_spacing); const size_t subresource_size = sizes[level]; - tmp_buffer.resize_destructive(subresource_size); - const std::span<u8> dst(tmp_buffer); - for (s32 layer = 0; layer < info.resources.layers; ++layer) { const std::span<const u8> src = input.subspan(host_offset); - gpu_memory.ReadBlockUnsafe(gpu_addr + guest_offset, dst.data(), dst.size_bytes()); - - SwizzleTexture(dst, src, bytes_per_block, num_tiles.width, num_tiles.height, - num_tiles.depth, block.height, block.depth); + { + Core::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::UnsafeReadWrite> + dst(gpu_memory, gpu_addr + guest_offset, subresource_size, &tmp_buffer); - gpu_memory.WriteBlockUnsafe(gpu_addr + guest_offset, dst.data(), dst.size_bytes()); + SwizzleTexture(dst, src, bytes_per_block, num_tiles.width, num_tiles.height, + num_tiles.depth, block.height, block.depth); + } host_offset += host_bytes_per_layer; guest_offset += layer_stride; @@ -837,6 +836,7 @@ boost::container::small_vector<BufferImageCopy, 16> UnswizzleImage(Tegra::Memory const Extent3D size = info.size; if (info.type == ImageType::Linear) { + ASSERT(output.size_bytes() >= guest_size_bytes); gpu_memory.ReadBlockUnsafe(gpu_addr, output.data(), guest_size_bytes); ASSERT((info.pitch >> bpp_log2) << bpp_log2 == info.pitch); @@ -904,16 +904,6 @@ boost::container::small_vector<BufferImageCopy, 16> UnswizzleImage(Tegra::Memory return copies; } -BufferCopy UploadBufferCopy(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, - const ImageBase& image, std::span<u8> output) { - gpu_memory.ReadBlockUnsafe(gpu_addr, output.data(), image.guest_size_bytes); - return BufferCopy{ - .src_offset = 0, - .dst_offset = 0, - .size = image.guest_size_bytes, - }; -} - void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8> output, std::span<BufferImageCopy> copies) { u32 output_offset = 0; @@ -1201,8 +1191,7 @@ std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const // Format checking is relaxed, but we still have to check for matching bytes per block. // This avoids creating a view for blits on UE4 titles where formats with different bytes // per block are aliased. - if (BytesPerBlock(existing.format) != BytesPerBlock(candidate.format) && - False(options & RelaxedOptions::FormatBpp)) { + if (BytesPerBlock(existing.format) != BytesPerBlock(candidate.format)) { return std::nullopt; } } else { @@ -1233,11 +1222,7 @@ std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const } const bool strict_size = False(options & RelaxedOptions::Size); if (!IsBlockLinearSizeCompatible(existing, candidate, base->level, 0, strict_size)) { - if (False(options & RelaxedOptions::FormatBpp)) { - return std::nullopt; - } else if (!IsBlockLinearSizeCompatibleBPPRelaxed(existing, candidate, base->level, 0)) { - return std::nullopt; - } + return std::nullopt; } // TODO: compare block sizes return base; diff --git a/src/video_core/texture_cache/util.h b/src/video_core/texture_cache/util.h index ab45a43c4..5a0649d24 100644 --- a/src/video_core/texture_cache/util.h +++ b/src/video_core/texture_cache/util.h @@ -66,9 +66,6 @@ struct OverlapResult { Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info, std::span<const u8> input, std::span<u8> output); -[[nodiscard]] BufferCopy UploadBufferCopy(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, - const ImageBase& image, std::span<u8> output); - void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8> output, std::span<BufferImageCopy> copies); diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 421e71e5a..e04852e01 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -485,7 +485,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR loaded_extensions.erase(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME); } } - if (extensions.extended_dynamic_state2 && (is_radv || is_qualcomm)) { + if (extensions.extended_dynamic_state2 && is_radv) { const u32 version = (properties.properties.driverVersion << 3) >> 3; if (version < VK_MAKE_API_VERSION(0, 22, 3, 1)) { LOG_WARNING( @@ -498,6 +498,20 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR loaded_extensions.erase(VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME); } } + if (extensions.extended_dynamic_state2 && is_qualcomm) { + const u32 version = (properties.properties.driverVersion << 3) >> 3; + if (version >= VK_MAKE_API_VERSION(0, 0, 676, 0) && + version < VK_MAKE_API_VERSION(0, 0, 680, 0)) { + // Qualcomm Adreno 7xx drivers do not properly support extended_dynamic_state2. + LOG_WARNING(Render_Vulkan, + "Qualcomm Adreno 7xx drivers have broken VK_EXT_extended_dynamic_state2"); + features.extended_dynamic_state2.extendedDynamicState2 = false; + features.extended_dynamic_state2.extendedDynamicState2LogicOp = false; + features.extended_dynamic_state2.extendedDynamicState2PatchControlPoints = false; + extensions.extended_dynamic_state2 = false; + loaded_extensions.erase(VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME); + } + } if (extensions.extended_dynamic_state3 && is_radv) { LOG_WARNING(Render_Vulkan, "RADV has broken extendedDynamicState3ColorBlendEquation"); features.extended_dynamic_state3.extendedDynamicState3ColorBlendEnable = false; @@ -512,8 +526,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR dynamic_state3_enables = false; } } - if (extensions.vertex_input_dynamic_state && (is_radv || is_qualcomm)) { - // Qualcomm S8gen2 drivers do not properly support vertex_input_dynamic_state. + if (extensions.vertex_input_dynamic_state && is_radv) { // TODO(ameerj): Blacklist only offending driver versions // TODO(ameerj): Confirm if RDNA1 is affected const bool is_rdna2 = @@ -526,6 +539,19 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR loaded_extensions.erase(VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME); } } + if (extensions.vertex_input_dynamic_state && is_qualcomm) { + const u32 version = (properties.properties.driverVersion << 3) >> 3; + if (version >= VK_MAKE_API_VERSION(0, 0, 676, 0) && + version < VK_MAKE_API_VERSION(0, 0, 680, 0)) { + // Qualcomm Adreno 7xx drivers do not properly support vertex_input_dynamic_state. + LOG_WARNING( + Render_Vulkan, + "Qualcomm Adreno 7xx drivers have broken VK_EXT_vertex_input_dynamic_state"); + features.vertex_input_dynamic_state.vertexInputDynamicState = false; + extensions.vertex_input_dynamic_state = false; + loaded_extensions.erase(VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME); + } + } sets_per_pool = 64; if (extensions.extended_dynamic_state3 && is_amd_driver && @@ -774,6 +800,17 @@ bool Device::ShouldBoostClocks() const { return validated_driver && !is_steam_deck && !is_debugging; } +bool Device::HasTimelineSemaphore() const { + if (GetDriverID() == VK_DRIVER_ID_QUALCOMM_PROPRIETARY || + GetDriverID() == VK_DRIVER_ID_MESA_TURNIP) { + // Timeline semaphores do not work properly on all Qualcomm drivers. + // They generally work properly with Turnip drivers, but are problematic on some devices + // (e.g. ZTE handsets with Snapdragon 870). + return false; + } + return features.timeline_semaphore.timelineSemaphore; +} + bool Device::GetSuitability(bool requires_swapchain) { // Assume we will be suitable. bool suitable = true; diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 1f17265d5..be3ed45ff 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -77,6 +77,7 @@ VK_DEFINE_HANDLE(VmaAllocator) EXTENSION(KHR, SPIRV_1_4, spirv_1_4) \ EXTENSION(KHR, SWAPCHAIN, swapchain) \ EXTENSION(KHR, SWAPCHAIN_MUTABLE_FORMAT, swapchain_mutable_format) \ + EXTENSION(KHR, IMAGE_FORMAT_LIST, image_format_list) \ EXTENSION(NV, DEVICE_DIAGNOSTICS_CONFIG, device_diagnostics_config) \ EXTENSION(NV, GEOMETRY_SHADER_PASSTHROUGH, geometry_shader_passthrough) \ EXTENSION(NV, VIEWPORT_ARRAY2, viewport_array2) \ @@ -408,6 +409,11 @@ public: return extensions.workgroup_memory_explicit_layout; } + /// Returns true if the device supports VK_KHR_image_format_list. + bool IsKhrImageFormatListSupported() const { + return extensions.image_format_list || instance_version >= VK_API_VERSION_1_2; + } + /// Returns true if the device supports VK_EXT_primitive_topology_list_restart. bool IsTopologyListPrimitiveRestartSupported() const { return features.primitive_topology_list_restart.primitiveTopologyListRestart; @@ -522,13 +528,7 @@ public: return extensions.shader_atomic_int64; } - bool HasTimelineSemaphore() const { - if (GetDriverID() == VK_DRIVER_ID_QUALCOMM_PROPRIETARY) { - // Timeline semaphores do not work properly on all Qualcomm drivers. - return false; - } - return features.timeline_semaphore.timelineSemaphore; - } + bool HasTimelineSemaphore() const; /// Returns the minimum supported version of SPIR-V. u32 SupportedSpirvVersion() const { diff --git a/src/video_core/vulkan_common/vulkan_instance.cpp b/src/video_core/vulkan_common/vulkan_instance.cpp index 7624a9b32..6a294c1da 100644 --- a/src/video_core/vulkan_common/vulkan_instance.cpp +++ b/src/video_core/vulkan_common/vulkan_instance.cpp @@ -19,11 +19,9 @@ #include <windows.h> // ensure include order #include <vulkan/vulkan_win32.h> -#elif defined(__APPLE__) -#include <vulkan/vulkan_macos.h> #elif defined(__ANDROID__) #include <vulkan/vulkan_android.h> -#else +#elif !defined(__APPLE__) #include <X11/Xlib.h> #include <vulkan/vulkan_wayland.h> #include <vulkan/vulkan_xlib.h> @@ -68,7 +66,7 @@ namespace { break; #elif defined(__APPLE__) case Core::Frontend::WindowSystemType::Cocoa: - extensions.push_back(VK_MVK_MACOS_SURFACE_EXTENSION_NAME); + extensions.push_back(VK_EXT_METAL_SURFACE_EXTENSION_NAME); break; #elif defined(__ANDROID__) case Core::Frontend::WindowSystemType::Android: diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp index a2ef0efa4..42f3ee0b4 100644 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp @@ -221,8 +221,8 @@ vk::Image MemoryAllocator::CreateImage(const VkImageCreateInfo& ci) const { const VmaAllocationCreateInfo alloc_ci = { .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT, .usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE, - .requiredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, - .preferredFlags = 0, + .requiredFlags = 0, + .preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, .memoryTypeBits = 0, .pool = VK_NULL_HANDLE, .pUserData = nullptr, diff --git a/src/video_core/vulkan_common/vulkan_surface.cpp b/src/video_core/vulkan_common/vulkan_surface.cpp index c34599365..cfea4cd7b 100644 --- a/src/video_core/vulkan_common/vulkan_surface.cpp +++ b/src/video_core/vulkan_common/vulkan_surface.cpp @@ -11,11 +11,9 @@ #include <windows.h> // ensure include order #include <vulkan/vulkan_win32.h> -#elif defined(__APPLE__) -#include <vulkan/vulkan_macos.h> #elif defined(__ANDROID__) #include <vulkan/vulkan_android.h> -#else +#elif !defined(__APPLE__) #include <X11/Xlib.h> #include <vulkan/vulkan_wayland.h> #include <vulkan/vulkan_xlib.h> @@ -44,12 +42,13 @@ vk::SurfaceKHR CreateSurface( } #elif defined(__APPLE__) if (window_info.type == Core::Frontend::WindowSystemType::Cocoa) { - const VkMacOSSurfaceCreateInfoMVK mvk_ci{VK_STRUCTURE_TYPE_MACOS_SURFACE_CREATE_INFO_MVK, - nullptr, 0, window_info.render_surface}; - const auto vkCreateMacOSSurfaceMVK = reinterpret_cast<PFN_vkCreateMacOSSurfaceMVK>( - dld.vkGetInstanceProcAddr(*instance, "vkCreateMacOSSurfaceMVK")); - if (!vkCreateMacOSSurfaceMVK || - vkCreateMacOSSurfaceMVK(*instance, &mvk_ci, nullptr, &unsafe_surface) != VK_SUCCESS) { + const VkMetalSurfaceCreateInfoEXT macos_ci = { + .pLayer = static_cast<const CAMetalLayer*>(window_info.render_surface), + }; + const auto vkCreateMetalSurfaceEXT = reinterpret_cast<PFN_vkCreateMetalSurfaceEXT>( + dld.vkGetInstanceProcAddr(*instance, "vkCreateMetalSurfaceEXT")); + if (!vkCreateMetalSurfaceEXT || + vkCreateMetalSurfaceEXT(*instance, &macos_ci, nullptr, &unsafe_surface) != VK_SUCCESS) { LOG_ERROR(Render_Vulkan, "Failed to initialize Metal surface"); throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); } diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index b5e70fcd4..32bd75ad8 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -15,6 +15,8 @@ #define VK_NO_PROTOTYPES #ifdef _WIN32 #define VK_USE_PLATFORM_WIN32_KHR +#elif defined(__APPLE__) +#define VK_USE_PLATFORM_METAL_EXT #endif #include <vulkan/vulkan.h> |
