From 82c2601555b59a94d7160f2fd686cb63d32dd423 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sat, 16 Jan 2021 20:48:58 -0300 Subject: video_core: Reimplement the buffer cache Reimplement the buffer cache using cached bindings and page level granularity for modification tracking. This also drops the usage of shared pointers and virtual functions from the cache. - Bindings are cached, allowing to skip work when the game changes few bits between draws. - OpenGL Assembly shaders no longer copy when a region has been modified from the GPU to emulate constant buffers, instead GL_EXT_memory_object is used to alias sub-buffers within the same allocation. - OpenGL Assembly shaders stream constant buffer data using glProgramBufferParametersIuivNV, from NV_parameter_buffer_object. In theory this should save one hash table resolve inside the driver compared to glBufferSubData. - A new OpenGL stream buffer is implemented based on fences for drivers that are not Nvidia's proprietary, due to their low performance on partial glBufferSubData calls synchronized with 3D rendering (that some games use a lot). - Most optimizations are shared between APIs now, allowing Vulkan to cache more bindings than before, skipping unnecesarry work. This commit adds the necessary infrastructure to use Vulkan object from OpenGL. Overall, it improves performance and fixes some bugs present on the old cache. There are still some edge cases hit by some games that harm performance on some vendors, this are planned to be fixed in later commits. --- src/video_core/texture_cache/texture_cache.h | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) (limited to 'src/video_core/texture_cache') diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index d1080300f..f336b705f 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -103,9 +103,6 @@ public: /// Notify the cache that a new frame has been queued void TickFrame(); - /// Return an unique mutually exclusive lock for the cache - [[nodiscard]] std::unique_lock AcquireLock(); - /// Return a constant reference to the given image view id [[nodiscard]] const ImageView& GetImageView(ImageViewId id) const noexcept; @@ -179,6 +176,8 @@ public: /// Return true when a CPU region is modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); + std::mutex mutex; + private: /// Iterate over all page indices in a range template @@ -212,8 +211,8 @@ private: void RefreshContents(Image& image); /// Upload data from guest to an image - template - void UploadImageContents(Image& image, MapBuffer& map, size_t buffer_offset); + template + void UploadImageContents(Image& image, StagingBuffer& staging_buffer, size_t buffer_offset); /// Find or create an image view from a guest descriptor [[nodiscard]] ImageViewId FindImageView(const TICEntry& config); @@ -325,8 +324,6 @@ private: RenderTargets render_targets; - std::mutex mutex; - std::unordered_map image_views; std::unordered_map samplers; std::unordered_map framebuffers; @@ -385,11 +382,6 @@ void TextureCache

::TickFrame() { ++frame_tick; } -template -std::unique_lock TextureCache

::AcquireLock() { - return std::unique_lock{mutex}; -} - template const typename P::ImageView& TextureCache

::GetImageView(ImageViewId id) const noexcept { return slot_image_views[id]; @@ -598,11 +590,11 @@ void TextureCache

::DownloadMemory(VAddr cpu_addr, size_t size) { }); for (const ImageId image_id : images) { Image& image = slot_images[image_id]; - auto map = runtime.MapDownloadBuffer(image.unswizzled_size_bytes); + auto map = runtime.DownloadStagingBuffer(image.unswizzled_size_bytes); const auto copies = FullDownloadCopies(image.info); image.DownloadMemory(map, 0, copies); runtime.Finish(); - SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.Span()); + SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span); } } @@ -757,7 +749,7 @@ void TextureCache

::PopAsyncFlushes() { for (const ImageId image_id : download_ids) { total_size_bytes += slot_images[image_id].unswizzled_size_bytes; } - auto download_map = runtime.MapDownloadBuffer(total_size_bytes); + auto download_map = runtime.DownloadStagingBuffer(total_size_bytes); size_t buffer_offset = 0; for (const ImageId image_id : download_ids) { Image& image = slot_images[image_id]; @@ -769,7 +761,7 @@ void TextureCache

::PopAsyncFlushes() { runtime.Finish(); buffer_offset = 0; - const std::span download_span = download_map.Span(); + const std::span download_span = download_map.mapped_span; for (const ImageId image_id : download_ids) { const ImageBase& image = slot_images[image_id]; const auto copies = FullDownloadCopies(image.info); @@ -806,7 +798,7 @@ void TextureCache

::RefreshContents(Image& image) { LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented"); return; } - auto map = runtime.MapUploadBuffer(MapSizeBytes(image)); + auto map = runtime.UploadStagingBuffer(MapSizeBytes(image)); UploadImageContents(image, map, 0); runtime.InsertUploadMemoryBarrier(); } @@ -814,7 +806,7 @@ void TextureCache

::RefreshContents(Image& image) { template template void TextureCache

::UploadImageContents(Image& image, MapBuffer& map, size_t buffer_offset) { - const std::span mapped_span = map.Span().subspan(buffer_offset); + const std::span mapped_span = map.mapped_span.subspan(buffer_offset); const GPUVAddr gpu_addr = image.gpu_addr; if (True(image.flags & ImageFlagBits::AcceleratedUpload)) { -- cgit v1.2.3 From 35df1d1864ba721ea7b1cebf9a106dd771cde4f5 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sat, 16 Jan 2021 16:20:18 -0300 Subject: vk_staging_buffer_pool: Add stream buffer for small uploads This uses a ring buffer similar to OpenGL's stream buffer for small uploads. This stops us from allocating several small buffers, reducing memory fragmentation and cache locality. It uses dedicated allocations when possible. --- src/video_core/buffer_cache/buffer_cache.h | 28 ++-- .../renderer_opengl/gl_texture_cache.cpp | 21 ++- src/video_core/renderer_opengl/gl_texture_cache.h | 11 +- src/video_core/renderer_opengl/util_shaders.cpp | 18 +-- src/video_core/renderer_opengl/util_shaders.h | 6 +- src/video_core/renderer_vulkan/vk_buffer_cache.cpp | 23 ++-- src/video_core/renderer_vulkan/vk_compute_pass.cpp | 61 ++++----- src/video_core/renderer_vulkan/vk_compute_pass.h | 9 +- .../renderer_vulkan/vk_staging_buffer_pool.cpp | 142 ++++++++++++++++++++- .../renderer_vulkan/vk_staging_buffer_pool.h | 20 +++ .../renderer_vulkan/vk_texture_cache.cpp | 14 +- src/video_core/renderer_vulkan/vk_texture_cache.h | 9 +- src/video_core/texture_cache/texture_cache.h | 38 +++--- src/video_core/vulkan_common/vulkan_wrapper.cpp | 20 ++- src/video_core/vulkan_common/vulkan_wrapper.h | 5 +- 15 files changed, 298 insertions(+), 127 deletions(-) (limited to 'src/video_core/texture_cache') diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index e4f3c8e35..d6399bf24 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -239,8 +239,7 @@ private: void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, std::span copies); - void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, - std::span copies); + void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span copies); void DeleteBuffer(BufferId buffer_id); @@ -362,11 +361,17 @@ void BufferCache

::DownloadMemory(VAddr cpu_addr, u64 size) { auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); const u8* const mapped_memory = download_staging.mapped_span.data(); const std::span copies_span(copies.data(), copies.data() + copies.size()); + for (BufferCopy& copy : copies) { + // Modify copies to have the staging offset in mind + copy.dst_offset += download_staging.offset; + } runtime.CopyBuffer(download_staging.buffer, buffer, copies_span); runtime.Finish(); for (const BufferCopy& copy : copies) { const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; - const u8* copy_mapped_memory = mapped_memory + copy.dst_offset; + // Undo the modified offset + const u64 dst_offset = copy.dst_offset - download_staging.offset; + const u8* copy_mapped_memory = mapped_memory + dst_offset; cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size); } } else { @@ -554,7 +559,9 @@ void BufferCache

::PopAsyncFlushes() { } if constexpr (USE_MEMORY_MAPS) { auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); - for (const auto [copy, buffer_id] : downloads) { + for (auto& [copy, buffer_id] : downloads) { + // Have in mind the staging buffer offset for the copy + copy.dst_offset += download_staging.offset; const std::array copies{copy}; runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies); } @@ -562,7 +569,9 @@ void BufferCache

::PopAsyncFlushes() { for (const auto [copy, buffer_id] : downloads) { const Buffer& buffer = slot_buffers[buffer_id]; const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; - const u8* read_mapped_memory = download_staging.mapped_span.data() + copy.dst_offset; + // Undo the modified offset + const u64 dst_offset = copy.dst_offset - download_staging.offset; + const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset; cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size); } } else { @@ -1117,13 +1126,16 @@ void BufferCache

::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, template void BufferCache

::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, - std::span copies) { + std::span copies) { auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes); const std::span staging_pointer = upload_staging.mapped_span; - for (const BufferCopy& copy : copies) { - const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; + for (BufferCopy& copy : copies) { u8* const src_pointer = staging_pointer.data() + copy.src_offset; + const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size); + + // Apply the staging offset + copy.src_offset += upload_staging.offset; } runtime.CopyBuffer(buffer, upload_staging.buffer, copies); } diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 37572ab28..31eb54123 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -550,15 +550,14 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src, } void TextureCacheRuntime::AccelerateImageUpload(Image& image, const ImageBufferMap& map, - size_t buffer_offset, std::span swizzles) { switch (image.info.type) { case ImageType::e2D: - return util_shaders.BlockLinearUpload2D(image, map, buffer_offset, swizzles); + return util_shaders.BlockLinearUpload2D(image, map, swizzles); case ImageType::e3D: - return util_shaders.BlockLinearUpload3D(image, map, buffer_offset, swizzles); + return util_shaders.BlockLinearUpload3D(image, map, swizzles); case ImageType::Linear: - return util_shaders.PitchUpload(image, map, buffer_offset, swizzles); + return util_shaders.PitchUpload(image, map, swizzles); default: UNREACHABLE(); break; @@ -710,10 +709,10 @@ Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_, } } -void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, +void Image::UploadMemory(const ImageBufferMap& map, std::span copies) { glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer); - glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, buffer_offset, unswizzled_size_bytes); + glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, map.offset, unswizzled_size_bytes); glPixelStorei(GL_UNPACK_ALIGNMENT, 1); @@ -729,19 +728,19 @@ void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, current_image_height = copy.buffer_image_height; glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, current_image_height); } - CopyBufferToImage(copy, buffer_offset); + CopyBufferToImage(copy, map.offset); } } -void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, +void Image::UploadMemory(const ImageBufferMap& map, std::span copies) { for (const VideoCommon::BufferCopy& copy : copies) { - glCopyNamedBufferSubData(map.buffer, buffer.handle, copy.src_offset + buffer_offset, + glCopyNamedBufferSubData(map.buffer, buffer.handle, copy.src_offset + map.offset, copy.dst_offset, copy.size); } } -void Image::DownloadMemory(ImageBufferMap& map, size_t buffer_offset, +void Image::DownloadMemory(ImageBufferMap& map, std::span copies) { glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API @@ -760,7 +759,7 @@ void Image::DownloadMemory(ImageBufferMap& map, size_t buffer_offset, current_image_height = copy.buffer_image_height; glPixelStorei(GL_PACK_IMAGE_HEIGHT, current_image_height); } - CopyImageToBuffer(copy, buffer_offset); + CopyImageToBuffer(copy, map.offset); } } diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 60d08d6d6..874cf54f4 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -35,6 +35,7 @@ struct ImageBufferMap { ~ImageBufferMap(); std::span mapped_span; + size_t offset = 0; OGLSync* sync; GLuint buffer; }; @@ -78,7 +79,7 @@ public: Tegra::Engines::Fermi2D::Filter filter, Tegra::Engines::Fermi2D::Operation operation); - void AccelerateImageUpload(Image& image, const ImageBufferMap& map, size_t buffer_offset, + void AccelerateImageUpload(Image& image, const ImageBufferMap& map, std::span swizzles); void InsertUploadMemoryBarrier(); @@ -137,14 +138,12 @@ public: explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr); - void UploadMemory(const ImageBufferMap& map, size_t buffer_offset, + void UploadMemory(const ImageBufferMap& map, std::span copies); - void UploadMemory(const ImageBufferMap& map, size_t buffer_offset, - std::span copies); + void UploadMemory(const ImageBufferMap& map, std::span copies); - void DownloadMemory(ImageBufferMap& map, size_t buffer_offset, - std::span copies); + void DownloadMemory(ImageBufferMap& map, std::span copies); GLuint Handle() const noexcept { return texture.handle; diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index aeb36551c..1b58e8617 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -63,7 +63,7 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_) UtilShaders::~UtilShaders() = default; -void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, size_t buffer_offset, +void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, std::span swizzles) { static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1}; static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; @@ -71,13 +71,13 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; program_manager.BindHostCompute(block_linear_unswizzle_2d_program.handle); - glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes); + glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format)); for (const SwizzleParameters& swizzle : swizzles) { const Extent3D num_tiles = swizzle.num_tiles; - const size_t input_offset = swizzle.buffer_offset + buffer_offset; + const size_t input_offset = swizzle.buffer_offset + map.offset; const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); @@ -100,7 +100,7 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s program_manager.RestoreGuestCompute(); } -void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, size_t buffer_offset, +void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, std::span swizzles) { static constexpr Extent3D WORKGROUP_SIZE{16, 8, 8}; @@ -108,14 +108,14 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s static constexpr GLuint BINDING_INPUT_BUFFER = 1; static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; - glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes); + glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); program_manager.BindHostCompute(block_linear_unswizzle_3d_program.handle); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format)); for (const SwizzleParameters& swizzle : swizzles) { const Extent3D num_tiles = swizzle.num_tiles; - const size_t input_offset = swizzle.buffer_offset + buffer_offset; + const size_t input_offset = swizzle.buffer_offset + map.offset; const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); @@ -141,7 +141,7 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s program_manager.RestoreGuestCompute(); } -void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t buffer_offset, +void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, std::span swizzles) { static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1}; static constexpr GLuint BINDING_INPUT_BUFFER = 0; @@ -159,7 +159,7 @@ void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t bu "Non-power of two images are not implemented"); program_manager.BindHostCompute(pitch_unswizzle_program.handle); - glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes); + glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); glUniform2ui(LOC_ORIGIN, 0, 0); glUniform2i(LOC_DESTINATION, 0, 0); glUniform1ui(LOC_BYTES_PER_BLOCK, bytes_per_block); @@ -167,7 +167,7 @@ void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t bu glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), 0, GL_FALSE, 0, GL_WRITE_ONLY, format); for (const SwizzleParameters& swizzle : swizzles) { const Extent3D num_tiles = swizzle.num_tiles; - const size_t input_offset = swizzle.buffer_offset + buffer_offset; + const size_t input_offset = swizzle.buffer_offset + map.offset; const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h index bec026bc3..7b1d16b09 100644 --- a/src/video_core/renderer_opengl/util_shaders.h +++ b/src/video_core/renderer_opengl/util_shaders.h @@ -24,13 +24,13 @@ public: explicit UtilShaders(ProgramManager& program_manager); ~UtilShaders(); - void BlockLinearUpload2D(Image& image, const ImageBufferMap& map, size_t buffer_offset, + void BlockLinearUpload2D(Image& image, const ImageBufferMap& map, std::span swizzles); - void BlockLinearUpload3D(Image& image, const ImageBufferMap& map, size_t buffer_offset, + void BlockLinearUpload3D(Image& image, const ImageBufferMap& map, std::span swizzles); - void PitchUpload(Image& image, const ImageBufferMap& map, size_t buffer_offset, + void PitchUpload(Image& image, const ImageBufferMap& map, std::span swizzles); void CopyBC4(Image& dst_image, Image& src_image, diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 48fc5d966..4f1e4ec28 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -138,17 +138,18 @@ void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer, void BufferCacheRuntime::BindIndexBuffer(PrimitiveTopology topology, IndexFormat index_format, u32 base_vertex, u32 num_indices, VkBuffer buffer, u32 offset, [[maybe_unused]] u32 size) { - VkIndexType index_type = MaxwellToVK::IndexFormat(index_format); + VkIndexType vk_index_type = MaxwellToVK::IndexFormat(index_format); + VkDeviceSize vk_offset = offset; if (topology == PrimitiveTopology::Quads) { - index_type = VK_INDEX_TYPE_UINT32; - std::tie(buffer, offset) = + vk_index_type = VK_INDEX_TYPE_UINT32; + std::tie(buffer, vk_offset) = quad_index_pass.Assemble(index_format, num_indices, base_vertex, buffer, offset); - } else if (index_type == VK_INDEX_TYPE_UINT8_EXT && !device.IsExtIndexTypeUint8Supported()) { - index_type = VK_INDEX_TYPE_UINT16; - std::tie(buffer, offset) = uint8_pass.Assemble(num_indices, buffer, offset); + } else if (vk_index_type == VK_INDEX_TYPE_UINT8_EXT && !device.IsExtIndexTypeUint8Supported()) { + vk_index_type = VK_INDEX_TYPE_UINT16; + std::tie(buffer, vk_offset) = uint8_pass.Assemble(num_indices, buffer, offset); } - scheduler.Record([buffer, offset, index_type](vk::CommandBuffer cmdbuf) { - cmdbuf.BindIndexBuffer(buffer, offset, index_type); + scheduler.Record([buffer, vk_offset, vk_index_type](vk::CommandBuffer cmdbuf) { + cmdbuf.BindIndexBuffer(buffer, vk_offset, vk_index_type); }); } @@ -251,10 +252,10 @@ void BufferCacheRuntime::ReserveQuadArrayLUT(u32 num_indices, bool wait_for_idle } } scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([src_buffer = staging.buffer, dst_buffer = *quad_array_lut, - size_bytes](vk::CommandBuffer cmdbuf) { + scheduler.Record([src_buffer = staging.buffer, src_offset = staging.offset, + dst_buffer = *quad_array_lut, size_bytes](vk::CommandBuffer cmdbuf) { const VkBufferCopy copy{ - .srcOffset = 0, + .srcOffset = src_offset, .dstOffset = 0, .size = size_bytes, }; diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index a4fdcdf81..2f9a7b028 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -10,6 +10,7 @@ #include "common/alignment.h" #include "common/assert.h" #include "common/common_types.h" +#include "common/div_ceil.h" #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" @@ -148,38 +149,33 @@ Uint8Pass::Uint8Pass(const Device& device, VKScheduler& scheduler_, Uint8Pass::~Uint8Pass() = default; -std::pair Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer, - u32 src_offset) { +std::pair Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer, + u32 src_offset) { const u32 staging_size = static_cast(num_vertices * sizeof(u16)); const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal); update_descriptor_queue.Acquire(); update_descriptor_queue.AddBuffer(src_buffer, src_offset, num_vertices); - update_descriptor_queue.AddBuffer(staging.buffer, 0, staging_size); + update_descriptor_queue.AddBuffer(staging.buffer, staging.offset, staging_size); const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set, num_vertices](vk::CommandBuffer cmdbuf) { - constexpr u32 dispatch_size = 1024; + static constexpr u32 DISPATCH_SIZE = 1024; + static constexpr VkMemoryBarrier WRITE_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT, + }; cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {}); - cmdbuf.Dispatch(Common::AlignUp(num_vertices, dispatch_size) / dispatch_size, 1, 1); - - VkBufferMemoryBarrier barrier; - barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barrier.pNext = nullptr; - barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.buffer = buffer; - barrier.offset = 0; - barrier.size = static_cast(num_vertices * sizeof(u16)); + cmdbuf.Dispatch(Common::DivCeil(num_vertices, DISPATCH_SIZE), 1, 1); cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {}); + VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, WRITE_BARRIER); }); - return {staging.buffer, 0}; + return {staging.buffer, staging.offset}; } QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_, @@ -194,7 +190,7 @@ QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_, QuadIndexedPass::~QuadIndexedPass() = default; -std::pair QuadIndexedPass::Assemble( +std::pair QuadIndexedPass::Assemble( Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, u32 base_vertex, VkBuffer src_buffer, u32 src_offset) { const u32 index_shift = [index_format] { @@ -217,34 +213,29 @@ std::pair QuadIndexedPass::Assemble( update_descriptor_queue.Acquire(); update_descriptor_queue.AddBuffer(src_buffer, src_offset, input_size); - update_descriptor_queue.AddBuffer(staging.buffer, 0, staging_size); + update_descriptor_queue.AddBuffer(staging.buffer, staging.offset, staging_size); const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set, num_tri_vertices, base_vertex, index_shift](vk::CommandBuffer cmdbuf) { - static constexpr u32 dispatch_size = 1024; + static constexpr u32 DISPATCH_SIZE = 1024; + static constexpr VkMemoryBarrier WRITE_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT, + }; const std::array push_constants = {base_vertex, index_shift}; cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {}); cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), &push_constants); - cmdbuf.Dispatch(Common::AlignUp(num_tri_vertices, dispatch_size) / dispatch_size, 1, 1); - - VkBufferMemoryBarrier barrier; - barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barrier.pNext = nullptr; - barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.buffer = buffer; - barrier.offset = 0; - barrier.size = static_cast(num_tri_vertices * sizeof(u32)); + cmdbuf.Dispatch(Common::DivCeil(num_tri_vertices, DISPATCH_SIZE), 1, 1); cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {}); + VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, WRITE_BARRIER); }); - return {staging.buffer, 0}; + return {staging.buffer, staging.offset}; } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index 4904019f5..17d781d99 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -50,7 +50,8 @@ public: /// Assemble uint8 indices into an uint16 index buffer /// Returns a pair with the staging buffer, and the offset where the assembled data is - std::pair Assemble(u32 num_vertices, VkBuffer src_buffer, u32 src_offset); + std::pair Assemble(u32 num_vertices, VkBuffer src_buffer, + u32 src_offset); private: VKScheduler& scheduler; @@ -66,9 +67,9 @@ public: VKUpdateDescriptorQueue& update_descriptor_queue_); ~QuadIndexedPass(); - std::pair Assemble(Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, - u32 num_vertices, u32 base_vertex, VkBuffer src_buffer, - u32 src_offset); + std::pair Assemble( + Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, + u32 base_vertex, VkBuffer src_buffer, u32 src_offset); private: VKScheduler& scheduler; diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp index 97fd41cc1..275d740b8 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp @@ -8,6 +8,7 @@ #include +#include "common/alignment.h" #include "common/assert.h" #include "common/bit_util.h" #include "common/common_types.h" @@ -17,14 +18,117 @@ #include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { +namespace { +// Maximum potential alignment of a Vulkan buffer +constexpr VkDeviceSize MAX_ALIGNMENT = 256; +// Maximum size to put elements in the stream buffer +constexpr VkDeviceSize MAX_STREAM_BUFFER_REQUEST_SIZE = 8 * 1024 * 1024; +// Stream buffer size in bytes +constexpr VkDeviceSize STREAM_BUFFER_SIZE = 128 * 1024 * 1024; +constexpr VkDeviceSize REGION_SIZE = STREAM_BUFFER_SIZE / StagingBufferPool::NUM_SYNCS; + +constexpr VkMemoryPropertyFlags HOST_FLAGS = + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; +constexpr VkMemoryPropertyFlags STREAM_FLAGS = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | HOST_FLAGS; + +bool IsStreamHeap(VkMemoryHeap heap) noexcept { + return STREAM_BUFFER_SIZE < (heap.size * 2) / 3; +} + +std::optional FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask, + VkMemoryPropertyFlags flags) noexcept { + for (u32 type_index = 0; type_index < props.memoryTypeCount; ++type_index) { + if (((type_mask >> type_index) & 1) == 0) { + // Memory type is incompatible + continue; + } + const VkMemoryType& memory_type = props.memoryTypes[type_index]; + if ((memory_type.propertyFlags & flags) != flags) { + // Memory type doesn't have the flags we want + continue; + } + if (!IsStreamHeap(props.memoryHeaps[memory_type.heapIndex])) { + // Memory heap is not suitable for streaming + continue; + } + // Success! + return type_index; + } + return std::nullopt; +} + +u32 FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask) { + // Try to find a DEVICE_LOCAL_BIT type, Nvidia and AMD have a dedicated heap for this + std::optional type = FindMemoryTypeIndex(props, type_mask, STREAM_FLAGS); + if (type) { + return *type; + } + // Otherwise try without the DEVICE_LOCAL_BIT + type = FindMemoryTypeIndex(props, type_mask, HOST_FLAGS); + if (type) { + return *type; + } + // This should never happen, and in case it does, signal it as an out of memory situation + throw vk::Exception(VK_ERROR_OUT_OF_DEVICE_MEMORY); +} + +size_t Region(size_t iterator) noexcept { + return iterator / REGION_SIZE; +} +} // Anonymous namespace StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_, VKScheduler& scheduler_) - : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} {} + : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} { + const vk::Device& dev = device.GetLogical(); + stream_buffer = dev.CreateBuffer(VkBufferCreateInfo{ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = STREAM_BUFFER_SIZE, + .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }); + if (device.HasDebuggingToolAttached()) { + stream_buffer.SetObjectNameEXT("Stream Buffer"); + } + VkMemoryDedicatedRequirements dedicated_reqs{ + .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS, + .pNext = nullptr, + .prefersDedicatedAllocation = VK_FALSE, + .requiresDedicatedAllocation = VK_FALSE, + }; + const auto requirements = dev.GetBufferMemoryRequirements(*stream_buffer, &dedicated_reqs); + const bool make_dedicated = dedicated_reqs.prefersDedicatedAllocation == VK_TRUE || + dedicated_reqs.requiresDedicatedAllocation == VK_TRUE; + const VkMemoryDedicatedAllocateInfo dedicated_info{ + .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO, + .pNext = nullptr, + .image = nullptr, + .buffer = *stream_buffer, + }; + const auto memory_properties = device.GetPhysical().GetMemoryProperties(); + stream_memory = dev.AllocateMemory(VkMemoryAllocateInfo{ + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .pNext = make_dedicated ? &dedicated_info : nullptr, + .allocationSize = requirements.size, + .memoryTypeIndex = FindMemoryTypeIndex(memory_properties, requirements.memoryTypeBits), + }); + if (device.HasDebuggingToolAttached()) { + stream_memory.SetObjectNameEXT("Stream Buffer Memory"); + } + stream_buffer.BindMemory(*stream_memory, 0); + stream_pointer = stream_memory.Map(0, STREAM_BUFFER_SIZE); +} StagingBufferPool::~StagingBufferPool() = default; StagingBufferRef StagingBufferPool::Request(size_t size, MemoryUsage usage) { + if (usage == MemoryUsage::Upload && size <= MAX_STREAM_BUFFER_REQUEST_SIZE) { + return GetStreamBuffer(size); + } if (const std::optional ref = TryGetReservedBuffer(size, usage)) { return *ref; } @@ -39,6 +143,42 @@ void StagingBufferPool::TickFrame() { ReleaseCache(MemoryUsage::Download); } +StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) { + for (size_t region = Region(used_iterator), region_end = Region(iterator); region < region_end; + ++region) { + sync_ticks[region] = scheduler.CurrentTick(); + } + used_iterator = iterator; + + for (size_t region = Region(free_iterator) + 1, + region_end = std::min(Region(iterator + size) + 1, NUM_SYNCS); + region < region_end; ++region) { + scheduler.Wait(sync_ticks[region]); + } + if (iterator + size > free_iterator) { + free_iterator = iterator + size; + } + if (iterator + size > STREAM_BUFFER_SIZE) { + for (size_t region = Region(used_iterator); region < NUM_SYNCS; ++region) { + sync_ticks[region] = scheduler.CurrentTick(); + } + used_iterator = 0; + iterator = 0; + free_iterator = size; + + for (size_t region = 0, region_end = Region(size); region <= region_end; ++region) { + scheduler.Wait(sync_ticks[region]); + } + } + const size_t offset = iterator; + iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT); + return StagingBufferRef{ + .buffer = *stream_buffer, + .offset = static_cast(offset), + .mapped_span = std::span(stream_pointer + offset, size), + }; +} + std::optional StagingBufferPool::TryGetReservedBuffer(size_t size, MemoryUsage usage) { StagingBuffers& cache_level = GetCache(usage)[Common::Log2Ceil64(size)]; diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h index d42918a47..4ed99c0df 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h @@ -19,11 +19,14 @@ class VKScheduler; struct StagingBufferRef { VkBuffer buffer; + VkDeviceSize offset; std::span mapped_span; }; class StagingBufferPool { public: + static constexpr size_t NUM_SYNCS = 16; + explicit StagingBufferPool(const Device& device, MemoryAllocator& memory_allocator, VKScheduler& scheduler); ~StagingBufferPool(); @@ -33,6 +36,11 @@ public: void TickFrame(); private: + struct StreamBufferCommit { + size_t upper_bound; + u64 tick; + }; + struct StagingBuffer { vk::Buffer buffer; MemoryCommit commit; @@ -42,6 +50,7 @@ private: StagingBufferRef Ref() const noexcept { return { .buffer = *buffer, + .offset = 0, .mapped_span = mapped_span, }; } @@ -56,6 +65,8 @@ private: static constexpr size_t NUM_LEVELS = sizeof(size_t) * CHAR_BIT; using StagingBuffersCache = std::array; + StagingBufferRef GetStreamBuffer(size_t size); + std::optional TryGetReservedBuffer(size_t size, MemoryUsage usage); StagingBufferRef CreateStagingBuffer(size_t size, MemoryUsage usage); @@ -70,6 +81,15 @@ private: MemoryAllocator& memory_allocator; VKScheduler& scheduler; + vk::Buffer stream_buffer; + vk::DeviceMemory stream_memory; + u8* stream_pointer = nullptr; + + size_t iterator = 0; + size_t used_iterator = 0; + size_t free_iterator = 0; + std::array sync_ticks{}; + StagingBuffersCache device_local_cache; StagingBuffersCache upload_cache; StagingBuffersCache download_cache; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 1eeb45ca9..22a1014a9 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -818,11 +818,10 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_ } } -void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset, - std::span copies) { +void Image::UploadMemory(const StagingBufferRef& map, std::span copies) { // TODO: Move this to another API scheduler->RequestOutsideRenderPassOperationContext(); - std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask); + std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask); const VkBuffer src_buffer = map.buffer; const VkImage vk_image = *image; const VkImageAspectFlags vk_aspect_mask = aspect_mask; @@ -833,11 +832,11 @@ void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset, }); } -void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset, +void Image::UploadMemory(const StagingBufferRef& map, std::span copies) { // TODO: Move this to another API scheduler->RequestOutsideRenderPassOperationContext(); - std::vector vk_copies = TransformBufferCopies(copies, buffer_offset); + std::vector vk_copies = TransformBufferCopies(copies, map.offset); const VkBuffer src_buffer = map.buffer; const VkBuffer dst_buffer = *buffer; scheduler->Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) { @@ -846,9 +845,8 @@ void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset, }); } -void Image::DownloadMemory(const StagingBufferRef& map, size_t buffer_offset, - std::span copies) { - std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask); +void Image::DownloadMemory(const StagingBufferRef& map, std::span copies) { + std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask); scheduler->Record([buffer = map.buffer, image = *image, aspect_mask = aspect_mask, vk_copies](vk::CommandBuffer cmdbuf) { const VkImageMemoryBarrier read_barrier{ diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 4558c3297..b08c23459 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -82,7 +82,7 @@ struct TextureCacheRuntime { return false; } - void AccelerateImageUpload(Image&, const StagingBufferRef&, size_t, + void AccelerateImageUpload(Image&, const StagingBufferRef&, std::span) { UNREACHABLE(); } @@ -100,13 +100,12 @@ public: explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr); - void UploadMemory(const StagingBufferRef& map, size_t buffer_offset, + void UploadMemory(const StagingBufferRef& map, std::span copies); - void UploadMemory(const StagingBufferRef& map, size_t buffer_offset, - std::span copies); + void UploadMemory(const StagingBufferRef& map, std::span copies); - void DownloadMemory(const StagingBufferRef& map, size_t buffer_offset, + void DownloadMemory(const StagingBufferRef& map, std::span copies); [[nodiscard]] VkImage Handle() const noexcept { diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index f336b705f..b1da69971 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -212,7 +212,7 @@ private: /// Upload data from guest to an image template - void UploadImageContents(Image& image, StagingBuffer& staging_buffer, size_t buffer_offset); + void UploadImageContents(Image& image, StagingBuffer& staging_buffer); /// Find or create an image view from a guest descriptor [[nodiscard]] ImageViewId FindImageView(const TICEntry& config); @@ -592,7 +592,7 @@ void TextureCache

::DownloadMemory(VAddr cpu_addr, size_t size) { Image& image = slot_images[image_id]; auto map = runtime.DownloadStagingBuffer(image.unswizzled_size_bytes); const auto copies = FullDownloadCopies(image.info); - image.DownloadMemory(map, 0, copies); + image.DownloadMemory(map, copies); runtime.Finish(); SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span); } @@ -750,24 +750,24 @@ void TextureCache

::PopAsyncFlushes() { total_size_bytes += slot_images[image_id].unswizzled_size_bytes; } auto download_map = runtime.DownloadStagingBuffer(total_size_bytes); - size_t buffer_offset = 0; + const size_t original_offset = download_map.offset; for (const ImageId image_id : download_ids) { Image& image = slot_images[image_id]; const auto copies = FullDownloadCopies(image.info); - image.DownloadMemory(download_map, buffer_offset, copies); - buffer_offset += image.unswizzled_size_bytes; + image.DownloadMemory(download_map, copies); + download_map.offset += image.unswizzled_size_bytes; } // Wait for downloads to finish runtime.Finish(); - buffer_offset = 0; - const std::span download_span = download_map.mapped_span; + download_map.offset = original_offset; + std::span download_span = download_map.mapped_span; for (const ImageId image_id : download_ids) { const ImageBase& image = slot_images[image_id]; const auto copies = FullDownloadCopies(image.info); - const std::span image_download_span = download_span.subspan(buffer_offset); - SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, image_download_span); - buffer_offset += image.unswizzled_size_bytes; + SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, download_span); + download_map.offset += image.unswizzled_size_bytes; + download_span = download_span.subspan(image.unswizzled_size_bytes); } committed_downloads.pop(); } @@ -798,32 +798,32 @@ void TextureCache

::RefreshContents(Image& image) { LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented"); return; } - auto map = runtime.UploadStagingBuffer(MapSizeBytes(image)); - UploadImageContents(image, map, 0); + auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image)); + UploadImageContents(image, staging); runtime.InsertUploadMemoryBarrier(); } template -template -void TextureCache

::UploadImageContents(Image& image, MapBuffer& map, size_t buffer_offset) { - const std::span mapped_span = map.mapped_span.subspan(buffer_offset); +template +void TextureCache

::UploadImageContents(Image& image, StagingBuffer& staging) { + const std::span mapped_span = staging.mapped_span; const GPUVAddr gpu_addr = image.gpu_addr; if (True(image.flags & ImageFlagBits::AcceleratedUpload)) { gpu_memory.ReadBlockUnsafe(gpu_addr, mapped_span.data(), mapped_span.size_bytes()); const auto uploads = FullUploadSwizzles(image.info); - runtime.AccelerateImageUpload(image, map, buffer_offset, uploads); + runtime.AccelerateImageUpload(image, staging, uploads); } else if (True(image.flags & ImageFlagBits::Converted)) { std::vector unswizzled_data(image.unswizzled_size_bytes); auto copies = UnswizzleImage(gpu_memory, gpu_addr, image.info, unswizzled_data); ConvertImage(unswizzled_data, image.info, mapped_span, copies); - image.UploadMemory(map, buffer_offset, copies); + image.UploadMemory(staging, copies); } else if (image.info.type == ImageType::Buffer) { const std::array copies{UploadBufferCopy(gpu_memory, gpu_addr, image, mapped_span)}; - image.UploadMemory(map, buffer_offset, copies); + image.UploadMemory(staging, copies); } else { const auto copies = UnswizzleImage(gpu_memory, gpu_addr, image.info, mapped_span); - image.UploadMemory(map, buffer_offset, copies); + image.UploadMemory(staging, copies); } } diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index d39bbdc70..2aa0ffbe6 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp @@ -168,7 +168,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkFreeCommandBuffers); X(vkFreeDescriptorSets); X(vkFreeMemory); - X(vkGetBufferMemoryRequirements); + X(vkGetBufferMemoryRequirements2); X(vkGetDeviceQueue); X(vkGetEventStatus); X(vkGetFenceStatus); @@ -786,10 +786,20 @@ DeviceMemory Device::AllocateMemory(const VkMemoryAllocateInfo& ai) const { return DeviceMemory(memory, handle, *dld); } -VkMemoryRequirements Device::GetBufferMemoryRequirements(VkBuffer buffer) const noexcept { - VkMemoryRequirements requirements; - dld->vkGetBufferMemoryRequirements(handle, buffer, &requirements); - return requirements; +VkMemoryRequirements Device::GetBufferMemoryRequirements(VkBuffer buffer, + void* pnext) const noexcept { + const VkBufferMemoryRequirementsInfo2 info{ + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2, + .pNext = nullptr, + .buffer = buffer, + }; + VkMemoryRequirements2 requirements{ + .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2, + .pNext = pnext, + .memoryRequirements{}, + }; + dld->vkGetBufferMemoryRequirements2(handle, &info, &requirements); + return requirements.memoryRequirements; } VkMemoryRequirements Device::GetImageMemoryRequirements(VkImage image) const noexcept { diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index 7f781b081..3e36d356a 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -283,7 +283,7 @@ struct DeviceDispatch : InstanceDispatch { PFN_vkFreeCommandBuffers vkFreeCommandBuffers{}; PFN_vkFreeDescriptorSets vkFreeDescriptorSets{}; PFN_vkFreeMemory vkFreeMemory{}; - PFN_vkGetBufferMemoryRequirements vkGetBufferMemoryRequirements{}; + PFN_vkGetBufferMemoryRequirements2 vkGetBufferMemoryRequirements2{}; PFN_vkGetDeviceQueue vkGetDeviceQueue{}; PFN_vkGetEventStatus vkGetEventStatus{}; PFN_vkGetFenceStatus vkGetFenceStatus{}; @@ -871,7 +871,8 @@ public: DeviceMemory AllocateMemory(const VkMemoryAllocateInfo& ai) const; - VkMemoryRequirements GetBufferMemoryRequirements(VkBuffer buffer) const noexcept; + VkMemoryRequirements GetBufferMemoryRequirements(VkBuffer buffer, + void* pnext = nullptr) const noexcept; VkMemoryRequirements GetImageMemoryRequirements(VkImage image) const noexcept; -- cgit v1.2.3