From 8e56a84566036cfff0aa5c3d80ae1b051d2bd0bf Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Sun, 23 Apr 2023 00:01:08 -0400 Subject: core_timing: Use CNTPCT as the guest CPU tick Previously, we were mixing the raw CPU frequency and CNTFRQ. The raw CPU frequency (1020 MHz) should've never been used as CNTPCT (whose frequency is CNTFRQ) is the only counter available. --- src/video_core/gpu.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 456f733cf..70762c51a 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -194,17 +194,17 @@ struct GPU::Impl { [[nodiscard]] u64 GetTicks() const { // This values were reversed engineered by fincs from NVN - // The gpu clock is reported in units of 385/625 nanoseconds - constexpr u64 gpu_ticks_num = 384; - constexpr u64 gpu_ticks_den = 625; + // The GPU clock is 614.4 MHz + using NsToGPUTickRatio = std::ratio<614'400'000, std::nano::den>; + static_assert(NsToGPUTickRatio::num == 384 && NsToGPUTickRatio::den == 625); + + u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count(); - u64 nanoseconds = system.CoreTiming().GetCPUTimeNs().count(); if (Settings::values.use_fast_gpu_time.GetValue()) { nanoseconds /= 256; } - const u64 nanoseconds_num = nanoseconds / gpu_ticks_den; - const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den; - return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den; + + return nanoseconds * NsToGPUTickRatio::num / NsToGPUTickRatio::den; } [[nodiscard]] bool IsAsync() const { -- cgit v1.2.3 From 907507886d755fa56099713c4b8f05bb640a8b7d Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Sun, 28 May 2023 17:45:47 -0400 Subject: (wall, native)_clock: Add GetGPUTick Allows us to directly calculate the GPU tick without double conversion to and from the host clock tick. --- src/common/wall_clock.cpp | 8 ++++++-- src/common/wall_clock.h | 20 +++++++++++++++++++- src/common/x64/native_clock.cpp | 7 ++++++- src/common/x64/native_clock.h | 3 +++ src/core/core_timing.cpp | 7 +++++++ src/core/core_timing.h | 3 +++ src/video_core/gpu.cpp | 11 +++-------- 7 files changed, 47 insertions(+), 12 deletions(-) (limited to 'src/video_core') diff --git a/src/common/wall_clock.cpp b/src/common/wall_clock.cpp index ad8db06b0..dc0dcbd68 100644 --- a/src/common/wall_clock.cpp +++ b/src/common/wall_clock.cpp @@ -32,6 +32,10 @@ public: return GetHostTicksElapsed() * NsToCNTPCTRatio::num / NsToCNTPCTRatio::den; } + u64 GetGPUTick() const override { + return GetHostTicksElapsed() * NsToGPUTickRatio::num / NsToGPUTickRatio::den; + } + u64 GetHostTicksNow() const override { return static_cast(SteadyClock::Now().time_since_epoch().count()); } @@ -52,12 +56,12 @@ std::unique_ptr CreateOptimalClock() { #ifdef ARCHITECTURE_x86_64 const auto& caps = GetCPUCaps(); - if (caps.invariant_tsc && caps.tsc_frequency >= WallClock::CNTFRQ) { + if (caps.invariant_tsc && caps.tsc_frequency >= WallClock::GPUTickFreq) { return std::make_unique(caps.tsc_frequency); } else { // Fallback to StandardWallClock if the hardware TSC // - Is not invariant - // - Is not more precise than CNTFRQ + // - Is not more precise than GPUTickFreq return std::make_unique(); } #else diff --git a/src/common/wall_clock.h b/src/common/wall_clock.h index 56c18ca25..fcfdd637c 100644 --- a/src/common/wall_clock.h +++ b/src/common/wall_clock.h @@ -13,7 +13,8 @@ namespace Common { class WallClock { public: - static constexpr u64 CNTFRQ = 19'200'000; // CNTPCT_EL0 Frequency = 19.2 MHz + static constexpr u64 CNTFRQ = 19'200'000; // CNTPCT_EL0 Frequency = 19.2 MHz + static constexpr u64 GPUTickFreq = 614'400'000; // GM20B GPU Tick Frequency = 614.4 MHz virtual ~WallClock() = default; @@ -29,6 +30,9 @@ public: /// @returns The guest CNTPCT ticks since the construction of this clock. virtual u64 GetCNTPCT() const = 0; + /// @returns The guest GPU ticks since the construction of this clock. + virtual u64 GetGPUTick() const = 0; + /// @returns The raw host timer ticks since an indeterminate epoch. virtual u64 GetHostTicksNow() const = 0; @@ -46,6 +50,10 @@ public: return us * UsToCNTPCTRatio::num / UsToCNTPCTRatio::den; } + static inline u64 NSToGPUTick(u64 ns) { + return ns * NsToGPUTickRatio::num / NsToGPUTickRatio::den; + } + static inline u64 CNTPCTToNS(u64 cntpct) { return cntpct * NsToCNTPCTRatio::den / NsToCNTPCTRatio::num; } @@ -54,6 +62,14 @@ public: return cntpct * UsToCNTPCTRatio::den / UsToCNTPCTRatio::num; } + static inline u64 GPUTickToNS(u64 gpu_tick) { + return gpu_tick * NsToGPUTickRatio::den / NsToGPUTickRatio::num; + } + + static inline u64 CNTPCTToGPUTick(u64 cntpct) { + return cntpct * CNTPCTToGPUTickRatio::num / CNTPCTToGPUTickRatio::den; + } + protected: using NsRatio = std::nano; using UsRatio = std::micro; @@ -63,6 +79,8 @@ protected: using NsToMsRatio = std::ratio_divide; using NsToCNTPCTRatio = std::ratio; using UsToCNTPCTRatio = std::ratio; + using NsToGPUTickRatio = std::ratio; + using CNTPCTToGPUTickRatio = std::ratio; }; std::unique_ptr CreateOptimalClock(); diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp index 5d1eb0590..7d2a26bd9 100644 --- a/src/common/x64/native_clock.cpp +++ b/src/common/x64/native_clock.cpp @@ -12,7 +12,8 @@ NativeClock::NativeClock(u64 rdtsc_frequency_) ns_rdtsc_factor{GetFixedPoint64Factor(NsRatio::den, rdtsc_frequency)}, us_rdtsc_factor{GetFixedPoint64Factor(UsRatio::den, rdtsc_frequency)}, ms_rdtsc_factor{GetFixedPoint64Factor(MsRatio::den, rdtsc_frequency)}, - cntpct_rdtsc_factor{GetFixedPoint64Factor(CNTFRQ, rdtsc_frequency)} {} + cntpct_rdtsc_factor{GetFixedPoint64Factor(CNTFRQ, rdtsc_frequency)}, + gputick_rdtsc_factor{GetFixedPoint64Factor(GPUTickFreq, rdtsc_frequency)} {} std::chrono::nanoseconds NativeClock::GetTimeNS() const { return std::chrono::nanoseconds{MultiplyHigh(GetHostTicksElapsed(), ns_rdtsc_factor)}; @@ -30,6 +31,10 @@ u64 NativeClock::GetCNTPCT() const { return MultiplyHigh(GetHostTicksElapsed(), cntpct_rdtsc_factor); } +u64 NativeClock::GetGPUTick() const { + return MultiplyHigh(GetHostTicksElapsed(), gputick_rdtsc_factor); +} + u64 NativeClock::GetHostTicksNow() const { return FencedRDTSC(); } diff --git a/src/common/x64/native_clock.h b/src/common/x64/native_clock.h index d6f8626c1..334415eff 100644 --- a/src/common/x64/native_clock.h +++ b/src/common/x64/native_clock.h @@ -19,6 +19,8 @@ public: u64 GetCNTPCT() const override; + u64 GetGPUTick() const override; + u64 GetHostTicksNow() const override; u64 GetHostTicksElapsed() const override; @@ -33,6 +35,7 @@ private: u64 us_rdtsc_factor; u64 ms_rdtsc_factor; u64 cntpct_rdtsc_factor; + u64 gputick_rdtsc_factor; }; } // namespace Common::X64 diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp index 9a1d5a69a..e57bca70a 100644 --- a/src/core/core_timing.cpp +++ b/src/core/core_timing.cpp @@ -197,6 +197,13 @@ u64 CoreTiming::GetClockTicks() const { return ticks; } +u64 CoreTiming::GetGPUTicks() const { + if (is_multicore) [[likely]] { + return clock->GetGPUTick(); + } + return Common::WallClock::CNTPCTToGPUTick(ticks); +} + std::optional CoreTiming::Advance() { std::scoped_lock lock{advance_lock, basic_lock}; global_timer = GetGlobalTimeNs().count(); diff --git a/src/core/core_timing.h b/src/core/core_timing.h index fdacdd94a..1873852c4 100644 --- a/src/core/core_timing.h +++ b/src/core/core_timing.h @@ -119,6 +119,9 @@ public: /// Returns the current CNTPCT tick value. u64 GetClockTicks() const; + /// Returns the current GPU tick value. + u64 GetGPUTicks() const; + /// Returns current time in microseconds. std::chrono::microseconds GetGlobalTimeUs() const; diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 70762c51a..db385076d 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -193,18 +193,13 @@ struct GPU::Impl { } [[nodiscard]] u64 GetTicks() const { - // This values were reversed engineered by fincs from NVN - // The GPU clock is 614.4 MHz - using NsToGPUTickRatio = std::ratio<614'400'000, std::nano::den>; - static_assert(NsToGPUTickRatio::num == 384 && NsToGPUTickRatio::den == 625); - - u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count(); + u64 gpu_tick = system.CoreTiming().GetGPUTicks(); if (Settings::values.use_fast_gpu_time.GetValue()) { - nanoseconds /= 256; + gpu_tick /= 256; } - return nanoseconds * NsToGPUTickRatio::num / NsToGPUTickRatio::den; + return gpu_tick; } [[nodiscard]] bool IsAsync() const { -- cgit v1.2.3 From 8d6aefdcc452b602d94a84d13bbbc15f806b689c Mon Sep 17 00:00:00 2001 From: Liam Date: Wed, 14 Jun 2023 14:11:46 -0400 Subject: video_core: optionally skip barriers on feedback loops --- src/common/settings.h | 1 + src/video_core/texture_cache/texture_cache.h | 4 ++++ src/yuzu/configuration/config.cpp | 2 ++ src/yuzu/configuration/configure_graphics_advanced.cpp | 10 ++++++++++ src/yuzu/configuration/configure_graphics_advanced.h | 1 + src/yuzu/configuration/configure_graphics_advanced.ui | 10 ++++++++++ 6 files changed, 28 insertions(+) (limited to 'src/video_core') diff --git a/src/common/settings.h b/src/common/settings.h index 9682281b0..3aedf3850 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -483,6 +483,7 @@ struct Values { AstcRecompression::Uncompressed, AstcRecompression::Uncompressed, AstcRecompression::Bc3, "astc_recompression"}; SwitchableSetting use_video_framerate{false, "use_video_framerate"}; + SwitchableSetting barrier_feedback_loops{true, "barrier_feedback_loops"}; SwitchableSetting bg_red{0, "bg_red"}; SwitchableSetting bg_green{0, "bg_green"}; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index c7f7448e9..43b7ac0a6 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -186,6 +186,10 @@ void TextureCache

::FillComputeImageViews(std::span views) { template void TextureCache

::CheckFeedbackLoop(std::span views) { + if (!Settings::values.barrier_feedback_loops.GetValue()) { + return; + } + const bool requires_barrier = [&] { for (const auto& view : views) { if (!view.id) { diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index bac9dff90..edc206a25 100644 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -761,6 +761,7 @@ void Config::ReadRendererValues() { ReadGlobalSetting(Settings::values.use_vulkan_driver_pipeline_cache); ReadGlobalSetting(Settings::values.enable_compute_pipelines); ReadGlobalSetting(Settings::values.use_video_framerate); + ReadGlobalSetting(Settings::values.barrier_feedback_loops); ReadGlobalSetting(Settings::values.bg_red); ReadGlobalSetting(Settings::values.bg_green); ReadGlobalSetting(Settings::values.bg_blue); @@ -1417,6 +1418,7 @@ void Config::SaveRendererValues() { WriteGlobalSetting(Settings::values.use_vulkan_driver_pipeline_cache); WriteGlobalSetting(Settings::values.enable_compute_pipelines); WriteGlobalSetting(Settings::values.use_video_framerate); + WriteGlobalSetting(Settings::values.barrier_feedback_loops); WriteGlobalSetting(Settings::values.bg_red); WriteGlobalSetting(Settings::values.bg_green); WriteGlobalSetting(Settings::values.bg_blue); diff --git a/src/yuzu/configuration/configure_graphics_advanced.cpp b/src/yuzu/configuration/configure_graphics_advanced.cpp index 0463ac8b9..c0a044767 100644 --- a/src/yuzu/configuration/configure_graphics_advanced.cpp +++ b/src/yuzu/configuration/configure_graphics_advanced.cpp @@ -43,6 +43,8 @@ void ConfigureGraphicsAdvanced::SetConfiguration() { ui->enable_compute_pipelines_checkbox->setChecked( Settings::values.enable_compute_pipelines.GetValue()); ui->use_video_framerate_checkbox->setChecked(Settings::values.use_video_framerate.GetValue()); + ui->barrier_feedback_loops_checkbox->setChecked( + Settings::values.barrier_feedback_loops.GetValue()); if (Settings::IsConfiguringGlobal()) { ui->gpu_accuracy->setCurrentIndex( @@ -94,6 +96,9 @@ void ConfigureGraphicsAdvanced::ApplyConfiguration() { enable_compute_pipelines); ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_video_framerate, ui->use_video_framerate_checkbox, use_video_framerate); + ConfigurationShared::ApplyPerGameSetting(&Settings::values.barrier_feedback_loops, + ui->barrier_feedback_loops_checkbox, + barrier_feedback_loops); } void ConfigureGraphicsAdvanced::changeEvent(QEvent* event) { @@ -130,6 +135,8 @@ void ConfigureGraphicsAdvanced::SetupPerGameUI() { Settings::values.enable_compute_pipelines.UsingGlobal()); ui->use_video_framerate_checkbox->setEnabled( Settings::values.use_video_framerate.UsingGlobal()); + ui->barrier_feedback_loops_checkbox->setEnabled( + Settings::values.barrier_feedback_loops.UsingGlobal()); return; } @@ -157,6 +164,9 @@ void ConfigureGraphicsAdvanced::SetupPerGameUI() { ConfigurationShared::SetColoredTristate(ui->use_video_framerate_checkbox, Settings::values.use_video_framerate, use_video_framerate); + ConfigurationShared::SetColoredTristate(ui->barrier_feedback_loops_checkbox, + Settings::values.barrier_feedback_loops, + barrier_feedback_loops); ConfigurationShared::SetColoredComboBox( ui->gpu_accuracy, ui->label_gpu_accuracy, static_cast(Settings::values.gpu_accuracy.GetValue(true))); diff --git a/src/yuzu/configuration/configure_graphics_advanced.h b/src/yuzu/configuration/configure_graphics_advanced.h index a4dc8ceb0..369a7c83e 100644 --- a/src/yuzu/configuration/configure_graphics_advanced.h +++ b/src/yuzu/configuration/configure_graphics_advanced.h @@ -48,6 +48,7 @@ private: ConfigurationShared::CheckState use_vulkan_driver_pipeline_cache; ConfigurationShared::CheckState enable_compute_pipelines; ConfigurationShared::CheckState use_video_framerate; + ConfigurationShared::CheckState barrier_feedback_loops; const Core::System& system; }; diff --git a/src/yuzu/configuration/configure_graphics_advanced.ui b/src/yuzu/configuration/configure_graphics_advanced.ui index e7f0ef6be..d527a6f38 100644 --- a/src/yuzu/configuration/configure_graphics_advanced.ui +++ b/src/yuzu/configuration/configure_graphics_advanced.ui @@ -201,6 +201,16 @@ Compute pipelines are always enabled on all other drivers. + + + + Improves rendering of transparency effects in specific games. + + + Barrier feedback loops + + + -- cgit v1.2.3 From 76a676883a17523fb12eeac6f2b9702e4916b2c2 Mon Sep 17 00:00:00 2001 From: FengChen Date: Sat, 17 Jun 2023 23:26:39 +0800 Subject: video_core: add samples check when find render target --- src/video_core/texture_cache/texture_cache.h | 22 ++++++++++------------ src/video_core/texture_cache/texture_cache_base.h | 10 ++++------ 2 files changed, 14 insertions(+), 18 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index c7f7448e9..f11998e20 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -280,7 +280,7 @@ void TextureCache

::SynchronizeComputeDescriptors() { } template -bool TextureCache

::RescaleRenderTargets(bool is_clear) { +bool TextureCache

::RescaleRenderTargets() { auto& flags = maxwell3d->dirty.flags; u32 scale_rating = 0; bool rescaled = false; @@ -318,13 +318,13 @@ bool TextureCache

::RescaleRenderTargets(bool is_clear) { ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index]; if (flags[Dirty::ColorBuffer0 + index] || force) { flags[Dirty::ColorBuffer0 + index] = false; - BindRenderTarget(&color_buffer_id, FindColorBuffer(index, is_clear)); + BindRenderTarget(&color_buffer_id, FindColorBuffer(index)); } check_rescale(color_buffer_id, tmp_color_images[index]); } if (flags[Dirty::ZetaBuffer] || force) { flags[Dirty::ZetaBuffer] = false; - BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer(is_clear)); + BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer()); } check_rescale(render_targets.depth_buffer_id, tmp_depth_image); @@ -389,7 +389,7 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { return; } - const bool rescaled = RescaleRenderTargets(is_clear); + const bool rescaled = RescaleRenderTargets(); if (is_rescaling != rescaled) { flags[Dirty::RescaleViewports] = true; flags[Dirty::RescaleScissors] = true; @@ -1658,7 +1658,7 @@ SamplerId TextureCache

::FindSampler(const TSCEntry& config) { } template -ImageViewId TextureCache

::FindColorBuffer(size_t index, bool is_clear) { +ImageViewId TextureCache

::FindColorBuffer(size_t index) { const auto& regs = maxwell3d->regs; if (index >= regs.rt_control.count) { return ImageViewId{}; @@ -1672,11 +1672,11 @@ ImageViewId TextureCache

::FindColorBuffer(size_t index, bool is_clear) { return ImageViewId{}; } const ImageInfo info(regs.rt[index], regs.anti_alias_samples_mode); - return FindRenderTargetView(info, gpu_addr, is_clear); + return FindRenderTargetView(info, gpu_addr); } template -ImageViewId TextureCache

::FindDepthBuffer(bool is_clear) { +ImageViewId TextureCache

::FindDepthBuffer() { const auto& regs = maxwell3d->regs; if (!regs.zeta_enable) { return ImageViewId{}; @@ -1686,18 +1686,16 @@ ImageViewId TextureCache

::FindDepthBuffer(bool is_clear) { return ImageViewId{}; } const ImageInfo info(regs.zeta, regs.zeta_size, regs.anti_alias_samples_mode); - return FindRenderTargetView(info, gpu_addr, is_clear); + return FindRenderTargetView(info, gpu_addr); } template -ImageViewId TextureCache

::FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr, - bool is_clear) { - const auto options = is_clear ? RelaxedOptions::Samples : RelaxedOptions{}; +ImageViewId TextureCache

::FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr) { ImageId image_id{}; bool delete_state = has_deleted_images; do { has_deleted_images = false; - image_id = FindOrInsertImage(info, gpu_addr, options); + image_id = FindOrInsertImage(info, gpu_addr); delete_state |= has_deleted_images; } while (has_deleted_images); has_deleted_images = delete_state; diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 3bfa92154..c347eccd6 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -166,9 +166,8 @@ public: void SynchronizeComputeDescriptors(); /// Updates the Render Targets if they can be rescaled - /// @param is_clear True when the render targets are being used for clears /// @retval True if the Render Targets have been rescaled. - bool RescaleRenderTargets(bool is_clear); + bool RescaleRenderTargets(); /// Update bound render targets and upload memory if necessary /// @param is_clear True when the render targets are being used for clears @@ -324,14 +323,13 @@ private: [[nodiscard]] SamplerId FindSampler(const TSCEntry& config); /// Find or create an image view for the given color buffer index - [[nodiscard]] ImageViewId FindColorBuffer(size_t index, bool is_clear); + [[nodiscard]] ImageViewId FindColorBuffer(size_t index); /// Find or create an image view for the depth buffer - [[nodiscard]] ImageViewId FindDepthBuffer(bool is_clear); + [[nodiscard]] ImageViewId FindDepthBuffer(); /// Find or create a view for a render target with the given image parameters - [[nodiscard]] ImageViewId FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr, - bool is_clear); + [[nodiscard]] ImageViewId FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr); /// Iterates over all the images in a region calling func template -- cgit v1.2.3 From 6448eade2ef126a88068cde66b77e7788c3fab08 Mon Sep 17 00:00:00 2001 From: lat9nq <22451773+lat9nq@users.noreply.github.com> Date: Sun, 18 Jun 2023 04:59:12 -0400 Subject: externals: Add vma and initialize it video_core: Move vma implementation to library --- .gitmodules | 3 +++ externals/CMakeLists.txt | 5 +++++ externals/vma/vma | 1 + externals/vma/vma.cpp | 5 +++++ src/video_core/CMakeLists.txt | 2 +- src/video_core/vulkan_common/vulkan_device.cpp | 23 ++++++++++++++++++++++- src/video_core/vulkan_common/vulkan_device.h | 3 +++ 7 files changed, 40 insertions(+), 2 deletions(-) create mode 160000 externals/vma/vma create mode 100644 externals/vma/vma.cpp (limited to 'src/video_core') diff --git a/.gitmodules b/.gitmodules index 89f2ad924..cc0e97a85 100644 --- a/.gitmodules +++ b/.gitmodules @@ -55,3 +55,6 @@ [submodule "tzdb_to_nx"] path = externals/nx_tzdb/tzdb_to_nx url = https://github.com/lat9nq/tzdb_to_nx.git +[submodule "externals/vma/vma"] + path = externals/vma/vma + url = https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index 7cce27d51..ca4ebe4b9 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -143,6 +143,11 @@ endif() # TZDB (Time Zone Database) add_subdirectory(nx_tzdb) +# VMA +add_library(vma vma/vma.cpp) +target_include_directories(vma PUBLIC ./vma/vma/include) +target_link_libraries(vma PRIVATE Vulkan::Headers) + if (NOT TARGET LLVM::Demangle) add_library(demangle demangle/ItaniumDemangle.cpp) target_include_directories(demangle PUBLIC ./demangle) diff --git a/externals/vma/vma b/externals/vma/vma new file mode 160000 index 000000000..0aa3989b8 --- /dev/null +++ b/externals/vma/vma @@ -0,0 +1 @@ +Subproject commit 0aa3989b8f382f185fdf646cc83a1d16fa31d6ab diff --git a/externals/vma/vma.cpp b/externals/vma/vma.cpp new file mode 100644 index 000000000..7f7df9cff --- /dev/null +++ b/externals/vma/vma.cpp @@ -0,0 +1,5 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#define VMA_IMPLEMENTATION +#include \ No newline at end of file diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index bf6439530..e9e6f278d 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -291,7 +291,7 @@ target_link_options(video_core PRIVATE ${FFmpeg_LDFLAGS}) add_dependencies(video_core host_shaders) target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE}) -target_link_libraries(video_core PRIVATE sirit Vulkan::Headers) +target_link_libraries(video_core PRIVATE sirit Vulkan::Headers vma) if (ENABLE_NSIGHT_AFTERMATH) if (NOT DEFINED ENV{NSIGHT_AFTERMATH_SDK}) diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 3d2e9a16a..631d5e378 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -22,6 +22,10 @@ #include #endif +#define VMA_STATIC_VULKAN_FUNCTIONS 0 +#define VMA_DYNAMIC_VULKAN_FUNCTIONS 1 +#include + namespace Vulkan { using namespace Common::Literals; namespace { @@ -592,9 +596,26 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR graphics_queue = logical.GetQueue(graphics_family); present_queue = logical.GetQueue(present_family); + + const VmaVulkanFunctions functions = { + .vkGetInstanceProcAddr = dld.vkGetInstanceProcAddr, + .vkGetDeviceProcAddr = dld.vkGetDeviceProcAddr, + }; + + const VmaAllocatorCreateInfo allocator_info = { + .physicalDevice = physical, + .device = *logical, + .pVulkanFunctions = &functions, + .instance = instance, + .vulkanApiVersion = VK_API_VERSION_1_1, + }; + + vk::Check(vmaCreateAllocator(&allocator_info, &allocator)); } -Device::~Device() = default; +Device::~Device() { + vmaDestroyAllocator(allocator); +} VkFormat Device::GetSupportedFormat(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage, FormatType format_type) const { diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index f314d0ffe..123d3b1c4 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -13,6 +13,8 @@ #include "common/settings.h" #include "video_core/vulkan_common/vulkan_wrapper.h" +VK_DEFINE_HANDLE(VmaAllocator) + // Define all features which may be used by the implementation here. // Vulkan version in the macro describes the minimum version required for feature availability. // If the Vulkan version is lower than the required version, the named extension is required. @@ -618,6 +620,7 @@ private: private: VkInstance instance; ///< Vulkan instance. + VmaAllocator allocator; ///< VMA allocator. vk::DeviceDispatch dld; ///< Device function pointers. vk::PhysicalDevice physical; ///< Physical device. vk::Device logical; ///< Logical device. -- cgit v1.2.3 From c60eed36b7439a7921ea5e86e1300e96e30c8f8a Mon Sep 17 00:00:00 2001 From: GPUCode Date: Wed, 24 May 2023 20:32:12 +0300 Subject: memory_allocator: Remove OpenGL interop * Appears to be unused atm --- src/video_core/renderer_vulkan/renderer_vulkan.cpp | 4 +- src/video_core/renderer_vulkan/vk_turbo_mode.cpp | 2 +- .../vulkan_common/vulkan_memory_allocator.cpp | 58 +--------------------- .../vulkan_common/vulkan_memory_allocator.h | 11 ++-- 4 files changed, 8 insertions(+), 67 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 77128c6e2..5bae8d24f 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -89,8 +89,8 @@ RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_, Settings::values.renderer_debug.GetValue())), debug_callback(Settings::values.renderer_debug ? CreateDebugCallback(instance) : nullptr), surface(CreateSurface(instance, render_window.GetWindowInfo())), - device(CreateDevice(instance, dld, *surface)), memory_allocator(device, false), - state_tracker(), scheduler(device, state_tracker), + device(CreateDevice(instance, dld, *surface)), memory_allocator(device), state_tracker(), + scheduler(device, state_tracker), swapchain(*surface, device, scheduler, render_window.GetFramebufferLayout().width, render_window.GetFramebufferLayout().height, false), present_manager(instance, render_window, device, memory_allocator, scheduler, swapchain, diff --git a/src/video_core/renderer_vulkan/vk_turbo_mode.cpp b/src/video_core/renderer_vulkan/vk_turbo_mode.cpp index a802d3c49..6417d7e31 100644 --- a/src/video_core/renderer_vulkan/vk_turbo_mode.cpp +++ b/src/video_core/renderer_vulkan/vk_turbo_mode.cpp @@ -18,7 +18,7 @@ using namespace Common::Literals; TurboMode::TurboMode(const vk::Instance& instance, const vk::InstanceDispatch& dld) #ifndef ANDROID - : m_device{CreateDevice(instance, dld, VK_NULL_HANDLE)}, m_allocator{m_device, false} + : m_device{CreateDevice(instance, dld, VK_NULL_HANDLE)}, m_allocator{m_device} #endif { { diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp index e28a556f8..f87c99603 100644 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp @@ -6,8 +6,6 @@ #include #include -#include - #include "common/alignment.h" #include "common/assert.h" #include "common/common_types.h" @@ -54,17 +52,6 @@ struct Range { return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; } -constexpr VkExportMemoryAllocateInfo EXPORT_ALLOCATE_INFO{ - .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO, - .pNext = nullptr, -#ifdef _WIN32 - .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT, -#elif __unix__ - .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT, -#else - .handleTypes = 0, -#endif -}; } // Anonymous namespace class MemoryAllocation { @@ -74,14 +61,6 @@ public: : allocator{allocator_}, memory{std::move(memory_)}, allocation_size{allocation_size_}, property_flags{properties}, shifted_memory_type{1U << type} {} -#if defined(_WIN32) || defined(__unix__) - ~MemoryAllocation() { - if (owning_opengl_handle != 0) { - glDeleteMemoryObjectsEXT(1, &owning_opengl_handle); - } - } -#endif - MemoryAllocation& operator=(const MemoryAllocation&) = delete; MemoryAllocation(const MemoryAllocation&) = delete; @@ -120,31 +99,6 @@ public: return memory_mapped_span; } -#ifdef _WIN32 - [[nodiscard]] u32 ExportOpenGLHandle() { - if (!owning_opengl_handle) { - glCreateMemoryObjectsEXT(1, &owning_opengl_handle); - glImportMemoryWin32HandleEXT(owning_opengl_handle, allocation_size, - GL_HANDLE_TYPE_OPAQUE_WIN32_EXT, - memory.GetMemoryWin32HandleKHR()); - } - return owning_opengl_handle; - } -#elif __unix__ - [[nodiscard]] u32 ExportOpenGLHandle() { - if (!owning_opengl_handle) { - glCreateMemoryObjectsEXT(1, &owning_opengl_handle); - glImportMemoryFdEXT(owning_opengl_handle, allocation_size, GL_HANDLE_TYPE_OPAQUE_FD_EXT, - memory.GetMemoryFdKHR()); - } - return owning_opengl_handle; - } -#else - [[nodiscard]] u32 ExportOpenGLHandle() { - return 0; - } -#endif - /// Returns whether this allocation is compatible with the arguments. [[nodiscard]] bool IsCompatible(VkMemoryPropertyFlags flags, u32 type_mask) const { return (flags & property_flags) == flags && (type_mask & shifted_memory_type) != 0; @@ -182,9 +136,6 @@ private: const u32 shifted_memory_type; ///< Shifted Vulkan memory type. std::vector commits; ///< All commit ranges done from this allocation. std::span memory_mapped_span; ///< Memory mapped span. Empty if not queried before. -#if defined(_WIN32) || defined(__unix__) - u32 owning_opengl_handle{}; ///< Owning OpenGL memory object handle. -#endif }; MemoryCommit::MemoryCommit(MemoryAllocation* allocation_, VkDeviceMemory memory_, u64 begin_, @@ -216,19 +167,14 @@ std::span MemoryCommit::Map() { return span; } -u32 MemoryCommit::ExportOpenGLHandle() const { - return allocation->ExportOpenGLHandle(); -} - void MemoryCommit::Release() { if (allocation) { allocation->Free(begin); } } -MemoryAllocator::MemoryAllocator(const Device& device_, bool export_allocations_) +MemoryAllocator::MemoryAllocator(const Device& device_) : device{device_}, properties{device_.GetPhysical().GetMemoryProperties().memoryProperties}, - export_allocations{export_allocations_}, buffer_image_granularity{ device_.GetPhysical().GetProperties().limits.bufferImageGranularity} {} @@ -271,7 +217,7 @@ bool MemoryAllocator::TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, const u32 type = FindType(flags, type_mask).value(); vk::DeviceMemory memory = device.GetLogical().TryAllocateMemory({ .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, - .pNext = export_allocations ? &EXPORT_ALLOCATE_INFO : nullptr, + .pNext = nullptr, .allocationSize = size, .memoryTypeIndex = type, }); diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.h b/src/video_core/vulkan_common/vulkan_memory_allocator.h index a5bff03fe..494f30f51 100644 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.h +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.h @@ -41,9 +41,6 @@ public: /// It will map the backing allocation if it hasn't been mapped before. std::span Map(); - /// Returns an non-owning OpenGL handle, creating one if it doesn't exist. - u32 ExportOpenGLHandle() const; - /// Returns the Vulkan memory handler. VkDeviceMemory Memory() const { return memory; @@ -74,11 +71,10 @@ public: * Construct memory allocator * * @param device_ Device to allocate from - * @param export_allocations_ True when allocations have to be exported * * @throw vk::Exception on failure */ - explicit MemoryAllocator(const Device& device_, bool export_allocations_); + explicit MemoryAllocator(const Device& device_); ~MemoryAllocator(); MemoryAllocator& operator=(const MemoryAllocator&) = delete; @@ -117,9 +113,8 @@ private: /// Returns index to the fastest memory type compatible with the passed requirements. std::optional FindType(VkMemoryPropertyFlags flags, u32 type_mask) const; - const Device& device; ///< Device handle. - const VkPhysicalDeviceMemoryProperties properties; ///< Physical device properties. - const bool export_allocations; ///< True when memory allocations have to be exported. + const Device& device; ///< Device handle. + const VkPhysicalDeviceMemoryProperties properties; ///< Physical device properties. std::vector> allocations; ///< Current allocations. VkDeviceSize buffer_image_granularity; // The granularity for adjacent offsets between buffers // and optimal images -- cgit v1.2.3 From 48e39756f1ec6e6b0ef48f2444ce38a4e861e898 Mon Sep 17 00:00:00 2001 From: GPUCode Date: Wed, 24 May 2023 22:39:58 +0300 Subject: renderer_vulkan: Use VMA for images --- src/video_core/renderer_vulkan/renderer_vulkan.cpp | 3 +- src/video_core/renderer_vulkan/vk_blit_screen.cpp | 12 +--- src/video_core/renderer_vulkan/vk_blit_screen.h | 2 - src/video_core/renderer_vulkan/vk_fsr.cpp | 4 +- src/video_core/renderer_vulkan/vk_fsr.h | 1 - .../renderer_vulkan/vk_present_manager.cpp | 4 +- .../renderer_vulkan/vk_present_manager.h | 1 - src/video_core/renderer_vulkan/vk_smaa.cpp | 27 +++------ src/video_core/renderer_vulkan/vk_smaa.h | 2 - .../renderer_vulkan/vk_texture_cache.cpp | 14 ++--- src/video_core/renderer_vulkan/vk_texture_cache.h | 2 - src/video_core/vulkan_common/vulkan_device.h | 5 ++ .../vulkan_common/vulkan_memory_allocator.cpp | 30 +++++++--- .../vulkan_common/vulkan_memory_allocator.h | 5 +- src/video_core/vulkan_common/vulkan_wrapper.cpp | 28 ++++----- src/video_core/vulkan_common/vulkan_wrapper.h | 70 ++++++++++++++++++---- 16 files changed, 119 insertions(+), 91 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 5bae8d24f..e569523b6 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -173,7 +173,7 @@ void Vulkan::RendererVulkan::RenderScreenshot(const Tegra::FramebufferConfig& fr return; } const Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout}; - vk::Image staging_image = device.GetLogical().CreateImage(VkImageCreateInfo{ + vk::Image staging_image = memory_allocator.CreateImage(VkImageCreateInfo{ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, .pNext = nullptr, .flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT, @@ -196,7 +196,6 @@ void Vulkan::RendererVulkan::RenderScreenshot(const Tegra::FramebufferConfig& fr .pQueueFamilyIndices = nullptr, .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, }); - const auto image_commit = memory_allocator.Commit(staging_image, MemoryUsage::DeviceLocal); const vk::ImageView dst_view = device.GetLogical().CreateImageView(VkImageViewCreateInfo{ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index acb143fc7..82ca81c7e 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -1071,12 +1071,8 @@ void BlitScreen::ReleaseRawImages() { scheduler.Wait(tick); } raw_images.clear(); - raw_buffer_commits.clear(); - aa_image_view.reset(); aa_image.reset(); - aa_commit = MemoryCommit{}; - buffer.reset(); buffer_commit = MemoryCommit{}; } @@ -1101,13 +1097,12 @@ void BlitScreen::CreateStagingBuffer(const Tegra::FramebufferConfig& framebuffer void BlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) { raw_images.resize(image_count); raw_image_views.resize(image_count); - raw_buffer_commits.resize(image_count); const auto create_image = [&](bool used_on_framebuffer = false, u32 up_scale = 1, u32 down_shift = 0) { u32 extra_usages = used_on_framebuffer ? VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT : VK_IMAGE_USAGE_TRANSFER_DST_BIT; - return device.GetLogical().CreateImage(VkImageCreateInfo{ + return memory_allocator.CreateImage(VkImageCreateInfo{ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -1130,9 +1125,6 @@ void BlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) { .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, }); }; - const auto create_commit = [&](vk::Image& image) { - return memory_allocator.Commit(image, MemoryUsage::DeviceLocal); - }; const auto create_image_view = [&](vk::Image& image, bool used_on_framebuffer = false) { return device.GetLogical().CreateImageView(VkImageViewCreateInfo{ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, @@ -1161,7 +1153,6 @@ void BlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) { for (size_t i = 0; i < image_count; ++i) { raw_images[i] = create_image(); - raw_buffer_commits[i] = create_commit(raw_images[i]); raw_image_views[i] = create_image_view(raw_images[i]); } @@ -1169,7 +1160,6 @@ void BlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) { const u32 up_scale = Settings::values.resolution_info.up_scale; const u32 down_shift = Settings::values.resolution_info.down_shift; aa_image = create_image(true, up_scale, down_shift); - aa_commit = create_commit(aa_image); aa_image_view = create_image_view(aa_image, true); VkExtent2D size{ .width = (up_scale * framebuffer.width) >> down_shift, diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h index 68ec20253..7fcfa9976 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.h +++ b/src/video_core/renderer_vulkan/vk_blit_screen.h @@ -148,7 +148,6 @@ private: std::vector raw_images; std::vector raw_image_views; - std::vector raw_buffer_commits; vk::DescriptorPool aa_descriptor_pool; vk::DescriptorSetLayout aa_descriptor_set_layout; @@ -159,7 +158,6 @@ private: vk::DescriptorSets aa_descriptor_sets; vk::Image aa_image; vk::ImageView aa_image_view; - MemoryCommit aa_commit; u32 raw_width = 0; u32 raw_height = 0; diff --git a/src/video_core/renderer_vulkan/vk_fsr.cpp b/src/video_core/renderer_vulkan/vk_fsr.cpp index df972cd54..9bcdca2fb 100644 --- a/src/video_core/renderer_vulkan/vk_fsr.cpp +++ b/src/video_core/renderer_vulkan/vk_fsr.cpp @@ -205,10 +205,9 @@ void FSR::CreateDescriptorSets() { void FSR::CreateImages() { images.resize(image_count * 2); image_views.resize(image_count * 2); - buffer_commits.resize(image_count * 2); for (size_t i = 0; i < image_count * 2; ++i) { - images[i] = device.GetLogical().CreateImage(VkImageCreateInfo{ + images[i] = memory_allocator.CreateImage(VkImageCreateInfo{ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -231,7 +230,6 @@ void FSR::CreateImages() { .pQueueFamilyIndices = nullptr, .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, }); - buffer_commits[i] = memory_allocator.Commit(images[i], MemoryUsage::DeviceLocal); image_views[i] = device.GetLogical().CreateImageView(VkImageViewCreateInfo{ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, .pNext = nullptr, diff --git a/src/video_core/renderer_vulkan/vk_fsr.h b/src/video_core/renderer_vulkan/vk_fsr.h index 5d872861f..8bb9fc23a 100644 --- a/src/video_core/renderer_vulkan/vk_fsr.h +++ b/src/video_core/renderer_vulkan/vk_fsr.h @@ -47,7 +47,6 @@ private: vk::Sampler sampler; std::vector images; std::vector image_views; - std::vector buffer_commits; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_present_manager.cpp b/src/video_core/renderer_vulkan/vk_present_manager.cpp index 10ace0420..d681bd22a 100644 --- a/src/video_core/renderer_vulkan/vk_present_manager.cpp +++ b/src/video_core/renderer_vulkan/vk_present_manager.cpp @@ -181,7 +181,7 @@ void PresentManager::RecreateFrame(Frame* frame, u32 width, u32 height, bool is_ frame->height = height; frame->is_srgb = is_srgb; - frame->image = dld.CreateImage({ + frame->image = memory_allocator.CreateImage({ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, .pNext = nullptr, .flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT, @@ -204,8 +204,6 @@ void PresentManager::RecreateFrame(Frame* frame, u32 width, u32 height, bool is_ .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, }); - frame->image_commit = memory_allocator.Commit(frame->image, MemoryUsage::DeviceLocal); - frame->image_view = dld.CreateImageView({ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, .pNext = nullptr, diff --git a/src/video_core/renderer_vulkan/vk_present_manager.h b/src/video_core/renderer_vulkan/vk_present_manager.h index 4ac2e2395..83e859416 100644 --- a/src/video_core/renderer_vulkan/vk_present_manager.h +++ b/src/video_core/renderer_vulkan/vk_present_manager.h @@ -29,7 +29,6 @@ struct Frame { vk::Image image; vk::ImageView image_view; vk::Framebuffer framebuffer; - MemoryCommit image_commit; vk::CommandBuffer cmdbuf; vk::Semaphore render_ready; vk::Fence present_done; diff --git a/src/video_core/renderer_vulkan/vk_smaa.cpp b/src/video_core/renderer_vulkan/vk_smaa.cpp index f8735189d..ff7c3a419 100644 --- a/src/video_core/renderer_vulkan/vk_smaa.cpp +++ b/src/video_core/renderer_vulkan/vk_smaa.cpp @@ -25,9 +25,7 @@ namespace { #define ARRAY_TO_SPAN(a) std::span(a, (sizeof(a) / sizeof(a[0]))) -std::pair CreateWrappedImage(const Device& device, - MemoryAllocator& allocator, - VkExtent2D dimensions, VkFormat format) { +vk::Image CreateWrappedImage(MemoryAllocator& allocator, VkExtent2D dimensions, VkFormat format) { const VkImageCreateInfo image_ci{ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, .pNext = nullptr, @@ -46,11 +44,7 @@ std::pair CreateWrappedImage(const Device& device, .pQueueFamilyIndices = nullptr, .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, }; - - auto image = device.GetLogical().CreateImage(image_ci); - auto commit = allocator.Commit(image, Vulkan::MemoryUsage::DeviceLocal); - - return std::make_pair(std::move(image), std::move(commit)); + return allocator.CreateImage(image_ci); } void TransitionImageLayout(vk::CommandBuffer& cmdbuf, VkImage image, VkImageLayout target_layout, @@ -531,10 +525,8 @@ void SMAA::CreateImages() { static constexpr VkExtent2D area_extent{AREATEX_WIDTH, AREATEX_HEIGHT}; static constexpr VkExtent2D search_extent{SEARCHTEX_WIDTH, SEARCHTEX_HEIGHT}; - std::tie(m_static_images[Area], m_static_buffer_commits[Area]) = - CreateWrappedImage(m_device, m_allocator, area_extent, VK_FORMAT_R8G8_UNORM); - std::tie(m_static_images[Search], m_static_buffer_commits[Search]) = - CreateWrappedImage(m_device, m_allocator, search_extent, VK_FORMAT_R8_UNORM); + m_static_images[Area] = CreateWrappedImage(m_allocator, area_extent, VK_FORMAT_R8G8_UNORM); + m_static_images[Search] = CreateWrappedImage(m_allocator, search_extent, VK_FORMAT_R8_UNORM); m_static_image_views[Area] = CreateWrappedImageView(m_device, m_static_images[Area], VK_FORMAT_R8G8_UNORM); @@ -544,12 +536,11 @@ void SMAA::CreateImages() { for (u32 i = 0; i < m_image_count; i++) { Images& images = m_dynamic_images.emplace_back(); - std::tie(images.images[Blend], images.buffer_commits[Blend]) = - CreateWrappedImage(m_device, m_allocator, m_extent, VK_FORMAT_R16G16B16A16_SFLOAT); - std::tie(images.images[Edges], images.buffer_commits[Edges]) = - CreateWrappedImage(m_device, m_allocator, m_extent, VK_FORMAT_R16G16_SFLOAT); - std::tie(images.images[Output], images.buffer_commits[Output]) = - CreateWrappedImage(m_device, m_allocator, m_extent, VK_FORMAT_R16G16B16A16_SFLOAT); + images.images[Blend] = + CreateWrappedImage(m_allocator, m_extent, VK_FORMAT_R16G16B16A16_SFLOAT); + images.images[Edges] = CreateWrappedImage(m_allocator, m_extent, VK_FORMAT_R16G16_SFLOAT); + images.images[Output] = + CreateWrappedImage(m_allocator, m_extent, VK_FORMAT_R16G16B16A16_SFLOAT); images.image_views[Blend] = CreateWrappedImageView(m_device, images.images[Blend], VK_FORMAT_R16G16B16A16_SFLOAT); diff --git a/src/video_core/renderer_vulkan/vk_smaa.h b/src/video_core/renderer_vulkan/vk_smaa.h index 99a369148..0e214258a 100644 --- a/src/video_core/renderer_vulkan/vk_smaa.h +++ b/src/video_core/renderer_vulkan/vk_smaa.h @@ -66,13 +66,11 @@ private: std::array m_pipelines{}; std::array m_renderpasses{}; - std::array m_static_buffer_commits; std::array m_static_images{}; std::array m_static_image_views{}; struct Images { vk::DescriptorSets descriptor_sets{}; - std::array buffer_commits; std::array images{}; std::array image_views{}; std::array framebuffers{}; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index f025f618b..10defe6cb 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -15,7 +15,6 @@ #include "video_core/renderer_vulkan/blit_image.h" #include "video_core/renderer_vulkan/maxwell_to_vk.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" -#include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_render_pass_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" @@ -163,11 +162,12 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { }; } -[[nodiscard]] vk::Image MakeImage(const Device& device, const ImageInfo& info) { +[[nodiscard]] vk::Image MakeImage(const Device& device, const MemoryAllocator& allocator, + const ImageInfo& info) { if (info.type == ImageType::Buffer) { return vk::Image{}; } - return device.GetLogical().CreateImage(MakeImageCreateInfo(device, info)); + return allocator.CreateImage(MakeImageCreateInfo(device, info)); } [[nodiscard]] VkImageAspectFlags ImageAspectMask(PixelFormat format) { @@ -1266,8 +1266,8 @@ void TextureCacheRuntime::TickFrame() {} Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_) : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), scheduler{&runtime_.scheduler}, - runtime{&runtime_}, original_image(MakeImage(runtime_.device, info)), - commit(runtime_.memory_allocator.Commit(original_image, MemoryUsage::DeviceLocal)), + runtime{&runtime_}, + original_image(MakeImage(runtime_.device, runtime_.memory_allocator, info)), aspect_mask(ImageAspectMask(info.format)) { if (IsPixelFormatASTC(info.format) && !runtime->device.IsOptimalAstcSupported()) { if (Settings::values.async_astc.GetValue()) { @@ -1467,9 +1467,7 @@ bool Image::ScaleUp(bool ignore) { auto scaled_info = info; scaled_info.size.width = scaled_width; scaled_info.size.height = scaled_height; - scaled_image = MakeImage(runtime->device, scaled_info); - auto& allocator = runtime->memory_allocator; - scaled_commit = MemoryCommit(allocator.Commit(scaled_image, MemoryUsage::DeviceLocal)); + scaled_image = MakeImage(runtime->device, runtime->memory_allocator, scaled_info); ignore = false; } current_image = *scaled_image; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index f14525dcb..9481f2531 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -180,12 +180,10 @@ private: TextureCacheRuntime* runtime{}; vk::Image original_image; - MemoryCommit commit; std::vector storage_image_views; VkImageAspectFlags aspect_mask = 0; bool initialized = false; vk::Image scaled_image{}; - MemoryCommit scaled_commit{}; VkImage current_image{}; std::unique_ptr scale_framebuffer; diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 123d3b1c4..9936f5658 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -210,6 +210,11 @@ public: return dld; } + /// Returns the VMA allocator. + VmaAllocator GetAllocator() const { + return allocator; + } + /// Returns the logical device. const vk::Device& GetLogical() const { return logical; diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp index f87c99603..7f860cccd 100644 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp @@ -15,6 +15,10 @@ #include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" +#define VMA_STATIC_VULKAN_FUNCTIONS 0 +#define VMA_DYNAMIC_VULKAN_FUNCTIONS 1 +#include + namespace Vulkan { namespace { struct Range { @@ -180,6 +184,24 @@ MemoryAllocator::MemoryAllocator(const Device& device_) MemoryAllocator::~MemoryAllocator() = default; +vk::Image MemoryAllocator::CreateImage(const VkImageCreateInfo& ci) const { + const VmaAllocationCreateInfo alloc_info = { + .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT, + .usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE, + .requiredFlags = 0, + .preferredFlags = 0, + .pool = VK_NULL_HANDLE, + .pUserData = nullptr, + }; + + VkImage handle{}; + VmaAllocation allocation{}; + vk::Check( + vmaCreateImage(device.GetAllocator(), &ci, &alloc_info, &handle, &allocation, nullptr)); + return vk::Image(handle, *device.GetLogical(), device.GetAllocator(), allocation, + device.GetDispatchLoader()); +} + MemoryCommit MemoryAllocator::Commit(const VkMemoryRequirements& requirements, MemoryUsage usage) { // Find the fastest memory flags we can afford with the current requirements const u32 type_mask = requirements.memoryTypeBits; @@ -205,14 +227,6 @@ MemoryCommit MemoryAllocator::Commit(const vk::Buffer& buffer, MemoryUsage usage return commit; } -MemoryCommit MemoryAllocator::Commit(const vk::Image& image, MemoryUsage usage) { - VkMemoryRequirements requirements = device.GetLogical().GetImageMemoryRequirements(*image); - requirements.size = Common::AlignUp(requirements.size, buffer_image_granularity); - auto commit = Commit(requirements, usage); - image.BindMemory(commit.Memory(), commit.Offset()); - return commit; -} - bool MemoryAllocator::TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size) { const u32 type = FindType(flags, type_mask).value(); vk::DeviceMemory memory = device.GetLogical().TryAllocateMemory({ diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.h b/src/video_core/vulkan_common/vulkan_memory_allocator.h index 494f30f51..f9ee53cfb 100644 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.h +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.h @@ -80,6 +80,8 @@ public: MemoryAllocator& operator=(const MemoryAllocator&) = delete; MemoryAllocator(const MemoryAllocator&) = delete; + vk::Image CreateImage(const VkImageCreateInfo& ci) const; + /** * Commits a memory with the specified requirements. * @@ -93,9 +95,6 @@ public: /// Commits memory required by the buffer and binds it. MemoryCommit Commit(const vk::Buffer& buffer, MemoryUsage usage); - /// Commits memory required by the image and binds it. - MemoryCommit Commit(const vk::Image& image, MemoryUsage usage); - private: /// Tries to allocate a chunk of memory. bool TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size); diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index 336f53700..5d088dc58 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp @@ -12,6 +12,10 @@ #include "video_core/vulkan_common/vulkan_wrapper.h" +#define VMA_STATIC_VULKAN_FUNCTIONS 0 +#define VMA_DYNAMIC_VULKAN_FUNCTIONS 1 +#include + namespace Vulkan::vk { namespace { @@ -547,6 +551,16 @@ DebugUtilsMessenger Instance::CreateDebugUtilsMessenger( return DebugUtilsMessenger(object, handle, *dld); } +void Image::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_IMAGE, name); +} + +void Image::Release() const noexcept { + if (handle) { + vmaDestroyImage(allocator, handle, allocation); + } +} + void Buffer::BindMemory(VkDeviceMemory memory, VkDeviceSize offset) const { Check(dld->vkBindBufferMemory(owner, handle, memory, offset)); } @@ -559,14 +573,6 @@ void BufferView::SetObjectNameEXT(const char* name) const { SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_BUFFER_VIEW, name); } -void Image::BindMemory(VkDeviceMemory memory, VkDeviceSize offset) const { - Check(dld->vkBindImageMemory(owner, handle, memory, offset)); -} - -void Image::SetObjectNameEXT(const char* name) const { - SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_IMAGE, name); -} - void ImageView::SetObjectNameEXT(const char* name) const { SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_IMAGE_VIEW, name); } @@ -713,12 +719,6 @@ BufferView Device::CreateBufferView(const VkBufferViewCreateInfo& ci) const { return BufferView(object, handle, *dld); } -Image Device::CreateImage(const VkImageCreateInfo& ci) const { - VkImage object; - Check(dld->vkCreateImage(handle, &ci, nullptr, &object)); - return Image(object, handle, *dld); -} - ImageView Device::CreateImageView(const VkImageViewCreateInfo& ci) const { VkImageView object; Check(dld->vkCreateImageView(handle, &ci, nullptr, &object)); diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index 4ff328a21..8ec708774 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -32,6 +32,9 @@ #pragma warning(disable : 26812) // Disable prefer enum class over enum #endif +VK_DEFINE_HANDLE(VmaAllocator) +VK_DEFINE_HANDLE(VmaAllocation) + namespace Vulkan::vk { /** @@ -616,6 +619,60 @@ public: } }; +class Image { +public: + explicit Image(VkImage handle_, VkDevice owner_, VmaAllocator allocator_, + VmaAllocation allocation_, const DeviceDispatch& dld_) noexcept + : handle{handle_}, owner{owner_}, allocator{allocator_}, + allocation{allocation_}, dld{&dld_} {} + Image() = default; + + Image(const Image&) = delete; + Image& operator=(const Image&) = delete; + + Image(Image&& rhs) noexcept + : handle{std::exchange(rhs.handle, nullptr)}, owner{rhs.owner}, allocator{rhs.allocator}, + allocation{rhs.allocation}, dld{rhs.dld} {} + + Image& operator=(Image&& rhs) noexcept { + Release(); + handle = std::exchange(rhs.handle, nullptr); + owner = rhs.owner; + allocator = rhs.allocator; + allocation = rhs.allocation; + dld = rhs.dld; + return *this; + } + + ~Image() noexcept { + Release(); + } + + VkImage operator*() const noexcept { + return handle; + } + + void reset() noexcept { + Release(); + handle = nullptr; + } + + explicit operator bool() const noexcept { + return handle != nullptr; + } + + void SetObjectNameEXT(const char* name) const; + +private: + void Release() const noexcept; + + VkImage handle = nullptr; + VkDevice owner = nullptr; + VmaAllocator allocator = nullptr; + VmaAllocation allocation = nullptr; + const DeviceDispatch* dld = nullptr; +}; + class Queue { public: /// Construct an empty queue handle. @@ -658,17 +715,6 @@ public: void SetObjectNameEXT(const char* name) const; }; -class Image : public Handle { - using Handle::Handle; - -public: - /// Attaches a memory allocation. - void BindMemory(VkDeviceMemory memory, VkDeviceSize offset) const; - - /// Set object name. - void SetObjectNameEXT(const char* name) const; -}; - class ImageView : public Handle { using Handle::Handle; @@ -844,8 +890,6 @@ public: BufferView CreateBufferView(const VkBufferViewCreateInfo& ci) const; - Image CreateImage(const VkImageCreateInfo& ci) const; - ImageView CreateImageView(const VkImageViewCreateInfo& ci) const; Semaphore CreateSemaphore() const; -- cgit v1.2.3 From 7b2f680468bbac206f96b26a1300939be90f5f1b Mon Sep 17 00:00:00 2001 From: GPUCode Date: Sat, 27 May 2023 17:09:17 +0300 Subject: renderer_vulkan: Use VMA for buffers --- src/video_core/renderer_vulkan/renderer_vulkan.cpp | 9 +- src/video_core/renderer_vulkan/vk_blit_screen.cpp | 6 +- src/video_core/renderer_vulkan/vk_blit_screen.h | 1 - src/video_core/renderer_vulkan/vk_buffer_cache.cpp | 81 +++++++++------- src/video_core/renderer_vulkan/vk_buffer_cache.h | 2 - src/video_core/renderer_vulkan/vk_smaa.cpp | 12 +-- .../renderer_vulkan/vk_staging_buffer_pool.cpp | 105 ++------------------ .../renderer_vulkan/vk_staging_buffer_pool.h | 4 +- .../renderer_vulkan/vk_texture_cache.cpp | 9 +- src/video_core/renderer_vulkan/vk_texture_cache.h | 1 - src/video_core/renderer_vulkan/vk_turbo_mode.cpp | 8 +- src/video_core/vulkan_common/vulkan_device.cpp | 1 + .../vulkan_common/vulkan_memory_allocator.cpp | 107 ++++++++++++++++----- .../vulkan_common/vulkan_memory_allocator.h | 12 ++- src/video_core/vulkan_common/vulkan_wrapper.cpp | 24 +++-- src/video_core/vulkan_common/vulkan_wrapper.h | 91 +++++++++++++++--- 16 files changed, 262 insertions(+), 211 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index e569523b6..ddf28ca28 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -233,8 +233,8 @@ void Vulkan::RendererVulkan::RenderScreenshot(const Tegra::FramebufferConfig& fr .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, }; - const vk::Buffer dst_buffer = device.GetLogical().CreateBuffer(dst_buffer_info); - MemoryCommit dst_buffer_memory = memory_allocator.Commit(dst_buffer, MemoryUsage::Download); + const vk::Buffer dst_buffer = + memory_allocator.CreateBuffer(dst_buffer_info, MemoryUsage::Download); scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([&](vk::CommandBuffer cmdbuf) { @@ -308,8 +308,9 @@ void Vulkan::RendererVulkan::RenderScreenshot(const Tegra::FramebufferConfig& fr scheduler.Finish(); // Copy backing image data to the QImage screenshot buffer - const auto dst_memory_map = dst_buffer_memory.Map(); - std::memcpy(renderer_settings.screenshot_bits, dst_memory_map.data(), dst_memory_map.size()); + dst_buffer.Invalidate(); + std::memcpy(renderer_settings.screenshot_bits, dst_buffer.Mapped().data(), + dst_buffer.Mapped().size()); renderer_settings.screenshot_complete_callback(false); renderer_settings.screenshot_requested = false; } diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 82ca81c7e..ad3b29f0e 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -162,7 +162,7 @@ void BlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, SetUniformData(data, layout); SetVertexData(data, framebuffer, layout); - const std::span mapped_span = buffer_commit.Map(); + const std::span mapped_span = buffer.Mapped(); std::memcpy(mapped_span.data(), &data, sizeof(data)); if (!use_accelerated) { @@ -1074,7 +1074,6 @@ void BlitScreen::ReleaseRawImages() { aa_image_view.reset(); aa_image.reset(); buffer.reset(); - buffer_commit = MemoryCommit{}; } void BlitScreen::CreateStagingBuffer(const Tegra::FramebufferConfig& framebuffer) { @@ -1090,8 +1089,7 @@ void BlitScreen::CreateStagingBuffer(const Tegra::FramebufferConfig& framebuffer .pQueueFamilyIndices = nullptr, }; - buffer = device.GetLogical().CreateBuffer(ci); - buffer_commit = memory_allocator.Commit(buffer, MemoryUsage::Upload); + buffer = memory_allocator.CreateBuffer(ci, MemoryUsage::Upload); } void BlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) { diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h index 7fcfa9976..8365b5668 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.h +++ b/src/video_core/renderer_vulkan/vk_blit_screen.h @@ -142,7 +142,6 @@ private: vk::Sampler sampler; vk::Buffer buffer; - MemoryCommit buffer_commit; std::vector resource_ticks; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 8c33722d3..67356c679 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -50,7 +50,7 @@ size_t BytesPerIndex(VkIndexType index_type) { } } -vk::Buffer CreateBuffer(const Device& device, u64 size) { +vk::Buffer CreateBuffer(const Device& device, const MemoryAllocator& memory_allocator, u64 size) { VkBufferUsageFlags flags = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | @@ -60,7 +60,7 @@ vk::Buffer CreateBuffer(const Device& device, u64 size) { if (device.IsExtTransformFeedbackSupported()) { flags |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; } - return device.GetLogical().CreateBuffer({ + const VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -69,7 +69,8 @@ vk::Buffer CreateBuffer(const Device& device, u64 size) { .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, - }); + }; + return memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); } } // Anonymous namespace @@ -79,8 +80,8 @@ Buffer::Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params) Buffer::Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes_) : VideoCommon::BufferBase(rasterizer_, cpu_addr_, size_bytes_), - device{&runtime.device}, buffer{CreateBuffer(*device, SizeBytes())}, - commit{runtime.memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal)} { + device{&runtime.device}, buffer{ + CreateBuffer(*device, runtime.memory_allocator, SizeBytes())} { if (runtime.device.HasDebuggingToolAttached()) { buffer.SetObjectNameEXT(fmt::format("Buffer 0x{:x}", CpuAddr()).c_str()); } @@ -138,7 +139,7 @@ public: const u32 num_first_offset_copies = 4; const size_t bytes_per_index = BytesPerIndex(index_type); const size_t size_bytes = num_triangle_indices * bytes_per_index * num_first_offset_copies; - buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ + const VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -147,14 +148,21 @@ public: .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, - }); + }; + buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); if (device.HasDebuggingToolAttached()) { buffer.SetObjectNameEXT("Quad LUT"); } - memory_commit = memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal); - const StagingBufferRef staging = staging_pool.Request(size_bytes, MemoryUsage::Upload); - u8* staging_data = staging.mapped_span.data(); + const bool host_visible = buffer.IsHostVisible(); + const StagingBufferRef staging = [&] { + if (host_visible) { + return StagingBufferRef{}; + } + return staging_pool.Request(size_bytes, MemoryUsage::Upload); + }(); + + u8* staging_data = host_visible ? buffer.Mapped().data() : staging.mapped_span.data(); const size_t quad_size = bytes_per_index * 6; for (u32 first = 0; first < num_first_offset_copies; ++first) { @@ -164,29 +172,33 @@ public: } } - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([src_buffer = staging.buffer, src_offset = staging.offset, - dst_buffer = *buffer, size_bytes](vk::CommandBuffer cmdbuf) { - const VkBufferCopy copy{ - .srcOffset = src_offset, - .dstOffset = 0, - .size = size_bytes, - }; - const VkBufferMemoryBarrier write_barrier{ - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_INDEX_READ_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = dst_buffer, - .offset = 0, - .size = size_bytes, - }; - cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy); - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, write_barrier); - }); + if (!host_visible) { + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([src_buffer = staging.buffer, src_offset = staging.offset, + dst_buffer = *buffer, size_bytes](vk::CommandBuffer cmdbuf) { + const VkBufferCopy copy{ + .srcOffset = src_offset, + .dstOffset = 0, + .size = size_bytes, + }; + const VkBufferMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_INDEX_READ_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = dst_buffer, + .offset = 0, + .size = size_bytes, + }; + cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, write_barrier); + }); + } else { + buffer.Flush(); + } } void BindBuffer(u32 first) { @@ -587,11 +599,10 @@ void BufferCacheRuntime::ReserveNullBuffer() { create_info.usage |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; } create_info.usage |= VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT; - null_buffer = device.GetLogical().CreateBuffer(create_info); + null_buffer = memory_allocator.CreateBuffer(create_info, MemoryUsage::DeviceLocal); if (device.HasDebuggingToolAttached()) { null_buffer.SetObjectNameEXT("Null buffer"); } - null_buffer_commit = memory_allocator.Commit(null_buffer, MemoryUsage::DeviceLocal); scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([buffer = *null_buffer](vk::CommandBuffer cmdbuf) { diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index cdeef8846..95446c732 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -48,7 +48,6 @@ private: const Device* device{}; vk::Buffer buffer; - MemoryCommit commit; std::vector views; }; @@ -142,7 +141,6 @@ private: std::shared_ptr quad_strip_index_buffer; vk::Buffer null_buffer; - MemoryCommit null_buffer_commit; std::unique_ptr uint8_pass; QuadIndexedPass quad_index_pass; diff --git a/src/video_core/renderer_vulkan/vk_smaa.cpp b/src/video_core/renderer_vulkan/vk_smaa.cpp index ff7c3a419..5efd7d66e 100644 --- a/src/video_core/renderer_vulkan/vk_smaa.cpp +++ b/src/video_core/renderer_vulkan/vk_smaa.cpp @@ -76,7 +76,7 @@ void TransitionImageLayout(vk::CommandBuffer& cmdbuf, VkImage image, VkImageLayo void UploadImage(const Device& device, MemoryAllocator& allocator, Scheduler& scheduler, vk::Image& image, VkExtent2D dimensions, VkFormat format, std::span initial_contents = {}) { - auto upload_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ + const VkBufferCreateInfo upload_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -85,9 +85,10 @@ void UploadImage(const Device& device, MemoryAllocator& allocator, Scheduler& sc .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, - }); - auto upload_commit = allocator.Commit(upload_buffer, MemoryUsage::Upload); - std::ranges::copy(initial_contents, upload_commit.Map().begin()); + }; + auto upload_buffer = allocator.CreateBuffer(upload_ci, MemoryUsage::Upload); + std::ranges::copy(initial_contents, upload_buffer.Mapped().begin()); + upload_buffer.Flush(); const std::array regions{{{ .bufferOffset = 0, @@ -111,9 +112,6 @@ void UploadImage(const Device& device, MemoryAllocator& allocator, Scheduler& sc VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); }); scheduler.Finish(); - - // This should go out of scope before the commit - auto upload_buffer2 = std::move(upload_buffer); } vk::ImageView CreateWrappedImageView(const Device& device, vk::Image& image, VkFormat format) { diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp index 74ca77216..62b251a9b 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp @@ -30,55 +30,6 @@ constexpr VkDeviceSize MAX_STREAM_BUFFER_REQUEST_SIZE = 8_MiB; constexpr VkDeviceSize STREAM_BUFFER_SIZE = 128_MiB; constexpr VkDeviceSize REGION_SIZE = STREAM_BUFFER_SIZE / StagingBufferPool::NUM_SYNCS; -constexpr VkMemoryPropertyFlags HOST_FLAGS = - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; -constexpr VkMemoryPropertyFlags STREAM_FLAGS = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | HOST_FLAGS; - -bool IsStreamHeap(VkMemoryHeap heap) noexcept { - return STREAM_BUFFER_SIZE < (heap.size * 2) / 3; -} - -std::optional FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask, - VkMemoryPropertyFlags flags) noexcept { - for (u32 type_index = 0; type_index < props.memoryTypeCount; ++type_index) { - if (((type_mask >> type_index) & 1) == 0) { - // Memory type is incompatible - continue; - } - const VkMemoryType& memory_type = props.memoryTypes[type_index]; - if ((memory_type.propertyFlags & flags) != flags) { - // Memory type doesn't have the flags we want - continue; - } - if (!IsStreamHeap(props.memoryHeaps[memory_type.heapIndex])) { - // Memory heap is not suitable for streaming - continue; - } - // Success! - return type_index; - } - return std::nullopt; -} - -u32 FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask, - bool try_device_local) { - std::optional type; - if (try_device_local) { - // Try to find a DEVICE_LOCAL_BIT type, Nvidia and AMD have a dedicated heap for this - type = FindMemoryTypeIndex(props, type_mask, STREAM_FLAGS); - if (type) { - return *type; - } - } - // Otherwise try without the DEVICE_LOCAL_BIT - type = FindMemoryTypeIndex(props, type_mask, HOST_FLAGS); - if (type) { - return *type; - } - // This should never happen, and in case it does, signal it as an out of memory situation - throw vk::Exception(VK_ERROR_OUT_OF_DEVICE_MEMORY); -} - size_t Region(size_t iterator) noexcept { return iterator / REGION_SIZE; } @@ -87,8 +38,7 @@ size_t Region(size_t iterator) noexcept { StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_, Scheduler& scheduler_) : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} { - const vk::Device& dev = device.GetLogical(); - stream_buffer = dev.CreateBuffer(VkBufferCreateInfo{ + const VkBufferCreateInfo stream_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -99,46 +49,13 @@ StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& mem .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, - }); - if (device.HasDebuggingToolAttached()) { - stream_buffer.SetObjectNameEXT("Stream Buffer"); - } - VkMemoryDedicatedRequirements dedicated_reqs{ - .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS, - .pNext = nullptr, - .prefersDedicatedAllocation = VK_FALSE, - .requiresDedicatedAllocation = VK_FALSE, - }; - const auto requirements = dev.GetBufferMemoryRequirements(*stream_buffer, &dedicated_reqs); - const bool make_dedicated = dedicated_reqs.prefersDedicatedAllocation == VK_TRUE || - dedicated_reqs.requiresDedicatedAllocation == VK_TRUE; - const VkMemoryDedicatedAllocateInfo dedicated_info{ - .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO, - .pNext = nullptr, - .image = nullptr, - .buffer = *stream_buffer, }; - const auto memory_properties = device.GetPhysical().GetMemoryProperties().memoryProperties; - VkMemoryAllocateInfo stream_memory_info{ - .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, - .pNext = make_dedicated ? &dedicated_info : nullptr, - .allocationSize = requirements.size, - .memoryTypeIndex = - FindMemoryTypeIndex(memory_properties, requirements.memoryTypeBits, true), - }; - stream_memory = dev.TryAllocateMemory(stream_memory_info); - if (!stream_memory) { - LOG_INFO(Render_Vulkan, "Dynamic memory allocation failed, trying with system memory"); - stream_memory_info.memoryTypeIndex = - FindMemoryTypeIndex(memory_properties, requirements.memoryTypeBits, false); - stream_memory = dev.AllocateMemory(stream_memory_info); - } - + stream_buffer = memory_allocator.CreateBuffer(stream_ci, MemoryUsage::Stream); if (device.HasDebuggingToolAttached()) { - stream_memory.SetObjectNameEXT("Stream Buffer Memory"); + stream_buffer.SetObjectNameEXT("Stream Buffer"); } - stream_buffer.BindMemory(*stream_memory, 0); - stream_pointer = stream_memory.Map(0, STREAM_BUFFER_SIZE); + stream_pointer = stream_buffer.Mapped(); + ASSERT_MSG(!stream_pointer.empty(), "Stream buffer must be host visible!"); } StagingBufferPool::~StagingBufferPool() = default; @@ -199,7 +116,7 @@ StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) { return StagingBufferRef{ .buffer = *stream_buffer, .offset = static_cast(offset), - .mapped_span = std::span(stream_pointer + offset, size), + .mapped_span = stream_pointer.subspan(offset, size), .usage{}, .log2_level{}, .index{}, @@ -247,7 +164,7 @@ std::optional StagingBufferPool::TryGetReservedBuffer(size_t s StagingBufferRef StagingBufferPool::CreateStagingBuffer(size_t size, MemoryUsage usage, bool deferred) { const u32 log2 = Common::Log2Ceil64(size); - vk::Buffer buffer = device.GetLogical().CreateBuffer({ + const VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -259,17 +176,15 @@ StagingBufferRef StagingBufferPool::CreateStagingBuffer(size_t size, MemoryUsage .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, - }); + }; + vk::Buffer buffer = memory_allocator.CreateBuffer(buffer_ci, usage); if (device.HasDebuggingToolAttached()) { ++buffer_index; buffer.SetObjectNameEXT(fmt::format("Staging Buffer {}", buffer_index).c_str()); } - MemoryCommit commit = memory_allocator.Commit(buffer, usage); - const std::span mapped_span = IsHostVisible(usage) ? commit.Map() : std::span{}; - + const std::span mapped_span = buffer.Mapped(); StagingBuffer& entry = GetCache(usage)[log2].entries.emplace_back(StagingBuffer{ .buffer = std::move(buffer), - .commit = std::move(commit), .mapped_span = mapped_span, .usage = usage, .log2_level = log2, diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h index 4fd15f11a..5f69f08b1 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h @@ -46,7 +46,6 @@ private: struct StagingBuffer { vk::Buffer buffer; - MemoryCommit commit; std::span mapped_span; MemoryUsage usage; u32 log2_level; @@ -97,8 +96,7 @@ private: Scheduler& scheduler; vk::Buffer stream_buffer; - vk::DeviceMemory stream_memory; - u8* stream_pointer = nullptr; + std::span stream_pointer; size_t iterator = 0; size_t used_iterator = 0; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 10defe6cb..28985b6fe 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -839,14 +839,14 @@ bool TextureCacheRuntime::ShouldReinterpret(Image& dst, Image& src) { VkBuffer TextureCacheRuntime::GetTemporaryBuffer(size_t needed_size) { const auto level = (8 * sizeof(size_t)) - std::countl_zero(needed_size - 1ULL); - if (buffer_commits[level]) { + if (buffers[level]) { return *buffers[level]; } const auto new_size = Common::NextPow2(needed_size); static constexpr VkBufferUsageFlags flags = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT; - buffers[level] = device.GetLogical().CreateBuffer({ + const VkBufferCreateInfo temp_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -855,9 +855,8 @@ VkBuffer TextureCacheRuntime::GetTemporaryBuffer(size_t needed_size) { .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, - }); - buffer_commits[level] = std::make_unique( - memory_allocator.Commit(buffers[level], MemoryUsage::DeviceLocal)); + }; + buffers[level] = memory_allocator.CreateBuffer(temp_ci, MemoryUsage::DeviceLocal); return *buffers[level]; } diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 9481f2531..220943116 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -116,7 +116,6 @@ public: static constexpr size_t indexing_slots = 8 * sizeof(size_t); std::array buffers{}; - std::array, indexing_slots> buffer_commits{}; }; class Image : public VideoCommon::ImageBase { diff --git a/src/video_core/renderer_vulkan/vk_turbo_mode.cpp b/src/video_core/renderer_vulkan/vk_turbo_mode.cpp index 6417d7e31..460d8d59d 100644 --- a/src/video_core/renderer_vulkan/vk_turbo_mode.cpp +++ b/src/video_core/renderer_vulkan/vk_turbo_mode.cpp @@ -41,7 +41,7 @@ void TurboMode::Run(std::stop_token stop_token) { auto& dld = m_device.GetLogical(); // Allocate buffer. 2MiB should be sufficient. - auto buffer = dld.CreateBuffer(VkBufferCreateInfo{ + const VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -50,10 +50,8 @@ void TurboMode::Run(std::stop_token stop_token) { .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, - }); - - // Commit some device local memory for the buffer. - auto commit = m_allocator.Commit(buffer, MemoryUsage::DeviceLocal); + }; + vk::Buffer buffer = m_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); // Create the descriptor pool to contain our descriptor. static constexpr VkDescriptorPoolSize pool_size{ diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 631d5e378..541f0c1da 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -603,6 +603,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR }; const VmaAllocatorCreateInfo allocator_info = { + .flags = VMA_ALLOCATOR_CREATE_EXTERNALLY_SYNCHRONIZED_BIT, .physicalDevice = physical, .device = *logical, .pVulkanFunctions = &functions, diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp index 7f860cccd..d2e1ef58e 100644 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp @@ -51,11 +51,59 @@ struct Range { case MemoryUsage::Download: return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + case MemoryUsage::Stream: + return VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; } ASSERT_MSG(false, "Invalid memory usage={}", usage); return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; } +[[nodiscard]] VkMemoryPropertyFlags MemoryUsageRequiredVmaFlags(MemoryUsage usage) { + switch (usage) { + case MemoryUsage::DeviceLocal: + return VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + case MemoryUsage::Upload: + case MemoryUsage::Stream: + return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + case MemoryUsage::Download: + return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + } + ASSERT_MSG(false, "Invalid memory usage={}", usage); + return {}; +} + +[[nodiscard]] VkMemoryPropertyFlags MemoryUsagePreferedVmaFlags(MemoryUsage usage) { + return usage != MemoryUsage::DeviceLocal ? VK_MEMORY_PROPERTY_HOST_COHERENT_BIT + : VkMemoryPropertyFlags{}; +} + +[[nodiscard]] VmaAllocationCreateFlags MemoryUsageVmaFlags(MemoryUsage usage) { + switch (usage) { + case MemoryUsage::Upload: + case MemoryUsage::Stream: + return VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT; + case MemoryUsage::Download: + return VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT; + case MemoryUsage::DeviceLocal: + return VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | + VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT; + } + return {}; +} + +[[nodiscard]] VmaMemoryUsage MemoryUsageVma(MemoryUsage usage) { + switch (usage) { + case MemoryUsage::DeviceLocal: + case MemoryUsage::Stream: + return VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; + case MemoryUsage::Upload: + case MemoryUsage::Download: + return VMA_MEMORY_USAGE_AUTO_PREFER_HOST; + } + return VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; +} + } // Anonymous namespace class MemoryAllocation { @@ -178,17 +226,18 @@ void MemoryCommit::Release() { } MemoryAllocator::MemoryAllocator(const Device& device_) - : device{device_}, properties{device_.GetPhysical().GetMemoryProperties().memoryProperties}, + : device{device_}, allocator{device.GetAllocator()}, + properties{device_.GetPhysical().GetMemoryProperties().memoryProperties}, buffer_image_granularity{ device_.GetPhysical().GetProperties().limits.bufferImageGranularity} {} MemoryAllocator::~MemoryAllocator() = default; vk::Image MemoryAllocator::CreateImage(const VkImageCreateInfo& ci) const { - const VmaAllocationCreateInfo alloc_info = { + const VmaAllocationCreateInfo alloc_ci = { .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT, .usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE, - .requiredFlags = 0, + .requiredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, .preferredFlags = 0, .pool = VK_NULL_HANDLE, .pUserData = nullptr, @@ -196,12 +245,40 @@ vk::Image MemoryAllocator::CreateImage(const VkImageCreateInfo& ci) const { VkImage handle{}; VmaAllocation allocation{}; - vk::Check( - vmaCreateImage(device.GetAllocator(), &ci, &alloc_info, &handle, &allocation, nullptr)); - return vk::Image(handle, *device.GetLogical(), device.GetAllocator(), allocation, + + vk::Check(vmaCreateImage(allocator, &ci, &alloc_ci, &handle, &allocation, nullptr)); + + return vk::Image(handle, *device.GetLogical(), allocator, allocation, device.GetDispatchLoader()); } +vk::Buffer MemoryAllocator::CreateBuffer(const VkBufferCreateInfo& ci, MemoryUsage usage) const { + const VmaAllocationCreateInfo alloc_ci = { + .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT | + MemoryUsageVmaFlags(usage), + .usage = MemoryUsageVma(usage), + .requiredFlags = MemoryUsageRequiredVmaFlags(usage), + .preferredFlags = MemoryUsagePreferedVmaFlags(usage), + .pool = VK_NULL_HANDLE, + .pUserData = nullptr, + }; + + VkBuffer handle{}; + VmaAllocationInfo alloc_info{}; + VmaAllocation allocation{}; + VkMemoryPropertyFlags property_flags{}; + + vk::Check(vmaCreateBuffer(allocator, &ci, &alloc_ci, &handle, &allocation, &alloc_info)); + vmaGetAllocationMemoryProperties(allocator, allocation, &property_flags); + + u8* data = reinterpret_cast(alloc_info.pMappedData); + const std::span mapped_data = data ? std::span{data, ci.size} : std::span{}; + const bool is_coherent = property_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + + return vk::Buffer(handle, *device.GetLogical(), allocator, allocation, mapped_data, is_coherent, + device.GetDispatchLoader()); +} + MemoryCommit MemoryAllocator::Commit(const VkMemoryRequirements& requirements, MemoryUsage usage) { // Find the fastest memory flags we can afford with the current requirements const u32 type_mask = requirements.memoryTypeBits; @@ -221,12 +298,6 @@ MemoryCommit MemoryAllocator::Commit(const VkMemoryRequirements& requirements, M return TryCommit(requirements, flags).value(); } -MemoryCommit MemoryAllocator::Commit(const vk::Buffer& buffer, MemoryUsage usage) { - auto commit = Commit(device.GetLogical().GetBufferMemoryRequirements(*buffer), usage); - buffer.BindMemory(commit.Memory(), commit.Offset()); - return commit; -} - bool MemoryAllocator::TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size) { const u32 type = FindType(flags, type_mask).value(); vk::DeviceMemory memory = device.GetLogical().TryAllocateMemory({ @@ -302,16 +373,4 @@ std::optional MemoryAllocator::FindType(VkMemoryPropertyFlags flags, u32 ty return std::nullopt; } -bool IsHostVisible(MemoryUsage usage) noexcept { - switch (usage) { - case MemoryUsage::DeviceLocal: - return false; - case MemoryUsage::Upload: - case MemoryUsage::Download: - return true; - } - ASSERT_MSG(false, "Invalid memory usage={}", usage); - return false; -} - } // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.h b/src/video_core/vulkan_common/vulkan_memory_allocator.h index f9ee53cfb..f449bc8d0 100644 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.h +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.h @@ -9,6 +9,8 @@ #include "common/common_types.h" #include "video_core/vulkan_common/vulkan_wrapper.h" +VK_DEFINE_HANDLE(VmaAllocator) + namespace Vulkan { class Device; @@ -17,9 +19,11 @@ class MemoryAllocation; /// Hints and requirements for the backing memory type of a commit enum class MemoryUsage { - DeviceLocal, ///< Hints device local usages, fastest memory type to read and write from the GPU + DeviceLocal, ///< Requests device local host visible buffer, falling back to device local + ///< memory. Upload, ///< Requires a host visible memory type optimized for CPU to GPU uploads Download, ///< Requires a host visible memory type optimized for GPU to CPU readbacks + Stream, ///< Requests device local host visible buffer, falling back host memory. }; /// Ownership handle of a memory commitment. @@ -82,6 +86,8 @@ public: vk::Image CreateImage(const VkImageCreateInfo& ci) const; + vk::Buffer CreateBuffer(const VkBufferCreateInfo& ci, MemoryUsage usage) const; + /** * Commits a memory with the specified requirements. * @@ -113,13 +119,11 @@ private: std::optional FindType(VkMemoryPropertyFlags flags, u32 type_mask) const; const Device& device; ///< Device handle. + VmaAllocator allocator; ///< Vma allocator. const VkPhysicalDeviceMemoryProperties properties; ///< Physical device properties. std::vector> allocations; ///< Current allocations. VkDeviceSize buffer_image_granularity; // The granularity for adjacent offsets between buffers // and optimal images }; -/// Returns true when a memory usage is guaranteed to be host visible. -bool IsHostVisible(MemoryUsage usage) noexcept; - } // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index 5d088dc58..c01a9478e 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp @@ -561,14 +561,28 @@ void Image::Release() const noexcept { } } -void Buffer::BindMemory(VkDeviceMemory memory, VkDeviceSize offset) const { - Check(dld->vkBindBufferMemory(owner, handle, memory, offset)); +void Buffer::Flush() const { + if (!is_coherent) { + vmaFlushAllocation(allocator, allocation, 0, VK_WHOLE_SIZE); + } +} + +void Buffer::Invalidate() const { + if (!is_coherent) { + vmaInvalidateAllocation(allocator, allocation, 0, VK_WHOLE_SIZE); + } } void Buffer::SetObjectNameEXT(const char* name) const { SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_BUFFER, name); } +void Buffer::Release() const noexcept { + if (handle) { + vmaDestroyBuffer(allocator, handle, allocation); + } +} + void BufferView::SetObjectNameEXT(const char* name) const { SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_BUFFER_VIEW, name); } @@ -707,12 +721,6 @@ Queue Device::GetQueue(u32 family_index) const noexcept { return Queue(queue, *dld); } -Buffer Device::CreateBuffer(const VkBufferCreateInfo& ci) const { - VkBuffer object; - Check(dld->vkCreateBuffer(handle, &ci, nullptr, &object)); - return Buffer(object, handle, *dld); -} - BufferView Device::CreateBufferView(const VkBufferViewCreateInfo& ci) const { VkBufferView object; Check(dld->vkCreateBufferView(handle, &ci, nullptr, &object)); diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index 8ec708774..44fce47a5 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -673,6 +673,84 @@ private: const DeviceDispatch* dld = nullptr; }; +class Buffer { +public: + explicit Buffer(VkBuffer handle_, VkDevice owner_, VmaAllocator allocator_, + VmaAllocation allocation_, std::span mapped_, bool is_coherent_, + const DeviceDispatch& dld_) noexcept + : handle{handle_}, owner{owner_}, allocator{allocator_}, + allocation{allocation_}, mapped{mapped_}, is_coherent{is_coherent_}, dld{&dld_} {} + Buffer() = default; + + Buffer(const Buffer&) = delete; + Buffer& operator=(const Buffer&) = delete; + + Buffer(Buffer&& rhs) noexcept + : handle{std::exchange(rhs.handle, nullptr)}, owner{rhs.owner}, allocator{rhs.allocator}, + allocation{rhs.allocation}, mapped{rhs.mapped}, + is_coherent{rhs.is_coherent}, dld{rhs.dld} {} + + Buffer& operator=(Buffer&& rhs) noexcept { + Release(); + handle = std::exchange(rhs.handle, nullptr); + owner = rhs.owner; + allocator = rhs.allocator; + allocation = rhs.allocation; + mapped = rhs.mapped; + is_coherent = rhs.is_coherent; + dld = rhs.dld; + return *this; + } + + ~Buffer() noexcept { + Release(); + } + + VkBuffer operator*() const noexcept { + return handle; + } + + void reset() noexcept { + Release(); + handle = nullptr; + } + + explicit operator bool() const noexcept { + return handle != nullptr; + } + + /// Returns the host mapped memory, an empty span otherwise. + std::span Mapped() noexcept { + return mapped; + } + + std::span Mapped() const noexcept { + return mapped; + } + + /// Returns true if the buffer is mapped to the host. + bool IsHostVisible() const noexcept { + return !mapped.empty(); + } + + void Flush() const; + + void Invalidate() const; + + void SetObjectNameEXT(const char* name) const; + +private: + void Release() const noexcept; + + VkBuffer handle = nullptr; + VkDevice owner = nullptr; + VmaAllocator allocator = nullptr; + VmaAllocation allocation = nullptr; + std::span mapped = {}; + bool is_coherent = false; + const DeviceDispatch* dld = nullptr; +}; + class Queue { public: /// Construct an empty queue handle. @@ -696,17 +774,6 @@ private: const DeviceDispatch* dld = nullptr; }; -class Buffer : public Handle { - using Handle::Handle; - -public: - /// Attaches a memory allocation. - void BindMemory(VkDeviceMemory memory, VkDeviceSize offset) const; - - /// Set object name. - void SetObjectNameEXT(const char* name) const; -}; - class BufferView : public Handle { using Handle::Handle; @@ -886,8 +953,6 @@ public: Queue GetQueue(u32 family_index) const noexcept; - Buffer CreateBuffer(const VkBufferCreateInfo& ci) const; - BufferView CreateBufferView(const VkBufferViewCreateInfo& ci) const; ImageView CreateImageView(const VkImageViewCreateInfo& ci) const; -- cgit v1.2.3 From ee0d68300e68a221d9930935f26d0c96be79357b Mon Sep 17 00:00:00 2001 From: GPUCode Date: Sun, 18 Jun 2023 12:27:31 +0300 Subject: renderer_vulkan: Add missing initializers --- externals/vma/vma.cpp | 2 ++ src/video_core/vulkan_common/vulkan_device.cpp | 12 ++++++++---- src/video_core/vulkan_common/vulkan_memory_allocator.cpp | 6 +++++- 3 files changed, 15 insertions(+), 5 deletions(-) (limited to 'src/video_core') diff --git a/externals/vma/vma.cpp b/externals/vma/vma.cpp index 7f7df9cff..ff1acc320 100644 --- a/externals/vma/vma.cpp +++ b/externals/vma/vma.cpp @@ -2,4 +2,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later #define VMA_IMPLEMENTATION +#define VMA_STATIC_VULKAN_FUNCTIONS 0 +#define VMA_DYNAMIC_VULKAN_FUNCTIONS 1 #include \ No newline at end of file diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 541f0c1da..94dd1aa14 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -597,18 +597,22 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR graphics_queue = logical.GetQueue(graphics_family); present_queue = logical.GetQueue(present_family); - const VmaVulkanFunctions functions = { - .vkGetInstanceProcAddr = dld.vkGetInstanceProcAddr, - .vkGetDeviceProcAddr = dld.vkGetDeviceProcAddr, - }; + VmaVulkanFunctions functions{}; + functions.vkGetInstanceProcAddr = dld.vkGetInstanceProcAddr; + functions.vkGetDeviceProcAddr = dld.vkGetDeviceProcAddr; const VmaAllocatorCreateInfo allocator_info = { .flags = VMA_ALLOCATOR_CREATE_EXTERNALLY_SYNCHRONIZED_BIT, .physicalDevice = physical, .device = *logical, + .preferredLargeHeapBlockSize = 0, + .pAllocationCallbacks = nullptr, + .pDeviceMemoryCallbacks = nullptr, + .pHeapSizeLimit = nullptr, .pVulkanFunctions = &functions, .instance = instance, .vulkanApiVersion = VK_API_VERSION_1_1, + .pTypeExternalMemoryHandleTypes = nullptr, }; vk::Check(vmaCreateAllocator(&allocator_info, &allocator)); diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp index d2e1ef58e..20d36680c 100644 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp @@ -75,7 +75,7 @@ struct Range { [[nodiscard]] VkMemoryPropertyFlags MemoryUsagePreferedVmaFlags(MemoryUsage usage) { return usage != MemoryUsage::DeviceLocal ? VK_MEMORY_PROPERTY_HOST_COHERENT_BIT - : VkMemoryPropertyFlags{}; + : VkMemoryPropertyFlagBits{}; } [[nodiscard]] VmaAllocationCreateFlags MemoryUsageVmaFlags(MemoryUsage usage) { @@ -239,8 +239,10 @@ vk::Image MemoryAllocator::CreateImage(const VkImageCreateInfo& ci) const { .usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE, .requiredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, .preferredFlags = 0, + .memoryTypeBits = 0, .pool = VK_NULL_HANDLE, .pUserData = nullptr, + .priority = 0.f, }; VkImage handle{}; @@ -259,8 +261,10 @@ vk::Buffer MemoryAllocator::CreateBuffer(const VkBufferCreateInfo& ci, MemoryUsa .usage = MemoryUsageVma(usage), .requiredFlags = MemoryUsageRequiredVmaFlags(usage), .preferredFlags = MemoryUsagePreferedVmaFlags(usage), + .memoryTypeBits = 0, .pool = VK_NULL_HANDLE, .pUserData = nullptr, + .priority = 0.f, }; VkBuffer handle{}; -- cgit v1.2.3 From 346c253cd2397ac152fd10f6b99d6af79349a77f Mon Sep 17 00:00:00 2001 From: lat9nq <22451773+lat9nq@users.noreply.github.com> Date: Thu, 15 Jun 2023 16:16:36 -0400 Subject: video_core: Formalize HasBrokenCompute Also limits it to only affected Intel proprietrary driver versions. vulkan_device: Move broken compute determination vk_device: Remove errant back quote --- .../renderer_vulkan/vk_pipeline_cache.cpp | 5 +---- src/video_core/vulkan_common/vulkan_device.cpp | 2 ++ src/video_core/vulkan_common/vulkan_device.h | 23 ++++++++++++++++++++++ 3 files changed, 26 insertions(+), 4 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 18e040a1b..ee2c33131 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -705,10 +705,7 @@ std::unique_ptr PipelineCache::CreateComputePipeline( std::unique_ptr PipelineCache::CreateComputePipeline( ShaderPools& pools, const ComputePipelineCacheKey& key, Shader::Environment& env, PipelineStatistics* statistics, bool build_in_parallel) try { - // TODO: Remove this when Intel fixes their shader compiler. - // https://github.com/IGCIT/Intel-GPU-Community-Issue-Tracker-IGCIT/issues/159 - if (device.GetDriverID() == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS && - !Settings::values.enable_compute_pipelines.GetValue()) { + if (device.HasBrokenCompute() && !Settings::values.enable_compute_pipelines.GetValue()) { LOG_ERROR(Render_Vulkan, "Skipping 0x{:016x}", key.Hash()); return nullptr; } diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index dcedf4425..e38e34bc8 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -562,6 +562,8 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR LOG_WARNING(Render_Vulkan, "Intel proprietary drivers do not support MSAA image blits"); cant_blit_msaa = true; } + has_broken_compute = + CheckBrokenCompute(properties.driver.driverID, properties.properties.driverVersion); if (is_intel_anv || (is_qualcomm && !is_s8gen2)) { LOG_WARNING(Render_Vulkan, "Driver does not support native BGR format"); must_emulate_bgr565 = true; diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 8c7e44fcb..e54828088 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -10,6 +10,7 @@ #include #include "common/common_types.h" +#include "common/logging/log.h" #include "common/settings.h" #include "video_core/vulkan_common/vulkan_wrapper.h" @@ -518,6 +519,11 @@ public: return has_renderdoc || has_nsight_graphics || Settings::values.renderer_debug.GetValue(); } + /// @returns True if compute pipelines can cause crashing. + bool HasBrokenCompute() const { + return has_broken_compute; + } + /// Returns true when the device does not properly support cube compatibility. bool HasBrokenCubeImageCompability() const { return has_broken_cube_compatibility; @@ -579,6 +585,22 @@ public: return supports_conditional_barriers; } + [[nodiscard]] static constexpr bool CheckBrokenCompute(VkDriverId driver_id, + u32 driver_version) { + if (driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS) { + const u32 major = VK_API_VERSION_MAJOR(driver_version); + const u32 minor = VK_API_VERSION_MINOR(driver_version); + const u32 patch = VK_API_VERSION_PATCH(driver_version); + if (major == 0 && minor == 405 && patch < 286) { + LOG_WARNING( + Render_Vulkan, + "Intel proprietary drivers 0.405.0 until 0.405.286 have broken compute"); + return true; + } + } + return {}; + } + private: /// Checks if the physical device is suitable and configures the object state /// with all necessary info about its properties. @@ -672,6 +694,7 @@ private: bool is_integrated{}; ///< Is GPU an iGPU. bool is_virtual{}; ///< Is GPU a virtual GPU. bool is_non_gpu{}; ///< Is SoftwareRasterizer, FPGA, non-GPU device. + bool has_broken_compute{}; ///< Compute shaders can cause crashes bool has_broken_cube_compatibility{}; ///< Has broken cube compatibility bit bool has_renderdoc{}; ///< Has RenderDoc attached bool has_nsight_graphics{}; ///< Has Nsight Graphics attached -- cgit v1.2.3 From 711190bb6708f822a3bdb7afd30168d2cc3ed0e4 Mon Sep 17 00:00:00 2001 From: Kelebek1 Date: Mon, 19 Jun 2023 00:19:28 +0100 Subject: Use current GPU address when unmapping GPU pages, not the base --- src/video_core/memory_manager.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 7b2cde7a7..b2f7e160a 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -111,7 +111,7 @@ GPUVAddr MemoryManager::PageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] VAddr cp [[maybe_unused]] const auto current_entry_type = GetEntry(current_gpu_addr); SetEntry(current_gpu_addr, entry_type); if (current_entry_type != entry_type) { - rasterizer->ModifyGPUMemory(unique_identifier, gpu_addr, page_size); + rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, page_size); } if constexpr (entry_type == EntryType::Mapped) { const VAddr current_cpu_addr = cpu_addr + offset; @@ -134,7 +134,7 @@ GPUVAddr MemoryManager::BigPageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] VAddr [[maybe_unused]] const auto current_entry_type = GetEntry(current_gpu_addr); SetEntry(current_gpu_addr, entry_type); if (current_entry_type != entry_type) { - rasterizer->ModifyGPUMemory(unique_identifier, gpu_addr, big_page_size); + rasterizer->ModifyGPUMemory(unique_identifier, current_gpu_addr, big_page_size); } if constexpr (entry_type == EntryType::Mapped) { const VAddr current_cpu_addr = cpu_addr + offset; -- cgit v1.2.3 From 197e13d93d6740cda589d88804262d6bdd176a74 Mon Sep 17 00:00:00 2001 From: lat9nq <22451773+lat9nq@users.noreply.github.com> Date: Mon, 19 Jun 2023 17:33:30 -0400 Subject: video_core: Check broken compute earlier Checks it as the system is determining what settings to enable. Reduces the need to check settings while the system is running. --- src/video_core/renderer_vulkan/vk_pipeline_cache.cpp | 2 +- src/video_core/vulkan_common/vulkan_device.cpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index ee2c33131..a2cfb2105 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -705,7 +705,7 @@ std::unique_ptr PipelineCache::CreateComputePipeline( std::unique_ptr PipelineCache::CreateComputePipeline( ShaderPools& pools, const ComputePipelineCacheKey& key, Shader::Environment& env, PipelineStatistics* statistics, bool build_in_parallel) try { - if (device.HasBrokenCompute() && !Settings::values.enable_compute_pipelines.GetValue()) { + if (device.HasBrokenCompute()) { LOG_ERROR(Render_Vulkan, "Skipping 0x{:016x}", key.Hash()); return nullptr; } diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index e38e34bc8..fa9cde75b 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -563,7 +563,8 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR cant_blit_msaa = true; } has_broken_compute = - CheckBrokenCompute(properties.driver.driverID, properties.properties.driverVersion); + CheckBrokenCompute(properties.driver.driverID, properties.properties.driverVersion) && + !Settings::values.enable_compute_pipelines.GetValue(); if (is_intel_anv || (is_qualcomm && !is_s8gen2)) { LOG_WARNING(Render_Vulkan, "Driver does not support native BGR format"); must_emulate_bgr565 = true; -- cgit v1.2.3 From 78ff2862f6a0785247d3aa64bdc210b545e4d82d Mon Sep 17 00:00:00 2001 From: toast2903 <22451773+lat9nq@users.noreply.github.com> Date: Sun, 18 Jun 2023 23:29:27 -0400 Subject: vulkan_device: Remove brace initializer Co-authored-by: Tobias --- src/video_core/vulkan_common/vulkan_device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/video_core') diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index e54828088..0b634a876 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -598,7 +598,7 @@ public: return true; } } - return {}; + return false; } private: -- cgit v1.2.3 From 5da70f719703084482933e103e561cc98163f370 Mon Sep 17 00:00:00 2001 From: Kelebek1 Date: Tue, 23 May 2023 14:45:54 +0100 Subject: Remove memory allocations in some hot paths --- src/audio_core/device/audio_buffers.h | 8 +-- src/audio_core/device/device_session.cpp | 12 ++--- src/audio_core/device/device_session.h | 7 +-- src/audio_core/in/audio_in_system.cpp | 5 +- src/audio_core/out/audio_out_system.cpp | 4 +- .../renderer/command/data_source/decode.cpp | 23 ++++----- .../renderer/command/effect/compressor.cpp | 8 +-- src/audio_core/renderer/command/effect/delay.cpp | 14 ++--- .../renderer/command/effect/i3dl2_reverb.cpp | 4 +- .../renderer/command/effect/light_limiter.cpp | 12 ++--- src/audio_core/renderer/command/effect/reverb.cpp | 12 ++--- .../renderer/command/sink/circular_buffer.cpp | 4 +- src/audio_core/renderer/command/sink/device.cpp | 5 +- src/audio_core/renderer/mix/mix_context.cpp | 6 +-- src/audio_core/renderer/nodes/node_states.cpp | 4 +- src/audio_core/renderer/nodes/node_states.h | 2 +- src/audio_core/renderer/system.cpp | 1 + src/audio_core/sink/null_sink.h | 2 +- src/audio_core/sink/sink_stream.cpp | 15 +++--- src/audio_core/sink/sink_stream.h | 5 +- src/common/ring_buffer.h | 3 +- src/common/scratch_buffer.h | 9 ++++ src/core/hle/kernel/k_synchronization_object.cpp | 3 +- src/core/hle/kernel/k_thread.cpp | 8 ++- src/core/hle/kernel/k_thread.h | 3 +- src/core/hle/kernel/svc/svc_ipc.cpp | 7 +-- src/core/hle/kernel/svc/svc_synchronization.cpp | 10 ++-- src/core/hle/kernel/svc/svc_thread.cpp | 2 +- src/core/hle/service/audio/audin_u.cpp | 16 +++--- src/core/hle/service/audio/audout_u.cpp | 15 ++---- src/core/hle/service/audio/audren_u.cpp | 22 ++++---- src/core/hle/service/audio/audren_u.h | 1 + src/core/hle/service/audio/hwopus.cpp | 9 ++-- src/core/hle/service/nvdrv/devices/nvdevice.h | 6 +-- .../hle/service/nvdrv/devices/nvdisp_disp0.cpp | 6 +-- src/core/hle/service/nvdrv/devices/nvdisp_disp0.h | 8 +-- .../hle/service/nvdrv/devices/nvhost_as_gpu.cpp | 31 ++++++------ src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h | 30 ++++++----- src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp | 19 ++++--- src/core/hle/service/nvdrv/devices/nvhost_ctrl.h | 21 ++++---- .../hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp | 32 ++++++------ .../hle/service/nvdrv/devices/nvhost_ctrl_gpu.h | 38 +++++++------- src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp | 59 +++++++++++----------- src/core/hle/service/nvdrv/devices/nvhost_gpu.h | 36 ++++++------- .../hle/service/nvdrv/devices/nvhost_nvdec.cpp | 6 +-- src/core/hle/service/nvdrv/devices/nvhost_nvdec.h | 8 +-- .../service/nvdrv/devices/nvhost_nvdec_common.cpp | 15 +++--- .../service/nvdrv/devices/nvhost_nvdec_common.h | 12 ++--- .../hle/service/nvdrv/devices/nvhost_nvjpg.cpp | 8 +-- src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h | 10 ++-- src/core/hle/service/nvdrv/devices/nvhost_vic.cpp | 6 +-- src/core/hle/service/nvdrv/devices/nvhost_vic.h | 8 +-- src/core/hle/service/nvdrv/devices/nvmap.cpp | 20 ++++---- src/core/hle/service/nvdrv/devices/nvmap.h | 20 ++++---- src/core/hle/service/nvdrv/nvdrv.cpp | 8 +-- src/core/hle/service/nvdrv/nvdrv.h | 8 +-- src/core/hle/service/nvdrv/nvdrv_interface.cpp | 24 ++++----- src/core/hle/service/nvdrv/nvdrv_interface.h | 3 ++ src/core/hle/service/nvnflinger/parcel.h | 7 +-- .../backend/glsl/glsl_emit_context.cpp | 2 +- src/shader_recompiler/backend/spirv/emit_spirv.cpp | 2 +- .../backend/spirv/spirv_emit_context.cpp | 2 +- src/shader_recompiler/runtime_info.h | 3 +- src/video_core/buffer_cache/buffer_cache.h | 4 +- src/video_core/buffer_cache/buffer_cache_base.h | 4 +- src/video_core/cdma_pusher.h | 1 - src/video_core/dma_pusher.h | 8 +-- src/video_core/engines/maxwell_dma.cpp | 35 +++++++------ src/video_core/host1x/codecs/h264.cpp | 4 +- src/video_core/memory_manager.cpp | 13 ++--- src/video_core/memory_manager.h | 15 ++++-- src/video_core/renderer_opengl/gl_shader_cache.cpp | 4 +- src/video_core/renderer_vulkan/vk_buffer_cache.cpp | 2 +- .../renderer_vulkan/vk_pipeline_cache.cpp | 10 +++- .../renderer_vulkan/vk_texture_cache.cpp | 27 +++++----- src/video_core/shader_cache.cpp | 4 +- src/video_core/texture_cache/image_base.h | 5 +- src/video_core/texture_cache/texture_cache.h | 14 ++--- src/video_core/texture_cache/texture_cache_base.h | 4 +- src/video_core/texture_cache/util.cpp | 48 ++++++++++-------- src/video_core/texture_cache/util.h | 31 ++++++------ src/video_core/transform_feedback.cpp | 8 +-- src/video_core/transform_feedback.h | 2 +- src/video_core/vulkan_common/vulkan_device.cpp | 1 + 84 files changed, 503 insertions(+), 460 deletions(-) (limited to 'src/video_core') diff --git a/src/audio_core/device/audio_buffers.h b/src/audio_core/device/audio_buffers.h index 15082f6c6..5d8ed0ef7 100644 --- a/src/audio_core/device/audio_buffers.h +++ b/src/audio_core/device/audio_buffers.h @@ -7,6 +7,7 @@ #include #include #include +#include #include "audio_buffer.h" #include "audio_core/device/device_session.h" @@ -48,7 +49,7 @@ public: * * @param out_buffers - The buffers which were registered. */ - void RegisterBuffers(std::vector& out_buffers) { + void RegisterBuffers(boost::container::static_vector& out_buffers) { std::scoped_lock l{lock}; const s32 to_register{std::min(std::min(appended_count, BufferAppendLimit), BufferAppendLimit - registered_count)}; @@ -162,7 +163,8 @@ public: * @param max_buffers - Maximum number of buffers to released. * @return The number of buffers released. */ - u32 GetRegisteredAppendedBuffers(std::vector& buffers_flushed, u32 max_buffers) { + u32 GetRegisteredAppendedBuffers( + boost::container::static_vector& buffers_flushed, u32 max_buffers) { std::scoped_lock l{lock}; if (registered_count + appended_count == 0) { return 0; @@ -270,7 +272,7 @@ public: */ bool FlushBuffers(u32& buffers_released) { std::scoped_lock l{lock}; - std::vector buffers_flushed{}; + boost::container::static_vector buffers_flushed{}; buffers_released = GetRegisteredAppendedBuffers(buffers_flushed, append_limit); diff --git a/src/audio_core/device/device_session.cpp b/src/audio_core/device/device_session.cpp index b5c0ef0e6..86811fcb8 100644 --- a/src/audio_core/device/device_session.cpp +++ b/src/audio_core/device/device_session.cpp @@ -79,7 +79,7 @@ void DeviceSession::ClearBuffers() { } } -void DeviceSession::AppendBuffers(std::span buffers) const { +void DeviceSession::AppendBuffers(std::span buffers) { for (const auto& buffer : buffers) { Sink::SinkBuffer new_buffer{ .frames = buffer.size / (channel_count * sizeof(s16)), @@ -88,13 +88,13 @@ void DeviceSession::AppendBuffers(std::span buffers) const { .consumed = false, }; + tmp_samples.resize_destructive(buffer.size / sizeof(s16)); if (type == Sink::StreamType::In) { - std::vector samples{}; - stream->AppendBuffer(new_buffer, samples); + stream->AppendBuffer(new_buffer, tmp_samples); } else { - std::vector samples(buffer.size / sizeof(s16)); - system.ApplicationMemory().ReadBlockUnsafe(buffer.samples, samples.data(), buffer.size); - stream->AppendBuffer(new_buffer, samples); + system.ApplicationMemory().ReadBlockUnsafe(buffer.samples, tmp_samples.data(), + buffer.size); + stream->AppendBuffer(new_buffer, tmp_samples); } } } diff --git a/src/audio_core/device/device_session.h b/src/audio_core/device/device_session.h index 75f766c68..7d52f362d 100644 --- a/src/audio_core/device/device_session.h +++ b/src/audio_core/device/device_session.h @@ -10,6 +10,7 @@ #include "audio_core/common/common.h" #include "audio_core/sink/sink.h" +#include "common/scratch_buffer.h" #include "core/hle/service/audio/errors.h" namespace Core { @@ -62,7 +63,7 @@ public: * * @param buffers - The buffers to play. */ - void AppendBuffers(std::span buffers) const; + void AppendBuffers(std::span buffers); /** * (Audio In only) Pop samples from the backend, and write them back to this buffer's address. @@ -146,8 +147,8 @@ private: std::shared_ptr thread_event; /// Is this session initialised? bool initialized{}; - /// Buffer queue - std::vector buffer_queue{}; + /// Temporary sample buffer + Common::ScratchBuffer tmp_samples{}; }; } // namespace AudioCore diff --git a/src/audio_core/in/audio_in_system.cpp b/src/audio_core/in/audio_in_system.cpp index e23e51758..579129121 100644 --- a/src/audio_core/in/audio_in_system.cpp +++ b/src/audio_core/in/audio_in_system.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include + #include "audio_core/audio_event.h" #include "audio_core/audio_manager.h" #include "audio_core/in/audio_in_system.h" @@ -89,7 +90,7 @@ Result System::Start() { session->Start(); state = State::Started; - std::vector buffers_to_flush{}; + boost::container::static_vector buffers_to_flush{}; buffers.RegisterBuffers(buffers_to_flush); session->AppendBuffers(buffers_to_flush); session->SetRingSize(static_cast(buffers_to_flush.size())); @@ -134,7 +135,7 @@ bool System::AppendBuffer(const AudioInBuffer& buffer, const u64 tag) { void System::RegisterBuffers() { if (state == State::Started) { - std::vector registered_buffers{}; + boost::container::static_vector registered_buffers{}; buffers.RegisterBuffers(registered_buffers); session->AppendBuffers(registered_buffers); } diff --git a/src/audio_core/out/audio_out_system.cpp b/src/audio_core/out/audio_out_system.cpp index bd13f7219..0adf64bd3 100644 --- a/src/audio_core/out/audio_out_system.cpp +++ b/src/audio_core/out/audio_out_system.cpp @@ -89,7 +89,7 @@ Result System::Start() { session->Start(); state = State::Started; - std::vector buffers_to_flush{}; + boost::container::static_vector buffers_to_flush{}; buffers.RegisterBuffers(buffers_to_flush); session->AppendBuffers(buffers_to_flush); session->SetRingSize(static_cast(buffers_to_flush.size())); @@ -134,7 +134,7 @@ bool System::AppendBuffer(const AudioOutBuffer& buffer, u64 tag) { void System::RegisterBuffers() { if (state == State::Started) { - std::vector registered_buffers{}; + boost::container::static_vector registered_buffers{}; buffers.RegisterBuffers(registered_buffers); session->AppendBuffers(registered_buffers); } diff --git a/src/audio_core/renderer/command/data_source/decode.cpp b/src/audio_core/renderer/command/data_source/decode.cpp index ff5d31bd6..f45933203 100644 --- a/src/audio_core/renderer/command/data_source/decode.cpp +++ b/src/audio_core/renderer/command/data_source/decode.cpp @@ -8,6 +8,7 @@ #include "audio_core/renderer/command/resample/resample.h" #include "common/fixed_point.h" #include "common/logging/log.h" +#include "common/scratch_buffer.h" #include "core/memory.h" namespace AudioCore::AudioRenderer { @@ -27,6 +28,7 @@ constexpr std::array PitchBySrcQuality = {4, 8, 4}; template static u32 DecodePcm(Core::Memory::Memory& memory, std::span out_buffer, const DecodeArg& req) { + std::array tmp_samples{}; constexpr s32 min{std::numeric_limits::min()}; constexpr s32 max{std::numeric_limits::max()}; @@ -49,18 +51,17 @@ static u32 DecodePcm(Core::Memory::Memory& memory, std::span out_buffer, const u64 size{channel_count * samples_to_decode}; const u64 size_bytes{size * sizeof(T)}; - std::vector samples(size); - memory.ReadBlockUnsafe(source, samples.data(), size_bytes); + memory.ReadBlockUnsafe(source, tmp_samples.data(), size_bytes); if constexpr (std::is_floating_point_v) { for (u32 i = 0; i < samples_to_decode; i++) { - auto sample{static_cast(samples[i * channel_count + req.target_channel] * + auto sample{static_cast(tmp_samples[i * channel_count + req.target_channel] * std::numeric_limits::max())}; out_buffer[i] = static_cast(std::clamp(sample, min, max)); } } else { for (u32 i = 0; i < samples_to_decode; i++) { - out_buffer[i] = samples[i * channel_count + req.target_channel]; + out_buffer[i] = tmp_samples[i * channel_count + req.target_channel]; } } } break; @@ -73,17 +74,16 @@ static u32 DecodePcm(Core::Memory::Memory& memory, std::span out_buffer, } const VAddr source{req.buffer + ((req.start_offset + req.offset) * sizeof(T))}; - std::vector samples(samples_to_decode); - memory.ReadBlockUnsafe(source, samples.data(), samples_to_decode * sizeof(T)); + memory.ReadBlockUnsafe(source, tmp_samples.data(), samples_to_decode * sizeof(T)); if constexpr (std::is_floating_point_v) { for (u32 i = 0; i < samples_to_decode; i++) { - auto sample{static_cast(samples[i * channel_count + req.target_channel] * + auto sample{static_cast(tmp_samples[i * channel_count + req.target_channel] * std::numeric_limits::max())}; out_buffer[i] = static_cast(std::clamp(sample, min, max)); } } else { - std::memcpy(out_buffer.data(), samples.data(), samples_to_decode * sizeof(s16)); + std::memcpy(out_buffer.data(), tmp_samples.data(), samples_to_decode * sizeof(s16)); } break; } @@ -101,6 +101,7 @@ static u32 DecodePcm(Core::Memory::Memory& memory, std::span out_buffer, */ static u32 DecodeAdpcm(Core::Memory::Memory& memory, std::span out_buffer, const DecodeArg& req) { + std::array wavebuffer{}; constexpr u32 SamplesPerFrame{14}; constexpr u32 NibblesPerFrame{16}; @@ -138,9 +139,7 @@ static u32 DecodeAdpcm(Core::Memory::Memory& memory, std::span out_buffer, } const auto size{std::max((samples_to_process / 8U) * SamplesPerFrame, 8U)}; - std::vector wavebuffer(size); - memory.ReadBlockUnsafe(req.buffer + position_in_frame / 2, wavebuffer.data(), - wavebuffer.size()); + memory.ReadBlockUnsafe(req.buffer + position_in_frame / 2, wavebuffer.data(), size); auto context{req.adpcm_context}; auto header{context->header}; @@ -258,7 +257,7 @@ void DecodeFromWaveBuffers(Core::Memory::Memory& memory, const DecodeFromWaveBuf u32 offset{voice_state.offset}; auto output_buffer{args.output}; - std::vector temp_buffer(TempBufferSize, 0); + std::array temp_buffer{}; while (remaining_sample_count > 0) { const auto samples_to_write{std::min(remaining_sample_count, max_remaining_sample_count)}; diff --git a/src/audio_core/renderer/command/effect/compressor.cpp b/src/audio_core/renderer/command/effect/compressor.cpp index 7229618e8..ee9b68d5b 100644 --- a/src/audio_core/renderer/command/effect/compressor.cpp +++ b/src/audio_core/renderer/command/effect/compressor.cpp @@ -44,8 +44,8 @@ static void InitializeCompressorEffect(const CompressorInfo::ParameterVersion2& static void ApplyCompressorEffect(const CompressorInfo::ParameterVersion2& params, CompressorInfo::State& state, bool enabled, - std::vector> input_buffers, - std::vector> output_buffers, u32 sample_count) { + std::span> input_buffers, + std::span> output_buffers, u32 sample_count) { if (enabled) { auto state_00{state.unk_00}; auto state_04{state.unk_04}; @@ -124,8 +124,8 @@ void CompressorCommand::Dump([[maybe_unused]] const ADSP::CommandListProcessor& } void CompressorCommand::Process(const ADSP::CommandListProcessor& processor) { - std::vector> input_buffers(parameter.channel_count); - std::vector> output_buffers(parameter.channel_count); + std::array, MaxChannels> input_buffers{}; + std::array, MaxChannels> output_buffers{}; for (s16 i = 0; i < parameter.channel_count; i++) { input_buffers[i] = processor.mix_buffers.subspan(inputs[i] * processor.sample_count, diff --git a/src/audio_core/renderer/command/effect/delay.cpp b/src/audio_core/renderer/command/effect/delay.cpp index a4e408d40..e536cbb1e 100644 --- a/src/audio_core/renderer/command/effect/delay.cpp +++ b/src/audio_core/renderer/command/effect/delay.cpp @@ -51,7 +51,7 @@ static void InitializeDelayEffect(const DelayInfo::ParameterVersion1& params, state.delay_lines[channel].sample_count_max = sample_count_max.to_int_floor(); state.delay_lines[channel].sample_count = sample_count.to_int_floor(); state.delay_lines[channel].buffer.resize(state.delay_lines[channel].sample_count, 0); - if (state.delay_lines[channel].buffer.size() == 0) { + if (state.delay_lines[channel].sample_count == 0) { state.delay_lines[channel].buffer.push_back(0); } state.delay_lines[channel].buffer_pos = 0; @@ -74,8 +74,8 @@ static void InitializeDelayEffect(const DelayInfo::ParameterVersion1& params, */ template static void ApplyDelay(const DelayInfo::ParameterVersion1& params, DelayInfo::State& state, - std::vector>& inputs, - std::vector>& outputs, const u32 sample_count) { + std::span> inputs, std::span> outputs, + const u32 sample_count) { for (u32 sample_index = 0; sample_index < sample_count; sample_index++) { std::array, NumChannels> input_samples{}; for (u32 channel = 0; channel < NumChannels; channel++) { @@ -153,8 +153,8 @@ static void ApplyDelay(const DelayInfo::ParameterVersion1& params, DelayInfo::St * @param sample_count - Number of samples to process. */ static void ApplyDelayEffect(const DelayInfo::ParameterVersion1& params, DelayInfo::State& state, - const bool enabled, std::vector>& inputs, - std::vector>& outputs, const u32 sample_count) { + const bool enabled, std::span> inputs, + std::span> outputs, const u32 sample_count) { if (!IsChannelCountValid(params.channel_count)) { LOG_ERROR(Service_Audio, "Invalid delay channels {}", params.channel_count); @@ -208,8 +208,8 @@ void DelayCommand::Dump([[maybe_unused]] const ADSP::CommandListProcessor& proce } void DelayCommand::Process(const ADSP::CommandListProcessor& processor) { - std::vector> input_buffers(parameter.channel_count); - std::vector> output_buffers(parameter.channel_count); + std::array, MaxChannels> input_buffers{}; + std::array, MaxChannels> output_buffers{}; for (s16 i = 0; i < parameter.channel_count; i++) { input_buffers[i] = processor.mix_buffers.subspan(inputs[i] * processor.sample_count, diff --git a/src/audio_core/renderer/command/effect/i3dl2_reverb.cpp b/src/audio_core/renderer/command/effect/i3dl2_reverb.cpp index 27d8b9844..d2bfb67cc 100644 --- a/src/audio_core/renderer/command/effect/i3dl2_reverb.cpp +++ b/src/audio_core/renderer/command/effect/i3dl2_reverb.cpp @@ -408,8 +408,8 @@ void I3dl2ReverbCommand::Dump([[maybe_unused]] const ADSP::CommandListProcessor& } void I3dl2ReverbCommand::Process(const ADSP::CommandListProcessor& processor) { - std::vector> input_buffers(parameter.channel_count); - std::vector> output_buffers(parameter.channel_count); + std::array, MaxChannels> input_buffers{}; + std::array, MaxChannels> output_buffers{}; for (u32 i = 0; i < parameter.channel_count; i++) { input_buffers[i] = processor.mix_buffers.subspan(inputs[i] * processor.sample_count, diff --git a/src/audio_core/renderer/command/effect/light_limiter.cpp b/src/audio_core/renderer/command/effect/light_limiter.cpp index e8fb0e2fc..4161a9821 100644 --- a/src/audio_core/renderer/command/effect/light_limiter.cpp +++ b/src/audio_core/renderer/command/effect/light_limiter.cpp @@ -47,8 +47,8 @@ static void InitializeLightLimiterEffect(const LightLimiterInfo::ParameterVersio */ static void ApplyLightLimiterEffect(const LightLimiterInfo::ParameterVersion2& params, LightLimiterInfo::State& state, const bool enabled, - std::vector>& inputs, - std::vector>& outputs, const u32 sample_count, + std::span> inputs, + std::span> outputs, const u32 sample_count, LightLimiterInfo::StatisticsInternal* statistics) { constexpr s64 min{std::numeric_limits::min()}; constexpr s64 max{std::numeric_limits::max()}; @@ -147,8 +147,8 @@ void LightLimiterVersion1Command::Dump([[maybe_unused]] const ADSP::CommandListP } void LightLimiterVersion1Command::Process(const ADSP::CommandListProcessor& processor) { - std::vector> input_buffers(parameter.channel_count); - std::vector> output_buffers(parameter.channel_count); + std::array, MaxChannels> input_buffers{}; + std::array, MaxChannels> output_buffers{}; for (u32 i = 0; i < parameter.channel_count; i++) { input_buffers[i] = processor.mix_buffers.subspan(inputs[i] * processor.sample_count, @@ -190,8 +190,8 @@ void LightLimiterVersion2Command::Dump([[maybe_unused]] const ADSP::CommandListP } void LightLimiterVersion2Command::Process(const ADSP::CommandListProcessor& processor) { - std::vector> input_buffers(parameter.channel_count); - std::vector> output_buffers(parameter.channel_count); + std::array, MaxChannels> input_buffers{}; + std::array, MaxChannels> output_buffers{}; for (u32 i = 0; i < parameter.channel_count; i++) { input_buffers[i] = processor.mix_buffers.subspan(inputs[i] * processor.sample_count, diff --git a/src/audio_core/renderer/command/effect/reverb.cpp b/src/audio_core/renderer/command/effect/reverb.cpp index 8b9b65214..fc2f15a5e 100644 --- a/src/audio_core/renderer/command/effect/reverb.cpp +++ b/src/audio_core/renderer/command/effect/reverb.cpp @@ -250,8 +250,8 @@ static Common::FixedPoint<50, 14> Axfx2AllPassTick(ReverbInfo::ReverbDelayLine& */ template static void ApplyReverbEffect(const ReverbInfo::ParameterVersion2& params, ReverbInfo::State& state, - std::vector>& inputs, - std::vector>& outputs, const u32 sample_count) { + std::span> inputs, + std::span> outputs, const u32 sample_count) { static constexpr std::array OutTapIndexes1Ch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; @@ -369,8 +369,8 @@ static void ApplyReverbEffect(const ReverbInfo::ParameterVersion2& params, Rever * @param sample_count - Number of samples to process. */ static void ApplyReverbEffect(const ReverbInfo::ParameterVersion2& params, ReverbInfo::State& state, - const bool enabled, std::vector>& inputs, - std::vector>& outputs, const u32 sample_count) { + const bool enabled, std::span> inputs, + std::span> outputs, const u32 sample_count) { if (enabled) { switch (params.channel_count) { case 0: @@ -412,8 +412,8 @@ void ReverbCommand::Dump([[maybe_unused]] const ADSP::CommandListProcessor& proc } void ReverbCommand::Process(const ADSP::CommandListProcessor& processor) { - std::vector> input_buffers(parameter.channel_count); - std::vector> output_buffers(parameter.channel_count); + std::array, MaxChannels> input_buffers{}; + std::array, MaxChannels> output_buffers{}; for (u32 i = 0; i < parameter.channel_count; i++) { input_buffers[i] = processor.mix_buffers.subspan(inputs[i] * processor.sample_count, diff --git a/src/audio_core/renderer/command/sink/circular_buffer.cpp b/src/audio_core/renderer/command/sink/circular_buffer.cpp index ded5afc94..e2ce59792 100644 --- a/src/audio_core/renderer/command/sink/circular_buffer.cpp +++ b/src/audio_core/renderer/command/sink/circular_buffer.cpp @@ -24,7 +24,7 @@ void CircularBufferSinkCommand::Process(const ADSP::CommandListProcessor& proces constexpr s32 min{std::numeric_limits::min()}; constexpr s32 max{std::numeric_limits::max()}; - std::vector output(processor.sample_count); + std::array output{}; for (u32 channel = 0; channel < input_count; channel++) { auto input{processor.mix_buffers.subspan(inputs[channel] * processor.sample_count, processor.sample_count)}; @@ -33,7 +33,7 @@ void CircularBufferSinkCommand::Process(const ADSP::CommandListProcessor& proces } processor.memory->WriteBlockUnsafe(address + pos, output.data(), - output.size() * sizeof(s16)); + processor.sample_count * sizeof(s16)); pos += static_cast(processor.sample_count * sizeof(s16)); if (pos >= size) { pos = 0; diff --git a/src/audio_core/renderer/command/sink/device.cpp b/src/audio_core/renderer/command/sink/device.cpp index e88372a75..5f74dd7ad 100644 --- a/src/audio_core/renderer/command/sink/device.cpp +++ b/src/audio_core/renderer/command/sink/device.cpp @@ -33,8 +33,7 @@ void DeviceSinkCommand::Process(const ADSP::CommandListProcessor& processor) { .consumed{false}, }; - std::vector samples(out_buffer.frames * input_count); - + std::array samples{}; for (u32 channel = 0; channel < input_count; channel++) { const auto offset{inputs[channel] * out_buffer.frames}; @@ -45,7 +44,7 @@ void DeviceSinkCommand::Process(const ADSP::CommandListProcessor& processor) { } out_buffer.tag = reinterpret_cast(samples.data()); - stream->AppendBuffer(out_buffer, samples); + stream->AppendBuffer(out_buffer, {samples.data(), out_buffer.frames * input_count}); if (stream->IsPaused()) { stream->Start(); diff --git a/src/audio_core/renderer/mix/mix_context.cpp b/src/audio_core/renderer/mix/mix_context.cpp index 35b748ede..3a18ae7c2 100644 --- a/src/audio_core/renderer/mix/mix_context.cpp +++ b/src/audio_core/renderer/mix/mix_context.cpp @@ -125,10 +125,10 @@ bool MixContext::TSortInfo(const SplitterContext& splitter_context) { return false; } - std::vector sorted_results{node_states.GetSortedResuls()}; - const auto result_size{std::min(count, static_cast(sorted_results.size()))}; + auto sorted_results{node_states.GetSortedResuls()}; + const auto result_size{std::min(count, static_cast(sorted_results.second))}; for (s32 i = 0; i < result_size; i++) { - sorted_mix_infos[i] = &mix_infos[sorted_results[i]]; + sorted_mix_infos[i] = &mix_infos[sorted_results.first[i]]; } CalcMixBufferOffset(); diff --git a/src/audio_core/renderer/nodes/node_states.cpp b/src/audio_core/renderer/nodes/node_states.cpp index 1821a51e6..b7a44a54c 100644 --- a/src/audio_core/renderer/nodes/node_states.cpp +++ b/src/audio_core/renderer/nodes/node_states.cpp @@ -134,8 +134,8 @@ u32 NodeStates::GetNodeCount() const { return node_count; } -std::vector NodeStates::GetSortedResuls() const { - return {results.rbegin(), results.rbegin() + result_pos}; +std::pair::reverse_iterator, size_t> NodeStates::GetSortedResuls() const { + return {results.rbegin(), result_pos}; } } // namespace AudioCore::AudioRenderer diff --git a/src/audio_core/renderer/nodes/node_states.h b/src/audio_core/renderer/nodes/node_states.h index 94b1d1254..e768cd4b5 100644 --- a/src/audio_core/renderer/nodes/node_states.h +++ b/src/audio_core/renderer/nodes/node_states.h @@ -175,7 +175,7 @@ public: * * @return Vector of nodes in reverse order. */ - std::vector GetSortedResuls() const; + std::pair::reverse_iterator, size_t> GetSortedResuls() const; private: /// Number of nodes in the graph diff --git a/src/audio_core/renderer/system.cpp b/src/audio_core/renderer/system.cpp index 53b258c4f..a23627472 100644 --- a/src/audio_core/renderer/system.cpp +++ b/src/audio_core/renderer/system.cpp @@ -444,6 +444,7 @@ Result System::Update(std::span input, std::span performance, std: std::scoped_lock l{lock}; const auto start_time{core.CoreTiming().GetClockTicks()}; + std::memset(output.data(), 0, output.size()); InfoUpdater info_updater(input, output, process_handle, behavior); diff --git a/src/audio_core/sink/null_sink.h b/src/audio_core/sink/null_sink.h index 1215d3cd2..b6b43c93e 100644 --- a/src/audio_core/sink/null_sink.h +++ b/src/audio_core/sink/null_sink.h @@ -20,7 +20,7 @@ public: explicit NullSinkStreamImpl(Core::System& system_, StreamType type_) : SinkStream{system_, type_} {} ~NullSinkStreamImpl() override {} - void AppendBuffer(SinkBuffer&, std::vector&) override {} + void AppendBuffer(SinkBuffer&, std::span) override {} std::vector ReleaseBuffer(u64) override { return {}; } diff --git a/src/audio_core/sink/sink_stream.cpp b/src/audio_core/sink/sink_stream.cpp index 9a718a9cc..404dcd0e9 100644 --- a/src/audio_core/sink/sink_stream.cpp +++ b/src/audio_core/sink/sink_stream.cpp @@ -18,7 +18,7 @@ namespace AudioCore::Sink { -void SinkStream::AppendBuffer(SinkBuffer& buffer, std::vector& samples) { +void SinkStream::AppendBuffer(SinkBuffer& buffer, std::span samples) { if (type == StreamType::In) { queue.enqueue(buffer); queued_buffers++; @@ -66,15 +66,16 @@ void SinkStream::AppendBuffer(SinkBuffer& buffer, std::vector& samples) { static_cast(std::clamp(right_sample, min, max)); } - samples.resize(samples.size() / system_channels * device_channels); + samples = samples.subspan(0, samples.size() / system_channels * device_channels); } else if (system_channels == 2 && device_channels == 6) { // We need moar samples! Not all games will provide 6 channel audio. // TODO: Implement some upmixing here. Currently just passthrough, with other // channels left as silence. - std::vector new_samples(samples.size() / system_channels * device_channels, 0); + auto new_size = samples.size() / system_channels * device_channels; + tmp_samples.resize_destructive(new_size); - for (u32 read_index = 0, write_index = 0; read_index < samples.size(); + for (u32 read_index = 0, write_index = 0; read_index < new_size; read_index += system_channels, write_index += device_channels) { const auto left_sample{static_cast(std::clamp( static_cast( @@ -82,7 +83,7 @@ void SinkStream::AppendBuffer(SinkBuffer& buffer, std::vector& samples) { volume), min, max))}; - new_samples[write_index + static_cast(Channels::FrontLeft)] = left_sample; + tmp_samples[write_index + static_cast(Channels::FrontLeft)] = left_sample; const auto right_sample{static_cast(std::clamp( static_cast( @@ -90,9 +91,9 @@ void SinkStream::AppendBuffer(SinkBuffer& buffer, std::vector& samples) { volume), min, max))}; - new_samples[write_index + static_cast(Channels::FrontRight)] = right_sample; + tmp_samples[write_index + static_cast(Channels::FrontRight)] = right_sample; } - samples = std::move(new_samples); + samples = std::span(tmp_samples); } else if (volume != 1.0f) { for (u32 i = 0; i < samples.size(); i++) { diff --git a/src/audio_core/sink/sink_stream.h b/src/audio_core/sink/sink_stream.h index 41cbadc9c..98d72ace1 100644 --- a/src/audio_core/sink/sink_stream.h +++ b/src/audio_core/sink/sink_stream.h @@ -16,6 +16,7 @@ #include "common/polyfill_thread.h" #include "common/reader_writer_queue.h" #include "common/ring_buffer.h" +#include "common/scratch_buffer.h" #include "common/thread.h" namespace Core { @@ -170,7 +171,7 @@ public: * @param buffer - Audio buffer information to be queued. * @param samples - The s16 samples to be queue for playback. */ - virtual void AppendBuffer(SinkBuffer& buffer, std::vector& samples); + virtual void AppendBuffer(SinkBuffer& buffer, std::span samples); /** * Release a buffer. Audio In only, will fill a buffer with recorded samples. @@ -255,6 +256,8 @@ private: /// Signalled when ring buffer entries are consumed std::condition_variable_any release_cv; std::mutex release_mutex; + /// Temporary buffer for appending samples when upmixing + Common::ScratchBuffer tmp_samples{}; }; using SinkStreamPtr = std::unique_ptr; diff --git a/src/common/ring_buffer.h b/src/common/ring_buffer.h index 4c328ab44..416680d44 100644 --- a/src/common/ring_buffer.h +++ b/src/common/ring_buffer.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -53,7 +54,7 @@ public: return push_count; } - std::size_t Push(const std::vector& input) { + std::size_t Push(const std::span input) { return Push(input.data(), input.size()); } diff --git a/src/common/scratch_buffer.h b/src/common/scratch_buffer.h index a69a5a7af..6fe907953 100644 --- a/src/common/scratch_buffer.h +++ b/src/common/scratch_buffer.h @@ -3,6 +3,9 @@ #pragma once +#include + +#include "common/concepts.h" #include "common/make_unique_for_overwrite.h" namespace Common { @@ -16,6 +19,12 @@ namespace Common { template class ScratchBuffer { public: + using iterator = T*; + using const_iterator = const T*; + using value_type = T; + using element_type = T; + using iterator_category = std::contiguous_iterator_tag; + ScratchBuffer() = default; explicit ScratchBuffer(size_t initial_capacity) diff --git a/src/core/hle/kernel/k_synchronization_object.cpp b/src/core/hle/kernel/k_synchronization_object.cpp index b7da3eee7..3e5b735b1 100644 --- a/src/core/hle/kernel/k_synchronization_object.cpp +++ b/src/core/hle/kernel/k_synchronization_object.cpp @@ -3,6 +3,7 @@ #include "common/assert.h" #include "common/common_types.h" +#include "common/scratch_buffer.h" #include "core/hle/kernel/k_scheduler.h" #include "core/hle/kernel/k_scoped_scheduler_lock_and_sleep.h" #include "core/hle/kernel/k_synchronization_object.h" @@ -75,7 +76,7 @@ Result KSynchronizationObject::Wait(KernelCore& kernel, s32* out_index, KSynchronizationObject** objects, const s32 num_objects, s64 timeout) { // Allocate space on stack for thread nodes. - std::vector thread_nodes(num_objects); + std::array thread_nodes; // Prepare for wait. KThread* thread = GetCurrentThreadPointer(kernel); diff --git a/src/core/hle/kernel/k_thread.cpp b/src/core/hle/kernel/k_thread.cpp index 908811e2c..adb6ec581 100644 --- a/src/core/hle/kernel/k_thread.cpp +++ b/src/core/hle/kernel/k_thread.cpp @@ -909,7 +909,7 @@ Result KThread::SetActivity(Svc::ThreadActivity activity) { R_SUCCEED(); } -Result KThread::GetThreadContext3(std::vector& out) { +Result KThread::GetThreadContext3(Common::ScratchBuffer& out) { // Lock ourselves. KScopedLightLock lk{m_activity_pause_lock}; @@ -927,15 +927,13 @@ Result KThread::GetThreadContext3(std::vector& out) { // Mask away mode bits, interrupt bits, IL bit, and other reserved bits. auto context = GetContext64(); context.pstate &= 0xFF0FFE20; - - out.resize(sizeof(context)); + out.resize_destructive(sizeof(context)); std::memcpy(out.data(), std::addressof(context), sizeof(context)); } else { // Mask away mode bits, interrupt bits, IL bit, and other reserved bits. auto context = GetContext32(); context.cpsr &= 0xFF0FFE20; - - out.resize(sizeof(context)); + out.resize_destructive(sizeof(context)); std::memcpy(out.data(), std::addressof(context), sizeof(context)); } } diff --git a/src/core/hle/kernel/k_thread.h b/src/core/hle/kernel/k_thread.h index 37fe5db77..dd662b3f8 100644 --- a/src/core/hle/kernel/k_thread.h +++ b/src/core/hle/kernel/k_thread.h @@ -15,6 +15,7 @@ #include "common/intrusive_list.h" #include "common/intrusive_red_black_tree.h" +#include "common/scratch_buffer.h" #include "common/spin_lock.h" #include "core/arm/arm_interface.h" #include "core/hle/kernel/k_affinity_mask.h" @@ -567,7 +568,7 @@ public: void RemoveWaiter(KThread* thread); - Result GetThreadContext3(std::vector& out); + Result GetThreadContext3(Common::ScratchBuffer& out); KThread* RemoveUserWaiterByKey(bool* out_has_waiters, KProcessAddress key) { return this->RemoveWaiterByKey(out_has_waiters, key, false); diff --git a/src/core/hle/kernel/svc/svc_ipc.cpp b/src/core/hle/kernel/svc/svc_ipc.cpp index ea03068aa..60247df2e 100644 --- a/src/core/hle/kernel/svc/svc_ipc.cpp +++ b/src/core/hle/kernel/svc/svc_ipc.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "common/scope_exit.h" +#include "common/scratch_buffer.h" #include "core/core.h" #include "core/hle/kernel/k_client_session.h" #include "core/hle/kernel/k_process.h" @@ -45,11 +46,11 @@ Result ReplyAndReceive(Core::System& system, s32* out_index, uint64_t handles_ad handles_addr, static_cast(sizeof(Handle) * num_handles)), ResultInvalidPointer); - std::vector handles(num_handles); + std::array handles; GetCurrentMemory(kernel).ReadBlock(handles_addr, handles.data(), sizeof(Handle) * num_handles); // Convert handle list to object table. - std::vector objs(num_handles); + std::array objs; R_UNLESS(handle_table.GetMultipleObjects(objs.data(), handles.data(), num_handles), ResultInvalidHandle); @@ -80,7 +81,7 @@ Result ReplyAndReceive(Core::System& system, s32* out_index, uint64_t handles_ad // Wait for an object. s32 index; Result result = KSynchronizationObject::Wait(kernel, std::addressof(index), objs.data(), - static_cast(objs.size()), timeout_ns); + num_handles, timeout_ns); if (result == ResultTimedOut) { R_RETURN(result); } diff --git a/src/core/hle/kernel/svc/svc_synchronization.cpp b/src/core/hle/kernel/svc/svc_synchronization.cpp index 04d65f0bd..53df5bcd8 100644 --- a/src/core/hle/kernel/svc/svc_synchronization.cpp +++ b/src/core/hle/kernel/svc/svc_synchronization.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "common/scope_exit.h" +#include "common/scratch_buffer.h" #include "core/core.h" #include "core/hle/kernel/k_process.h" #include "core/hle/kernel/k_readable_event.h" @@ -54,7 +55,7 @@ static Result WaitSynchronization(Core::System& system, int32_t* out_index, cons // Get the synchronization context. auto& kernel = system.Kernel(); auto& handle_table = GetCurrentProcess(kernel).GetHandleTable(); - std::vector objs(num_handles); + std::array objs; // Copy user handles. if (num_handles > 0) { @@ -72,8 +73,8 @@ static Result WaitSynchronization(Core::System& system, int32_t* out_index, cons }); // Wait on the objects. - Result res = KSynchronizationObject::Wait(kernel, out_index, objs.data(), - static_cast(objs.size()), timeout_ns); + Result res = + KSynchronizationObject::Wait(kernel, out_index, objs.data(), num_handles, timeout_ns); R_SUCCEED_IF(res == ResultSessionClosed); R_RETURN(res); @@ -87,8 +88,7 @@ Result WaitSynchronization(Core::System& system, int32_t* out_index, u64 user_ha // Ensure number of handles is valid. R_UNLESS(0 <= num_handles && num_handles <= Svc::ArgumentHandleCountMax, ResultOutOfRange); - - std::vector handles(num_handles); + std::array handles; if (num_handles > 0) { GetCurrentMemory(system.Kernel()) .ReadBlock(user_handles, handles.data(), num_handles * sizeof(Handle)); diff --git a/src/core/hle/kernel/svc/svc_thread.cpp b/src/core/hle/kernel/svc/svc_thread.cpp index 37b54079c..36b94e6bf 100644 --- a/src/core/hle/kernel/svc/svc_thread.cpp +++ b/src/core/hle/kernel/svc/svc_thread.cpp @@ -174,7 +174,7 @@ Result GetThreadContext3(Core::System& system, u64 out_context, Handle thread_ha } // Get the thread context. - std::vector context; + static thread_local Common::ScratchBuffer context; R_TRY(thread->GetThreadContext3(context)); // Copy the thread context to user space. diff --git a/src/core/hle/service/audio/audin_u.cpp b/src/core/hle/service/audio/audin_u.cpp index f0640c64f..c8d574993 100644 --- a/src/core/hle/service/audio/audin_u.cpp +++ b/src/core/hle/service/audio/audin_u.cpp @@ -5,6 +5,7 @@ #include "audio_core/renderer/audio_device.h" #include "common/common_funcs.h" #include "common/logging/log.h" +#include "common/settings.h" #include "common/string_util.h" #include "core/core.h" #include "core/hle/kernel/k_event.h" @@ -123,19 +124,13 @@ private: void GetReleasedAudioInBuffer(HLERequestContext& ctx) { const auto write_buffer_size = ctx.GetWriteBufferNumElements(); - std::vector released_buffers(write_buffer_size); + tmp_buffer.resize_destructive(write_buffer_size); + tmp_buffer[0] = 0; - const auto count = impl->GetReleasedBuffers(released_buffers); + const auto count = impl->GetReleasedBuffers(tmp_buffer); - [[maybe_unused]] std::string tags{}; - for (u32 i = 0; i < count; i++) { - tags += fmt::format("{:08X}, ", released_buffers[i]); - } - [[maybe_unused]] auto sessionid{impl->GetSystem().GetSessionId()}; - LOG_TRACE(Service_Audio, "called. Session {} released {} buffers: {}", sessionid, count, - tags); + ctx.WriteBuffer(tmp_buffer); - ctx.WriteBuffer(released_buffers); IPC::ResponseBuilder rb{ctx, 3}; rb.Push(ResultSuccess); rb.Push(count); @@ -200,6 +195,7 @@ private: KernelHelpers::ServiceContext service_context; Kernel::KEvent* event; std::shared_ptr impl; + Common::ScratchBuffer tmp_buffer; }; AudInU::AudInU(Core::System& system_) diff --git a/src/core/hle/service/audio/audout_u.cpp b/src/core/hle/service/audio/audout_u.cpp index 3e62fa4fc..032c8c11f 100644 --- a/src/core/hle/service/audio/audout_u.cpp +++ b/src/core/hle/service/audio/audout_u.cpp @@ -123,19 +123,13 @@ private: void GetReleasedAudioOutBuffers(HLERequestContext& ctx) { const auto write_buffer_size = ctx.GetWriteBufferNumElements(); - std::vector released_buffers(write_buffer_size); + tmp_buffer.resize_destructive(write_buffer_size); + tmp_buffer[0] = 0; - const auto count = impl->GetReleasedBuffers(released_buffers); + const auto count = impl->GetReleasedBuffers(tmp_buffer); - [[maybe_unused]] std::string tags{}; - for (u32 i = 0; i < count; i++) { - tags += fmt::format("{:08X}, ", released_buffers[i]); - } - [[maybe_unused]] const auto sessionid{impl->GetSystem().GetSessionId()}; - LOG_TRACE(Service_Audio, "called. Session {} released {} buffers: {}", sessionid, count, - tags); + ctx.WriteBuffer(tmp_buffer); - ctx.WriteBuffer(released_buffers); IPC::ResponseBuilder rb{ctx, 3}; rb.Push(ResultSuccess); rb.Push(count); @@ -211,6 +205,7 @@ private: KernelHelpers::ServiceContext service_context; Kernel::KEvent* event; std::shared_ptr impl; + Common::ScratchBuffer tmp_buffer; }; AudOutU::AudOutU(Core::System& system_) diff --git a/src/core/hle/service/audio/audren_u.cpp b/src/core/hle/service/audio/audren_u.cpp index 7086d4750..12845c23a 100644 --- a/src/core/hle/service/audio/audren_u.cpp +++ b/src/core/hle/service/audio/audren_u.cpp @@ -116,28 +116,26 @@ private: // These buffers are written manually to avoid an issue with WriteBuffer throwing errors for // checking size 0. Performance size is 0 for most games. - std::vector output{}; - std::vector performance{}; auto is_buffer_b{ctx.BufferDescriptorB()[0].Size() != 0}; if (is_buffer_b) { const auto buffersB{ctx.BufferDescriptorB()}; - output.resize(buffersB[0].Size(), 0); - performance.resize(buffersB[1].Size(), 0); + tmp_output.resize_destructive(buffersB[0].Size()); + tmp_performance.resize_destructive(buffersB[1].Size()); } else { const auto buffersC{ctx.BufferDescriptorC()}; - output.resize(buffersC[0].Size(), 0); - performance.resize(buffersC[1].Size(), 0); + tmp_output.resize_destructive(buffersC[0].Size()); + tmp_performance.resize_destructive(buffersC[1].Size()); } - auto result = impl->RequestUpdate(input, performance, output); + auto result = impl->RequestUpdate(input, tmp_performance, tmp_output); if (result.IsSuccess()) { if (is_buffer_b) { - ctx.WriteBufferB(output.data(), output.size(), 0); - ctx.WriteBufferB(performance.data(), performance.size(), 1); + ctx.WriteBufferB(tmp_output.data(), tmp_output.size(), 0); + ctx.WriteBufferB(tmp_performance.data(), tmp_performance.size(), 1); } else { - ctx.WriteBufferC(output.data(), output.size(), 0); - ctx.WriteBufferC(performance.data(), performance.size(), 1); + ctx.WriteBufferC(tmp_output.data(), tmp_output.size(), 0); + ctx.WriteBufferC(tmp_performance.data(), tmp_performance.size(), 1); } } else { LOG_ERROR(Service_Audio, "RequestUpdate failed error 0x{:02X}!", result.description); @@ -235,6 +233,8 @@ private: Kernel::KEvent* rendered_event; Manager& manager; std::unique_ptr impl; + Common::ScratchBuffer tmp_output; + Common::ScratchBuffer tmp_performance; }; class IAudioDevice final : public ServiceFramework { diff --git a/src/core/hle/service/audio/audren_u.h b/src/core/hle/service/audio/audren_u.h index 24ce37e87..d8e9c8719 100644 --- a/src/core/hle/service/audio/audren_u.h +++ b/src/core/hle/service/audio/audren_u.h @@ -4,6 +4,7 @@ #pragma once #include "audio_core/audio_render_manager.h" +#include "common/scratch_buffer.h" #include "core/hle/service/kernel_helpers.h" #include "core/hle/service/service.h" diff --git a/src/core/hle/service/audio/hwopus.cpp b/src/core/hle/service/audio/hwopus.cpp index 451ac224a..c835f6cb7 100644 --- a/src/core/hle/service/audio/hwopus.cpp +++ b/src/core/hle/service/audio/hwopus.cpp @@ -68,13 +68,13 @@ private: ExtraBehavior extra_behavior) { u32 consumed = 0; u32 sample_count = 0; - std::vector samples(ctx.GetWriteBufferNumElements()); + tmp_samples.resize_destructive(ctx.GetWriteBufferNumElements()); if (extra_behavior == ExtraBehavior::ResetContext) { ResetDecoderContext(); } - if (!DecodeOpusData(consumed, sample_count, ctx.ReadBuffer(), samples, performance)) { + if (!DecodeOpusData(consumed, sample_count, ctx.ReadBuffer(), tmp_samples, performance)) { LOG_ERROR(Audio, "Failed to decode opus data"); IPC::ResponseBuilder rb{ctx, 2}; // TODO(ogniK): Use correct error code @@ -90,11 +90,11 @@ private: if (performance) { rb.Push(*performance); } - ctx.WriteBuffer(samples); + ctx.WriteBuffer(tmp_samples); } bool DecodeOpusData(u32& consumed, u32& sample_count, std::span input, - std::vector& output, u64* out_performance_time) const { + std::span output, u64* out_performance_time) const { const auto start_time = std::chrono::steady_clock::now(); const std::size_t raw_output_sz = output.size() * sizeof(opus_int16); if (sizeof(OpusPacketHeader) > input.size()) { @@ -154,6 +154,7 @@ private: OpusDecoderPtr decoder; u32 sample_rate; u32 channel_count; + Common::ScratchBuffer tmp_samples; }; class IHardwareOpusDecoderManager final : public ServiceFramework { diff --git a/src/core/hle/service/nvdrv/devices/nvdevice.h b/src/core/hle/service/nvdrv/devices/nvdevice.h index ab1f30f9e..a04538d5d 100644 --- a/src/core/hle/service/nvdrv/devices/nvdevice.h +++ b/src/core/hle/service/nvdrv/devices/nvdevice.h @@ -34,7 +34,7 @@ public: * @returns The result code of the ioctl. */ virtual NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) = 0; + std::span output) = 0; /** * Handles an ioctl2 request. @@ -45,7 +45,7 @@ public: * @returns The result code of the ioctl. */ virtual NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) = 0; + std::span inline_input, std::span output) = 0; /** * Handles an ioctl3 request. @@ -56,7 +56,7 @@ public: * @returns The result code of the ioctl. */ virtual NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) = 0; + std::span output, std::span inline_output) = 0; /** * Called once a device is opened diff --git a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp index 0fe242e9d..05a43d8dc 100644 --- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp +++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp @@ -18,19 +18,19 @@ nvdisp_disp0::nvdisp_disp0(Core::System& system_, NvCore::Container& core) nvdisp_disp0::~nvdisp_disp0() = default; NvResult nvdisp_disp0::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } NvResult nvdisp_disp0::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } NvResult nvdisp_disp0::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) { + std::span output, std::span inline_output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } diff --git a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h index bcd0e3ed5..daee05fe8 100644 --- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h +++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h @@ -26,11 +26,11 @@ public: ~nvdisp_disp0() override; NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) override; + std::span output) override; NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) override; - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output) override; + std::span inline_input, std::span output) override; + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) override; void OnOpen(DeviceFD fd) override; void OnClose(DeviceFD fd) override; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp index 681bd0867..07e570a9f 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp @@ -28,7 +28,7 @@ nvhost_as_gpu::nvhost_as_gpu(Core::System& system_, Module& module_, NvCore::Con nvhost_as_gpu::~nvhost_as_gpu() = default; NvResult nvhost_as_gpu::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { switch (command.group) { case 'A': switch (command.cmd) { @@ -61,13 +61,13 @@ NvResult nvhost_as_gpu::Ioctl1(DeviceFD fd, Ioctl command, std::span i } NvResult nvhost_as_gpu::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } NvResult nvhost_as_gpu::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) { + std::span output, std::span inline_output) { switch (command.group) { case 'A': switch (command.cmd) { @@ -87,7 +87,7 @@ NvResult nvhost_as_gpu::Ioctl3(DeviceFD fd, Ioctl command, std::span i void nvhost_as_gpu::OnOpen(DeviceFD fd) {} void nvhost_as_gpu::OnClose(DeviceFD fd) {} -NvResult nvhost_as_gpu::AllocAsEx(std::span input, std::vector& output) { +NvResult nvhost_as_gpu::AllocAsEx(std::span input, std::span output) { IoctlAllocAsEx params{}; std::memcpy(¶ms, input.data(), input.size()); @@ -141,7 +141,7 @@ NvResult nvhost_as_gpu::AllocAsEx(std::span input, std::vector& ou return NvResult::Success; } -NvResult nvhost_as_gpu::AllocateSpace(std::span input, std::vector& output) { +NvResult nvhost_as_gpu::AllocateSpace(std::span input, std::span output) { IoctlAllocSpace params{}; std::memcpy(¶ms, input.data(), input.size()); @@ -220,7 +220,7 @@ void nvhost_as_gpu::FreeMappingLocked(u64 offset) { mapping_map.erase(offset); } -NvResult nvhost_as_gpu::FreeSpace(std::span input, std::vector& output) { +NvResult nvhost_as_gpu::FreeSpace(std::span input, std::span output) { IoctlFreeSpace params{}; std::memcpy(¶ms, input.data(), input.size()); @@ -266,15 +266,14 @@ NvResult nvhost_as_gpu::FreeSpace(std::span input, std::vector& ou return NvResult::Success; } -NvResult nvhost_as_gpu::Remap(std::span input, std::vector& output) { +NvResult nvhost_as_gpu::Remap(std::span input, std::span output) { const auto num_entries = input.size() / sizeof(IoctlRemapEntry); LOG_DEBUG(Service_NVDRV, "called, num_entries=0x{:X}", num_entries); - std::vector entries(num_entries); - std::memcpy(entries.data(), input.data(), input.size()); - std::scoped_lock lock(mutex); + entries.resize_destructive(num_entries); + std::memcpy(entries.data(), input.data(), input.size()); if (!vm.initialised) { return NvResult::BadValue; @@ -320,7 +319,7 @@ NvResult nvhost_as_gpu::Remap(std::span input, std::vector& output return NvResult::Success; } -NvResult nvhost_as_gpu::MapBufferEx(std::span input, std::vector& output) { +NvResult nvhost_as_gpu::MapBufferEx(std::span input, std::span output) { IoctlMapBufferEx params{}; std::memcpy(¶ms, input.data(), input.size()); @@ -424,7 +423,7 @@ NvResult nvhost_as_gpu::MapBufferEx(std::span input, std::vector& return NvResult::Success; } -NvResult nvhost_as_gpu::UnmapBuffer(std::span input, std::vector& output) { +NvResult nvhost_as_gpu::UnmapBuffer(std::span input, std::span output) { IoctlUnmapBuffer params{}; std::memcpy(¶ms, input.data(), input.size()); @@ -463,7 +462,7 @@ NvResult nvhost_as_gpu::UnmapBuffer(std::span input, std::vector& return NvResult::Success; } -NvResult nvhost_as_gpu::BindChannel(std::span input, std::vector& output) { +NvResult nvhost_as_gpu::BindChannel(std::span input, std::span output) { IoctlBindChannel params{}; std::memcpy(¶ms, input.data(), input.size()); LOG_DEBUG(Service_NVDRV, "called, fd={:X}", params.fd); @@ -492,7 +491,7 @@ void nvhost_as_gpu::GetVARegionsImpl(IoctlGetVaRegions& params) { }; } -NvResult nvhost_as_gpu::GetVARegions(std::span input, std::vector& output) { +NvResult nvhost_as_gpu::GetVARegions(std::span input, std::span output) { IoctlGetVaRegions params{}; std::memcpy(¶ms, input.data(), input.size()); @@ -511,8 +510,8 @@ NvResult nvhost_as_gpu::GetVARegions(std::span input, std::vector& return NvResult::Success; } -NvResult nvhost_as_gpu::GetVARegions(std::span input, std::vector& output, - std::vector& inline_output) { +NvResult nvhost_as_gpu::GetVARegions(std::span input, std::span output, + std::span inline_output) { IoctlGetVaRegions params{}; std::memcpy(¶ms, input.data(), input.size()); diff --git a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h index 1aba8d579..2af3e1260 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h @@ -15,6 +15,7 @@ #include "common/address_space.h" #include "common/common_funcs.h" #include "common/common_types.h" +#include "common/scratch_buffer.h" #include "common/swap.h" #include "core/hle/service/nvdrv/core/nvmap.h" #include "core/hle/service/nvdrv/devices/nvdevice.h" @@ -48,11 +49,11 @@ public: ~nvhost_as_gpu() override; NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) override; + std::span output) override; NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) override; - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output) override; + std::span inline_input, std::span output) override; + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) override; void OnOpen(DeviceFD fd) override; void OnClose(DeviceFD fd) override; @@ -138,18 +139,18 @@ private: static_assert(sizeof(IoctlGetVaRegions) == 16 + sizeof(VaRegion) * 2, "IoctlGetVaRegions is incorrect size"); - NvResult AllocAsEx(std::span input, std::vector& output); - NvResult AllocateSpace(std::span input, std::vector& output); - NvResult Remap(std::span input, std::vector& output); - NvResult MapBufferEx(std::span input, std::vector& output); - NvResult UnmapBuffer(std::span input, std::vector& output); - NvResult FreeSpace(std::span input, std::vector& output); - NvResult BindChannel(std::span input, std::vector& output); + NvResult AllocAsEx(std::span input, std::span output); + NvResult AllocateSpace(std::span input, std::span output); + NvResult Remap(std::span input, std::span output); + NvResult MapBufferEx(std::span input, std::span output); + NvResult UnmapBuffer(std::span input, std::span output); + NvResult FreeSpace(std::span input, std::span output); + NvResult BindChannel(std::span input, std::span output); void GetVARegionsImpl(IoctlGetVaRegions& params); - NvResult GetVARegions(std::span input, std::vector& output); - NvResult GetVARegions(std::span input, std::vector& output, - std::vector& inline_output); + NvResult GetVARegions(std::span input, std::span output); + NvResult GetVARegions(std::span input, std::span output, + std::span inline_output); void FreeMappingLocked(u64 offset); @@ -212,6 +213,7 @@ private: bool initialised{}; } vm; std::shared_ptr gmmu; + Common::ScratchBuffer entries; // s32 channel{}; // u32 big_page_size{VM::DEFAULT_BIG_PAGE_SIZE}; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp index e12025560..4d55554b4 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp @@ -35,7 +35,7 @@ nvhost_ctrl::~nvhost_ctrl() { } NvResult nvhost_ctrl::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { switch (command.group) { case 0x0: switch (command.cmd) { @@ -64,13 +64,13 @@ NvResult nvhost_ctrl::Ioctl1(DeviceFD fd, Ioctl command, std::span inp } NvResult nvhost_ctrl::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } NvResult nvhost_ctrl::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_outpu) { + std::span output, std::span inline_outpu) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } @@ -79,7 +79,7 @@ void nvhost_ctrl::OnOpen(DeviceFD fd) {} void nvhost_ctrl::OnClose(DeviceFD fd) {} -NvResult nvhost_ctrl::NvOsGetConfigU32(std::span input, std::vector& output) { +NvResult nvhost_ctrl::NvOsGetConfigU32(std::span input, std::span output) { IocGetConfigParams params{}; std::memcpy(¶ms, input.data(), sizeof(params)); LOG_TRACE(Service_NVDRV, "called, setting={}!{}", params.domain_str.data(), @@ -87,7 +87,7 @@ NvResult nvhost_ctrl::NvOsGetConfigU32(std::span input, std::vector input, std::vector& output, +NvResult nvhost_ctrl::IocCtrlEventWait(std::span input, std::span output, bool is_allocation) { IocCtrlEventWaitParams params{}; std::memcpy(¶ms, input.data(), sizeof(params)); @@ -231,7 +231,7 @@ NvResult nvhost_ctrl::FreeEvent(u32 slot) { return NvResult::Success; } -NvResult nvhost_ctrl::IocCtrlEventRegister(std::span input, std::vector& output) { +NvResult nvhost_ctrl::IocCtrlEventRegister(std::span input, std::span output) { IocCtrlEventRegisterParams params{}; std::memcpy(¶ms, input.data(), sizeof(params)); const u32 event_id = params.user_event_id; @@ -252,7 +252,7 @@ NvResult nvhost_ctrl::IocCtrlEventRegister(std::span input, std::vecto return NvResult::Success; } -NvResult nvhost_ctrl::IocCtrlEventUnregister(std::span input, std::vector& output) { +NvResult nvhost_ctrl::IocCtrlEventUnregister(std::span input, std::span output) { IocCtrlEventUnregisterParams params{}; std::memcpy(¶ms, input.data(), sizeof(params)); const u32 event_id = params.user_event_id & 0x00FF; @@ -262,8 +262,7 @@ NvResult nvhost_ctrl::IocCtrlEventUnregister(std::span input, std::vec return FreeEvent(event_id); } -NvResult nvhost_ctrl::IocCtrlEventUnregisterBatch(std::span input, - std::vector& output) { +NvResult nvhost_ctrl::IocCtrlEventUnregisterBatch(std::span input, std::span output) { IocCtrlEventUnregisterBatchParams params{}; std::memcpy(¶ms, input.data(), sizeof(params)); u64 event_mask = params.user_events; @@ -281,7 +280,7 @@ NvResult nvhost_ctrl::IocCtrlEventUnregisterBatch(std::span input, return NvResult::Success; } -NvResult nvhost_ctrl::IocCtrlClearEventWait(std::span input, std::vector& output) { +NvResult nvhost_ctrl::IocCtrlClearEventWait(std::span input, std::span output) { IocCtrlEventClearParams params{}; std::memcpy(¶ms, input.data(), sizeof(params)); diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h index dd2e7888a..2efed4862 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h @@ -26,11 +26,11 @@ public: ~nvhost_ctrl() override; NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) override; + std::span output) override; NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) override; - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output) override; + std::span inline_input, std::span output) override; + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) override; void OnOpen(DeviceFD fd) override; void OnClose(DeviceFD fd) override; @@ -186,13 +186,12 @@ private: static_assert(sizeof(IocCtrlEventUnregisterBatchParams) == 8, "IocCtrlEventKill is incorrect size"); - NvResult NvOsGetConfigU32(std::span input, std::vector& output); - NvResult IocCtrlEventWait(std::span input, std::vector& output, - bool is_allocation); - NvResult IocCtrlEventRegister(std::span input, std::vector& output); - NvResult IocCtrlEventUnregister(std::span input, std::vector& output); - NvResult IocCtrlEventUnregisterBatch(std::span input, std::vector& output); - NvResult IocCtrlClearEventWait(std::span input, std::vector& output); + NvResult NvOsGetConfigU32(std::span input, std::span output); + NvResult IocCtrlEventWait(std::span input, std::span output, bool is_allocation); + NvResult IocCtrlEventRegister(std::span input, std::span output); + NvResult IocCtrlEventUnregister(std::span input, std::span output); + NvResult IocCtrlEventUnregisterBatch(std::span input, std::span output); + NvResult IocCtrlClearEventWait(std::span input, std::span output); NvResult FreeEvent(u32 slot); diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp index be3c083db..6081d92e9 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp @@ -22,7 +22,7 @@ nvhost_ctrl_gpu::~nvhost_ctrl_gpu() { } NvResult nvhost_ctrl_gpu::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { switch (command.group) { case 'G': switch (command.cmd) { @@ -54,13 +54,13 @@ NvResult nvhost_ctrl_gpu::Ioctl1(DeviceFD fd, Ioctl command, std::span } NvResult nvhost_ctrl_gpu::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } NvResult nvhost_ctrl_gpu::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) { + std::span output, std::span inline_output) { switch (command.group) { case 'G': switch (command.cmd) { @@ -82,7 +82,7 @@ NvResult nvhost_ctrl_gpu::Ioctl3(DeviceFD fd, Ioctl command, std::span void nvhost_ctrl_gpu::OnOpen(DeviceFD fd) {} void nvhost_ctrl_gpu::OnClose(DeviceFD fd) {} -NvResult nvhost_ctrl_gpu::GetCharacteristics(std::span input, std::vector& output) { +NvResult nvhost_ctrl_gpu::GetCharacteristics(std::span input, std::span output) { LOG_DEBUG(Service_NVDRV, "called"); IoctlCharacteristics params{}; std::memcpy(¶ms, input.data(), input.size()); @@ -127,8 +127,8 @@ NvResult nvhost_ctrl_gpu::GetCharacteristics(std::span input, std::vec return NvResult::Success; } -NvResult nvhost_ctrl_gpu::GetCharacteristics(std::span input, std::vector& output, - std::vector& inline_output) { +NvResult nvhost_ctrl_gpu::GetCharacteristics(std::span input, std::span output, + std::span inline_output) { LOG_DEBUG(Service_NVDRV, "called"); IoctlCharacteristics params{}; std::memcpy(¶ms, input.data(), input.size()); @@ -175,7 +175,7 @@ NvResult nvhost_ctrl_gpu::GetCharacteristics(std::span input, std::vec return NvResult::Success; } -NvResult nvhost_ctrl_gpu::GetTPCMasks(std::span input, std::vector& output) { +NvResult nvhost_ctrl_gpu::GetTPCMasks(std::span input, std::span output) { IoctlGpuGetTpcMasksArgs params{}; std::memcpy(¶ms, input.data(), input.size()); LOG_DEBUG(Service_NVDRV, "called, mask_buffer_size=0x{:X}", params.mask_buffer_size); @@ -186,8 +186,8 @@ NvResult nvhost_ctrl_gpu::GetTPCMasks(std::span input, std::vector return NvResult::Success; } -NvResult nvhost_ctrl_gpu::GetTPCMasks(std::span input, std::vector& output, - std::vector& inline_output) { +NvResult nvhost_ctrl_gpu::GetTPCMasks(std::span input, std::span output, + std::span inline_output) { IoctlGpuGetTpcMasksArgs params{}; std::memcpy(¶ms, input.data(), input.size()); LOG_DEBUG(Service_NVDRV, "called, mask_buffer_size=0x{:X}", params.mask_buffer_size); @@ -199,7 +199,7 @@ NvResult nvhost_ctrl_gpu::GetTPCMasks(std::span input, std::vector return NvResult::Success; } -NvResult nvhost_ctrl_gpu::GetActiveSlotMask(std::span input, std::vector& output) { +NvResult nvhost_ctrl_gpu::GetActiveSlotMask(std::span input, std::span output) { LOG_DEBUG(Service_NVDRV, "called"); IoctlActiveSlotMask params{}; @@ -212,7 +212,7 @@ NvResult nvhost_ctrl_gpu::GetActiveSlotMask(std::span input, std::vect return NvResult::Success; } -NvResult nvhost_ctrl_gpu::ZCullGetCtxSize(std::span input, std::vector& output) { +NvResult nvhost_ctrl_gpu::ZCullGetCtxSize(std::span input, std::span output) { LOG_DEBUG(Service_NVDRV, "called"); IoctlZcullGetCtxSize params{}; @@ -224,7 +224,7 @@ NvResult nvhost_ctrl_gpu::ZCullGetCtxSize(std::span input, std::vector return NvResult::Success; } -NvResult nvhost_ctrl_gpu::ZCullGetInfo(std::span input, std::vector& output) { +NvResult nvhost_ctrl_gpu::ZCullGetInfo(std::span input, std::span output) { LOG_DEBUG(Service_NVDRV, "called"); IoctlNvgpuGpuZcullGetInfoArgs params{}; @@ -247,7 +247,7 @@ NvResult nvhost_ctrl_gpu::ZCullGetInfo(std::span input, std::vector input, std::vector& output) { +NvResult nvhost_ctrl_gpu::ZBCSetTable(std::span input, std::span output) { LOG_WARNING(Service_NVDRV, "(STUBBED) called"); IoctlZbcSetTable params{}; @@ -263,7 +263,7 @@ NvResult nvhost_ctrl_gpu::ZBCSetTable(std::span input, std::vector return NvResult::Success; } -NvResult nvhost_ctrl_gpu::ZBCQueryTable(std::span input, std::vector& output) { +NvResult nvhost_ctrl_gpu::ZBCQueryTable(std::span input, std::span output) { LOG_WARNING(Service_NVDRV, "(STUBBED) called"); IoctlZbcQueryTable params{}; @@ -273,7 +273,7 @@ NvResult nvhost_ctrl_gpu::ZBCQueryTable(std::span input, std::vector input, std::vector& output) { +NvResult nvhost_ctrl_gpu::FlushL2(std::span input, std::span output) { LOG_WARNING(Service_NVDRV, "(STUBBED) called"); IoctlFlushL2 params{}; @@ -283,7 +283,7 @@ NvResult nvhost_ctrl_gpu::FlushL2(std::span input, std::vector& ou return NvResult::Success; } -NvResult nvhost_ctrl_gpu::GetGpuTime(std::span input, std::vector& output) { +NvResult nvhost_ctrl_gpu::GetGpuTime(std::span input, std::span output) { LOG_DEBUG(Service_NVDRV, "called"); IoctlGetGpuTime params{}; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h index b9333d9d3..97995551c 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h @@ -22,11 +22,11 @@ public: ~nvhost_ctrl_gpu() override; NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) override; + std::span output) override; NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) override; - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output) override; + std::span inline_input, std::span output) override; + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) override; void OnOpen(DeviceFD fd) override; void OnClose(DeviceFD fd) override; @@ -151,21 +151,21 @@ private: }; static_assert(sizeof(IoctlGetGpuTime) == 0x10, "IoctlGetGpuTime is incorrect size"); - NvResult GetCharacteristics(std::span input, std::vector& output); - NvResult GetCharacteristics(std::span input, std::vector& output, - std::vector& inline_output); - - NvResult GetTPCMasks(std::span input, std::vector& output); - NvResult GetTPCMasks(std::span input, std::vector& output, - std::vector& inline_output); - - NvResult GetActiveSlotMask(std::span input, std::vector& output); - NvResult ZCullGetCtxSize(std::span input, std::vector& output); - NvResult ZCullGetInfo(std::span input, std::vector& output); - NvResult ZBCSetTable(std::span input, std::vector& output); - NvResult ZBCQueryTable(std::span input, std::vector& output); - NvResult FlushL2(std::span input, std::vector& output); - NvResult GetGpuTime(std::span input, std::vector& output); + NvResult GetCharacteristics(std::span input, std::span output); + NvResult GetCharacteristics(std::span input, std::span output, + std::span inline_output); + + NvResult GetTPCMasks(std::span input, std::span output); + NvResult GetTPCMasks(std::span input, std::span output, + std::span inline_output); + + NvResult GetActiveSlotMask(std::span input, std::span output); + NvResult ZCullGetCtxSize(std::span input, std::span output); + NvResult ZCullGetInfo(std::span input, std::span output); + NvResult ZBCSetTable(std::span input, std::span output); + NvResult ZBCQueryTable(std::span input, std::span output); + NvResult FlushL2(std::span input, std::span output); + NvResult GetGpuTime(std::span input, std::span output); EventInterface& events_interface; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp index 453a965dc..46a25fcab 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp @@ -47,7 +47,7 @@ nvhost_gpu::~nvhost_gpu() { } NvResult nvhost_gpu::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { switch (command.group) { case 0x0: switch (command.cmd) { @@ -99,7 +99,7 @@ NvResult nvhost_gpu::Ioctl1(DeviceFD fd, Ioctl command, std::span inpu }; NvResult nvhost_gpu::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { switch (command.group) { case 'H': switch (command.cmd) { @@ -113,7 +113,7 @@ NvResult nvhost_gpu::Ioctl2(DeviceFD fd, Ioctl command, std::span inpu } NvResult nvhost_gpu::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) { + std::span output, std::span inline_output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } @@ -121,7 +121,7 @@ NvResult nvhost_gpu::Ioctl3(DeviceFD fd, Ioctl command, std::span inpu void nvhost_gpu::OnOpen(DeviceFD fd) {} void nvhost_gpu::OnClose(DeviceFD fd) {} -NvResult nvhost_gpu::SetNVMAPfd(std::span input, std::vector& output) { +NvResult nvhost_gpu::SetNVMAPfd(std::span input, std::span output) { IoctlSetNvmapFD params{}; std::memcpy(¶ms, input.data(), input.size()); LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd); @@ -130,7 +130,7 @@ NvResult nvhost_gpu::SetNVMAPfd(std::span input, std::vector& outp return NvResult::Success; } -NvResult nvhost_gpu::SetClientData(std::span input, std::vector& output) { +NvResult nvhost_gpu::SetClientData(std::span input, std::span output) { LOG_DEBUG(Service_NVDRV, "called"); IoctlClientData params{}; @@ -139,7 +139,7 @@ NvResult nvhost_gpu::SetClientData(std::span input, std::vector& o return NvResult::Success; } -NvResult nvhost_gpu::GetClientData(std::span input, std::vector& output) { +NvResult nvhost_gpu::GetClientData(std::span input, std::span output) { LOG_DEBUG(Service_NVDRV, "called"); IoctlClientData params{}; @@ -149,7 +149,7 @@ NvResult nvhost_gpu::GetClientData(std::span input, std::vector& o return NvResult::Success; } -NvResult nvhost_gpu::ZCullBind(std::span input, std::vector& output) { +NvResult nvhost_gpu::ZCullBind(std::span input, std::span output) { std::memcpy(&zcull_params, input.data(), input.size()); LOG_DEBUG(Service_NVDRV, "called, gpu_va={:X}, mode={:X}", zcull_params.gpu_va, zcull_params.mode); @@ -158,7 +158,7 @@ NvResult nvhost_gpu::ZCullBind(std::span input, std::vector& outpu return NvResult::Success; } -NvResult nvhost_gpu::SetErrorNotifier(std::span input, std::vector& output) { +NvResult nvhost_gpu::SetErrorNotifier(std::span input, std::span output) { IoctlSetErrorNotifier params{}; std::memcpy(¶ms, input.data(), input.size()); LOG_WARNING(Service_NVDRV, "(STUBBED) called, offset={:X}, size={:X}, mem={:X}", params.offset, @@ -168,14 +168,14 @@ NvResult nvhost_gpu::SetErrorNotifier(std::span input, std::vector return NvResult::Success; } -NvResult nvhost_gpu::SetChannelPriority(std::span input, std::vector& output) { +NvResult nvhost_gpu::SetChannelPriority(std::span input, std::span output) { std::memcpy(&channel_priority, input.data(), input.size()); LOG_DEBUG(Service_NVDRV, "(STUBBED) called, priority={:X}", channel_priority); return NvResult::Success; } -NvResult nvhost_gpu::AllocGPFIFOEx2(std::span input, std::vector& output) { +NvResult nvhost_gpu::AllocGPFIFOEx2(std::span input, std::span output) { IoctlAllocGpfifoEx2 params{}; std::memcpy(¶ms, input.data(), input.size()); LOG_WARNING(Service_NVDRV, @@ -197,7 +197,7 @@ NvResult nvhost_gpu::AllocGPFIFOEx2(std::span input, std::vector& return NvResult::Success; } -NvResult nvhost_gpu::AllocateObjectContext(std::span input, std::vector& output) { +NvResult nvhost_gpu::AllocateObjectContext(std::span input, std::span output) { IoctlAllocObjCtx params{}; std::memcpy(¶ms, input.data(), input.size()); LOG_WARNING(Service_NVDRV, "(STUBBED) called, class_num={:X}, flags={:X}", params.class_num, @@ -208,7 +208,8 @@ NvResult nvhost_gpu::AllocateObjectContext(std::span input, std::vecto return NvResult::Success; } -static std::vector BuildWaitCommandList(NvFence fence) { +static boost::container::small_vector BuildWaitCommandList( + NvFence fence) { return { Tegra::BuildCommandHeader(Tegra::BufferMethods::SyncpointPayload, 1, Tegra::SubmissionMode::Increasing), @@ -219,35 +220,35 @@ static std::vector BuildWaitCommandList(NvFence fence) { }; } -static std::vector BuildIncrementCommandList(NvFence fence) { - std::vector result{ +static boost::container::small_vector BuildIncrementCommandList( + NvFence fence) { + boost::container::small_vector result{ Tegra::BuildCommandHeader(Tegra::BufferMethods::SyncpointPayload, 1, Tegra::SubmissionMode::Increasing), {}}; for (u32 count = 0; count < 2; ++count) { - result.emplace_back(Tegra::BuildCommandHeader(Tegra::BufferMethods::SyncpointOperation, 1, - Tegra::SubmissionMode::Increasing)); - result.emplace_back( + result.push_back(Tegra::BuildCommandHeader(Tegra::BufferMethods::SyncpointOperation, 1, + Tegra::SubmissionMode::Increasing)); + result.push_back( BuildFenceAction(Tegra::Engines::Puller::FenceOperation::Increment, fence.id)); } return result; } -static std::vector BuildIncrementWithWfiCommandList(NvFence fence) { - std::vector result{ +static boost::container::small_vector BuildIncrementWithWfiCommandList( + NvFence fence) { + boost::container::small_vector result{ Tegra::BuildCommandHeader(Tegra::BufferMethods::WaitForIdle, 1, Tegra::SubmissionMode::Increasing), {}}; - const std::vector increment{BuildIncrementCommandList(fence)}; - - result.insert(result.end(), increment.begin(), increment.end()); - + auto increment_list{BuildIncrementCommandList(fence)}; + result.insert(result.end(), increment_list.begin(), increment_list.end()); return result; } -NvResult nvhost_gpu::SubmitGPFIFOImpl(IoctlSubmitGpfifo& params, std::vector& output, +NvResult nvhost_gpu::SubmitGPFIFOImpl(IoctlSubmitGpfifo& params, std::span output, Tegra::CommandList&& entries) { LOG_TRACE(Service_NVDRV, "called, gpfifo={:X}, num_entries={:X}, flags={:X}", params.address, params.num_entries, params.flags.raw); @@ -293,7 +294,7 @@ NvResult nvhost_gpu::SubmitGPFIFOImpl(IoctlSubmitGpfifo& params, std::vector return NvResult::Success; } -NvResult nvhost_gpu::SubmitGPFIFOBase(std::span input, std::vector& output, +NvResult nvhost_gpu::SubmitGPFIFOBase(std::span input, std::span output, bool kickoff) { if (input.size() < sizeof(IoctlSubmitGpfifo)) { UNIMPLEMENTED(); @@ -315,7 +316,7 @@ NvResult nvhost_gpu::SubmitGPFIFOBase(std::span input, std::vector } NvResult nvhost_gpu::SubmitGPFIFOBase(std::span input, std::span input_inline, - std::vector& output) { + std::span output) { if (input.size() < sizeof(IoctlSubmitGpfifo)) { UNIMPLEMENTED(); return NvResult::InvalidSize; @@ -327,7 +328,7 @@ NvResult nvhost_gpu::SubmitGPFIFOBase(std::span input, std::span input, std::vector& output) { +NvResult nvhost_gpu::GetWaitbase(std::span input, std::span output) { IoctlGetWaitbase params{}; std::memcpy(¶ms, input.data(), sizeof(IoctlGetWaitbase)); LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown); @@ -337,7 +338,7 @@ NvResult nvhost_gpu::GetWaitbase(std::span input, std::vector& out return NvResult::Success; } -NvResult nvhost_gpu::ChannelSetTimeout(std::span input, std::vector& output) { +NvResult nvhost_gpu::ChannelSetTimeout(std::span input, std::span output) { IoctlChannelSetTimeout params{}; std::memcpy(¶ms, input.data(), sizeof(IoctlChannelSetTimeout)); LOG_INFO(Service_NVDRV, "called, timeout=0x{:X}", params.timeout); @@ -345,7 +346,7 @@ NvResult nvhost_gpu::ChannelSetTimeout(std::span input, std::vector input, std::vector& output) { +NvResult nvhost_gpu::ChannelSetTimeslice(std::span input, std::span output) { IoctlSetTimeslice params{}; std::memcpy(¶ms, input.data(), sizeof(IoctlSetTimeslice)); LOG_INFO(Service_NVDRV, "called, timeslice=0x{:X}", params.timeslice); diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h index 3ca58202d..529c20526 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h @@ -41,11 +41,11 @@ public: ~nvhost_gpu() override; NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) override; + std::span output) override; NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) override; - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output) override; + std::span inline_input, std::span output) override; + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) override; void OnOpen(DeviceFD fd) override; void OnClose(DeviceFD fd) override; @@ -186,23 +186,23 @@ private: u32_le channel_priority{}; u32_le channel_timeslice{}; - NvResult SetNVMAPfd(std::span input, std::vector& output); - NvResult SetClientData(std::span input, std::vector& output); - NvResult GetClientData(std::span input, std::vector& output); - NvResult ZCullBind(std::span input, std::vector& output); - NvResult SetErrorNotifier(std::span input, std::vector& output); - NvResult SetChannelPriority(std::span input, std::vector& output); - NvResult AllocGPFIFOEx2(std::span input, std::vector& output); - NvResult AllocateObjectContext(std::span input, std::vector& output); - NvResult SubmitGPFIFOImpl(IoctlSubmitGpfifo& params, std::vector& output, + NvResult SetNVMAPfd(std::span input, std::span output); + NvResult SetClientData(std::span input, std::span output); + NvResult GetClientData(std::span input, std::span output); + NvResult ZCullBind(std::span input, std::span output); + NvResult SetErrorNotifier(std::span input, std::span output); + NvResult SetChannelPriority(std::span input, std::span output); + NvResult AllocGPFIFOEx2(std::span input, std::span output); + NvResult AllocateObjectContext(std::span input, std::span output); + NvResult SubmitGPFIFOImpl(IoctlSubmitGpfifo& params, std::span output, Tegra::CommandList&& entries); - NvResult SubmitGPFIFOBase(std::span input, std::vector& output, + NvResult SubmitGPFIFOBase(std::span input, std::span output, bool kickoff = false); NvResult SubmitGPFIFOBase(std::span input, std::span input_inline, - std::vector& output); - NvResult GetWaitbase(std::span input, std::vector& output); - NvResult ChannelSetTimeout(std::span input, std::vector& output); - NvResult ChannelSetTimeslice(std::span input, std::vector& output); + std::span output); + NvResult GetWaitbase(std::span input, std::span output); + NvResult ChannelSetTimeout(std::span input, std::span output); + NvResult ChannelSetTimeslice(std::span input, std::span output); EventInterface& events_interface; NvCore::Container& core; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp index dc45169ad..a174442a6 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp @@ -16,7 +16,7 @@ nvhost_nvdec::nvhost_nvdec(Core::System& system_, NvCore::Container& core_) nvhost_nvdec::~nvhost_nvdec() = default; NvResult nvhost_nvdec::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { switch (command.group) { case 0x0: switch (command.cmd) { @@ -56,13 +56,13 @@ NvResult nvhost_nvdec::Ioctl1(DeviceFD fd, Ioctl command, std::span in } NvResult nvhost_nvdec::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } NvResult nvhost_nvdec::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) { + std::span output, std::span inline_output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h index 0d615bbcb..ad2233c49 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h @@ -14,11 +14,11 @@ public: ~nvhost_nvdec() override; NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) override; + std::span output) override; NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) override; - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output) override; + std::span inline_input, std::span output) override; + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) override; void OnOpen(DeviceFD fd) override; void OnClose(DeviceFD fd) override; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp index 1ab51f10b..61649aa4a 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp @@ -36,7 +36,7 @@ std::size_t SliceVectors(std::span input, std::vector& dst, std::si // Writes the data in src to an offset into the dst vector. The offset is specified in bytes // Returns the number of bytes written into dst. template -std::size_t WriteVectors(std::vector& dst, const std::vector& src, std::size_t offset) { +std::size_t WriteVectors(std::span dst, const std::vector& src, std::size_t offset) { if (src.empty()) { return 0; } @@ -72,8 +72,7 @@ NvResult nvhost_nvdec_common::SetNVMAPfd(std::span input) { return NvResult::Success; } -NvResult nvhost_nvdec_common::Submit(DeviceFD fd, std::span input, - std::vector& output) { +NvResult nvhost_nvdec_common::Submit(DeviceFD fd, std::span input, std::span output) { IoctlSubmit params{}; std::memcpy(¶ms, input.data(), sizeof(IoctlSubmit)); LOG_DEBUG(Service_NVDRV, "called NVDEC Submit, cmd_buffer_count={}", params.cmd_buffer_count); @@ -121,7 +120,7 @@ NvResult nvhost_nvdec_common::Submit(DeviceFD fd, std::span input, return NvResult::Success; } -NvResult nvhost_nvdec_common::GetSyncpoint(std::span input, std::vector& output) { +NvResult nvhost_nvdec_common::GetSyncpoint(std::span input, std::span output) { IoctlGetSyncpoint params{}; std::memcpy(¶ms, input.data(), sizeof(IoctlGetSyncpoint)); LOG_DEBUG(Service_NVDRV, "called GetSyncpoint, id={}", params.param); @@ -133,7 +132,7 @@ NvResult nvhost_nvdec_common::GetSyncpoint(std::span input, std::vecto return NvResult::Success; } -NvResult nvhost_nvdec_common::GetWaitbase(std::span input, std::vector& output) { +NvResult nvhost_nvdec_common::GetWaitbase(std::span input, std::span output) { IoctlGetWaitbase params{}; LOG_CRITICAL(Service_NVDRV, "called WAITBASE"); std::memcpy(¶ms, input.data(), sizeof(IoctlGetWaitbase)); @@ -142,7 +141,7 @@ NvResult nvhost_nvdec_common::GetWaitbase(std::span input, std::vector return NvResult::Success; } -NvResult nvhost_nvdec_common::MapBuffer(std::span input, std::vector& output) { +NvResult nvhost_nvdec_common::MapBuffer(std::span input, std::span output) { IoctlMapBuffer params{}; std::memcpy(¶ms, input.data(), sizeof(IoctlMapBuffer)); std::vector cmd_buffer_handles(params.num_entries); @@ -159,7 +158,7 @@ NvResult nvhost_nvdec_common::MapBuffer(std::span input, std::vector input, std::vector& output) { +NvResult nvhost_nvdec_common::UnmapBuffer(std::span input, std::span output) { IoctlMapBuffer params{}; std::memcpy(¶ms, input.data(), sizeof(IoctlMapBuffer)); std::vector cmd_buffer_handles(params.num_entries); @@ -173,7 +172,7 @@ NvResult nvhost_nvdec_common::UnmapBuffer(std::span input, std::vector return NvResult::Success; } -NvResult nvhost_nvdec_common::SetSubmitTimeout(std::span input, std::vector& output) { +NvResult nvhost_nvdec_common::SetSubmitTimeout(std::span input, std::span output) { std::memcpy(&submit_timeout, input.data(), input.size()); LOG_WARNING(Service_NVDRV, "(STUBBED) called"); return NvResult::Success; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h index 5af26a26f..9bb573bfe 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h @@ -108,12 +108,12 @@ protected: /// Ioctl command implementations NvResult SetNVMAPfd(std::span input); - NvResult Submit(DeviceFD fd, std::span input, std::vector& output); - NvResult GetSyncpoint(std::span input, std::vector& output); - NvResult GetWaitbase(std::span input, std::vector& output); - NvResult MapBuffer(std::span input, std::vector& output); - NvResult UnmapBuffer(std::span input, std::vector& output); - NvResult SetSubmitTimeout(std::span input, std::vector& output); + NvResult Submit(DeviceFD fd, std::span input, std::span output); + NvResult GetSyncpoint(std::span input, std::span output); + NvResult GetWaitbase(std::span input, std::span output); + NvResult MapBuffer(std::span input, std::span output); + NvResult UnmapBuffer(std::span input, std::span output); + NvResult SetSubmitTimeout(std::span input, std::span output); Kernel::KEvent* QueryEvent(u32 event_id) override; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp index 39f30e7c8..a05c8cdae 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp @@ -13,7 +13,7 @@ nvhost_nvjpg::nvhost_nvjpg(Core::System& system_) : nvdevice{system_} {} nvhost_nvjpg::~nvhost_nvjpg() = default; NvResult nvhost_nvjpg::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { switch (command.group) { case 'H': switch (command.cmd) { @@ -32,13 +32,13 @@ NvResult nvhost_nvjpg::Ioctl1(DeviceFD fd, Ioctl command, std::span in } NvResult nvhost_nvjpg::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } NvResult nvhost_nvjpg::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) { + std::span output, std::span inline_output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } @@ -46,7 +46,7 @@ NvResult nvhost_nvjpg::Ioctl3(DeviceFD fd, Ioctl command, std::span in void nvhost_nvjpg::OnOpen(DeviceFD fd) {} void nvhost_nvjpg::OnClose(DeviceFD fd) {} -NvResult nvhost_nvjpg::SetNVMAPfd(std::span input, std::vector& output) { +NvResult nvhost_nvjpg::SetNVMAPfd(std::span input, std::span output) { IoctlSetNvmapFD params{}; std::memcpy(¶ms, input.data(), input.size()); LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd); diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h index 41b57e872..5623e0d47 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h @@ -16,11 +16,11 @@ public: ~nvhost_nvjpg() override; NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) override; + std::span output) override; NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) override; - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output) override; + std::span inline_input, std::span output) override; + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) override; void OnOpen(DeviceFD fd) override; void OnClose(DeviceFD fd) override; @@ -33,7 +33,7 @@ private: s32_le nvmap_fd{}; - NvResult SetNVMAPfd(std::span input, std::vector& output); + NvResult SetNVMAPfd(std::span input, std::span output); }; } // namespace Service::Nvidia::Devices diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp index b0ea402a7..c0b8684c3 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp @@ -16,7 +16,7 @@ nvhost_vic::nvhost_vic(Core::System& system_, NvCore::Container& core_) nvhost_vic::~nvhost_vic() = default; NvResult nvhost_vic::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { switch (command.group) { case 0x0: switch (command.cmd) { @@ -56,13 +56,13 @@ NvResult nvhost_vic::Ioctl1(DeviceFD fd, Ioctl command, std::span inpu } NvResult nvhost_vic::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } NvResult nvhost_vic::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) { + std::span output, std::span inline_output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.h b/src/core/hle/service/nvdrv/devices/nvhost_vic.h index b5e350a83..cadbcb0a5 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_vic.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.h @@ -13,11 +13,11 @@ public: ~nvhost_vic(); NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) override; + std::span output) override; NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) override; - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output) override; + std::span inline_input, std::span output) override; + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) override; void OnOpen(DeviceFD fd) override; void OnClose(DeviceFD fd) override; diff --git a/src/core/hle/service/nvdrv/devices/nvmap.cpp b/src/core/hle/service/nvdrv/devices/nvmap.cpp index 07417f045..e7f7e273b 100644 --- a/src/core/hle/service/nvdrv/devices/nvmap.cpp +++ b/src/core/hle/service/nvdrv/devices/nvmap.cpp @@ -26,7 +26,7 @@ nvmap::nvmap(Core::System& system_, NvCore::Container& container_) nvmap::~nvmap() = default; NvResult nvmap::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { switch (command.group) { case 0x1: switch (command.cmd) { @@ -55,13 +55,13 @@ NvResult nvmap::Ioctl1(DeviceFD fd, Ioctl command, std::span input, } NvResult nvmap::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } -NvResult nvmap::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) { +NvResult nvmap::Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } @@ -69,7 +69,7 @@ NvResult nvmap::Ioctl3(DeviceFD fd, Ioctl command, std::span input, void nvmap::OnOpen(DeviceFD fd) {} void nvmap::OnClose(DeviceFD fd) {} -NvResult nvmap::IocCreate(std::span input, std::vector& output) { +NvResult nvmap::IocCreate(std::span input, std::span output) { IocCreateParams params; std::memcpy(¶ms, input.data(), sizeof(params)); LOG_DEBUG(Service_NVDRV, "called, size=0x{:08X}", params.size); @@ -89,7 +89,7 @@ NvResult nvmap::IocCreate(std::span input, std::vector& output) { return NvResult::Success; } -NvResult nvmap::IocAlloc(std::span input, std::vector& output) { +NvResult nvmap::IocAlloc(std::span input, std::span output) { IocAllocParams params; std::memcpy(¶ms, input.data(), sizeof(params)); LOG_DEBUG(Service_NVDRV, "called, addr={:X}", params.address); @@ -137,7 +137,7 @@ NvResult nvmap::IocAlloc(std::span input, std::vector& output) { return result; } -NvResult nvmap::IocGetId(std::span input, std::vector& output) { +NvResult nvmap::IocGetId(std::span input, std::span output) { IocGetIdParams params; std::memcpy(¶ms, input.data(), sizeof(params)); @@ -161,7 +161,7 @@ NvResult nvmap::IocGetId(std::span input, std::vector& output) { return NvResult::Success; } -NvResult nvmap::IocFromId(std::span input, std::vector& output) { +NvResult nvmap::IocFromId(std::span input, std::span output) { IocFromIdParams params; std::memcpy(¶ms, input.data(), sizeof(params)); @@ -192,7 +192,7 @@ NvResult nvmap::IocFromId(std::span input, std::vector& output) { return NvResult::Success; } -NvResult nvmap::IocParam(std::span input, std::vector& output) { +NvResult nvmap::IocParam(std::span input, std::span output) { enum class ParamTypes { Size = 1, Alignment = 2, Base = 3, Heap = 4, Kind = 5, Compr = 6 }; IocParamParams params; @@ -241,7 +241,7 @@ NvResult nvmap::IocParam(std::span input, std::vector& output) { return NvResult::Success; } -NvResult nvmap::IocFree(std::span input, std::vector& output) { +NvResult nvmap::IocFree(std::span input, std::span output) { IocFreeParams params; std::memcpy(¶ms, input.data(), sizeof(params)); diff --git a/src/core/hle/service/nvdrv/devices/nvmap.h b/src/core/hle/service/nvdrv/devices/nvmap.h index 82bd3b118..40c65b430 100644 --- a/src/core/hle/service/nvdrv/devices/nvmap.h +++ b/src/core/hle/service/nvdrv/devices/nvmap.h @@ -27,11 +27,11 @@ public: nvmap& operator=(const nvmap&) = delete; NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) override; + std::span output) override; NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) override; - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output) override; + std::span inline_input, std::span output) override; + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) override; void OnOpen(DeviceFD fd) override; void OnClose(DeviceFD fd) override; @@ -106,12 +106,12 @@ private: }; static_assert(sizeof(IocGetIdParams) == 8, "IocGetIdParams has wrong size"); - NvResult IocCreate(std::span input, std::vector& output); - NvResult IocAlloc(std::span input, std::vector& output); - NvResult IocGetId(std::span input, std::vector& output); - NvResult IocFromId(std::span input, std::vector& output); - NvResult IocParam(std::span input, std::vector& output); - NvResult IocFree(std::span input, std::vector& output); + NvResult IocCreate(std::span input, std::span output); + NvResult IocAlloc(std::span input, std::span output); + NvResult IocGetId(std::span input, std::span output); + NvResult IocFromId(std::span input, std::span output); + NvResult IocParam(std::span input, std::span output); + NvResult IocFree(std::span input, std::span output); NvCore::Container& container; NvCore::NvMap& file; diff --git a/src/core/hle/service/nvdrv/nvdrv.cpp b/src/core/hle/service/nvdrv/nvdrv.cpp index 3d774eec4..9e46ee8dd 100644 --- a/src/core/hle/service/nvdrv/nvdrv.cpp +++ b/src/core/hle/service/nvdrv/nvdrv.cpp @@ -130,7 +130,7 @@ DeviceFD Module::Open(const std::string& device_name) { } NvResult Module::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { if (fd < 0) { LOG_ERROR(Service_NVDRV, "Invalid DeviceFD={}!", fd); return NvResult::InvalidState; @@ -147,7 +147,7 @@ NvResult Module::Ioctl1(DeviceFD fd, Ioctl command, std::span input, } NvResult Module::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { if (fd < 0) { LOG_ERROR(Service_NVDRV, "Invalid DeviceFD={}!", fd); return NvResult::InvalidState; @@ -163,8 +163,8 @@ NvResult Module::Ioctl2(DeviceFD fd, Ioctl command, std::span input, return itr->second->Ioctl2(fd, command, input, inline_input, output); } -NvResult Module::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) { +NvResult Module::Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) { if (fd < 0) { LOG_ERROR(Service_NVDRV, "Invalid DeviceFD={}!", fd); return NvResult::InvalidState; diff --git a/src/core/hle/service/nvdrv/nvdrv.h b/src/core/hle/service/nvdrv/nvdrv.h index 668be742b..d8622b3ca 100644 --- a/src/core/hle/service/nvdrv/nvdrv.h +++ b/src/core/hle/service/nvdrv/nvdrv.h @@ -80,13 +80,13 @@ public: DeviceFD Open(const std::string& device_name); /// Sends an ioctl command to the specified file descriptor. - NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, std::vector& output); + NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, std::span output); NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output); + std::span inline_input, std::span output); - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output); + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output); /// Closes a device file descriptor and returns operation success. NvResult Close(DeviceFD fd); diff --git a/src/core/hle/service/nvdrv/nvdrv_interface.cpp b/src/core/hle/service/nvdrv/nvdrv_interface.cpp index d010a1e03..348207e25 100644 --- a/src/core/hle/service/nvdrv/nvdrv_interface.cpp +++ b/src/core/hle/service/nvdrv/nvdrv_interface.cpp @@ -63,12 +63,12 @@ void NVDRV::Ioctl1(HLERequestContext& ctx) { } // Check device - std::vector output_buffer(ctx.GetWriteBufferSize(0)); + tmp_output.resize_destructive(ctx.GetWriteBufferSize(0)); const auto input_buffer = ctx.ReadBuffer(0); - const auto nv_result = nvdrv->Ioctl1(fd, command, input_buffer, output_buffer); + const auto nv_result = nvdrv->Ioctl1(fd, command, input_buffer, tmp_output); if (command.is_out != 0) { - ctx.WriteBuffer(output_buffer); + ctx.WriteBuffer(tmp_output); } IPC::ResponseBuilder rb{ctx, 3}; @@ -90,12 +90,12 @@ void NVDRV::Ioctl2(HLERequestContext& ctx) { const auto input_buffer = ctx.ReadBuffer(0); const auto input_inlined_buffer = ctx.ReadBuffer(1); - std::vector output_buffer(ctx.GetWriteBufferSize(0)); + tmp_output.resize_destructive(ctx.GetWriteBufferSize(0)); const auto nv_result = - nvdrv->Ioctl2(fd, command, input_buffer, input_inlined_buffer, output_buffer); + nvdrv->Ioctl2(fd, command, input_buffer, input_inlined_buffer, tmp_output); if (command.is_out != 0) { - ctx.WriteBuffer(output_buffer); + ctx.WriteBuffer(tmp_output); } IPC::ResponseBuilder rb{ctx, 3}; @@ -116,14 +116,12 @@ void NVDRV::Ioctl3(HLERequestContext& ctx) { } const auto input_buffer = ctx.ReadBuffer(0); - std::vector output_buffer(ctx.GetWriteBufferSize(0)); - std::vector output_buffer_inline(ctx.GetWriteBufferSize(1)); - - const auto nv_result = - nvdrv->Ioctl3(fd, command, input_buffer, output_buffer, output_buffer_inline); + tmp_output.resize_destructive(ctx.GetWriteBufferSize(0)); + tmp_output_inline.resize_destructive(ctx.GetWriteBufferSize(1)); + const auto nv_result = nvdrv->Ioctl3(fd, command, input_buffer, tmp_output, tmp_output_inline); if (command.is_out != 0) { - ctx.WriteBuffer(output_buffer, 0); - ctx.WriteBuffer(output_buffer_inline, 1); + ctx.WriteBuffer(tmp_output, 0); + ctx.WriteBuffer(tmp_output_inline, 1); } IPC::ResponseBuilder rb{ctx, 3}; diff --git a/src/core/hle/service/nvdrv/nvdrv_interface.h b/src/core/hle/service/nvdrv/nvdrv_interface.h index 881ea1a6b..4b593ff90 100644 --- a/src/core/hle/service/nvdrv/nvdrv_interface.h +++ b/src/core/hle/service/nvdrv/nvdrv_interface.h @@ -4,6 +4,7 @@ #pragma once #include +#include "common/scratch_buffer.h" #include "core/hle/service/nvdrv/nvdrv.h" #include "core/hle/service/service.h" @@ -33,6 +34,8 @@ private: u64 pid{}; bool is_initialized{}; + Common::ScratchBuffer tmp_output; + Common::ScratchBuffer tmp_output_inline; }; } // namespace Service::Nvidia diff --git a/src/core/hle/service/nvnflinger/parcel.h b/src/core/hle/service/nvnflinger/parcel.h index fb56d75d7..23ba315a0 100644 --- a/src/core/hle/service/nvnflinger/parcel.h +++ b/src/core/hle/service/nvnflinger/parcel.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "common/alignment.h" #include "common/assert.h" @@ -167,7 +168,7 @@ public: private: template requires(std::is_trivially_copyable_v) - void WriteImpl(const T& val, std::vector& buffer) { + void WriteImpl(const T& val, boost::container::small_vector& buffer) { const size_t aligned_size = Common::AlignUp(sizeof(T), 4); const size_t old_size = buffer.size(); buffer.resize(old_size + aligned_size); @@ -176,8 +177,8 @@ private: } private: - std::vector m_data_buffer; - std::vector m_object_buffer; + boost::container::small_vector m_data_buffer; + boost::container::small_vector m_object_buffer; }; } // namespace Service::android diff --git a/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp b/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp index c3c2281bb..9ff4028c2 100644 --- a/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp +++ b/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp @@ -479,7 +479,7 @@ void EmitContext::DefineGenericOutput(size_t index, u32 invocations) { const u32 remainder{4 - element}; const TransformFeedbackVarying* xfb_varying{}; const size_t xfb_varying_index{base_index + element}; - if (xfb_varying_index < runtime_info.xfb_varyings.size()) { + if (xfb_varying_index < runtime_info.xfb_count) { xfb_varying = &runtime_info.xfb_varyings[xfb_varying_index]; xfb_varying = xfb_varying->components > 0 ? xfb_varying : nullptr; } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index 0f86a8004..34592a01f 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -387,7 +387,7 @@ void SetupSignedNanCapabilities(const Profile& profile, const IR::Program& progr } void SetupTransformFeedbackCapabilities(EmitContext& ctx, Id main_func) { - if (ctx.runtime_info.xfb_varyings.empty()) { + if (ctx.runtime_info.xfb_count == 0) { return; } ctx.AddCapability(spv::Capability::TransformFeedback); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index fd15f47ea..bec5db173 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -160,7 +160,7 @@ void DefineGenericOutput(EmitContext& ctx, size_t index, std::optional invo const u32 remainder{4 - element}; const TransformFeedbackVarying* xfb_varying{}; const size_t xfb_varying_index{base_attr_index + element}; - if (xfb_varying_index < ctx.runtime_info.xfb_varyings.size()) { + if (xfb_varying_index < ctx.runtime_info.xfb_count) { xfb_varying = &ctx.runtime_info.xfb_varyings[xfb_varying_index]; xfb_varying = xfb_varying->components > 0 ? xfb_varying : nullptr; } diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 3b63c249f..619c0b138 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -84,7 +84,8 @@ struct RuntimeInfo { bool glasm_use_storage_buffers{}; /// Transform feedback state for each varying - std::vector xfb_varyings; + std::array xfb_varyings{}; + u32 xfb_count{0}; }; } // namespace Shader diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 45977d578..58a45ab67 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -207,7 +207,7 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am if (has_new_downloads) { memory_tracker.MarkRegionAsGpuModified(*cpu_dest_address, amount); } - tmp_buffer.resize(amount); + tmp_buffer.resize_destructive(amount); cpu_memory.ReadBlockUnsafe(*cpu_src_address, tmp_buffer.data(), amount); cpu_memory.WriteBlockUnsafe(*cpu_dest_address, tmp_buffer.data(), amount); return true; @@ -1279,7 +1279,7 @@ template typename BufferCache

::OverlapResult BufferCache

::ResolveOverlaps(VAddr cpu_addr, u32 wanted_size) { static constexpr int STREAM_LEAP_THRESHOLD = 16; - std::vector overlap_ids; + boost::container::small_vector overlap_ids; VAddr begin = cpu_addr; VAddr end = cpu_addr + wanted_size; int stream_score = 0; diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index 63a120f7a..fe6068cfe 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -229,7 +229,7 @@ class BufferCache : public VideoCommon::ChannelSetupCaches; struct OverlapResult { - std::vector ids; + boost::container::small_vector ids; VAddr begin; VAddr end; bool has_stream_leap = false; @@ -582,7 +582,7 @@ private: BufferId inline_buffer_id; std::array> CACHING_PAGEBITS)> page_table; - std::vector tmp_buffer; + Common::ScratchBuffer tmp_buffer; }; } // namespace VideoCommon diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h index 83112dfce..7d660af47 100644 --- a/src/video_core/cdma_pusher.h +++ b/src/video_core/cdma_pusher.h @@ -63,7 +63,6 @@ struct ChCommand { }; using ChCommandHeaderList = std::vector; -using ChCommandList = std::vector; struct ThiRegisters { u32_le increment_syncpt{}; diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h index 1cdb690ed..8a2784cdc 100644 --- a/src/video_core/dma_pusher.h +++ b/src/video_core/dma_pusher.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "common/bit_field.h" @@ -102,11 +103,12 @@ inline CommandHeader BuildCommandHeader(BufferMethods method, u32 arg_count, Sub struct CommandList final { CommandList() = default; explicit CommandList(std::size_t size) : command_lists(size) {} - explicit CommandList(std::vector&& prefetch_command_list_) + explicit CommandList( + boost::container::small_vector&& prefetch_command_list_) : prefetch_command_list{std::move(prefetch_command_list_)} {} - std::vector command_lists; - std::vector prefetch_command_list; + boost::container::small_vector command_lists; + boost::container::small_vector prefetch_command_list; }; /** diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index ebe5536de..bc1eb41e7 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -108,9 +108,11 @@ void MaxwellDMA::Launch() { if (regs.launch_dma.remap_enable != 0 && is_const_a_dst) { ASSERT(regs.remap_const.component_size_minus_one == 3); accelerate.BufferClear(regs.offset_out, regs.line_length_in, regs.remap_consta_value); - std::vector tmp_buffer(regs.line_length_in, regs.remap_consta_value); + read_buffer.resize_destructive(regs.line_length_in * sizeof(u32)); + std::span span(reinterpret_cast(read_buffer.data()), regs.line_length_in); + std::ranges::fill(span, regs.remap_consta_value); memory_manager.WriteBlockUnsafe(regs.offset_out, - reinterpret_cast(tmp_buffer.data()), + reinterpret_cast(read_buffer.data()), regs.line_length_in * sizeof(u32)); } else { memory_manager.FlushCaching(); @@ -126,32 +128,32 @@ void MaxwellDMA::Launch() { UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0); UNIMPLEMENTED_IF(regs.offset_in % 16 != 0); UNIMPLEMENTED_IF(regs.offset_out % 16 != 0); - std::vector tmp_buffer(16); + read_buffer.resize_destructive(16); for (u32 offset = 0; offset < regs.line_length_in; offset += 16) { memory_manager.ReadBlockUnsafe( convert_linear_2_blocklinear_addr(regs.offset_in + offset), - tmp_buffer.data(), tmp_buffer.size()); - memory_manager.WriteBlockCached(regs.offset_out + offset, tmp_buffer.data(), - tmp_buffer.size()); + read_buffer.data(), read_buffer.size()); + memory_manager.WriteBlockCached(regs.offset_out + offset, read_buffer.data(), + read_buffer.size()); } } else if (is_src_pitch && !is_dst_pitch) { UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0); UNIMPLEMENTED_IF(regs.offset_in % 16 != 0); UNIMPLEMENTED_IF(regs.offset_out % 16 != 0); - std::vector tmp_buffer(16); + read_buffer.resize_destructive(16); for (u32 offset = 0; offset < regs.line_length_in; offset += 16) { - memory_manager.ReadBlockUnsafe(regs.offset_in + offset, tmp_buffer.data(), - tmp_buffer.size()); + memory_manager.ReadBlockUnsafe(regs.offset_in + offset, read_buffer.data(), + read_buffer.size()); memory_manager.WriteBlockCached( convert_linear_2_blocklinear_addr(regs.offset_out + offset), - tmp_buffer.data(), tmp_buffer.size()); + read_buffer.data(), read_buffer.size()); } } else { if (!accelerate.BufferCopy(regs.offset_in, regs.offset_out, regs.line_length_in)) { - std::vector tmp_buffer(regs.line_length_in); - memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), + read_buffer.resize_destructive(regs.line_length_in); + memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(), regs.line_length_in); - memory_manager.WriteBlockCached(regs.offset_out, tmp_buffer.data(), + memory_manager.WriteBlockCached(regs.offset_out, read_buffer.data(), regs.line_length_in); } } @@ -171,7 +173,8 @@ void MaxwellDMA::CopyBlockLinearToPitch() { src_operand.address = regs.offset_in; DMA::BufferOperand dst_operand; - dst_operand.pitch = regs.pitch_out; + u32 abs_pitch_out = std::abs(static_cast(regs.pitch_out)); + dst_operand.pitch = abs_pitch_out; dst_operand.width = regs.line_length_in; dst_operand.height = regs.line_count; dst_operand.address = regs.offset_out; @@ -218,7 +221,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() { const size_t src_size = CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth); - const size_t dst_size = static_cast(regs.pitch_out) * regs.line_count; + const size_t dst_size = static_cast(abs_pitch_out) * regs.line_count; read_buffer.resize_destructive(src_size); write_buffer.resize_destructive(dst_size); @@ -227,7 +230,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() { UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset, src_params.origin.y, x_elements, regs.line_count, block_height, block_depth, - regs.pitch_out); + abs_pitch_out); memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size); } diff --git a/src/video_core/host1x/codecs/h264.cpp b/src/video_core/host1x/codecs/h264.cpp index 6ce179167..ce827eb6c 100644 --- a/src/video_core/host1x/codecs/h264.cpp +++ b/src/video_core/host1x/codecs/h264.cpp @@ -4,6 +4,7 @@ #include #include +#include "common/scratch_buffer.h" #include "common/settings.h" #include "video_core/host1x/codecs/h264.h" #include "video_core/host1x/host1x.h" @@ -188,7 +189,8 @@ void H264BitWriter::WriteBit(bool state) { } void H264BitWriter::WriteScalingList(std::span list, s32 start, s32 count) { - std::vector scan(count); + static Common::ScratchBuffer scan{}; + scan.resize_destructive(count); if (count == 16) { std::memcpy(scan.data(), zig_zag_scan.data(), scan.size()); } else { diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index b2f7e160a..45141e488 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -587,7 +587,7 @@ void MemoryManager::InvalidateRegion(GPUVAddr gpu_addr, size_t size, void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size, VideoCommon::CacheType which) { - std::vector tmp_buffer(size); + tmp_buffer.resize_destructive(size); ReadBlock(gpu_src_addr, tmp_buffer.data(), size, which); // The output block must be flushed in case it has data modified from the GPU. @@ -670,9 +670,9 @@ bool MemoryManager::IsFullyMappedRange(GPUVAddr gpu_addr, std::size_t size) cons return result; } -std::vector> MemoryManager::GetSubmappedRange( - GPUVAddr gpu_addr, std::size_t size) const { - std::vector> result{}; +boost::container::small_vector, 32> +MemoryManager::GetSubmappedRange(GPUVAddr gpu_addr, std::size_t size) const { + boost::container::small_vector, 32> result{}; GetSubmappedRangeImpl(gpu_addr, size, result); return result; } @@ -680,8 +680,9 @@ std::vector> MemoryManager::GetSubmappedRange( template void MemoryManager::GetSubmappedRangeImpl( GPUVAddr gpu_addr, std::size_t size, - std::vector, std::size_t>>& - result) const { + boost::container::small_vector< + std::pair, std::size_t>, 32>& result) + const { std::optional, std::size_t>> last_segment{}; std::optional old_page_addr{}; diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 794535122..4202c26ff 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -8,10 +8,12 @@ #include #include #include +#include #include "common/common_types.h" #include "common/multi_level_page_table.h" #include "common/range_map.h" +#include "common/scratch_buffer.h" #include "common/virtual_buffer.h" #include "video_core/cache_types.h" #include "video_core/pte_kind.h" @@ -107,8 +109,8 @@ public: * if the region is continuous, a single pair will be returned. If it's unmapped, an empty * vector will be returned; */ - std::vector> GetSubmappedRange(GPUVAddr gpu_addr, - std::size_t size) const; + boost::container::small_vector, 32> GetSubmappedRange( + GPUVAddr gpu_addr, std::size_t size) const; GPUVAddr Map(GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size, PTEKind kind = PTEKind::INVALID, bool is_big_pages = true); @@ -165,7 +167,8 @@ private: template void GetSubmappedRangeImpl( GPUVAddr gpu_addr, std::size_t size, - std::vector, std::size_t>>& + boost::container::small_vector< + std::pair, std::size_t>, 32>& result) const; Core::System& system; @@ -215,8 +218,8 @@ private: Common::VirtualBuffer big_page_table_cpu; std::vector big_page_continuous; - std::vector> page_stash{}; - std::vector> page_stash2{}; + boost::container::small_vector, 32> page_stash{}; + boost::container::small_vector, 32> page_stash2{}; mutable std::mutex guard; @@ -226,6 +229,8 @@ private: std::unique_ptr accumulator; static std::atomic unique_identifier_generator; + + Common::ScratchBuffer tmp_buffer; }; } // namespace Tegra diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 3f077311e..0329ed820 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -85,7 +85,9 @@ Shader::RuntimeInfo MakeRuntimeInfo(const GraphicsPipelineKey& key, case Shader::Stage::VertexB: case Shader::Stage::Geometry: if (!use_assembly_shaders && key.xfb_enabled != 0) { - info.xfb_varyings = VideoCommon::MakeTransformFeedbackVaryings(key.xfb_state); + auto [varyings, count] = VideoCommon::MakeTransformFeedbackVaryings(key.xfb_state); + info.xfb_varyings = varyings; + info.xfb_count = count; } break; case Shader::Stage::TessellationEval: diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index e30fcb1ed..f47301ad5 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -361,7 +361,7 @@ void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer, .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, }; // Measuring a popular game, this number never exceeds the specified size once data is warmed up - boost::container::small_vector vk_copies(copies.size()); + boost::container::small_vector vk_copies(copies.size()); std::ranges::transform(copies, vk_copies.begin(), MakeBufferCopy); scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([src_buffer, dst_buffer, vk_copies, barrier](vk::CommandBuffer cmdbuf) { diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index a2cfb2105..9f316113c 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -167,7 +167,10 @@ Shader::RuntimeInfo MakeRuntimeInfo(std::span program info.fixed_state_point_size = point_size; } if (key.state.xfb_enabled) { - info.xfb_varyings = VideoCommon::MakeTransformFeedbackVaryings(key.state.xfb_state); + auto [varyings, count] = + VideoCommon::MakeTransformFeedbackVaryings(key.state.xfb_state); + info.xfb_varyings = varyings; + info.xfb_count = count; } info.convert_depth_mode = gl_ndc; } @@ -214,7 +217,10 @@ Shader::RuntimeInfo MakeRuntimeInfo(std::span program info.fixed_state_point_size = point_size; } if (key.state.xfb_enabled != 0) { - info.xfb_varyings = VideoCommon::MakeTransformFeedbackVaryings(key.state.xfb_state); + auto [varyings, count] = + VideoCommon::MakeTransformFeedbackVaryings(key.state.xfb_state); + info.xfb_varyings = varyings; + info.xfb_count = count; } info.convert_depth_mode = gl_ndc; break; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index f025f618b..f3cef09dd 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -330,9 +330,9 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { }; } -[[maybe_unused]] [[nodiscard]] std::vector TransformBufferCopies( - std::span copies, size_t buffer_offset) { - std::vector result(copies.size()); +[[maybe_unused]] [[nodiscard]] boost::container::small_vector +TransformBufferCopies(std::span copies, size_t buffer_offset) { + boost::container::small_vector result(copies.size()); std::ranges::transform( copies, result.begin(), [buffer_offset](const VideoCommon::BufferCopy& copy) { return VkBufferCopy{ @@ -344,7 +344,7 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { return result; } -[[nodiscard]] std::vector TransformBufferImageCopies( +[[nodiscard]] boost::container::small_vector TransformBufferImageCopies( std::span copies, size_t buffer_offset, VkImageAspectFlags aspect_mask) { struct Maker { VkBufferImageCopy operator()(const BufferImageCopy& copy) const { @@ -377,14 +377,14 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { VkImageAspectFlags aspect_mask; }; if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { - std::vector result(copies.size() * 2); + boost::container::small_vector result(copies.size() * 2); std::ranges::transform(copies, result.begin(), Maker{buffer_offset, VK_IMAGE_ASPECT_DEPTH_BIT}); std::ranges::transform(copies, result.begin() + copies.size(), Maker{buffer_offset, VK_IMAGE_ASPECT_STENCIL_BIT}); return result; } else { - std::vector result(copies.size()); + boost::container::small_vector result(copies.size()); std::ranges::transform(copies, result.begin(), Maker{buffer_offset, aspect_mask}); return result; } @@ -867,8 +867,8 @@ void TextureCacheRuntime::BarrierFeedbackLoop() { void TextureCacheRuntime::ReinterpretImage(Image& dst, Image& src, std::span copies) { - std::vector vk_in_copies(copies.size()); - std::vector vk_out_copies(copies.size()); + boost::container::small_vector vk_in_copies(copies.size()); + boost::container::small_vector vk_out_copies(copies.size()); const VkImageAspectFlags src_aspect_mask = src.AspectMask(); const VkImageAspectFlags dst_aspect_mask = dst.AspectMask(); @@ -1157,7 +1157,7 @@ void TextureCacheRuntime::ConvertImage(Framebuffer* dst, ImageView& dst_view, Im void TextureCacheRuntime::CopyImage(Image& dst, Image& src, std::span copies) { - std::vector vk_copies(copies.size()); + boost::container::small_vector vk_copies(copies.size()); const VkImageAspectFlags aspect_mask = dst.AspectMask(); ASSERT(aspect_mask == src.AspectMask()); @@ -1332,7 +1332,7 @@ void Image::UploadMemory(VkBuffer buffer, VkDeviceSize offset, ScaleDown(true); } scheduler->RequestOutsideRenderPassOperationContext(); - std::vector vk_copies = TransformBufferImageCopies(copies, offset, aspect_mask); + auto vk_copies = TransformBufferImageCopies(copies, offset, aspect_mask); const VkBuffer src_buffer = buffer; const VkImage vk_image = *original_image; const VkImageAspectFlags vk_aspect_mask = aspect_mask; @@ -1367,8 +1367,9 @@ void Image::DownloadMemory(std::span buffers_span, std::span buffers_vector{}; - boost::container::small_vector, 1> vk_copies; + boost::container::small_vector buffers_vector{}; + boost::container::small_vector, 8> + vk_copies; for (size_t index = 0; index < buffers_span.size(); index++) { buffers_vector.emplace_back(buffers_span[index]); vk_copies.emplace_back( @@ -1858,7 +1859,7 @@ Framebuffer::~Framebuffer() = default; void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime, std::span color_buffers, ImageView* depth_buffer, bool is_rescaled) { - std::vector attachments; + boost::container::small_vector attachments; RenderPassKey renderpass_key{}; s32 num_layers = 1; diff --git a/src/video_core/shader_cache.cpp b/src/video_core/shader_cache.cpp index c5213875b..4db948b6d 100644 --- a/src/video_core/shader_cache.cpp +++ b/src/video_core/shader_cache.cpp @@ -151,11 +151,9 @@ void ShaderCache::RemovePendingShaders() { marked_for_removal.erase(std::unique(marked_for_removal.begin(), marked_for_removal.end()), marked_for_removal.end()); - std::vector removed_shaders; - removed_shaders.reserve(marked_for_removal.size()); + boost::container::small_vector removed_shaders; std::scoped_lock lock{lookup_mutex}; - for (Entry* const entry : marked_for_removal) { removed_shaders.push_back(entry->data); diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index 1b8a17ee8..55d49d017 100644 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "common/common_funcs.h" #include "common/common_types.h" @@ -108,8 +109,8 @@ struct ImageBase { std::vector image_view_infos; std::vector image_view_ids; - std::vector slice_offsets; - std::vector slice_subresources; + boost::container::small_vector slice_offsets; + boost::container::small_vector slice_subresources; std::vector aliased_images; std::vector overlapping_images; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index d58bb69ff..d3f03a995 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -526,7 +526,7 @@ void TextureCache

::WriteMemory(VAddr cpu_addr, size_t size) { template void TextureCache

::DownloadMemory(VAddr cpu_addr, size_t size) { - std::vector images; + boost::container::small_vector images; ForEachImageInRegion(cpu_addr, size, [&images](ImageId image_id, ImageBase& image) { if (!image.IsSafeDownload()) { return; @@ -579,7 +579,7 @@ std::optional TextureCache

::GetFlushArea(V template void TextureCache

::UnmapMemory(VAddr cpu_addr, size_t size) { - std::vector deleted_images; + boost::container::small_vector deleted_images; ForEachImageInRegion(cpu_addr, size, [&](ImageId id, Image&) { deleted_images.push_back(id); }); for (const ImageId id : deleted_images) { Image& image = slot_images[id]; @@ -593,7 +593,7 @@ void TextureCache

::UnmapMemory(VAddr cpu_addr, size_t size) { template void TextureCache

::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size) { - std::vector deleted_images; + boost::container::small_vector deleted_images; ForEachImageInRegionGPU(as_id, gpu_addr, size, [&](ImageId id, Image&) { deleted_images.push_back(id); }); for (const ImageId id : deleted_images) { @@ -1101,7 +1101,7 @@ ImageId TextureCache

::FindImage(const ImageInfo& info, GPUVAddr gpu_addr, const bool native_bgr = runtime.HasNativeBgr(); const bool flexible_formats = True(options & RelaxedOptions::Format); ImageId image_id{}; - boost::container::small_vector image_ids; + boost::container::small_vector image_ids; const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) { if (True(existing_image.flags & ImageFlagBits::Remapped)) { return false; @@ -1622,7 +1622,7 @@ ImageId TextureCache

::FindDMAImage(const ImageInfo& info, GPUVAddr gpu_addr) } } ImageId image_id{}; - boost::container::small_vector image_ids; + boost::container::small_vector image_ids; const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) { if (True(existing_image.flags & ImageFlagBits::Remapped)) { return false; @@ -1942,7 +1942,7 @@ void TextureCache

::RegisterImage(ImageId image_id) { image.map_view_id = map_id; return; } - std::vector sparse_maps{}; + boost::container::small_vector sparse_maps; ForEachSparseSegment( image, [this, image_id, &sparse_maps](GPUVAddr gpu_addr, VAddr cpu_addr, size_t size) { auto map_id = slot_map_views.insert(gpu_addr, cpu_addr, size, image_id); @@ -2217,7 +2217,7 @@ void TextureCache

::MarkModification(ImageBase& image) noexcept { template void TextureCache

::SynchronizeAliases(ImageId image_id) { - boost::container::small_vector aliased_images; + boost::container::small_vector aliased_images; Image& image = slot_images[image_id]; bool any_rescaled = True(image.flags & ImageFlagBits::Rescaled); bool any_modified = True(image.flags & ImageFlagBits::GpuModified); diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 44232b961..e9ec91265 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -56,7 +56,7 @@ struct ImageViewInOut { struct AsyncDecodeContext { ImageId image_id; Common::ScratchBuffer decoded_data; - std::vector copies; + boost::container::small_vector copies; std::mutex mutex; std::atomic_bool complete; }; @@ -429,7 +429,7 @@ private: std::unordered_map, Common::IdentityHash> page_table; std::unordered_map, Common::IdentityHash> sparse_page_table; - std::unordered_map> sparse_views; + std::unordered_map> sparse_views; VAddr virtual_invalid_space{}; diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp index 95a5b47d8..f781cb7a0 100644 --- a/src/video_core/texture_cache/util.cpp +++ b/src/video_core/texture_cache/util.cpp @@ -329,13 +329,13 @@ template [[nodiscard]] std::optional ResolveOverlapRightAddress3D( const ImageInfo& new_info, GPUVAddr gpu_addr, const ImageBase& overlap, bool strict_size) { - const std::vector slice_offsets = CalculateSliceOffsets(new_info); + const auto slice_offsets = CalculateSliceOffsets(new_info); const u32 diff = static_cast(overlap.gpu_addr - gpu_addr); const auto it = std::ranges::find(slice_offsets, diff); if (it == slice_offsets.end()) { return std::nullopt; } - const std::vector subresources = CalculateSliceSubresources(new_info); + const auto subresources = CalculateSliceSubresources(new_info); const SubresourceBase base = subresources[std::distance(slice_offsets.begin(), it)]; const ImageInfo& info = overlap.info; if (!IsBlockLinearSizeCompatible(new_info, info, base.level, 0, strict_size)) { @@ -655,9 +655,9 @@ LevelArray CalculateMipLevelSizes(const ImageInfo& info) noexcept { return sizes; } -std::vector CalculateSliceOffsets(const ImageInfo& info) { +boost::container::small_vector CalculateSliceOffsets(const ImageInfo& info) { ASSERT(info.type == ImageType::e3D); - std::vector offsets; + boost::container::small_vector offsets; offsets.reserve(NumSlices(info)); const LevelInfo level_info = MakeLevelInfo(info); @@ -679,9 +679,10 @@ std::vector CalculateSliceOffsets(const ImageInfo& info) { return offsets; } -std::vector CalculateSliceSubresources(const ImageInfo& info) { +boost::container::small_vector CalculateSliceSubresources( + const ImageInfo& info) { ASSERT(info.type == ImageType::e3D); - std::vector subresources; + boost::container::small_vector subresources; subresources.reserve(NumSlices(info)); for (s32 level = 0; level < info.resources.levels; ++level) { const s32 depth = AdjustMipSize(info.size.depth, level); @@ -723,8 +724,10 @@ ImageViewType RenderTargetImageViewType(const ImageInfo& info) noexcept { } } -std::vector MakeShrinkImageCopies(const ImageInfo& dst, const ImageInfo& src, - SubresourceBase base, u32 up_scale, u32 down_shift) { +boost::container::small_vector MakeShrinkImageCopies(const ImageInfo& dst, + const ImageInfo& src, + SubresourceBase base, + u32 up_scale, u32 down_shift) { ASSERT(dst.resources.levels >= src.resources.levels); const bool is_dst_3d = dst.type == ImageType::e3D; @@ -733,7 +736,7 @@ std::vector MakeShrinkImageCopies(const ImageInfo& dst, const ImageIn ASSERT(src.resources.levels == 1); } const bool both_2d{src.type == ImageType::e2D && dst.type == ImageType::e2D}; - std::vector copies; + boost::container::small_vector copies; copies.reserve(src.resources.levels); for (s32 level = 0; level < src.resources.levels; ++level) { ImageCopy& copy = copies.emplace_back(); @@ -770,9 +773,10 @@ std::vector MakeShrinkImageCopies(const ImageInfo& dst, const ImageIn return copies; } -std::vector MakeReinterpretImageCopies(const ImageInfo& src, u32 up_scale, - u32 down_shift) { - std::vector copies; +boost::container::small_vector MakeReinterpretImageCopies(const ImageInfo& src, + u32 up_scale, + u32 down_shift) { + boost::container::small_vector copies; copies.reserve(src.resources.levels); const bool is_3d = src.type == ImageType::e3D; for (s32 level = 0; level < src.resources.levels; ++level) { @@ -824,9 +828,11 @@ bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config return gpu_memory.GpuToCpuAddress(address, guest_size_bytes).has_value(); } -std::vector UnswizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, - const ImageInfo& info, std::span input, - std::span output) { +boost::container::small_vector UnswizzleImage(Tegra::MemoryManager& gpu_memory, + GPUVAddr gpu_addr, + const ImageInfo& info, + std::span input, + std::span output) { const size_t guest_size_bytes = input.size_bytes(); const u32 bpp_log2 = BytesPerBlockLog2(info.format); const Extent3D size = info.size; @@ -861,7 +867,7 @@ std::vector UnswizzleImage(Tegra::MemoryManager& gpu_memory, GP info.tile_width_spacing); size_t guest_offset = 0; u32 host_offset = 0; - std::vector copies(num_levels); + boost::container::small_vector copies(num_levels); for (s32 level = 0; level < num_levels; ++level) { const Extent3D level_size = AdjustMipSize(size, level); @@ -978,7 +984,7 @@ void ConvertImage(std::span input, const ImageInfo& info, std::span FullDownloadCopies(const ImageInfo& info) { +boost::container::small_vector FullDownloadCopies(const ImageInfo& info) { const Extent3D size = info.size; const u32 bytes_per_block = BytesPerBlock(info.format); if (info.type == ImageType::Linear) { @@ -1006,7 +1012,7 @@ std::vector FullDownloadCopies(const ImageInfo& info) { u32 host_offset = 0; - std::vector copies(num_levels); + boost::container::small_vector copies(num_levels); for (s32 level = 0; level < num_levels; ++level) { const Extent3D level_size = AdjustMipSize(size, level); const u32 num_blocks_per_layer = NumBlocks(level_size, tile_size); @@ -1042,10 +1048,10 @@ Extent3D MipBlockSize(const ImageInfo& info, u32 level) { return AdjustMipBlockSize(num_tiles, level_info.block, level); } -std::vector FullUploadSwizzles(const ImageInfo& info) { +boost::container::small_vector FullUploadSwizzles(const ImageInfo& info) { const Extent2D tile_size = DefaultBlockSize(info.format); if (info.type == ImageType::Linear) { - return std::vector{SwizzleParameters{ + return {SwizzleParameters{ .num_tiles = AdjustTileSize(info.size, tile_size), .block = {}, .buffer_offset = 0, @@ -1057,7 +1063,7 @@ std::vector FullUploadSwizzles(const ImageInfo& info) { const s32 num_levels = info.resources.levels; u32 guest_offset = 0; - std::vector params(num_levels); + boost::container::small_vector params(num_levels); for (s32 level = 0; level < num_levels; ++level) { const Extent3D level_size = AdjustMipSize(size, level); const Extent3D num_tiles = AdjustTileSize(level_size, tile_size); diff --git a/src/video_core/texture_cache/util.h b/src/video_core/texture_cache/util.h index 84aa6880d..ab45a43c4 100644 --- a/src/video_core/texture_cache/util.h +++ b/src/video_core/texture_cache/util.h @@ -5,6 +5,7 @@ #include #include +#include #include "common/common_types.h" #include "common/scratch_buffer.h" @@ -40,9 +41,10 @@ struct OverlapResult { [[nodiscard]] LevelArray CalculateMipLevelSizes(const ImageInfo& info) noexcept; -[[nodiscard]] std::vector CalculateSliceOffsets(const ImageInfo& info); +[[nodiscard]] boost::container::small_vector CalculateSliceOffsets(const ImageInfo& info); -[[nodiscard]] std::vector CalculateSliceSubresources(const ImageInfo& info); +[[nodiscard]] boost::container::small_vector CalculateSliceSubresources( + const ImageInfo& info); [[nodiscard]] u32 CalculateLevelStrideAlignment(const ImageInfo& info, u32 level); @@ -51,21 +53,18 @@ struct OverlapResult { [[nodiscard]] ImageViewType RenderTargetImageViewType(const ImageInfo& info) noexcept; -[[nodiscard]] std::vector MakeShrinkImageCopies(const ImageInfo& dst, - const ImageInfo& src, - SubresourceBase base, u32 up_scale = 1, - u32 down_shift = 0); +[[nodiscard]] boost::container::small_vector MakeShrinkImageCopies( + const ImageInfo& dst, const ImageInfo& src, SubresourceBase base, u32 up_scale = 1, + u32 down_shift = 0); -[[nodiscard]] std::vector MakeReinterpretImageCopies(const ImageInfo& src, - u32 up_scale = 1, - u32 down_shift = 0); +[[nodiscard]] boost::container::small_vector MakeReinterpretImageCopies( + const ImageInfo& src, u32 up_scale = 1, u32 down_shift = 0); [[nodiscard]] bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config); -[[nodiscard]] std::vector UnswizzleImage(Tegra::MemoryManager& gpu_memory, - GPUVAddr gpu_addr, const ImageInfo& info, - std::span input, - std::span output); +[[nodiscard]] boost::container::small_vector UnswizzleImage( + Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info, + std::span input, std::span output); [[nodiscard]] BufferCopy UploadBufferCopy(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageBase& image, std::span output); @@ -73,13 +72,15 @@ struct OverlapResult { void ConvertImage(std::span input, const ImageInfo& info, std::span output, std::span copies); -[[nodiscard]] std::vector FullDownloadCopies(const ImageInfo& info); +[[nodiscard]] boost::container::small_vector FullDownloadCopies( + const ImageInfo& info); [[nodiscard]] Extent3D MipSize(Extent3D size, u32 level); [[nodiscard]] Extent3D MipBlockSize(const ImageInfo& info, u32 level); -[[nodiscard]] std::vector FullUploadSwizzles(const ImageInfo& info); +[[nodiscard]] boost::container::small_vector FullUploadSwizzles( + const ImageInfo& info); void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info, std::span copies, std::span memory, diff --git a/src/video_core/transform_feedback.cpp b/src/video_core/transform_feedback.cpp index 155599316..1f353d2df 100644 --- a/src/video_core/transform_feedback.cpp +++ b/src/video_core/transform_feedback.cpp @@ -13,7 +13,7 @@ namespace VideoCommon { -std::vector MakeTransformFeedbackVaryings( +std::pair, u32> MakeTransformFeedbackVaryings( const TransformFeedbackState& state) { static constexpr std::array VECTORS{ 28U, // gl_Position @@ -62,7 +62,8 @@ std::vector MakeTransformFeedbackVaryings( 216U, // gl_TexCoord[6] 220U, // gl_TexCoord[7] }; - std::vector xfb(256); + std::array xfb{}; + u32 count{0}; for (size_t buffer = 0; buffer < state.layouts.size(); ++buffer) { const auto& locations = state.varyings[buffer]; const auto& layout = state.layouts[buffer]; @@ -103,11 +104,12 @@ std::vector MakeTransformFeedbackVaryings( } } xfb[attribute] = varying; + count = std::max(count, attribute); highest = std::max(highest, (base_offset + varying.components) * 4); } UNIMPLEMENTED_IF(highest != layout.stride); } - return xfb; + return {xfb, count + 1}; } } // namespace VideoCommon diff --git a/src/video_core/transform_feedback.h b/src/video_core/transform_feedback.h index d13eb16c3..401b1352a 100644 --- a/src/video_core/transform_feedback.h +++ b/src/video_core/transform_feedback.h @@ -24,7 +24,7 @@ struct TransformFeedbackState { varyings; }; -std::vector MakeTransformFeedbackVaryings( +std::pair, u32> MakeTransformFeedbackVaryings( const TransformFeedbackState& state); } // namespace VideoCommon diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index fa9cde75b..b11abe311 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -316,6 +316,7 @@ NvidiaArchitecture GetNvidiaArchitecture(vk::PhysicalDevice physical, std::vector ExtensionListForVulkan( const std::set>& extensions) { std::vector output; + output.reserve(extensions.size()); for (const auto& extension : extensions) { output.push_back(extension.c_str()); } -- cgit v1.2.3 From 1586f1c0b174bec6b1db7de48b46ff75e29f3bb2 Mon Sep 17 00:00:00 2001 From: Liam Date: Tue, 20 Jun 2023 11:41:38 -0400 Subject: general: remove atomic signal and wait --- src/common/thread.h | 2 +- src/core/hle/service/nvnflinger/nvnflinger.cpp | 14 ++++---------- src/core/hle/service/nvnflinger/nvnflinger.h | 3 ++- src/core/hle/service/server_manager.cpp | 7 ++----- src/core/hle/service/server_manager.h | 4 ++-- .../renderer_vulkan/vk_master_semaphore.cpp | 22 +++++++++------------- .../renderer_vulkan/vk_master_semaphore.h | 1 + src/yuzu/bootmanager.cpp | 6 ++---- src/yuzu/bootmanager.h | 7 +++---- 9 files changed, 26 insertions(+), 40 deletions(-) (limited to 'src/video_core') diff --git a/src/common/thread.h b/src/common/thread.h index 8ae169b4e..c6976fb6c 100644 --- a/src/common/thread.h +++ b/src/common/thread.h @@ -55,7 +55,7 @@ public: is_set = false; } - [[nodiscard]] bool IsSet() { + [[nodiscard]] bool IsSet() const { return is_set; } diff --git a/src/core/hle/service/nvnflinger/nvnflinger.cpp b/src/core/hle/service/nvnflinger/nvnflinger.cpp index b41c6240c..5f55cd31e 100644 --- a/src/core/hle/service/nvnflinger/nvnflinger.cpp +++ b/src/core/hle/service/nvnflinger/nvnflinger.cpp @@ -43,14 +43,10 @@ void Nvnflinger::SplitVSync(std::stop_token stop_token) { Common::SetCurrentThreadPriority(Common::ThreadPriority::High); while (!stop_token.stop_requested()) { - vsync_signal.wait(false); - vsync_signal.store(false); - - guard->lock(); + vsync_signal.Wait(); + const auto lock_guard = Lock(); Compose(); - - guard->unlock(); } } @@ -69,9 +65,8 @@ Nvnflinger::Nvnflinger(Core::System& system_, HosBinderDriverServer& hos_binder_ "ScreenComposition", [this](std::uintptr_t, s64 time, std::chrono::nanoseconds ns_late) -> std::optional { - vsync_signal.store(true); { const auto lock_guard = Lock(); } - vsync_signal.notify_one(); + vsync_signal.Set(); return std::chrono::nanoseconds(GetNextTicks()); }); @@ -97,8 +92,7 @@ Nvnflinger::~Nvnflinger() { if (system.IsMulticore()) { system.CoreTiming().UnscheduleEvent(multi_composition_event, {}); vsync_thread.request_stop(); - vsync_signal.store(true); - vsync_signal.notify_all(); + vsync_signal.Set(); } else { system.CoreTiming().UnscheduleEvent(single_composition_event, {}); } diff --git a/src/core/hle/service/nvnflinger/nvnflinger.h b/src/core/hle/service/nvnflinger/nvnflinger.h index a043cceb2..ef236303a 100644 --- a/src/core/hle/service/nvnflinger/nvnflinger.h +++ b/src/core/hle/service/nvnflinger/nvnflinger.h @@ -12,6 +12,7 @@ #include "common/common_types.h" #include "common/polyfill_thread.h" +#include "common/thread.h" #include "core/hle/result.h" #include "core/hle/service/kernel_helpers.h" @@ -143,7 +144,7 @@ private: Core::System& system; - std::atomic vsync_signal; + Common::Event vsync_signal; std::jthread vsync_thread; diff --git a/src/core/hle/service/server_manager.cpp b/src/core/hle/service/server_manager.cpp index 156bc27d8..d1e99b184 100644 --- a/src/core/hle/service/server_manager.cpp +++ b/src/core/hle/service/server_manager.cpp @@ -44,7 +44,7 @@ ServerManager::~ServerManager() { m_event->Signal(); // Wait for processing to stop. - m_stopped.wait(false); + m_stopped.Wait(); m_threads.clear(); // Clean up ports. @@ -182,10 +182,7 @@ void ServerManager::StartAdditionalHostThreads(const char* name, size_t num_thre } Result ServerManager::LoopProcess() { - SCOPE_EXIT({ - m_stopped.store(true); - m_stopped.notify_all(); - }); + SCOPE_EXIT({ m_stopped.Set(); }); R_RETURN(this->LoopProcessImpl()); } diff --git a/src/core/hle/service/server_manager.h b/src/core/hle/service/server_manager.h index fdb8af2ff..58b0a0832 100644 --- a/src/core/hle/service/server_manager.h +++ b/src/core/hle/service/server_manager.h @@ -3,7 +3,6 @@ #pragma once -#include #include #include #include @@ -12,6 +11,7 @@ #include #include "common/polyfill_thread.h" +#include "common/thread.h" #include "core/hle/result.h" #include "core/hle/service/mutex.h" @@ -82,7 +82,7 @@ private: std::list m_deferrals{}; // Host state tracking - std::atomic m_stopped{}; + Common::Event m_stopped{}; std::vector m_threads{}; std::stop_source m_stop_source{}; }; diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp index 5eeda08d2..6b288b994 100644 --- a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp @@ -75,15 +75,9 @@ void MasterSemaphore::Refresh() { void MasterSemaphore::Wait(u64 tick) { if (!semaphore) { - // If we don't support timeline semaphores, use an atomic wait - while (true) { - u64 current_value = gpu_tick.load(std::memory_order_relaxed); - if (current_value >= tick) { - return; - } - gpu_tick.wait(current_value); - } - + // If we don't support timeline semaphores, wait for the value normally + std::unique_lock lk{free_mutex}; + free_cv.wait(lk, [&] { return gpu_tick.load(std::memory_order_relaxed) >= tick; }); return; } @@ -198,11 +192,13 @@ void MasterSemaphore::WaitThread(std::stop_token token) { fence.Wait(); fence.Reset(); - gpu_tick.store(host_tick); - gpu_tick.notify_all(); - std::scoped_lock lock{free_mutex}; - free_queue.push_front(std::move(fence)); + { + std::scoped_lock lock{free_mutex}; + free_queue.push_front(std::move(fence)); + gpu_tick.store(host_tick); + } + free_cv.notify_one(); } } diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.h b/src/video_core/renderer_vulkan/vk_master_semaphore.h index 1e7c90215..3f599d7bd 100644 --- a/src/video_core/renderer_vulkan/vk_master_semaphore.h +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.h @@ -72,6 +72,7 @@ private: std::atomic current_tick{1}; ///< Current logical tick. std::mutex wait_mutex; std::mutex free_mutex; + std::condition_variable free_cv; std::condition_variable_any wait_cv; std::queue wait_queue; ///< Queue for the fences to be waited on by the wait thread. std::deque free_queue; ///< Holds available fences for submission. diff --git a/src/yuzu/bootmanager.cpp b/src/yuzu/bootmanager.cpp index cc6b6a25a..bdd1497b5 100644 --- a/src/yuzu/bootmanager.cpp +++ b/src/yuzu/bootmanager.cpp @@ -105,14 +105,12 @@ void EmuThread::run() { std::unique_lock lk{m_should_run_mutex}; if (m_should_run) { m_system.Run(); - m_is_running.store(true); - m_is_running.notify_all(); + m_stopped.Reset(); Common::CondvarWait(m_should_run_cv, lk, stop_token, [&] { return !m_should_run; }); } else { m_system.Pause(); - m_is_running.store(false); - m_is_running.notify_all(); + m_stopped.Set(); EmulationPaused(lk); Common::CondvarWait(m_should_run_cv, lk, stop_token, [&] { return m_should_run; }); diff --git a/src/yuzu/bootmanager.h b/src/yuzu/bootmanager.h index b7b9d4141..87b23df12 100644 --- a/src/yuzu/bootmanager.h +++ b/src/yuzu/bootmanager.h @@ -3,7 +3,6 @@ #pragma once -#include #include #include #include @@ -88,7 +87,7 @@ public: // Wait until paused, if pausing. if (!should_run) { - m_is_running.wait(true); + m_stopped.Wait(); } } @@ -97,7 +96,7 @@ public: * @return True if the emulation thread is running, otherwise false */ bool IsRunning() const { - return m_is_running.load() || m_should_run; + return m_should_run; } /** @@ -118,7 +117,7 @@ private: std::stop_source m_stop_source; std::mutex m_should_run_mutex; std::condition_variable_any m_should_run_cv; - std::atomic m_is_running{false}; + Common::Event m_stopped; bool m_should_run{true}; signals: -- cgit v1.2.3 From 75fb29e08e3891ecfc5d96d54603cda806ebd426 Mon Sep 17 00:00:00 2001 From: GPUCode Date: Thu, 22 Jun 2023 20:03:12 +0300 Subject: vulkan_common: Remove required flags * Allows VMA to fallback to system RAM instead of crashing --- src/video_core/vulkan_common/vulkan_memory_allocator.cpp | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp index 20d36680c..70db41343 100644 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp @@ -59,20 +59,6 @@ struct Range { return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; } -[[nodiscard]] VkMemoryPropertyFlags MemoryUsageRequiredVmaFlags(MemoryUsage usage) { - switch (usage) { - case MemoryUsage::DeviceLocal: - return VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; - case MemoryUsage::Upload: - case MemoryUsage::Stream: - return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; - case MemoryUsage::Download: - return VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT; - } - ASSERT_MSG(false, "Invalid memory usage={}", usage); - return {}; -} - [[nodiscard]] VkMemoryPropertyFlags MemoryUsagePreferedVmaFlags(MemoryUsage usage) { return usage != MemoryUsage::DeviceLocal ? VK_MEMORY_PROPERTY_HOST_COHERENT_BIT : VkMemoryPropertyFlagBits{}; @@ -259,7 +245,7 @@ vk::Buffer MemoryAllocator::CreateBuffer(const VkBufferCreateInfo& ci, MemoryUsa .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT | MemoryUsageVmaFlags(usage), .usage = MemoryUsageVma(usage), - .requiredFlags = MemoryUsageRequiredVmaFlags(usage), + .requiredFlags = 0, .preferredFlags = MemoryUsagePreferedVmaFlags(usage), .memoryTypeBits = 0, .pool = VK_NULL_HANDLE, -- cgit v1.2.3 From 82107b33a2251eb4f55ab2006a8fc0cb47cc39e8 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sun, 25 Jun 2023 18:43:23 -0400 Subject: OpenGL: Add Local Memory warmup shader --- src/video_core/host_shaders/CMakeLists.txt | 1 + .../host_shaders/opengl_lmem_warmup.comp | 47 ++++++++++++++++++++++ src/video_core/renderer_opengl/gl_rasterizer.cpp | 2 + .../renderer_opengl/gl_shader_manager.cpp | 10 ++++- src/video_core/renderer_opengl/gl_shader_manager.h | 3 ++ 5 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 src/video_core/host_shaders/opengl_lmem_warmup.comp (limited to 'src/video_core') diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 2442c3c29..e61d9af80 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -33,6 +33,7 @@ set(SHADER_FILES opengl_fidelityfx_fsr.frag opengl_fidelityfx_fsr_easu.frag opengl_fidelityfx_fsr_rcas.frag + opengl_lmem_warmup.comp opengl_present.frag opengl_present.vert opengl_present_scaleforce.frag diff --git a/src/video_core/host_shaders/opengl_lmem_warmup.comp b/src/video_core/host_shaders/opengl_lmem_warmup.comp new file mode 100644 index 000000000..518268477 --- /dev/null +++ b/src/video_core/host_shaders/opengl_lmem_warmup.comp @@ -0,0 +1,47 @@ +// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +// This shader is a workaround for a quirk in NVIDIA OpenGL drivers +// Shaders using local memory see a great performance benefit if a shader that was dispatched +// before it had more local memory allocated. +// This shader allocates the maximum local memory allowed on NVIDIA drivers to ensure that +// subsequent shaders see the performance boost. + +// NOTE: This shader does no actual meaningful work and returns immediately, +// it is simply a means to have the driver expect a shader using lots of local memory. + +#version 450 + +layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in; + +layout(location = 0) uniform uint uniform_data; + +layout(binding = 0, rgba8) uniform writeonly restrict image2DArray dest_image; + +#define MAX_LMEM_SIZE 4080 // Size chosen to avoid errors in Nvidia's GLSL compiler +#define NUM_LMEM_CONSTANTS 1 +#define ARRAY_SIZE MAX_LMEM_SIZE - NUM_LMEM_CONSTANTS + +uint lmem_0[ARRAY_SIZE]; +const uvec4 constant_values[NUM_LMEM_CONSTANTS] = uvec4[](uvec4(0)); + +void main() { + const uint global_id = gl_GlobalInvocationID.x; + if (global_id <= 128) { + // Since the shader is called with a dispatch of 1x1x1 + // This should always be the case, and this shader will not actually execute + return; + } + for (uint t = 0; t < uniform_data; t++) { + const uint offset = (t * uniform_data); + lmem_0[offset] = t; + } + const uint offset = (gl_GlobalInvocationID.y * uniform_data + gl_GlobalInvocationID.x); + const uint value = lmem_0[offset]; + const uint const_value = constant_values[offset / 4][offset % 4]; + const uvec4 color = uvec4(value + const_value); + + // A "side-effect" is needed so the variables don't get optimized out, + // but this should never execute so there should be no clobbering of previously bound state. + imageStore(dest_image, ivec3(gl_GlobalInvocationID), color); +} diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index fc711c44a..d03288516 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -222,6 +222,7 @@ void RasterizerOpenGL::PrepareDraw(bool is_indexed, Func&& draw_func) { gpu.TickWork(); std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; + program_manager.LocalMemoryWarmup(); pipeline->SetEngine(maxwell3d, gpu_memory); pipeline->Configure(is_indexed); @@ -371,6 +372,7 @@ void RasterizerOpenGL::DispatchCompute() { if (!pipeline) { return; } + program_manager.LocalMemoryWarmup(); pipeline->SetEngine(kepler_compute, gpu_memory); pipeline->Configure(); const auto& qmd{kepler_compute->launch_description}; diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 98841ae65..2f6ba6823 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -3,7 +3,9 @@ #include +#include "video_core/host_shaders/opengl_lmem_warmup_comp.h" #include "video_core/renderer_opengl/gl_shader_manager.h" +#include "video_core/renderer_opengl/gl_shader_util.h" namespace OpenGL { @@ -12,7 +14,8 @@ static constexpr std::array ASSEMBLY_PROGRAM_ENUMS{ GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV, }; -ProgramManager::ProgramManager(const Device& device) { +ProgramManager::ProgramManager(const Device& device) + : lmem_warmup_program(CreateProgram(HostShaders::OPENGL_LMEM_WARMUP_COMP, GL_COMPUTE_SHADER)) { glCreateProgramPipelines(1, &pipeline.handle); if (device.UseAssemblyShaders()) { glEnable(GL_COMPUTE_PROGRAM_NV); @@ -98,6 +101,11 @@ void ProgramManager::BindAssemblyPrograms(std::span current_programs{}; GLuint current_assembly_compute_program = 0; + OGLProgram lmem_warmup_program; }; } // namespace OpenGL -- cgit v1.2.3 From 405eae3734dd6bfb259df0afceecf4de1f1262ce Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sun, 25 Jun 2023 18:59:33 -0400 Subject: shaders: Track local memory usage --- src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp | 4 ++++ src/shader_recompiler/shader_info.h | 1 + src/video_core/renderer_opengl/gl_compute_pipeline.cpp | 1 + src/video_core/renderer_opengl/gl_compute_pipeline.h | 5 +++++ src/video_core/renderer_opengl/gl_graphics_pipeline.cpp | 1 + src/video_core/renderer_opengl/gl_graphics_pipeline.h | 5 +++++ src/video_core/renderer_opengl/gl_rasterizer.cpp | 8 ++++++-- 7 files changed, 23 insertions(+), 2 deletions(-) (limited to 'src/video_core') diff --git a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp index 5a4195217..70292686f 100644 --- a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp +++ b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp @@ -424,6 +424,10 @@ void VisitUsages(Info& info, IR::Inst& inst) { info.used_constant_buffer_types |= IR::Type::U32 | IR::Type::U32x2; info.used_storage_buffer_types |= IR::Type::U32 | IR::Type::U32x2 | IR::Type::U32x4; break; + case IR::Opcode::LoadLocal: + case IR::Opcode::WriteLocal: + info.uses_local_memory = true; + break; default: break; } diff --git a/src/shader_recompiler/shader_info.h b/src/shader_recompiler/shader_info.h index d308db942..b4b4afd37 100644 --- a/src/shader_recompiler/shader_info.h +++ b/src/shader_recompiler/shader_info.h @@ -172,6 +172,7 @@ struct Info { bool stores_indexed_attributes{}; bool stores_global_memory{}; + bool uses_local_memory{}; bool uses_fp16{}; bool uses_fp64{}; diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp index 3151c0db8..f9ca55c36 100644 --- a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp @@ -63,6 +63,7 @@ ComputePipeline::ComputePipeline(const Device& device, TextureCache& texture_cac writes_global_memory = !use_storage_buffers && std::ranges::any_of(info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; }); + uses_local_memory = info.uses_local_memory; if (force_context_flush) { std::scoped_lock lock{built_mutex}; built_fence.Create(); diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.h b/src/video_core/renderer_opengl/gl_compute_pipeline.h index 9bcc72b59..c26b4fa5e 100644 --- a/src/video_core/renderer_opengl/gl_compute_pipeline.h +++ b/src/video_core/renderer_opengl/gl_compute_pipeline.h @@ -59,6 +59,10 @@ public: return writes_global_memory; } + [[nodiscard]] bool UsesLocalMemory() const noexcept { + return uses_local_memory; + } + void SetEngine(Tegra::Engines::KeplerCompute* kepler_compute_, Tegra::MemoryManager* gpu_memory_) { kepler_compute = kepler_compute_; @@ -84,6 +88,7 @@ private: bool use_storage_buffers{}; bool writes_global_memory{}; + bool uses_local_memory{}; std::mutex built_mutex; std::condition_variable built_condvar; diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp index c58f760b8..23a48c6fe 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp @@ -215,6 +215,7 @@ GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_c writes_global_memory |= std::ranges::any_of( info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; }); + uses_local_memory |= info.uses_local_memory; } ASSERT(num_textures <= MAX_TEXTURES); ASSERT(num_images <= MAX_IMAGES); diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.h b/src/video_core/renderer_opengl/gl_graphics_pipeline.h index 7bab3be0a..7b3d7eae8 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.h +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.h @@ -98,6 +98,10 @@ public: return writes_global_memory; } + [[nodiscard]] bool UsesLocalMemory() const noexcept { + return uses_local_memory; + } + [[nodiscard]] bool IsBuilt() noexcept; template @@ -146,6 +150,7 @@ private: bool use_storage_buffers{}; bool writes_global_memory{}; + bool uses_local_memory{}; static constexpr std::size_t XFB_ENTRY_STRIDE = 3; GLsizei num_xfb_attribs{}; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index d03288516..edf527f2d 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -222,7 +222,9 @@ void RasterizerOpenGL::PrepareDraw(bool is_indexed, Func&& draw_func) { gpu.TickWork(); std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; - program_manager.LocalMemoryWarmup(); + if (pipeline->UsesLocalMemory()) { + program_manager.LocalMemoryWarmup(); + } pipeline->SetEngine(maxwell3d, gpu_memory); pipeline->Configure(is_indexed); @@ -372,7 +374,9 @@ void RasterizerOpenGL::DispatchCompute() { if (!pipeline) { return; } - program_manager.LocalMemoryWarmup(); + if (pipeline->UsesLocalMemory()) { + program_manager.LocalMemoryWarmup(); + } pipeline->SetEngine(kepler_compute, gpu_memory); pipeline->Configure(); const auto& qmd{kepler_compute->launch_description}; -- cgit v1.2.3 From 4f160633d369b702a45ace9b6ff133312761c5f8 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sun, 25 Jun 2023 19:06:51 -0400 Subject: OpenGL: Limit lmem warmup to NVIDIA :frog: --- src/video_core/renderer_opengl/gl_device.cpp | 1 + src/video_core/renderer_opengl/gl_device.h | 5 +++++ src/video_core/renderer_opengl/gl_shader_manager.cpp | 13 +++++++++---- 3 files changed, 15 insertions(+), 4 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 03d234f2f..33e63c17d 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -194,6 +194,7 @@ Device::Device(Core::Frontend::EmuWindow& emu_window) { has_bool_ref_bug = true; } } + has_lmem_perf_bug = is_nvidia; strict_context_required = emu_window.StrictContextRequired(); // Blocks AMD and Intel OpenGL drivers on Windows from using asynchronous shader compilation. diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index ad27264e5..a5a6bbbba 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -192,6 +192,10 @@ public: return supports_conditional_barriers; } + bool HasLmemPerfBug() const { + return has_lmem_perf_bug; + } + private: static bool TestVariableAoffi(); static bool TestPreciseBug(); @@ -238,6 +242,7 @@ private: bool can_report_memory{}; bool strict_context_required{}; bool supports_conditional_barriers{}; + bool has_lmem_perf_bug{}; std::string vendor_name; }; diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 2f6ba6823..03d4b9d06 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -14,12 +14,15 @@ static constexpr std::array ASSEMBLY_PROGRAM_ENUMS{ GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV, }; -ProgramManager::ProgramManager(const Device& device) - : lmem_warmup_program(CreateProgram(HostShaders::OPENGL_LMEM_WARMUP_COMP, GL_COMPUTE_SHADER)) { +ProgramManager::ProgramManager(const Device& device) { glCreateProgramPipelines(1, &pipeline.handle); if (device.UseAssemblyShaders()) { glEnable(GL_COMPUTE_PROGRAM_NV); } + if (device.HasLmemPerfBug()) { + lmem_warmup_program = + CreateProgram(HostShaders::OPENGL_LMEM_WARMUP_COMP, GL_COMPUTE_SHADER); + } } void ProgramManager::BindComputeProgram(GLuint program) { @@ -102,8 +105,10 @@ void ProgramManager::BindAssemblyPrograms(std::span Date: Wed, 21 Jun 2023 05:48:04 +0100 Subject: Use safe reads in DMA engine --- src/video_core/engines/maxwell_dma.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index bc1eb41e7..a290d6ea7 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -130,7 +130,7 @@ void MaxwellDMA::Launch() { UNIMPLEMENTED_IF(regs.offset_out % 16 != 0); read_buffer.resize_destructive(16); for (u32 offset = 0; offset < regs.line_length_in; offset += 16) { - memory_manager.ReadBlockUnsafe( + memory_manager.ReadBlock( convert_linear_2_blocklinear_addr(regs.offset_in + offset), read_buffer.data(), read_buffer.size()); memory_manager.WriteBlockCached(regs.offset_out + offset, read_buffer.data(), @@ -142,8 +142,8 @@ void MaxwellDMA::Launch() { UNIMPLEMENTED_IF(regs.offset_out % 16 != 0); read_buffer.resize_destructive(16); for (u32 offset = 0; offset < regs.line_length_in; offset += 16) { - memory_manager.ReadBlockUnsafe(regs.offset_in + offset, read_buffer.data(), - read_buffer.size()); + memory_manager.ReadBlock(regs.offset_in + offset, read_buffer.data(), + read_buffer.size()); memory_manager.WriteBlockCached( convert_linear_2_blocklinear_addr(regs.offset_out + offset), read_buffer.data(), read_buffer.size()); @@ -151,8 +151,9 @@ void MaxwellDMA::Launch() { } else { if (!accelerate.BufferCopy(regs.offset_in, regs.offset_out, regs.line_length_in)) { read_buffer.resize_destructive(regs.line_length_in); - memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(), - regs.line_length_in); + memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), + regs.line_length_in, + VideoCommon::CacheType::NoBufferCache); memory_manager.WriteBlockCached(regs.offset_out, read_buffer.data(), regs.line_length_in); } -- cgit v1.2.3 From b6c6dcc5760ebaf08460c176c42d1c4729e2eb21 Mon Sep 17 00:00:00 2001 From: GPUCode Date: Sun, 25 Jun 2023 15:08:38 +0300 Subject: externals: Use cmake subdirectory --- .gitmodules | 4 ++-- externals/CMakeLists.txt | 2 +- externals/vma/VulkanMemoryAllocator | 1 + externals/vma/vma | 1 - externals/vma/vma.cpp | 1 + src/video_core/vulkan_common/vulkan_device.cpp | 2 -- src/video_core/vulkan_common/vulkan_memory_allocator.cpp | 2 -- src/video_core/vulkan_common/vulkan_wrapper.cpp | 2 -- 8 files changed, 5 insertions(+), 10 deletions(-) create mode 160000 externals/vma/VulkanMemoryAllocator delete mode 160000 externals/vma/vma (limited to 'src/video_core') diff --git a/.gitmodules b/.gitmodules index cc0e97a85..5a8169b44 100644 --- a/.gitmodules +++ b/.gitmodules @@ -55,6 +55,6 @@ [submodule "tzdb_to_nx"] path = externals/nx_tzdb/tzdb_to_nx url = https://github.com/lat9nq/tzdb_to_nx.git -[submodule "externals/vma/vma"] - path = externals/vma/vma +[submodule "VulkanMemoryAllocator"] + path = externals/vma/VulkanMemoryAllocator url = https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index ca4ebe4b9..0184289eb 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -145,7 +145,7 @@ add_subdirectory(nx_tzdb) # VMA add_library(vma vma/vma.cpp) -target_include_directories(vma PUBLIC ./vma/vma/include) +target_include_directories(vma PUBLIC ./vma/VulkanMemoryAllocator/include) target_link_libraries(vma PRIVATE Vulkan::Headers) if (NOT TARGET LLVM::Demangle) diff --git a/externals/vma/VulkanMemoryAllocator b/externals/vma/VulkanMemoryAllocator new file mode 160000 index 000000000..0aa3989b8 --- /dev/null +++ b/externals/vma/VulkanMemoryAllocator @@ -0,0 +1 @@ +Subproject commit 0aa3989b8f382f185fdf646cc83a1d16fa31d6ab diff --git a/externals/vma/vma b/externals/vma/vma deleted file mode 160000 index 0aa3989b8..000000000 --- a/externals/vma/vma +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0aa3989b8f382f185fdf646cc83a1d16fa31d6ab diff --git a/externals/vma/vma.cpp b/externals/vma/vma.cpp index ff1acc320..1fe2cf52b 100644 --- a/externals/vma/vma.cpp +++ b/externals/vma/vma.cpp @@ -4,4 +4,5 @@ #define VMA_IMPLEMENTATION #define VMA_STATIC_VULKAN_FUNCTIONS 0 #define VMA_DYNAMIC_VULKAN_FUNCTIONS 1 + #include \ No newline at end of file diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 94dd1aa14..31226084f 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -22,8 +22,6 @@ #include #endif -#define VMA_STATIC_VULKAN_FUNCTIONS 0 -#define VMA_DYNAMIC_VULKAN_FUNCTIONS 1 #include namespace Vulkan { diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp index 70db41343..a2ef0efa4 100644 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp @@ -15,8 +15,6 @@ #include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" -#define VMA_STATIC_VULKAN_FUNCTIONS 0 -#define VMA_DYNAMIC_VULKAN_FUNCTIONS 1 #include namespace Vulkan { diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index c01a9478e..28fcb21a0 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp @@ -12,8 +12,6 @@ #include "video_core/vulkan_common/vulkan_wrapper.h" -#define VMA_STATIC_VULKAN_FUNCTIONS 0 -#define VMA_DYNAMIC_VULKAN_FUNCTIONS 1 #include namespace Vulkan::vk { -- cgit v1.2.3 From 72e7f5b4dd08e9ea46ee049d8f2564a8808273d4 Mon Sep 17 00:00:00 2001 From: GPUCode Date: Sat, 3 Jun 2023 09:21:52 +0300 Subject: renderer_vulkan: Add suport for debug report callback --- src/video_core/renderer_vulkan/renderer_vulkan.cpp | 18 ++++++- src/video_core/renderer_vulkan/renderer_vulkan.h | 5 +- .../vulkan_common/vulkan_debug_callback.cpp | 40 ++++++++++++--- .../vulkan_common/vulkan_debug_callback.h | 4 +- src/video_core/vulkan_common/vulkan_device.cpp | 2 +- src/video_core/vulkan_common/vulkan_instance.cpp | 58 ++++++++++++---------- src/video_core/vulkan_common/vulkan_wrapper.cpp | 14 ++++++ src/video_core/vulkan_common/vulkan_wrapper.h | 9 ++++ 8 files changed, 113 insertions(+), 37 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index ddf28ca28..454bb66a4 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -12,6 +12,7 @@ #include #include "common/logging/log.h" +#include "common/polyfill_ranges.h" #include "common/scope_exit.h" #include "common/settings.h" #include "common/telemetry.h" @@ -65,6 +66,21 @@ std::string BuildCommaSeparatedExtensions( return fmt::format("{}", fmt::join(available_extensions, ",")); } +DebugCallback MakeDebugCallback(const vk::Instance& instance, const vk::InstanceDispatch& dld) { + if (!Settings::values.renderer_debug) { + return DebugCallback{}; + } + const std::optional properties = vk::EnumerateInstanceExtensionProperties(dld); + const auto it = std::ranges::find_if(*properties, [](const auto& prop) { + return std::strcmp(VK_EXT_DEBUG_UTILS_EXTENSION_NAME, prop.extensionName) == 0; + }); + if (it != properties->end()) { + return CreateDebugUtilsCallback(instance); + } else { + return CreateDebugReportCallback(instance); + } +} + } // Anonymous namespace Device CreateDevice(const vk::Instance& instance, const vk::InstanceDispatch& dld, @@ -87,7 +103,7 @@ RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_, cpu_memory(cpu_memory_), gpu(gpu_), library(OpenLibrary(context.get())), instance(CreateInstance(*library, dld, VK_API_VERSION_1_1, render_window.GetWindowInfo().type, Settings::values.renderer_debug.GetValue())), - debug_callback(Settings::values.renderer_debug ? CreateDebugCallback(instance) : nullptr), + debug_callback(MakeDebugCallback(instance, dld)), surface(CreateSurface(instance, render_window.GetWindowInfo())), device(CreateDevice(instance, dld, *surface)), memory_allocator(device), state_tracker(), scheduler(device, state_tracker), diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index b2e8cbd1b..ca22c0baa 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -5,6 +5,7 @@ #include #include +#include #include "common/dynamic_library.h" #include "video_core/renderer_base.h" @@ -33,6 +34,8 @@ class GPU; namespace Vulkan { +using DebugCallback = std::variant; + Device CreateDevice(const vk::Instance& instance, const vk::InstanceDispatch& dld, VkSurfaceKHR surface); @@ -71,7 +74,7 @@ private: vk::InstanceDispatch dld; vk::Instance instance; - vk::DebugUtilsMessenger debug_callback; + DebugCallback debug_callback; vk::SurfaceKHR surface; ScreenInfo screen_info; diff --git a/src/video_core/vulkan_common/vulkan_debug_callback.cpp b/src/video_core/vulkan_common/vulkan_debug_callback.cpp index 9de484c29..67e8065a4 100644 --- a/src/video_core/vulkan_common/vulkan_debug_callback.cpp +++ b/src/video_core/vulkan_common/vulkan_debug_callback.cpp @@ -7,10 +7,10 @@ namespace Vulkan { namespace { -VkBool32 Callback(VkDebugUtilsMessageSeverityFlagBitsEXT severity, - VkDebugUtilsMessageTypeFlagsEXT type, - const VkDebugUtilsMessengerCallbackDataEXT* data, - [[maybe_unused]] void* user_data) { +VkBool32 DebugUtilCallback(VkDebugUtilsMessageSeverityFlagBitsEXT severity, + VkDebugUtilsMessageTypeFlagsEXT type, + const VkDebugUtilsMessengerCallbackDataEXT* data, + [[maybe_unused]] void* user_data) { // Skip logging known false-positive validation errors switch (static_cast(data->messageIdNumber)) { #ifdef ANDROID @@ -62,9 +62,26 @@ VkBool32 Callback(VkDebugUtilsMessageSeverityFlagBitsEXT severity, } return VK_FALSE; } + +VkBool32 DebugReportCallback(VkDebugReportFlagsEXT flags, VkDebugReportObjectTypeEXT objectType, + uint64_t object, size_t location, int32_t messageCode, + const char* pLayerPrefix, const char* pMessage, void* pUserData) { + const VkDebugReportFlagBitsEXT severity = static_cast(flags); + const std::string_view message{pMessage}; + if (severity & VK_DEBUG_REPORT_ERROR_BIT_EXT) { + LOG_CRITICAL(Render_Vulkan, "{}", message); + } else if (severity & VK_DEBUG_REPORT_WARNING_BIT_EXT) { + LOG_WARNING(Render_Vulkan, "{}", message); + } else if (severity & VK_DEBUG_REPORT_INFORMATION_BIT_EXT) { + LOG_INFO(Render_Vulkan, "{}", message); + } else if (severity & VK_DEBUG_REPORT_DEBUG_BIT_EXT) { + LOG_DEBUG(Render_Vulkan, "{}", message); + } + return VK_FALSE; +} } // Anonymous namespace -vk::DebugUtilsMessenger CreateDebugCallback(const vk::Instance& instance) { +vk::DebugUtilsMessenger CreateDebugUtilsCallback(const vk::Instance& instance) { return instance.CreateDebugUtilsMessenger(VkDebugUtilsMessengerCreateInfoEXT{ .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT, .pNext = nullptr, @@ -76,7 +93,18 @@ vk::DebugUtilsMessenger CreateDebugCallback(const vk::Instance& instance) { .messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT, - .pfnUserCallback = Callback, + .pfnUserCallback = DebugUtilCallback, + .pUserData = nullptr, + }); +} + +vk::DebugReportCallback CreateDebugReportCallback(const vk::Instance& instance) { + return instance.CreateDebugReportCallback({ + .sType = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, + .pNext = nullptr, + .flags = VK_DEBUG_REPORT_DEBUG_BIT_EXT | VK_DEBUG_REPORT_INFORMATION_BIT_EXT | + VK_DEBUG_REPORT_ERROR_BIT_EXT | VK_DEBUG_REPORT_WARNING_BIT_EXT, + .pfnCallback = DebugReportCallback, .pUserData = nullptr, }); } diff --git a/src/video_core/vulkan_common/vulkan_debug_callback.h b/src/video_core/vulkan_common/vulkan_debug_callback.h index 71b1f69ec..a8af7b406 100644 --- a/src/video_core/vulkan_common/vulkan_debug_callback.h +++ b/src/video_core/vulkan_common/vulkan_debug_callback.h @@ -7,6 +7,8 @@ namespace Vulkan { -vk::DebugUtilsMessenger CreateDebugCallback(const vk::Instance& instance); +vk::DebugUtilsMessenger CreateDebugUtilsCallback(const vk::Instance& instance); + +vk::DebugReportCallback CreateDebugReportCallback(const vk::Instance& instance); } // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index e4ca65b58..9743a82f5 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -349,7 +349,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR const bool is_s8gen2 = device_id == 0x43050a01; const bool is_arm = driver_id == VK_DRIVER_ID_ARM_PROPRIETARY; - if ((is_mvk || is_qualcomm || is_turnip) && !is_suitable) { + if ((is_mvk || is_qualcomm || is_turnip || is_arm) && !is_suitable) { LOG_WARNING(Render_Vulkan, "Unsuitable driver, continuing anyway"); } else if (!is_suitable) { throw vk::Exception(VK_ERROR_INCOMPATIBLE_DRIVER); diff --git a/src/video_core/vulkan_common/vulkan_instance.cpp b/src/video_core/vulkan_common/vulkan_instance.cpp index b6d83e446..7624a9b32 100644 --- a/src/video_core/vulkan_common/vulkan_instance.cpp +++ b/src/video_core/vulkan_common/vulkan_instance.cpp @@ -31,10 +31,34 @@ namespace Vulkan { namespace { + +[[nodiscard]] bool AreExtensionsSupported(const vk::InstanceDispatch& dld, + std::span extensions) { + const std::optional properties = vk::EnumerateInstanceExtensionProperties(dld); + if (!properties) { + LOG_ERROR(Render_Vulkan, "Failed to query extension properties"); + return false; + } + for (const char* extension : extensions) { + const auto it = std::ranges::find_if(*properties, [extension](const auto& prop) { + return std::strcmp(extension, prop.extensionName) == 0; + }); + if (it == properties->end()) { + LOG_ERROR(Render_Vulkan, "Required instance extension {} is not available", extension); + return false; + } + } + return true; +} + [[nodiscard]] std::vector RequiredExtensions( - Core::Frontend::WindowSystemType window_type, bool enable_validation) { + const vk::InstanceDispatch& dld, Core::Frontend::WindowSystemType window_type, + bool enable_validation) { std::vector extensions; extensions.reserve(6); +#ifdef __APPLE__ + extensions.push_back(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME); +#endif switch (window_type) { case Core::Frontend::WindowSystemType::Headless: break; @@ -66,35 +90,14 @@ namespace { extensions.push_back(VK_KHR_SURFACE_EXTENSION_NAME); } if (enable_validation) { - extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); + const bool debug_utils = + AreExtensionsSupported(dld, std::array{VK_EXT_DEBUG_UTILS_EXTENSION_NAME}); + extensions.push_back(debug_utils ? VK_EXT_DEBUG_UTILS_EXTENSION_NAME + : VK_EXT_DEBUG_REPORT_EXTENSION_NAME); } - extensions.push_back(VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME); - -#ifdef __APPLE__ - extensions.push_back(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME); -#endif return extensions; } -[[nodiscard]] bool AreExtensionsSupported(const vk::InstanceDispatch& dld, - std::span extensions) { - const std::optional properties = vk::EnumerateInstanceExtensionProperties(dld); - if (!properties) { - LOG_ERROR(Render_Vulkan, "Failed to query extension properties"); - return false; - } - for (const char* extension : extensions) { - const auto it = std::ranges::find_if(*properties, [extension](const auto& prop) { - return std::strcmp(extension, prop.extensionName) == 0; - }); - if (it == properties->end()) { - LOG_ERROR(Render_Vulkan, "Required instance extension {} is not available", extension); - return false; - } - } - return true; -} - [[nodiscard]] std::vector Layers(bool enable_validation) { std::vector layers; if (enable_validation) { @@ -138,7 +141,8 @@ vk::Instance CreateInstance(const Common::DynamicLibrary& library, vk::InstanceD LOG_ERROR(Render_Vulkan, "Failed to load Vulkan function pointers"); throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); } - const std::vector extensions = RequiredExtensions(window_type, enable_validation); + const std::vector extensions = + RequiredExtensions(dld, window_type, enable_validation); if (!AreExtensionsSupported(dld, extensions)) { throw vk::Exception(VK_ERROR_EXTENSION_NOT_PRESENT); } diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index 28fcb21a0..2fa29793a 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp @@ -259,7 +259,9 @@ bool Load(VkInstance instance, InstanceDispatch& dld) noexcept { // These functions may fail to load depending on the enabled extensions. // Don't return a failure on these. X(vkCreateDebugUtilsMessengerEXT); + X(vkCreateDebugReportCallbackEXT); X(vkDestroyDebugUtilsMessengerEXT); + X(vkDestroyDebugReportCallbackEXT); X(vkDestroySurfaceKHR); X(vkGetPhysicalDeviceFeatures2); X(vkGetPhysicalDeviceProperties2); @@ -481,6 +483,11 @@ void Destroy(VkInstance instance, VkDebugUtilsMessengerEXT handle, dld.vkDestroyDebugUtilsMessengerEXT(instance, handle, nullptr); } +void Destroy(VkInstance instance, VkDebugReportCallbackEXT handle, + const InstanceDispatch& dld) noexcept { + dld.vkDestroyDebugReportCallbackEXT(instance, handle, nullptr); +} + void Destroy(VkInstance instance, VkSurfaceKHR handle, const InstanceDispatch& dld) noexcept { dld.vkDestroySurfaceKHR(instance, handle, nullptr); } @@ -549,6 +556,13 @@ DebugUtilsMessenger Instance::CreateDebugUtilsMessenger( return DebugUtilsMessenger(object, handle, *dld); } +DebugReportCallback Instance::CreateDebugReportCallback( + const VkDebugReportCallbackCreateInfoEXT& create_info) const { + VkDebugReportCallbackEXT object; + Check(dld->vkCreateDebugReportCallbackEXT(handle, &create_info, nullptr, &object)); + return DebugReportCallback(object, handle, *dld); +} + void Image::SetObjectNameEXT(const char* name) const { SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_IMAGE, name); } diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index 44fce47a5..b5e70fcd4 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -164,8 +164,10 @@ struct InstanceDispatch { PFN_vkEnumerateInstanceLayerProperties vkEnumerateInstanceLayerProperties{}; PFN_vkCreateDebugUtilsMessengerEXT vkCreateDebugUtilsMessengerEXT{}; + PFN_vkCreateDebugReportCallbackEXT vkCreateDebugReportCallbackEXT{}; PFN_vkCreateDevice vkCreateDevice{}; PFN_vkDestroyDebugUtilsMessengerEXT vkDestroyDebugUtilsMessengerEXT{}; + PFN_vkDestroyDebugReportCallbackEXT vkDestroyDebugReportCallbackEXT{}; PFN_vkDestroyDevice vkDestroyDevice{}; PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR{}; PFN_vkEnumerateDeviceExtensionProperties vkEnumerateDeviceExtensionProperties{}; @@ -366,6 +368,7 @@ void Destroy(VkDevice, VkSwapchainKHR, const DeviceDispatch&) noexcept; void Destroy(VkDevice, VkSemaphore, const DeviceDispatch&) noexcept; void Destroy(VkDevice, VkShaderModule, const DeviceDispatch&) noexcept; void Destroy(VkInstance, VkDebugUtilsMessengerEXT, const InstanceDispatch&) noexcept; +void Destroy(VkInstance, VkDebugReportCallbackEXT, const InstanceDispatch&) noexcept; void Destroy(VkInstance, VkSurfaceKHR, const InstanceDispatch&) noexcept; VkResult Free(VkDevice, VkDescriptorPool, Span, const DeviceDispatch&) noexcept; @@ -581,6 +584,7 @@ private: }; using DebugUtilsMessenger = Handle; +using DebugReportCallback = Handle; using DescriptorSetLayout = Handle; using DescriptorUpdateTemplate = Handle; using Pipeline = Handle; @@ -613,6 +617,11 @@ public: DebugUtilsMessenger CreateDebugUtilsMessenger( const VkDebugUtilsMessengerCreateInfoEXT& create_info) const; + /// Creates a debug report callback. + /// @throw Exception on creation failure. + DebugReportCallback CreateDebugReportCallback( + const VkDebugReportCallbackCreateInfoEXT& create_info) const; + /// Returns dispatch table. const InstanceDispatch& Dispatch() const noexcept { return *dld; -- cgit v1.2.3 From a9b44d37e101c646b00ca73dffa06edcb7627dcd Mon Sep 17 00:00:00 2001 From: GPUCode Date: Mon, 5 Jun 2023 18:58:21 +0300 Subject: renderer_vulkan: Don't add transform feedback flag if unsupported --- src/video_core/renderer_vulkan/vk_buffer_cache.cpp | 3 ++- .../renderer_vulkan/vk_staging_buffer_pool.cpp | 16 ++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 660f7c9ff..b72f95235 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -590,7 +590,8 @@ void BufferCacheRuntime::ReserveNullBuffer() { .pNext = nullptr, .flags = 0, .size = 4, - .usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp index 62b251a9b..ce92f66ab 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp @@ -38,18 +38,20 @@ size_t Region(size_t iterator) noexcept { StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_, Scheduler& scheduler_) : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} { - const VkBufferCreateInfo stream_ci = { + VkBufferCreateInfo stream_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, .size = STREAM_BUFFER_SIZE, .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | - VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | - VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT, + VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, }; + if (device.IsExtTransformFeedbackSupported()) { + stream_ci.usage |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; + } stream_buffer = memory_allocator.CreateBuffer(stream_ci, MemoryUsage::Stream); if (device.HasDebuggingToolAttached()) { stream_buffer.SetObjectNameEXT("Stream Buffer"); @@ -164,19 +166,21 @@ std::optional StagingBufferPool::TryGetReservedBuffer(size_t s StagingBufferRef StagingBufferPool::CreateStagingBuffer(size_t size, MemoryUsage usage, bool deferred) { const u32 log2 = Common::Log2Ceil64(size); - const VkBufferCreateInfo buffer_ci = { + VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, .size = 1ULL << log2, .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | - VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | - VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT, + VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, }; + if (device.IsExtTransformFeedbackSupported()) { + buffer_ci.usage |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; + } vk::Buffer buffer = memory_allocator.CreateBuffer(buffer_ci, usage); if (device.HasDebuggingToolAttached()) { ++buffer_index; -- cgit v1.2.3 From c339af37a73de144fbbad706e43aefe278640cd7 Mon Sep 17 00:00:00 2001 From: GPUCode Date: Mon, 5 Jun 2023 19:03:16 +0300 Subject: renderer_vulkan: Respect viewport limit --- src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp | 5 +++-- src/video_core/renderer_vulkan/vk_rasterizer.cpp | 16 ++++++++++++---- src/video_core/vulkan_common/vulkan_device.h | 4 ++++ 3 files changed, 19 insertions(+), 6 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index c1595642e..ad35cacac 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -652,13 +652,14 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) { .pNext = nullptr, .negativeOneToOne = key.state.ndc_minus_one_to_one.Value() != 0 ? VK_TRUE : VK_FALSE, }; + const u32 num_viewports = std::min(device.GetMaxViewports(), Maxwell::NumViewports); VkPipelineViewportStateCreateInfo viewport_ci{ .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, .pNext = nullptr, .flags = 0, - .viewportCount = Maxwell::NumViewports, + .viewportCount = num_viewports, .pViewports = nullptr, - .scissorCount = Maxwell::NumViewports, + .scissorCount = num_viewports, .pScissors = nullptr, }; if (device.IsNvViewportSwizzleSupported()) { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 84e3a30cc..268b955fb 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -925,7 +925,7 @@ void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& reg } const bool is_rescaling{texture_cache.IsRescaling()}; const float scale = is_rescaling ? Settings::values.resolution_info.up_factor : 1.0f; - const std::array viewports{ + const std::array viewport_list{ GetViewportState(device, regs, 0, scale), GetViewportState(device, regs, 1, scale), GetViewportState(device, regs, 2, scale), GetViewportState(device, regs, 3, scale), GetViewportState(device, regs, 4, scale), GetViewportState(device, regs, 5, scale), @@ -935,7 +935,11 @@ void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& reg GetViewportState(device, regs, 12, scale), GetViewportState(device, regs, 13, scale), GetViewportState(device, regs, 14, scale), GetViewportState(device, regs, 15, scale), }; - scheduler.Record([viewports](vk::CommandBuffer cmdbuf) { cmdbuf.SetViewport(0, viewports); }); + scheduler.Record([this, viewport_list](vk::CommandBuffer cmdbuf) { + const u32 num_viewports = std::min(device.GetMaxViewports(), Maxwell::NumViewports); + const vk::Span viewports(viewport_list.data(), num_viewports); + cmdbuf.SetViewport(0, viewports); + }); } void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs) { @@ -948,7 +952,7 @@ void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs up_scale = Settings::values.resolution_info.up_scale; down_shift = Settings::values.resolution_info.down_shift; } - const std::array scissors{ + const std::array scissor_list{ GetScissorState(regs, 0, up_scale, down_shift), GetScissorState(regs, 1, up_scale, down_shift), GetScissorState(regs, 2, up_scale, down_shift), @@ -966,7 +970,11 @@ void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs GetScissorState(regs, 14, up_scale, down_shift), GetScissorState(regs, 15, up_scale, down_shift), }; - scheduler.Record([scissors](vk::CommandBuffer cmdbuf) { cmdbuf.SetScissor(0, scissors); }); + scheduler.Record([this, scissor_list](vk::CommandBuffer cmdbuf) { + const u32 num_scissors = std::min(device.GetMaxViewports(), Maxwell::NumViewports); + const vk::Span scissors(scissor_list.data(), num_scissors); + cmdbuf.SetScissor(0, scissors); + }); } void RasterizerVulkan::UpdateDepthBias(Tegra::Engines::Maxwell3D::Regs& regs) { diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index b84af3dfb..8a05a4fab 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -588,6 +588,10 @@ public: return properties.properties.limits.maxVertexInputBindings; } + u32 GetMaxViewports() const { + return properties.properties.limits.maxViewports; + } + bool SupportsConditionalBarriers() const { return supports_conditional_barriers; } -- cgit v1.2.3 From 1522b956583aab463bd1576652d1794fe989437d Mon Sep 17 00:00:00 2001 From: GPUCode Date: Mon, 5 Jun 2023 19:04:22 +0300 Subject: renderer_vulkan: Bump minimum SPIRV version * 1.3 is guaranteed on all 1.1 drivers --- src/video_core/vulkan_common/vulkan_device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/video_core') diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 8a05a4fab..13ca24ef5 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -518,7 +518,7 @@ public: if (extensions.spirv_1_4) { return 0x00010400U; } - return 0x00010000U; + return 0x00010300U; } /// Returns true when a known debugging tool is attached. -- cgit v1.2.3 From 220a42896d350399f1f2d77432aa571ade3c9cdd Mon Sep 17 00:00:00 2001 From: GPUCode Date: Mon, 5 Jun 2023 19:07:05 +0300 Subject: renderer_vulkan: Don't assume debug tool with debug renderer * Causes crashes because mali drivers don't support debug utils --- src/video_core/vulkan_common/vulkan_device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/video_core') diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 13ca24ef5..7be631122 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -523,7 +523,7 @@ public: /// Returns true when a known debugging tool is attached. bool HasDebuggingToolAttached() const { - return has_renderdoc || has_nsight_graphics || Settings::values.renderer_debug.GetValue(); + return has_renderdoc || has_nsight_graphics; } /// @returns True if compute pipelines can cause crashing. -- cgit v1.2.3 From b8c96cee5f2eb0bd5ba9ef46746daec78ee3bb44 Mon Sep 17 00:00:00 2001 From: GPUCode Date: Mon, 5 Jun 2023 19:27:36 +0300 Subject: renderer_vulkan: Add more feature checking --- src/video_core/renderer_vulkan/vk_pipeline_cache.cpp | 7 ++++--- src/video_core/vulkan_common/vulkan_device.cpp | 4 ++++ src/video_core/vulkan_common/vulkan_device.h | 16 ++++++++++++++++ 3 files changed, 24 insertions(+), 3 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 9f316113c..d600c4e61 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -309,7 +309,7 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device .support_int16 = device.IsShaderInt16Supported(), .support_int64 = device.IsShaderInt64Supported(), .support_vertex_instance_id = false, - .support_float_controls = true, + .support_float_controls = device.IsKhrShaderFloatControlsSupported(), .support_separate_denorm_behavior = float_control.denormBehaviorIndependence == VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL, .support_separate_rounding_mode = @@ -325,12 +325,13 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device .support_fp64_signed_zero_nan_preserve = float_control.shaderSignedZeroInfNanPreserveFloat64 != VK_FALSE, .support_explicit_workgroup_layout = device.IsKhrWorkgroupMemoryExplicitLayoutSupported(), - .support_vote = true, + .support_vote = device.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_VOTE_BIT), .support_viewport_index_layer_non_geometry = device.IsExtShaderViewportIndexLayerSupported(), .support_viewport_mask = device.IsNvViewportArray2Supported(), .support_typeless_image_loads = device.IsFormatlessImageLoadSupported(), - .support_demote_to_helper_invocation = true, + .support_demote_to_helper_invocation = + device.IsExtShaderDemoteToHelperInvocationSupported(), .support_int64_atomics = device.IsExtShaderAtomicInt64Supported(), .support_derivative_control = true, .support_geometry_shader_passthrough = device.IsNvGeometryShaderPassthroughSupported(), diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 9743a82f5..70436cf1c 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -905,6 +905,10 @@ bool Device::GetSuitability(bool requires_swapchain) { properties.driver.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES; SetNext(next, properties.driver); + // Retrieve subgroup properties. + properties.subgroup_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES; + SetNext(next, properties.subgroup_properties); + // Retrieve relevant extension properties. if (extensions.shader_float_controls) { properties.float_controls.sType = diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 7be631122..e05d04db3 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -323,6 +323,11 @@ public: return properties.subgroup_size_control.requiredSubgroupSizeStages & stage; } + /// Returns true if the device supports the provided subgroup feature. + bool IsSubgroupFeatureSupported(VkSubgroupFeatureFlagBits feature) const { + return properties.subgroup_properties.supportedOperations & feature; + } + /// Returns the maximum number of push descriptors. u32 MaxPushDescriptors() const { return properties.push_descriptor.maxPushDescriptors; @@ -388,6 +393,11 @@ public: return extensions.swapchain_mutable_format; } + /// Returns true if VK_KHR_shader_float_controls is enabled. + bool IsKhrShaderFloatControlsSupported() const { + return extensions.shader_float_controls; + } + /// Returns true if the device supports VK_KHR_workgroup_memory_explicit_layout. bool IsKhrWorkgroupMemoryExplicitLayoutSupported() const { return extensions.workgroup_memory_explicit_layout; @@ -487,6 +497,11 @@ public: return extensions.shader_stencil_export; } + /// Returns true if the device supports VK_EXT_shader_demote_to_helper_invocation + bool IsExtShaderDemoteToHelperInvocationSupported() const { + return extensions.shader_demote_to_helper_invocation; + } + /// Returns true if the device supports VK_EXT_conservative_rasterization. bool IsExtConservativeRasterizationSupported() const { return extensions.conservative_rasterization; @@ -684,6 +699,7 @@ private: struct Properties { VkPhysicalDeviceDriverProperties driver{}; + VkPhysicalDeviceSubgroupProperties subgroup_properties{}; VkPhysicalDeviceFloatControlsProperties float_controls{}; VkPhysicalDevicePushDescriptorPropertiesKHR push_descriptor{}; VkPhysicalDeviceSubgroupSizeControlProperties subgroup_size_control{}; -- cgit v1.2.3 From eac46ad7ceca5e35b396a8b80bfc38dc6ef1a4fe Mon Sep 17 00:00:00 2001 From: GPUCode Date: Tue, 6 Jun 2023 23:10:06 +0300 Subject: video_core: Add BCn decoding support --- externals/CMakeLists.txt | 3 + externals/bc_decoder/bc_decoder.cpp | 1522 ++++++++++++++++++++ externals/bc_decoder/bc_decoder.h | 43 + src/video_core/CMakeLists.txt | 6 +- src/video_core/renderer_vulkan/maxwell_to_vk.cpp | 20 + src/video_core/renderer_vulkan/vk_rasterizer.cpp | 7 + .../renderer_vulkan/vk_texture_cache.cpp | 4 + src/video_core/surface.cpp | 22 + src/video_core/surface.h | 2 + src/video_core/texture_cache/decode_bc.cpp | 129 ++ src/video_core/texture_cache/decode_bc.h | 19 + src/video_core/texture_cache/decode_bc4.cpp | 96 -- src/video_core/texture_cache/decode_bc4.h | 15 - src/video_core/texture_cache/util.cpp | 24 +- src/video_core/textures/bcn.cpp | 1 - src/video_core/textures/bcn.h | 9 +- src/video_core/vulkan_common/vulkan_device.h | 15 +- 17 files changed, 1803 insertions(+), 134 deletions(-) create mode 100644 externals/bc_decoder/bc_decoder.cpp create mode 100644 externals/bc_decoder/bc_decoder.h create mode 100644 src/video_core/texture_cache/decode_bc.cpp create mode 100644 src/video_core/texture_cache/decode_bc.h delete mode 100644 src/video_core/texture_cache/decode_bc4.cpp delete mode 100644 src/video_core/texture_cache/decode_bc4.h (limited to 'src/video_core') diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index 0184289eb..4ff588851 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -157,6 +157,9 @@ endif() add_library(stb stb/stb_dxt.cpp) target_include_directories(stb PUBLIC ./stb) +add_library(bc_decoder bc_decoder/bc_decoder.cpp) +target_include_directories(bc_decoder PUBLIC ./bc_decoder) + if (ANDROID) if (ARCHITECTURE_arm64) add_subdirectory(libadrenotools) diff --git a/externals/bc_decoder/bc_decoder.cpp b/externals/bc_decoder/bc_decoder.cpp new file mode 100644 index 000000000..536c44f34 --- /dev/null +++ b/externals/bc_decoder/bc_decoder.cpp @@ -0,0 +1,1522 @@ +// SPDX-License-Identifier: MPL-2.0 +// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/) +// Copyright 2019 The SwiftShader Authors. All Rights Reserved. + +// This BCn Decoder is directly derivative of Swiftshader's BCn Decoder found at: https://github.com/google/swiftshader/blob/d070309f7d154d6764cbd514b1a5c8bfcef61d06/src/Device/BC_Decoder.cpp +// This file does not follow the Skyline code conventions but has certain Skyline specific code +// There are a lot of implicit and narrowing conversions in this file due to this (Warnings are disabled as a result) + +#include +#include +#include +#include + +namespace { + constexpr int BlockWidth = 4; + constexpr int BlockHeight = 4; + + struct BC_color { + void decode(uint8_t *dst, size_t x, size_t y, size_t dstW, size_t dstH, size_t dstPitch, size_t dstBpp, bool hasAlphaChannel, bool hasSeparateAlpha) const { + Color c[4]; + c[0].extract565(c0); + c[1].extract565(c1); + if (hasSeparateAlpha || (c0 > c1)) { + c[2] = ((c[0] * 2) + c[1]) / 3; + c[3] = ((c[1] * 2) + c[0]) / 3; + } else { + c[2] = (c[0] + c[1]) >> 1; + if (hasAlphaChannel) { + c[3].clearAlpha(); + } + } + + for (int j = 0; j < BlockHeight && (y + j) < dstH; j++) { + size_t dstOffset = j * dstPitch; + size_t idxOffset = j * BlockHeight; + for (size_t i = 0; i < BlockWidth && (x + i) < dstW; i++, idxOffset++, dstOffset += dstBpp) { + *reinterpret_cast(dst + dstOffset) = c[getIdx(idxOffset)].pack8888(); + } + } + } + + private: + struct Color { + Color() { + c[0] = c[1] = c[2] = 0; + c[3] = 0xFF000000; + } + + void extract565(const unsigned int c565) { + c[0] = ((c565 & 0x0000001F) << 3) | ((c565 & 0x0000001C) >> 2); + c[1] = ((c565 & 0x000007E0) >> 3) | ((c565 & 0x00000600) >> 9); + c[2] = ((c565 & 0x0000F800) >> 8) | ((c565 & 0x0000E000) >> 13); + } + + unsigned int pack8888() const { + return ((c[0] & 0xFF) << 16) | ((c[1] & 0xFF) << 8) | (c[2] & 0xFF) | c[3]; + } + + void clearAlpha() { + c[3] = 0; + } + + Color operator*(int factor) const { + Color res; + for (int i = 0; i < 4; ++i) { + res.c[i] = c[i] * factor; + } + return res; + } + + Color operator/(int factor) const { + Color res; + for (int i = 0; i < 4; ++i) { + res.c[i] = c[i] / factor; + } + return res; + } + + Color operator>>(int shift) const { + Color res; + for (int i = 0; i < 4; ++i) { + res.c[i] = c[i] >> shift; + } + return res; + } + + Color operator+(Color const &obj) const { + Color res; + for (int i = 0; i < 4; ++i) { + res.c[i] = c[i] + obj.c[i]; + } + return res; + } + + private: + int c[4]; + }; + + size_t getIdx(int i) const { + size_t offset = i << 1; // 2 bytes per index + return (idx & (0x3 << offset)) >> offset; + } + + unsigned short c0; + unsigned short c1; + unsigned int idx; + }; + static_assert(sizeof(BC_color) == 8, "BC_color must be 8 bytes"); + + struct BC_channel { + void decode(uint8_t *dst, size_t x, size_t y, size_t dstW, size_t dstH, size_t dstPitch, size_t dstBpp, size_t channel, bool isSigned) const { + int c[8] = {0}; + + if (isSigned) { + c[0] = static_cast(data & 0xFF); + c[1] = static_cast((data & 0xFF00) >> 8); + } else { + c[0] = static_cast(data & 0xFF); + c[1] = static_cast((data & 0xFF00) >> 8); + } + + if (c[0] > c[1]) { + for (int i = 2; i < 8; ++i) { + c[i] = ((8 - i) * c[0] + (i - 1) * c[1]) / 7; + } + } else { + for (int i = 2; i < 6; ++i) { + c[i] = ((6 - i) * c[0] + (i - 1) * c[1]) / 5; + } + c[6] = isSigned ? -128 : 0; + c[7] = isSigned ? 127 : 255; + } + + for (size_t j = 0; j < BlockHeight && (y + j) < dstH; j++) { + for (size_t i = 0; i < BlockWidth && (x + i) < dstW; i++) { + dst[channel + (i * dstBpp) + (j * dstPitch)] = static_cast(c[getIdx((j * BlockHeight) + i)]); + } + } + } + + private: + uint8_t getIdx(int i) const { + int offset = i * 3 + 16; + return static_cast((data & (0x7ull << offset)) >> offset); + } + + uint64_t data; + }; + static_assert(sizeof(BC_channel) == 8, "BC_channel must be 8 bytes"); + + struct BC_alpha { + void decode(uint8_t *dst, size_t x, size_t y, size_t dstW, size_t dstH, size_t dstPitch, size_t dstBpp) const { + dst += 3; // Write only to alpha (channel 3) + for (size_t j = 0; j < BlockHeight && (y + j) < dstH; j++, dst += dstPitch) { + uint8_t *dstRow = dst; + for (size_t i = 0; i < BlockWidth && (x + i) < dstW; i++, dstRow += dstBpp) { + *dstRow = getAlpha(j * BlockHeight + i); + } + } + } + + private: + uint8_t getAlpha(int i) const { + int offset = i << 2; + int alpha = (data & (0xFull << offset)) >> offset; + return static_cast(alpha | (alpha << 4)); + } + + uint64_t data; + }; + static_assert(sizeof(BC_alpha) == 8, "BC_alpha must be 8 bytes"); + + namespace BC6H { + static constexpr int MaxPartitions = 64; + + // @fmt:off + + static constexpr uint8_t PartitionTable2[MaxPartitions][16] = { + { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, + { 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1 }, + { 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1 }, + { 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1 }, + { 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1 }, + { 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1 }, + { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1 }, + { 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0 }, + { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0 }, + { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 }, + { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1 }, + { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 }, + { 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0 }, + { 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0 }, + { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0 }, + { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, + { 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0 }, + { 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }, + { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1 }, + { 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0 }, + { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0 }, + { 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0 }, + { 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1 }, + { 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1 }, + { 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0 }, + { 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0 }, + { 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0 }, + { 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0 }, + { 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0 }, + { 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1 }, + { 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1 }, + { 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0 }, + { 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0 }, + { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0 }, + { 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1 }, + { 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1 }, + { 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0 }, + { 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0 }, + { 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1 }, + { 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1 }, + { 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1 }, + { 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1 }, + { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, + { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, + { 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0 }, + { 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1 }, + }; + + static constexpr uint8_t AnchorTable2[MaxPartitions] = { + 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, + 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, + 0xf, 0x2, 0x8, 0x2, 0x2, 0x8, 0x8, 0xf, + 0x2, 0x8, 0x2, 0x2, 0x8, 0x8, 0x2, 0x2, + 0xf, 0xf, 0x6, 0x8, 0x2, 0x8, 0xf, 0xf, + 0x2, 0x8, 0x2, 0x2, 0x2, 0xf, 0xf, 0x6, + 0x6, 0x2, 0x6, 0x8, 0xf, 0xf, 0x2, 0x2, + 0xf, 0xf, 0xf, 0xf, 0xf, 0x2, 0x2, 0xf, + }; + + // @fmt:on + + // 1.0f in half-precision floating point format + static constexpr uint16_t halfFloat1 = 0x3C00; + union Color { + struct RGBA { + uint16_t r = 0; + uint16_t g = 0; + uint16_t b = 0; + uint16_t a = halfFloat1; + + RGBA(uint16_t r, uint16_t g, uint16_t b) + : r(r), g(g), b(b) { + } + + RGBA &operator=(const RGBA &other) { + this->r = other.r; + this->g = other.g; + this->b = other.b; + this->a = halfFloat1; + + return *this; + } + }; + + Color(uint16_t r, uint16_t g, uint16_t b) + : rgba(r, g, b) { + } + + Color(int r, int g, int b) + : rgba((uint16_t) r, (uint16_t) g, (uint16_t) b) { + } + + Color() {} + + Color(const Color &other) { + this->rgba = other.rgba; + } + + Color &operator=(const Color &other) { + this->rgba = other.rgba; + + return *this; + } + + RGBA rgba; + uint16_t channel[4]; + }; + static_assert(sizeof(Color) == 8, "BC6h::Color must be 8 bytes long"); + + inline int32_t extendSign(int32_t val, size_t size) { + // Suppose we have a 2-bit integer being stored in 4 bit variable: + // x = 0b00AB + // + // In order to sign extend x, we need to turn the 0s into A's: + // x_extend = 0bAAAB + // + // We can do that by flipping A in x then subtracting 0b0010 from x. + // Suppose A is 1: + // x = 0b001B + // x_flip = 0b000B + // x_minus = 0b111B + // Since A is flipped to 0, subtracting the mask sets it and all the bits above it to 1. + // And if A is 0: + // x = 0b000B + // x_flip = 0b001B + // x_minus = 0b000B + // We unset the bit we flipped, and touch no other bit + uint16_t mask = 1u << (size - 1); + return (val ^ mask) - mask; + } + + static int constexpr RGBfChannels = 3; + struct RGBf { + uint16_t channel[RGBfChannels]; + size_t size[RGBfChannels]; + bool isSigned; + + RGBf() { + static_assert(RGBfChannels == 3, "RGBf must have exactly 3 channels"); + static_assert(sizeof(channel) / sizeof(channel[0]) == RGBfChannels, "RGBf must have exactly 3 channels"); + static_assert(sizeof(channel) / sizeof(channel[0]) == sizeof(size) / sizeof(size[0]), "RGBf requires equally sized arrays for channels and channel sizes"); + + for (int i = 0; i < RGBfChannels; i++) { + channel[i] = 0; + size[i] = 0; + } + + isSigned = false; + } + + void extendSign() { + for (int i = 0; i < RGBfChannels; i++) { + channel[i] = BC6H::extendSign(channel[i], size[i]); + } + } + + // Assuming this is the delta, take the base-endpoint and transform this into + // a proper endpoint. + // + // The final computed endpoint is truncated to the base-endpoint's size; + void resolveDelta(RGBf base) { + for (int i = 0; i < RGBfChannels; i++) { + size[i] = base.size[i]; + channel[i] = (base.channel[i] + channel[i]) & ((1 << base.size[i]) - 1); + } + + // Per the spec: + // "For signed formats, the results of the delta calculation must be sign + // extended as well." + if (isSigned) { + extendSign(); + } + } + + void unquantize() { + if (isSigned) { + unquantizeSigned(); + } else { + unquantizeUnsigned(); + } + } + + void unquantizeUnsigned() { + for (int i = 0; i < RGBfChannels; i++) { + if (size[i] >= 15 || channel[i] == 0) { + continue; + } else if (channel[i] == ((1u << size[i]) - 1)) { + channel[i] = 0xFFFFu; + } else { + // Need 32 bits to avoid overflow + uint32_t tmp = channel[i]; + channel[i] = (uint16_t) (((tmp << 16) + 0x8000) >> size[i]); + } + size[i] = 16; + } + } + + void unquantizeSigned() { + for (int i = 0; i < RGBfChannels; i++) { + if (size[i] >= 16 || channel[i] == 0) { + continue; + } + + int16_t value = (int16_t)channel[i]; + int32_t result = value; + bool signBit = value < 0; + if (signBit) { + value = -value; + } + + if (value >= ((1 << (size[i] - 1)) - 1)) { + result = 0x7FFF; + } else { + // Need 32 bits to avoid overflow + int32_t tmp = value; + result = (((tmp << 15) + 0x4000) >> (size[i] - 1)); + } + + if (signBit) { + result = -result; + } + + channel[i] = (uint16_t) result; + size[i] = 16; + } + } + }; + + struct Data { + uint64_t low64; + uint64_t high64; + + Data() = default; + + Data(uint64_t low64, uint64_t high64) + : low64(low64), high64(high64) { + } + + // Consumes the lowest N bits from from low64 and high64 where N is: + // abs(MSB - LSB) + // MSB and LSB come from the block description of the BC6h spec and specify + // the location of the bits in the returned bitstring. + // + // If MSB < LSB, then the bits are reversed. Otherwise, the bitstring is read and + // shifted without further modification. + // + uint32_t consumeBits(uint32_t MSB, uint32_t LSB) { + bool reversed = MSB < LSB; + if (reversed) { + std::swap(MSB, LSB); + } + assert(MSB - LSB + 1 < sizeof(uint32_t) * 8); + + uint32_t numBits = MSB - LSB + 1; + uint32_t mask = (1 << numBits) - 1; + // Read the low N bits + uint32_t bits = (low64 & mask); + + low64 >>= numBits; + // Put the low N bits of high64 into the high 64-N bits of low64 + low64 |= (high64 & mask) << (sizeof(high64) * 8 - numBits); + high64 >>= numBits; + + if (reversed) { + uint32_t tmp = 0; + for (uint32_t numSwaps = 0; numSwaps < numBits; numSwaps++) { + tmp <<= 1; + tmp |= (bits & 1); + bits >>= 1; + } + + bits = tmp; + } + + return bits << LSB; + } + }; + + struct IndexInfo { + uint64_t value; + int numBits; + }; + +// Interpolates between two endpoints, then does a final unquantization step + Color interpolate(RGBf e0, RGBf e1, const IndexInfo &index, bool isSigned) { + static constexpr uint32_t weights3[] = {0, 9, 18, 27, 37, 46, 55, 64}; + static constexpr uint32_t weights4[] = {0, 4, 9, 13, 17, 21, 26, 30, + 34, 38, 43, 47, 51, 55, 60, 64}; + static constexpr uint32_t const *weightsN[] = { + nullptr, nullptr, nullptr, weights3, weights4 + }; + auto weights = weightsN[index.numBits]; + assert(weights != nullptr); + Color color; + uint32_t e0Weight = 64 - weights[index.value]; + uint32_t e1Weight = weights[index.value]; + + for (int i = 0; i < RGBfChannels; i++) { + int32_t e0Channel = e0.channel[i]; + int32_t e1Channel = e1.channel[i]; + + if (isSigned) { + e0Channel = extendSign(e0Channel, 16); + e1Channel = extendSign(e1Channel, 16); + } + + int32_t e0Value = e0Channel * e0Weight; + int32_t e1Value = e1Channel * e1Weight; + + uint32_t tmp = ((e0Value + e1Value + 32) >> 6); + + // Need to unquantize value to limit it to the legal range of half-precision + // floats. We do this by scaling by 31/32 or 31/64 depending on if the value + // is signed or unsigned. + if (isSigned) { + tmp = ((tmp & 0x80000000) != 0) ? (((~tmp + 1) * 31) >> 5) | 0x8000 : (tmp * 31) >> 5; + // Don't return -0.0f, just normalize it to 0.0f. + if (tmp == 0x8000) + tmp = 0; + } else { + tmp = (tmp * 31) >> 6; + } + + color.channel[i] = (uint16_t) tmp; + } + + return color; + } + + enum DataType { + // Endpoints + EP0 = 0, + EP1 = 1, + EP2 = 2, + EP3 = 3, + Mode, + Partition, + End, + }; + + enum Channel { + R = 0, + G = 1, + B = 2, + None, + }; + + struct DeltaBits { + size_t channel[3]; + + constexpr DeltaBits() + : channel{0, 0, 0} { + } + + constexpr DeltaBits(size_t r, size_t g, size_t b) + : channel{r, g, b} { + } + }; + + struct ModeDesc { + int number; + bool hasDelta; + int partitionCount; + int endpointBits; + DeltaBits deltaBits; + + constexpr ModeDesc() + : number(-1), hasDelta(false), partitionCount(0), endpointBits(0) { + } + + constexpr ModeDesc(int number, bool hasDelta, int partitionCount, int endpointBits, DeltaBits deltaBits) + : number(number), hasDelta(hasDelta), partitionCount(partitionCount), endpointBits(endpointBits), deltaBits(deltaBits) { + } + }; + + struct BlockDesc { + DataType type; + Channel channel; + int MSB; + int LSB; + ModeDesc modeDesc; + + constexpr BlockDesc() + : type(End), channel(None), MSB(0), LSB(0), modeDesc() { + } + + constexpr BlockDesc(const DataType type, Channel channel, int MSB, int LSB, ModeDesc modeDesc) + : type(type), channel(channel), MSB(MSB), LSB(LSB), modeDesc(modeDesc) { + } + + constexpr BlockDesc(DataType type, Channel channel, int MSB, int LSB) + : type(type), channel(channel), MSB(MSB), LSB(LSB), modeDesc() { + } + }; + +// Turns a legal mode into an index into the BlockDesc table. +// Illegal or reserved modes return -1. + static int modeToIndex(uint8_t mode) { + if (mode <= 3) { + return mode; + } else if ((mode & 0x2) != 0) { + if (mode <= 18) { +// Turns 6 into 4, 7 into 5, 10 into 6, etc. + return (mode / 2) + 1 + (mode & 0x1); + } else if (mode == 22 || mode == 26 || mode == 30) { +// Turns 22 into 11, 26 into 12, etc. + return mode / 4 + 6; + } + } + + return -1; + } + +// Returns a description of the bitfields for each mode from the LSB +// to the MSB before the index data starts. +// +// The numbers come from the BC6h block description. Each BlockDesc in the +// {Type, Channel, MSB, LSB} +// * Type describes which endpoint this is, or if this is a mode, a partition +// number, or the end of the block description. +// * Channel describes one of the 3 color channels within an endpoint +// * MSB and LSB specificy: +// * The size of the bitfield being read +// * The position of the bitfield within the variable it is being read to +// * If the bitfield is stored in reverse bit order +// If MSB < LSB then the bitfield is stored in reverse order. The size of +// the bitfield is abs(MSB-LSB+1). And the position of the bitfield within +// the variable is min(LSB, MSB). +// +// Invalid or reserved modes return an empty list. + static constexpr int NumBlocks = 14; +// The largest number of descriptions within a block. + static constexpr int MaxBlockDescIndex = 26; + static constexpr BlockDesc blockDescs[NumBlocks][MaxBlockDescIndex] = { +// @fmt:off +// Mode 0, Index 0 +{ +{ Mode, None, 1, 0, { 0, true, 2, 10, { 5, 5, 5 } } }, +{ EP2, G, 4, 4 }, { EP2, B, 4, 4 }, { EP3, B, 4, 4 }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 4, 0 }, { EP3, G, 4, 4 }, { EP2, G, 3, 0 }, +{ EP1, G, 4, 0 }, { EP3, B, 0, 0 }, { EP3, G, 3, 0 }, +{ EP1, B, 4, 0 }, { EP3, B, 1, 1 }, { EP2, B, 3, 0 }, +{ EP2, R, 4, 0 }, { EP3, B, 2, 2 }, { EP3, R, 4, 0 }, +{ EP3, B, 3, 3 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 1, Index 1 +{ +{ Mode, None, 1, 0, { 1, true, 2, 7, { 6, 6, 6 } } }, +{ EP2, G, 5, 5 }, { EP3, G, 5, 4 }, { EP0, R, 6, 0 }, +{ EP3, B, 1, 0 }, { EP2, B, 4, 4 }, { EP0, G, 6, 0 }, +{ EP2, B, 5, 5 }, { EP3, B, 2, 2 }, { EP2, G, 4, 4 }, +{ EP0, B, 6, 0 }, { EP3, B, 3, 3 }, { EP3, B, 5, 5 }, +{ EP3, B, 4, 4 }, { EP1, R, 5, 0 }, { EP2, G, 3, 0 }, +{ EP1, G, 5, 0 }, { EP3, G, 3, 0 }, { EP1, B, 5, 0 }, +{ EP2, B, 3, 0 }, { EP2, R, 5, 0 }, { EP3, R, 5, 0 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 2, Index 2 +{ +{ Mode, None, 4, 0, { 2, true, 2, 11, { 5, 4, 4 } } }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 4, 0 }, { EP0, R, 10, 10 }, { EP2, G, 3, 0 }, +{ EP1, G, 3, 0 }, { EP0, G, 10, 10 }, { EP3, B, 0, 0 }, +{ EP3, G, 3, 0 }, { EP1, B, 3, 0 }, { EP0, B, 10, 10 }, +{ EP3, B, 1, 1 }, { EP2, B, 3, 0 }, { EP2, R, 4, 0 }, +{ EP3, B, 2, 2 }, { EP3, R, 4, 0 }, { EP3, B, 3, 3 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 3, Index 3 +{ +{ Mode, None, 4, 0, { 3, false, 1, 10, { 0, 0, 0 } } }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 9, 0 }, { EP1, G, 9, 0 }, { EP1, B, 9, 0 }, +{ End, None, 0, 0}, +}, +// Mode 6, Index 4 +{ +{ Mode, None, 4, 0, { 6, true, 2, 11, { 4, 5, 4 } } }, // 1 1 +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 3, 0 }, { EP0, R, 10, 10 }, { EP3, G, 4, 4 }, +{ EP2, G, 3, 0 }, { EP1, G, 4, 0 }, { EP0, G, 10, 10 }, +{ EP3, G, 3, 0 }, { EP1, B, 3, 0 }, { EP0, B, 10, 10 }, +{ EP3, B, 1, 1 }, { EP2, B, 3, 0 }, { EP2, R, 3, 0 }, +{ EP3, B, 0, 0 }, { EP3, B, 2, 2 }, { EP3, R, 3, 0 }, // 18 19 +{ EP2, G, 4, 4 }, { EP3, B, 3, 3 }, // 2 21 +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 7, Index 5 +{ +{ Mode, None, 4, 0, { 7, true, 1, 11, { 9, 9, 9 } } }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 8, 0 }, { EP0, R, 10, 10 }, { EP1, G, 8, 0 }, +{ EP0, G, 10, 10 }, { EP1, B, 8, 0 }, { EP0, B, 10, 10 }, +{ End, None, 0, 0}, +}, +// Mode 10, Index 6 +{ +{ Mode, None, 4, 0, { 10, true, 2, 11, { 4, 4, 5 } } }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 3, 0 }, { EP0, R, 10, 10 }, { EP2, B, 4, 4 }, +{ EP2, G, 3, 0 }, { EP1, G, 3, 0 }, { EP0, G, 10, 10 }, +{ EP3, B, 0, 0 }, { EP3, G, 3, 0 }, { EP1, B, 4, 0 }, +{ EP0, B, 10, 10 }, { EP2, B, 3, 0 }, { EP2, R, 3, 0 }, +{ EP3, B, 1, 1 }, { EP3, B, 2, 2 }, { EP3, R, 3, 0 }, +{ EP3, B, 4, 4 }, { EP3, B, 3, 3 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 11, Index 7 +{ +{ Mode, None, 4, 0, { 11, true, 1, 12, { 8, 8, 8 } } }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 7, 0 }, { EP0, R, 10, 11 }, { EP1, G, 7, 0 }, +{ EP0, G, 10, 11 }, { EP1, B, 7, 0 }, { EP0, B, 10, 11 }, +{ End, None, 0, 0}, +}, +// Mode 14, Index 8 +{ +{ Mode, None, 4, 0, { 14, true, 2, 9, { 5, 5, 5 } } }, +{ EP0, R, 8, 0 }, { EP2, B, 4, 4 }, { EP0, G, 8, 0 }, +{ EP2, G, 4, 4 }, { EP0, B, 8, 0 }, { EP3, B, 4, 4 }, +{ EP1, R, 4, 0 }, { EP3, G, 4, 4 }, { EP2, G, 3, 0 }, +{ EP1, G, 4, 0 }, { EP3, B, 0, 0 }, { EP3, G, 3, 0 }, +{ EP1, B, 4, 0 }, { EP3, B, 1, 1 }, { EP2, B, 3, 0 }, +{ EP2, R, 4, 0 }, { EP3, B, 2, 2 }, { EP3, R, 4, 0 }, +{ EP3, B, 3, 3 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 15, Index 9 +{ +{ Mode, None, 4, 0, { 15, true, 1, 16, { 4, 4, 4 } } }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 3, 0 }, { EP0, R, 10, 15 }, { EP1, G, 3, 0 }, +{ EP0, G, 10, 15 }, { EP1, B, 3, 0 }, { EP0, B, 10, 15 }, +{ End, None, 0, 0}, +}, +// Mode 18, Index 10 +{ +{ Mode, None, 4, 0, { 18, true, 2, 8, { 6, 5, 5 } } }, +{ EP0, R, 7, 0 }, { EP3, G, 4, 4 }, { EP2, B, 4, 4 }, +{ EP0, G, 7, 0 }, { EP3, B, 2, 2 }, { EP2, G, 4, 4 }, +{ EP0, B, 7, 0 }, { EP3, B, 3, 3 }, { EP3, B, 4, 4 }, +{ EP1, R, 5, 0 }, { EP2, G, 3, 0 }, { EP1, G, 4, 0 }, +{ EP3, B, 0, 0 }, { EP3, G, 3, 0 }, { EP1, B, 4, 0 }, +{ EP3, B, 1, 1 }, { EP2, B, 3, 0 }, { EP2, R, 5, 0 }, +{ EP3, R, 5, 0 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 22, Index 11 +{ +{ Mode, None, 4, 0, { 22, true, 2, 8, { 5, 6, 5 } } }, +{ EP0, R, 7, 0 }, { EP3, B, 0, 0 }, { EP2, B, 4, 4 }, +{ EP0, G, 7, 0 }, { EP2, G, 5, 5 }, { EP2, G, 4, 4 }, +{ EP0, B, 7, 0 }, { EP3, G, 5, 5 }, { EP3, B, 4, 4 }, +{ EP1, R, 4, 0 }, { EP3, G, 4, 4 }, { EP2, G, 3, 0 }, +{ EP1, G, 5, 0 }, { EP3, G, 3, 0 }, { EP1, B, 4, 0 }, +{ EP3, B, 1, 1 }, { EP2, B, 3, 0 }, { EP2, R, 4, 0 }, +{ EP3, B, 2, 2 }, { EP3, R, 4, 0 }, { EP3, B, 3, 3 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 26, Index 12 +{ +{ Mode, None, 4, 0, { 26, true, 2, 8, { 5, 5, 6 } } }, +{ EP0, R, 7, 0 }, { EP3, B, 1, 1 }, { EP2, B, 4, 4 }, +{ EP0, G, 7, 0 }, { EP2, B, 5, 5 }, { EP2, G, 4, 4 }, +{ EP0, B, 7, 0 }, { EP3, B, 5, 5 }, { EP3, B, 4, 4 }, +{ EP1, R, 4, 0 }, { EP3, G, 4, 4 }, { EP2, G, 3, 0 }, +{ EP1, G, 4, 0 }, { EP3, B, 0, 0 }, { EP3, G, 3, 0 }, +{ EP1, B, 5, 0 }, { EP2, B, 3, 0 }, { EP2, R, 4, 0 }, +{ EP3, B, 2, 2 }, { EP3, R, 4, 0 }, { EP3, B, 3, 3 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 30, Index 13 +{ +{ Mode, None, 4, 0, { 30, false, 2, 6, { 0, 0, 0 } } }, +{ EP0, R, 5, 0 }, { EP3, G, 4, 4 }, { EP3, B, 0, 0 }, +{ EP3, B, 1, 1 }, { EP2, B, 4, 4 }, { EP0, G, 5, 0 }, +{ EP2, G, 5, 5 }, { EP2, B, 5, 5 }, { EP3, B, 2, 2 }, +{ EP2, G, 4, 4 }, { EP0, B, 5, 0 }, { EP3, G, 5, 5 }, +{ EP3, B, 3, 3 }, { EP3, B, 5, 5 }, { EP3, B, 4, 4 }, +{ EP1, R, 5, 0 }, { EP2, G, 3, 0 }, { EP1, G, 5, 0 }, +{ EP3, G, 3, 0 }, { EP1, B, 5, 0 }, { EP2, B, 3, 0 }, +{ EP2, R, 5, 0 }, { EP3, R, 5, 0 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +} +// @fmt:on + }; + + struct Block { + uint64_t low64; + uint64_t high64; + + void decode(uint8_t *dst, size_t dstX, size_t dstY, size_t dstWidth, size_t dstHeight, size_t dstPitch, size_t dstBpp, bool isSigned) const { + uint8_t mode = 0; + Data data(low64, high64); + assert(dstBpp == sizeof(Color)); + + if ((data.low64 & 0x2) == 0) { + mode = data.consumeBits(1, 0); + } else { + mode = data.consumeBits(4, 0); + } + + int blockIndex = modeToIndex(mode); + // Handle illegal or reserved mode + if (blockIndex == -1) { + for (int y = 0; y < 4 && y + dstY < dstHeight; y++) { + for (int x = 0; x < 4 && x + dstX < dstWidth; x++) { + auto out = reinterpret_cast(dst + sizeof(Color) * x + dstPitch * y); + out->rgba = {0, 0, 0}; + } + } + return; + } + const BlockDesc *blockDesc = blockDescs[blockIndex]; + + RGBf e[4]; + e[0].isSigned = e[1].isSigned = e[2].isSigned = e[3].isSigned = isSigned; + + int partition = 0; + ModeDesc modeDesc; + for (int index = 0; blockDesc[index].type != End; index++) { + const BlockDesc desc = blockDesc[index]; + + switch (desc.type) { + case Mode: + modeDesc = desc.modeDesc; + assert(modeDesc.number == mode); + + e[0].size[0] = e[0].size[1] = e[0].size[2] = modeDesc.endpointBits; + for (int i = 0; i < RGBfChannels; i++) { + if (modeDesc.hasDelta) { + e[1].size[i] = e[2].size[i] = e[3].size[i] = modeDesc.deltaBits.channel[i]; + } else { + e[1].size[i] = e[2].size[i] = e[3].size[i] = modeDesc.endpointBits; + } + } + break; + case Partition: + partition |= data.consumeBits(desc.MSB, desc.LSB); + break; + case EP0: + case EP1: + case EP2: + case EP3: + e[desc.type].channel[desc.channel] |= data.consumeBits(desc.MSB, desc.LSB); + break; + default: + assert(false); + return; + } + } + + // Sign extension + if (isSigned) { + for (int ep = 0; ep < modeDesc.partitionCount * 2; ep++) { + e[ep].extendSign(); + } + } else if (modeDesc.hasDelta) { + // Don't sign-extend the base endpoint in an unsigned format. + for (int ep = 1; ep < modeDesc.partitionCount * 2; ep++) { + e[ep].extendSign(); + } + } + + // Turn the deltas into endpoints + if (modeDesc.hasDelta) { + for (int ep = 1; ep < modeDesc.partitionCount * 2; ep++) { + e[ep].resolveDelta(e[0]); + } + } + + for (int ep = 0; ep < modeDesc.partitionCount * 2; ep++) { + e[ep].unquantize(); + } + + // Get the indices, calculate final colors, and output + for (int y = 0; y < 4; y++) { + for (int x = 0; x < 4; x++) { + int pixelNum = x + y * 4; + IndexInfo idx; + bool isAnchor = false; + int firstEndpoint = 0; + // Bc6H can have either 1 or 2 petitions depending on the mode. + // The number of petitions affects the number of indices with implicit + // leading 0 bits and the number of bits per index. + if (modeDesc.partitionCount == 1) { + idx.numBits = 4; + // There's an implicit leading 0 bit for the first idx + isAnchor = (pixelNum == 0); + } else { + idx.numBits = 3; + // There are 2 indices with implicit leading 0-bits. + isAnchor = ((pixelNum == 0) || (pixelNum == AnchorTable2[partition])); + firstEndpoint = PartitionTable2[partition][pixelNum] * 2; + } + + idx.value = data.consumeBits(idx.numBits - isAnchor - 1, 0); + + // Don't exit the loop early, we need to consume these index bits regardless if + // we actually output them or not. + if ((y + dstY >= dstHeight) || (x + dstX >= dstWidth)) { + continue; + } + + Color color = interpolate(e[firstEndpoint], e[firstEndpoint + 1], idx, isSigned); + auto out = reinterpret_cast(dst + dstBpp * x + dstPitch * y); + *out = color; + } + } + } + }; + + } // namespace BC6H + + namespace BC7 { +// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_bptc.txt +// https://docs.microsoft.com/en-us/windows/win32/direct3d11/bc7-format + + struct Bitfield { + int offset; + int count; + + constexpr Bitfield Then(const int bits) { return {offset + count, bits}; } + + constexpr bool operator==(const Bitfield &rhs) { + return offset == rhs.offset && count == rhs.count; + } + }; + + struct Mode { + const int IDX; // Mode index + const int NS; // Number of subsets in each partition + const int PB; // Partition bits + const int RB; // Rotation bits + const int ISB; // Index selection bits + const int CB; // Color bits + const int AB; // Alpha bits + const int EPB; // Endpoint P-bits + const int SPB; // Shared P-bits + const int IB; // Primary index bits per element + const int IBC; // Primary index bits total + const int IB2; // Secondary index bits per element + + constexpr int NumColors() const { return NS * 2; } + + constexpr Bitfield Partition() const { return {IDX + 1, PB}; } + + constexpr Bitfield Rotation() const { return Partition().Then(RB); } + + constexpr Bitfield IndexSelection() const { return Rotation().Then(ISB); } + + constexpr Bitfield Red(int idx) const { + return IndexSelection().Then(CB * idx).Then(CB); + } + + constexpr Bitfield Green(int idx) const { + return Red(NumColors() - 1).Then(CB * idx).Then(CB); + } + + constexpr Bitfield Blue(int idx) const { + return Green(NumColors() - 1).Then(CB * idx).Then(CB); + } + + constexpr Bitfield Alpha(int idx) const { + return Blue(NumColors() - 1).Then(AB * idx).Then(AB); + } + + constexpr Bitfield EndpointPBit(int idx) const { + return Alpha(NumColors() - 1).Then(EPB * idx).Then(EPB); + } + + constexpr Bitfield SharedPBit0() const { + return EndpointPBit(NumColors() - 1).Then(SPB); + } + + constexpr Bitfield SharedPBit1() const { + return SharedPBit0().Then(SPB); + } + + constexpr Bitfield PrimaryIndex(int offset, int count) const { + return SharedPBit1().Then(offset).Then(count); + } + + constexpr Bitfield SecondaryIndex(int offset, int count) const { + return SharedPBit1().Then(IBC + offset).Then(count); + } + }; + + static constexpr Mode Modes[] = { + // IDX NS PB RB ISB CB AB EPB SPB IB IBC, IB2 + /**/ {0x0, 0x3, 0x4, 0x0, 0x0, 0x4, 0x0, 0x1, 0x0, 0x3, 0x2d, 0x0}, +/**/ {0x1, 0x2, 0x6, 0x0, 0x0, 0x6, 0x0, 0x0, 0x1, 0x3, 0x2e, 0x0}, +/**/ {0x2, 0x3, 0x6, 0x0, 0x0, 0x5, 0x0, 0x0, 0x0, 0x2, 0x1d, 0x0}, +/**/ {0x3, 0x2, 0x6, 0x0, 0x0, 0x7, 0x0, 0x1, 0x0, 0x2, 0x1e, 0x0}, +/**/ {0x4, 0x1, 0x0, 0x2, 0x1, 0x5, 0x6, 0x0, 0x0, 0x2, 0x1f, 0x3}, +/**/ {0x5, 0x1, 0x0, 0x2, 0x0, 0x7, 0x8, 0x0, 0x0, 0x2, 0x1f, 0x2}, +/**/ {0x6, 0x1, 0x0, 0x0, 0x0, 0x7, 0x7, 0x1, 0x0, 0x4, 0x3f, 0x0}, +/**/ {0x7, 0x2, 0x6, 0x0, 0x0, 0x5, 0x5, 0x1, 0x0, 0x2, 0x1e, 0x0}, +/**/ {-1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x00, 0x0}, + }; + + static constexpr int MaxPartitions = 64; + static constexpr int MaxSubsets = 3; + + static constexpr uint8_t PartitionTable2[MaxPartitions][16] = { + {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1}, + {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1}, + {0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1}, + {0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1}, + {0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1}, + {0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1}, + {0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1}, + {0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1}, + {0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0}, + {0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0}, + {0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0}, + {0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1}, + {0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0}, + {0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0}, + {0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0}, + {0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0}, + {0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0}, + {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0}, + {0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0}, + {0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0}, + {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}, + {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}, + {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0}, + {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0}, + {0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0}, + {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0}, + {0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1}, + {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1}, + {0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0}, + {0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0}, + {0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0}, + {0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0}, + {0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0}, + {0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1}, + {0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1}, + {0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0}, + {0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0}, + {0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0}, + {0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0}, + {0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1}, + {0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1}, + {0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0}, + {0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0}, + {0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1}, + {0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1}, + {0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1}, + {0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1}, + {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1}, + {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0}, + {0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0}, + {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1}, + }; + + static constexpr uint8_t PartitionTable3[MaxPartitions][16] = { + {0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 1, 2, 2, 2, 2}, + {0, 0, 0, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1}, + {0, 0, 0, 0, 2, 0, 0, 1, 2, 2, 1, 1, 2, 2, 1, 1}, + {0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 1, 0, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2}, + {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 2, 2}, + {0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2}, + {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2}, + {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2}, + {0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2}, + {0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2}, + {0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2}, + {0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 2}, + {0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0, 2, 2, 2, 0}, + {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2}, + {0, 1, 1, 1, 0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0}, + {0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2}, + {0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1}, + {0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2}, + {0, 0, 0, 1, 0, 0, 0, 1, 2, 2, 2, 1, 2, 2, 2, 1}, + {0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2}, + {0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 1, 0, 2, 2, 1, 0}, + {0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0}, + {0, 0, 1, 2, 0, 0, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2}, + {0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1, 0, 1, 1, 0}, + {0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1}, + {0, 0, 2, 2, 1, 1, 0, 2, 1, 1, 0, 2, 0, 0, 2, 2}, + {0, 1, 1, 0, 0, 1, 1, 0, 2, 0, 0, 2, 2, 2, 2, 2}, + {0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1}, + {0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 1, 1, 2, 2, 2, 1}, + {0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 2, 2, 2}, + {0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 2, 0, 0, 1, 1}, + {0, 0, 1, 1, 0, 0, 1, 2, 0, 0, 2, 2, 0, 2, 2, 2}, + {0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0}, + {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0}, + {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0}, + {0, 1, 2, 0, 2, 0, 1, 2, 1, 2, 0, 1, 0, 1, 2, 0}, + {0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1}, + {0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0, 1, 1}, + {0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2}, + {0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1}, + {0, 0, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2, 1, 1, 2, 2}, + {0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 1, 1}, + {0, 2, 2, 0, 1, 2, 2, 1, 0, 2, 2, 0, 1, 2, 2, 1}, + {0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 0, 1}, + {0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1}, + {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2}, + {0, 2, 2, 2, 0, 1, 1, 1, 0, 2, 2, 2, 0, 1, 1, 1}, + {0, 0, 0, 2, 1, 1, 1, 2, 0, 0, 0, 2, 1, 1, 1, 2}, + {0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2}, + {0, 2, 2, 2, 0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2}, + {0, 0, 0, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2}, + {0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2}, + {0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2}, + {0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2}, + {0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2}, + {0, 0, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2}, + {0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1}, + {0, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2}, + {0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}, + {0, 1, 1, 1, 2, 0, 1, 1, 2, 2, 0, 1, 2, 2, 2, 0}, + }; + + static constexpr uint8_t AnchorTable2[MaxPartitions] = { +// @fmt:off +0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, +0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, +0xf, 0x2, 0x8, 0x2, 0x2, 0x8, 0x8, 0xf, +0x2, 0x8, 0x2, 0x2, 0x8, 0x8, 0x2, 0x2, +0xf, 0xf, 0x6, 0x8, 0x2, 0x8, 0xf, 0xf, +0x2, 0x8, 0x2, 0x2, 0x2, 0xf, 0xf, 0x6, +0x6, 0x2, 0x6, 0x8, 0xf, 0xf, 0x2, 0x2, +0xf, 0xf, 0xf, 0xf, 0xf, 0x2, 0x2, 0xf, +// @fmt:on + }; + + static constexpr uint8_t AnchorTable3a[MaxPartitions] = { +// @fmt:off +0x3, 0x3, 0xf, 0xf, 0x8, 0x3, 0xf, 0xf, +0x8, 0x8, 0x6, 0x6, 0x6, 0x5, 0x3, 0x3, +0x3, 0x3, 0x8, 0xf, 0x3, 0x3, 0x6, 0xa, +0x5, 0x8, 0x8, 0x6, 0x8, 0x5, 0xf, 0xf, +0x8, 0xf, 0x3, 0x5, 0x6, 0xa, 0x8, 0xf, +0xf, 0x3, 0xf, 0x5, 0xf, 0xf, 0xf, 0xf, +0x3, 0xf, 0x5, 0x5, 0x5, 0x8, 0x5, 0xa, +0x5, 0xa, 0x8, 0xd, 0xf, 0xc, 0x3, 0x3, +// @fmt:on + }; + + static constexpr uint8_t AnchorTable3b[MaxPartitions] = { +// @fmt:off +0xf, 0x8, 0x8, 0x3, 0xf, 0xf, 0x3, 0x8, +0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0x8, +0xf, 0x8, 0xf, 0x3, 0xf, 0x8, 0xf, 0x8, +0x3, 0xf, 0x6, 0xa, 0xf, 0xf, 0xa, 0x8, +0xf, 0x3, 0xf, 0xa, 0xa, 0x8, 0x9, 0xa, +0x6, 0xf, 0x8, 0xf, 0x3, 0x6, 0x6, 0x8, +0xf, 0x3, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, +0xf, 0xf, 0xf, 0xf, 0x3, 0xf, 0xf, 0x8, +// @fmt:on + }; + + struct Color { + struct RGB { + RGB() = default; + + RGB(uint8_t r, uint8_t g, uint8_t b) + : b(b), g(g), r(r) {} + + RGB(int r, int g, int b) + : b(static_cast(b)), g(static_cast(g)), r(static_cast(r)) {} + + RGB operator<<(int shift) const { return {r << shift, g << shift, b << shift}; } + + RGB operator>>(int shift) const { return {r >> shift, g >> shift, b >> shift}; } + + RGB operator|(int bits) const { return {r | bits, g | bits, b | bits}; } + + RGB operator|(const RGB &rhs) const { return {r | rhs.r, g | rhs.g, b | rhs.b}; } + + RGB operator+(const RGB &rhs) const { return {r + rhs.r, g + rhs.g, b + rhs.b}; } + + uint8_t b; + uint8_t g; + uint8_t r; + }; + + RGB rgb; + uint8_t a; + }; + + static_assert(sizeof(Color) == 4, "Color size must be 4 bytes"); + + struct Block { + constexpr uint64_t Get(const Bitfield &bf) const { + uint64_t mask = (1ULL << bf.count) - 1; + if (bf.offset + bf.count <= 64) { + return (low >> bf.offset) & mask; + } + if (bf.offset >= 64) { + return (high >> (bf.offset - 64)) & mask; + } + return ((low >> bf.offset) | (high << (64 - bf.offset))) & mask; + } + + const Mode &mode() const { + if ((low & 0b00000001) != 0) { + return Modes[0]; + } + if ((low & 0b00000010) != 0) { + return Modes[1]; + } + if ((low & 0b00000100) != 0) { + return Modes[2]; + } + if ((low & 0b00001000) != 0) { + return Modes[3]; + } + if ((low & 0b00010000) != 0) { + return Modes[4]; + } + if ((low & 0b00100000) != 0) { + return Modes[5]; + } + if ((low & 0b01000000) != 0) { + return Modes[6]; + } + if ((low & 0b10000000) != 0) { + return Modes[7]; + } + return Modes[8]; // Invalid mode + } + + struct IndexInfo { + uint64_t value; + int numBits; + }; + + uint8_t interpolate(uint8_t e0, uint8_t e1, const IndexInfo &index) const { + static constexpr uint16_t weights2[] = {0, 21, 43, 64}; + static constexpr uint16_t weights3[] = {0, 9, 18, 27, 37, 46, 55, 64}; + static constexpr uint16_t weights4[] = {0, 4, 9, 13, 17, 21, 26, 30, + 34, 38, 43, 47, 51, 55, 60, 64}; + static constexpr uint16_t const *weightsN[] = { + nullptr, nullptr, weights2, weights3, weights4 + }; + auto weights = weightsN[index.numBits]; + assert(weights != nullptr); + return (uint8_t) (((64 - weights[index.value]) * uint16_t(e0) + weights[index.value] * uint16_t(e1) + 32) >> 6); + } + + void decode(uint8_t *dst, size_t dstX, size_t dstY, size_t dstWidth, size_t dstHeight, size_t dstPitch) const { + auto const &mode = this->mode(); + + if (mode.IDX < 0) // Invalid mode: + { + for (size_t y = 0; y < 4 && y + dstY < dstHeight; y++) { + for (size_t x = 0; x < 4 && x + dstX < dstWidth; x++) { + auto out = reinterpret_cast(dst + sizeof(Color) * x + dstPitch * y); + out->rgb = {0, 0, 0}; + out->a = 0; + } + } + return; + } + + using Endpoint = std::array; + std::array subsets; + + for (size_t i = 0; i < mode.NS; i++) { + auto &subset = subsets[i]; + subset[0].rgb.r = Get(mode.Red(i * 2 + 0)); + subset[0].rgb.g = Get(mode.Green(i * 2 + 0)); + subset[0].rgb.b = Get(mode.Blue(i * 2 + 0)); + subset[0].a = (mode.AB > 0) ? Get(mode.Alpha(i * 2 + 0)) : 255; + + subset[1].rgb.r = Get(mode.Red(i * 2 + 1)); + subset[1].rgb.g = Get(mode.Green(i * 2 + 1)); + subset[1].rgb.b = Get(mode.Blue(i * 2 + 1)); + subset[1].a = (mode.AB > 0) ? Get(mode.Alpha(i * 2 + 1)) : 255; + } + + if (mode.SPB > 0) { + auto pbit0 = Get(mode.SharedPBit0()); + auto pbit1 = Get(mode.SharedPBit1()); + subsets[0][0].rgb = (subsets[0][0].rgb << 1) | pbit0; + subsets[0][1].rgb = (subsets[0][1].rgb << 1) | pbit0; + subsets[1][0].rgb = (subsets[1][0].rgb << 1) | pbit1; + subsets[1][1].rgb = (subsets[1][1].rgb << 1) | pbit1; + } + + if (mode.EPB > 0) { + for (size_t i = 0; i < mode.NS; i++) { + auto &subset = subsets[i]; + auto pbit0 = Get(mode.EndpointPBit(i * 2 + 0)); + auto pbit1 = Get(mode.EndpointPBit(i * 2 + 1)); + subset[0].rgb = (subset[0].rgb << 1) | pbit0; + subset[1].rgb = (subset[1].rgb << 1) | pbit1; + if (mode.AB > 0) { + subset[0].a = (subset[0].a << 1) | pbit0; + subset[1].a = (subset[1].a << 1) | pbit1; + } + } + } + + auto const colorBits = mode.CB + mode.SPB + mode.EPB; + auto const alphaBits = mode.AB + mode.SPB + mode.EPB; + + for (size_t i = 0; i < mode.NS; i++) { + auto &subset = subsets[i]; + subset[0].rgb = subset[0].rgb << (8 - colorBits); + subset[1].rgb = subset[1].rgb << (8 - colorBits); + subset[0].rgb = subset[0].rgb | (subset[0].rgb >> colorBits); + subset[1].rgb = subset[1].rgb | (subset[1].rgb >> colorBits); + + if (mode.AB > 0) { + subset[0].a = subset[0].a << (8 - alphaBits); + subset[1].a = subset[1].a << (8 - alphaBits); + subset[0].a = subset[0].a | (subset[0].a >> alphaBits); + subset[1].a = subset[1].a | (subset[1].a >> alphaBits); + } + } + + int colorIndexBitOffset = 0; + int alphaIndexBitOffset = 0; + for (int y = 0; y < 4; y++) { + for (int x = 0; x < 4; x++) { + auto texelIdx = y * 4 + x; + auto partitionIdx = Get(mode.Partition()); + assert(partitionIdx < MaxPartitions); + auto subsetIdx = subsetIndex(mode, partitionIdx, texelIdx); + assert(subsetIdx < MaxSubsets); + auto const &subset = subsets[subsetIdx]; + + auto anchorIdx = anchorIndex(mode, partitionIdx, subsetIdx); + auto isAnchor = anchorIdx == texelIdx; + auto colorIdx = colorIndex(mode, isAnchor, colorIndexBitOffset); + auto alphaIdx = alphaIndex(mode, isAnchor, alphaIndexBitOffset); + + if (y + dstY >= dstHeight || x + dstX >= dstWidth) { + // Don't be tempted to skip early at the loops: + // The calls to colorIndex() and alphaIndex() adjust bit + // offsets that need to be carefully tracked. + continue; + } + + Color output; + // Note: We flip r and b channels past this point as the texture storage is BGR while the output is RGB + output.rgb.r = interpolate(subset[0].rgb.b, subset[1].rgb.b, colorIdx); + output.rgb.g = interpolate(subset[0].rgb.g, subset[1].rgb.g, colorIdx); + output.rgb.b = interpolate(subset[0].rgb.r, subset[1].rgb.r, colorIdx); + output.a = interpolate(subset[0].a, subset[1].a, alphaIdx); + + switch (Get(mode.Rotation())) { + default: + break; + case 1: + std::swap(output.a, output.rgb.b); + break; + case 2: + std::swap(output.a, output.rgb.g); + break; + case 3: + std::swap(output.a, output.rgb.r); + break; + } + + auto out = reinterpret_cast(dst + sizeof(Color) * x + dstPitch * y); + *out = output; + } + } + } + + int subsetIndex(const Mode &mode, int partitionIdx, int texelIndex) const { + switch (mode.NS) { + default: + return 0; + case 2: + return PartitionTable2[partitionIdx][texelIndex]; + case 3: + return PartitionTable3[partitionIdx][texelIndex]; + } + } + + int anchorIndex(const Mode &mode, int partitionIdx, int subsetIdx) const { + // ARB_texture_compression_bptc states: + // "In partition zero, the anchor index is always index zero. + // In other partitions, the anchor index is specified by tables + // Table.A2 and Table.A3."" + // Note: This is really confusing - I believe they meant subset instead + // of partition here. + switch (subsetIdx) { + default: + return 0; + case 1: + return mode.NS == 2 ? AnchorTable2[partitionIdx] : AnchorTable3a[partitionIdx]; + case 2: + return AnchorTable3b[partitionIdx]; + } + } + + IndexInfo colorIndex(const Mode &mode, bool isAnchor, + int &indexBitOffset) const { + // ARB_texture_compression_bptc states: + // "The index value for interpolating color comes from the secondary + // index for the texel if the format has an index selection bit and its + // value is one and from the primary index otherwise."" + auto idx = Get(mode.IndexSelection()); + assert(idx <= 1); + bool secondary = idx == 1; + auto numBits = secondary ? mode.IB2 : mode.IB; + auto numReadBits = numBits - (isAnchor ? 1 : 0); + auto index = + Get(secondary ? mode.SecondaryIndex(indexBitOffset, numReadBits) + : mode.PrimaryIndex(indexBitOffset, numReadBits)); + indexBitOffset += numReadBits; + return {index, numBits}; + } + + IndexInfo alphaIndex(const Mode &mode, bool isAnchor, + int &indexBitOffset) const { + // ARB_texture_compression_bptc states: + // "The alpha index comes from the secondary index if the block has a + // secondary index and the block either doesn't have an index selection + // bit or that bit is zero and the primary index otherwise." + auto idx = Get(mode.IndexSelection()); + assert(idx <= 1); + bool secondary = (mode.IB2 != 0) && (idx == 0); + auto numBits = secondary ? mode.IB2 : mode.IB; + auto numReadBits = numBits - (isAnchor ? 1 : 0); + auto index = + Get(secondary ? mode.SecondaryIndex(indexBitOffset, numReadBits) + : mode.PrimaryIndex(indexBitOffset, numReadBits)); + indexBitOffset += numReadBits; + return {index, numBits}; + } + + // Assumes little-endian + uint64_t low; + uint64_t high; + }; + + } // namespace BC7 +} // anonymous namespace + +namespace bcn { + constexpr size_t R8Bpp{1}; //!< The amount of bytes per pixel in R8 + constexpr size_t R8g8Bpp{2}; //!< The amount of bytes per pixel in R8G8 + constexpr size_t R8g8b8a8Bpp{4}; //!< The amount of bytes per pixel in R8G8B8A8 + constexpr size_t R16g16b16a16Bpp{8}; //!< The amount of bytes per pixel in R16G16B16 + + void DecodeBc1(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height) { + const auto *color{reinterpret_cast(src)}; + size_t pitch{R8g8b8a8Bpp * width}; + color->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp, true, false); + } + + void DecodeBc2(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height) { + const auto *alpha{reinterpret_cast(src)}; + const auto *color{reinterpret_cast(src + 8)}; + size_t pitch{R8g8b8a8Bpp * width}; + color->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp, false, true); + alpha->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp); + } + + void DecodeBc3(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height) { + const auto *alpha{reinterpret_cast(src)}; + const auto *color{reinterpret_cast(src + 8)}; + size_t pitch{R8g8b8a8Bpp * width}; + color->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp, false, true); + alpha->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp, 3, false); + } + + void DecodeBc4(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned) { + const auto *red{reinterpret_cast(src)}; + size_t pitch{R8Bpp * width}; + red->decode(dst, x, y, width, height, pitch, R8Bpp, 0, isSigned); + } + + void DecodeBc5(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned) { + const auto *red{reinterpret_cast(src)}; + const auto *green{reinterpret_cast(src + 8)}; + size_t pitch{R8g8Bpp * width}; + red->decode(dst, x, y, width, height, pitch, R8g8Bpp, 0, isSigned); + green->decode(dst, x, y, width, height, pitch, R8g8Bpp, 1, isSigned); + } + + void DecodeBc6(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned) { + const auto *block{reinterpret_cast(src)}; + size_t pitch{R16g16b16a16Bpp * width}; + block->decode(dst, x, y, width, height, pitch, R16g16b16a16Bpp, isSigned); + } + + void DecodeBc7(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height) { + const auto *block{reinterpret_cast(src)}; + size_t pitch{R8g8b8a8Bpp * width}; + block->decode(dst, x, y, width, height, pitch); + } +} diff --git a/externals/bc_decoder/bc_decoder.h b/externals/bc_decoder/bc_decoder.h new file mode 100644 index 000000000..4f0ead7d3 --- /dev/null +++ b/externals/bc_decoder/bc_decoder.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: MPL-2.0 +// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/) + +#pragma once + +#include + +namespace bcn { + /** + * @brief Decodes a BC1 encoded image to R8G8B8A8 + */ + void DecodeBc1(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height); + + /** + * @brief Decodes a BC2 encoded image to R8G8B8A8 + */ + void DecodeBc2(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height); + + /** + * @brief Decodes a BC3 encoded image to R8G8B8A8 + */ + void DecodeBc3(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height); + + /** + * @brief Decodes a BC4 encoded image to R8 + */ + void DecodeBc4(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned); + + /** + * @brief Decodes a BC5 encoded image to R8G8 + */ + void DecodeBc5(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned); + + /** + * @brief Decodes a BC6 encoded image to R16G16B16A16 + */ + void DecodeBc6(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned); + + /** + * @brief Decodes a BC7 encoded image to R8G8B8A8 + */ + void DecodeBc7(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height); +} diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index e9e6f278d..3b2fe01da 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -220,8 +220,8 @@ add_library(video_core STATIC surface.h texture_cache/accelerated_swizzle.cpp texture_cache/accelerated_swizzle.h - texture_cache/decode_bc4.cpp - texture_cache/decode_bc4.h + texture_cache/decode_bc.cpp + texture_cache/decode_bc.h texture_cache/descriptor_table.h texture_cache/formatter.cpp texture_cache/formatter.h @@ -279,7 +279,7 @@ add_library(video_core STATIC create_target_directory_groups(video_core) target_link_libraries(video_core PUBLIC common core) -target_link_libraries(video_core PUBLIC glad shader_recompiler stb) +target_link_libraries(video_core PUBLIC glad shader_recompiler stb bc_decoder) if (YUZU_USE_BUNDLED_FFMPEG AND NOT (WIN32 OR ANDROID)) add_dependencies(video_core ffmpeg-build) diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 9a0b10568..a8540339d 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -259,6 +259,26 @@ FormatInfo SurfaceFormat(const Device& device, FormatType format_type, bool with break; } } + // Transcode on hardware that doesn't support BCn natively + if (!device.IsOptimalBcnSupported() && VideoCore::Surface::IsPixelFormatBCn(pixel_format)) { + const bool is_srgb = with_srgb && VideoCore::Surface::IsPixelFormatSRGB(pixel_format); + if (pixel_format == PixelFormat::BC4_SNORM) { + tuple.format = VK_FORMAT_R8_SNORM; + } else if (pixel_format == PixelFormat::BC4_UNORM) { + tuple.format = VK_FORMAT_R8_UNORM; + } else if (pixel_format == PixelFormat::BC5_SNORM) { + tuple.format = VK_FORMAT_R8G8_SNORM; + } else if (pixel_format == PixelFormat::BC5_UNORM) { + tuple.format = VK_FORMAT_R8G8_UNORM; + } else if (pixel_format == PixelFormat::BC6H_SFLOAT || + pixel_format == PixelFormat::BC6H_UFLOAT) { + tuple.format = VK_FORMAT_R16G16B16A16_SFLOAT; + } else if (is_srgb) { + tuple.format = VK_FORMAT_A8B8G8R8_SRGB_PACK32; + } else { + tuple.format = VK_FORMAT_A8B8G8R8_UNORM_PACK32; + } + } const bool attachable = (tuple.usage & Attachable) != 0; const bool storage = (tuple.usage & Storage) != 0; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 268b955fb..f7c0d939a 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -315,7 +315,14 @@ void RasterizerVulkan::Clear(u32 layer_count) { FlushWork(); gpu_memory->FlushCaching(); +#if ANDROID + if (Settings::IsGPULevelHigh()) { + // This is problematic on Android, disable on GPU Normal. + query_cache.UpdateCounters(); + } +#else query_cache.UpdateCounters(); +#endif auto& regs = maxwell3d->regs; const bool use_color = regs.clear_surface.R || regs.clear_surface.G || regs.clear_surface.B || diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index ce6acc30c..8385b5509 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1279,6 +1279,10 @@ Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu flags |= VideoCommon::ImageFlagBits::Converted; flags |= VideoCommon::ImageFlagBits::CostlyLoad; } + if (IsPixelFormatBCn(info.format) && !runtime->device.IsOptimalBcnSupported()) { + flags |= VideoCommon::ImageFlagBits::Converted; + flags |= VideoCommon::ImageFlagBits::CostlyLoad; + } if (runtime->device.HasDebuggingToolAttached()) { original_image.SetObjectNameEXT(VideoCommon::Name(*this).c_str()); } diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index cb51529e4..e16cd5e73 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -269,6 +269,28 @@ bool IsPixelFormatASTC(PixelFormat format) { } } +bool IsPixelFormatBCn(PixelFormat format) { + switch (format) { + case PixelFormat::BC1_RGBA_UNORM: + case PixelFormat::BC2_UNORM: + case PixelFormat::BC3_UNORM: + case PixelFormat::BC4_UNORM: + case PixelFormat::BC4_SNORM: + case PixelFormat::BC5_UNORM: + case PixelFormat::BC5_SNORM: + case PixelFormat::BC1_RGBA_SRGB: + case PixelFormat::BC2_SRGB: + case PixelFormat::BC3_SRGB: + case PixelFormat::BC7_UNORM: + case PixelFormat::BC6H_UFLOAT: + case PixelFormat::BC6H_SFLOAT: + case PixelFormat::BC7_SRGB: + return true; + default: + return false; + } +} + bool IsPixelFormatSRGB(PixelFormat format) { switch (format) { case PixelFormat::A8B8G8R8_SRGB: diff --git a/src/video_core/surface.h b/src/video_core/surface.h index 0225d3287..9b9c4d9bc 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -501,6 +501,8 @@ SurfaceType GetFormatType(PixelFormat pixel_format); bool IsPixelFormatASTC(PixelFormat format); +bool IsPixelFormatBCn(PixelFormat format); + bool IsPixelFormatSRGB(PixelFormat format); bool IsPixelFormatInteger(PixelFormat format); diff --git a/src/video_core/texture_cache/decode_bc.cpp b/src/video_core/texture_cache/decode_bc.cpp new file mode 100644 index 000000000..3e26474a3 --- /dev/null +++ b/src/video_core/texture_cache/decode_bc.cpp @@ -0,0 +1,129 @@ +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include + +#include "common/common_types.h" +#include "video_core/texture_cache/decode_bc.h" + +namespace VideoCommon { + +namespace { +constexpr u32 BLOCK_SIZE = 4; + +using VideoCore::Surface::PixelFormat; + +constexpr bool IsSigned(PixelFormat pixel_format) { + switch (pixel_format) { + case PixelFormat::BC4_SNORM: + case PixelFormat::BC4_UNORM: + case PixelFormat::BC5_SNORM: + case PixelFormat::BC5_UNORM: + case PixelFormat::BC6H_SFLOAT: + case PixelFormat::BC6H_UFLOAT: + return true; + default: + return false; + } +} + +constexpr u32 BlockSize(PixelFormat pixel_format) { + switch (pixel_format) { + case PixelFormat::BC1_RGBA_SRGB: + case PixelFormat::BC1_RGBA_UNORM: + case PixelFormat::BC4_SNORM: + case PixelFormat::BC4_UNORM: + return 8; + default: + return 16; + } +} +} // Anonymous namespace + +u32 ConvertedBytesPerBlock(VideoCore::Surface::PixelFormat pixel_format) { + switch (pixel_format) { + case PixelFormat::BC4_SNORM: + case PixelFormat::BC4_UNORM: + return 1; + case PixelFormat::BC5_SNORM: + case PixelFormat::BC5_UNORM: + return 2; + case PixelFormat::BC6H_SFLOAT: + case PixelFormat::BC6H_UFLOAT: + return 8; + default: + return 4; + } +} + +template +void DecompressBlocks(std::span input, std::span output, Extent3D extent, + bool is_signed = false) { + const u32 out_bpp = ConvertedBytesPerBlock(pixel_format); + const u32 block_width = std::min(extent.width, BLOCK_SIZE); + const u32 block_height = std::min(extent.height, BLOCK_SIZE); + const u32 pitch = extent.width * out_bpp; + size_t input_offset = 0; + size_t output_offset = 0; + for (u32 slice = 0; slice < extent.depth; ++slice) { + for (u32 y = 0; y < extent.height; y += block_height) { + size_t row_offset = 0; + for (u32 x = 0; x < extent.width; + x += block_width, row_offset += block_width * out_bpp) { + const u8* src = input.data() + input_offset; + u8* const dst = output.data() + output_offset + row_offset; + if constexpr (IsSigned(pixel_format)) { + decompress(src, dst, x, y, extent.width, extent.height, is_signed); + } else { + decompress(src, dst, x, y, extent.width, extent.height); + } + input_offset += BlockSize(pixel_format); + } + output_offset += block_height * pitch; + } + } +} + +void DecompressBCn(std::span input, std::span output, Extent3D extent, + VideoCore::Surface::PixelFormat pixel_format) { + switch (pixel_format) { + case PixelFormat::BC1_RGBA_UNORM: + case PixelFormat::BC1_RGBA_SRGB: + DecompressBlocks(input, output, extent); + break; + case PixelFormat::BC2_UNORM: + case PixelFormat::BC2_SRGB: + DecompressBlocks(input, output, extent); + break; + case PixelFormat::BC3_UNORM: + case PixelFormat::BC3_SRGB: + DecompressBlocks(input, output, extent); + break; + case PixelFormat::BC4_SNORM: + case PixelFormat::BC4_UNORM: + DecompressBlocks( + input, output, extent, pixel_format == PixelFormat::BC4_SNORM); + break; + case PixelFormat::BC5_SNORM: + case PixelFormat::BC5_UNORM: + DecompressBlocks( + input, output, extent, pixel_format == PixelFormat::BC5_SNORM); + break; + case PixelFormat::BC6H_SFLOAT: + case PixelFormat::BC6H_UFLOAT: + DecompressBlocks( + input, output, extent, pixel_format == PixelFormat::BC6H_SFLOAT); + break; + case PixelFormat::BC7_SRGB: + case PixelFormat::BC7_UNORM: + DecompressBlocks(input, output, extent); + break; + default: + LOG_WARNING(HW_GPU, "Unimplemented BCn decompression {}", pixel_format); + } +} + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/decode_bc.h b/src/video_core/texture_cache/decode_bc.h new file mode 100644 index 000000000..41d1ec0a3 --- /dev/null +++ b/src/video_core/texture_cache/decode_bc.h @@ -0,0 +1,19 @@ +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include + +#include "common/common_types.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/types.h" + +namespace VideoCommon { + +[[nodiscard]] u32 ConvertedBytesPerBlock(VideoCore::Surface::PixelFormat pixel_format); + +void DecompressBCn(std::span input, std::span output, Extent3D extent, + VideoCore::Surface::PixelFormat pixel_format); + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/decode_bc4.cpp b/src/video_core/texture_cache/decode_bc4.cpp deleted file mode 100644 index ef98afdca..000000000 --- a/src/video_core/texture_cache/decode_bc4.cpp +++ /dev/null @@ -1,96 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include -#include -#include - -#include "common/assert.h" -#include "common/common_types.h" -#include "video_core/texture_cache/decode_bc4.h" -#include "video_core/texture_cache/types.h" - -namespace VideoCommon { - -// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_rgtc.txt -[[nodiscard]] constexpr u32 DecompressBlock(u64 bits, u32 x, u32 y) { - const u32 code_offset = 16 + 3 * (4 * y + x); - const u32 code = (bits >> code_offset) & 7; - const u32 red0 = (bits >> 0) & 0xff; - const u32 red1 = (bits >> 8) & 0xff; - if (red0 > red1) { - switch (code) { - case 0: - return red0; - case 1: - return red1; - case 2: - return (6 * red0 + 1 * red1) / 7; - case 3: - return (5 * red0 + 2 * red1) / 7; - case 4: - return (4 * red0 + 3 * red1) / 7; - case 5: - return (3 * red0 + 4 * red1) / 7; - case 6: - return (2 * red0 + 5 * red1) / 7; - case 7: - return (1 * red0 + 6 * red1) / 7; - } - } else { - switch (code) { - case 0: - return red0; - case 1: - return red1; - case 2: - return (4 * red0 + 1 * red1) / 5; - case 3: - return (3 * red0 + 2 * red1) / 5; - case 4: - return (2 * red0 + 3 * red1) / 5; - case 5: - return (1 * red0 + 4 * red1) / 5; - case 6: - return 0; - case 7: - return 0xff; - } - } - return 0; -} - -void DecompressBC4(std::span input, Extent3D extent, std::span output) { - UNIMPLEMENTED_IF_MSG(extent.width % 4 != 0, "Unaligned width={}", extent.width); - UNIMPLEMENTED_IF_MSG(extent.height % 4 != 0, "Unaligned height={}", extent.height); - static constexpr u32 BLOCK_SIZE = 4; - size_t input_offset = 0; - for (u32 slice = 0; slice < extent.depth; ++slice) { - for (u32 block_y = 0; block_y < extent.height / 4; ++block_y) { - for (u32 block_x = 0; block_x < extent.width / 4; ++block_x) { - u64 bits; - std::memcpy(&bits, &input[input_offset], sizeof(bits)); - input_offset += sizeof(bits); - - for (u32 y = 0; y < BLOCK_SIZE; ++y) { - for (u32 x = 0; x < BLOCK_SIZE; ++x) { - const u32 linear_z = slice; - const u32 linear_y = block_y * BLOCK_SIZE + y; - const u32 linear_x = block_x * BLOCK_SIZE + x; - const u32 offset_z = linear_z * extent.width * extent.height; - const u32 offset_y = linear_y * extent.width; - const u32 offset_x = linear_x; - const u32 output_offset = (offset_z + offset_y + offset_x) * 4ULL; - const u32 color = DecompressBlock(bits, x, y); - output[output_offset + 0] = static_cast(color); - output[output_offset + 1] = 0; - output[output_offset + 2] = 0; - output[output_offset + 3] = 0xff; - } - } - } - } - } -} - -} // namespace VideoCommon diff --git a/src/video_core/texture_cache/decode_bc4.h b/src/video_core/texture_cache/decode_bc4.h deleted file mode 100644 index ab2f735be..000000000 --- a/src/video_core/texture_cache/decode_bc4.h +++ /dev/null @@ -1,15 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include - -#include "common/common_types.h" -#include "video_core/texture_cache/types.h" - -namespace VideoCommon { - -void DecompressBC4(std::span data, Extent3D extent, std::span output); - -} // namespace VideoCommon diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp index f781cb7a0..9a618a57a 100644 --- a/src/video_core/texture_cache/util.cpp +++ b/src/video_core/texture_cache/util.cpp @@ -24,7 +24,7 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/memory_manager.h" #include "video_core/surface.h" -#include "video_core/texture_cache/decode_bc4.h" +#include "video_core/texture_cache/decode_bc.h" #include "video_core/texture_cache/format_lookup_table.h" #include "video_core/texture_cache/formatter.h" #include "video_core/texture_cache/samples_helper.h" @@ -61,8 +61,6 @@ using VideoCore::Surface::PixelFormatFromDepthFormat; using VideoCore::Surface::PixelFormatFromRenderTargetFormat; using VideoCore::Surface::SurfaceType; -constexpr u32 CONVERTED_BYTES_PER_BLOCK = BytesPerBlock(PixelFormat::A8B8G8R8_UNORM); - struct LevelInfo { Extent3D size; Extent3D block; @@ -612,7 +610,8 @@ u32 CalculateConvertedSizeBytes(const ImageInfo& info) noexcept { } return output_size; } - return NumBlocksPerLayer(info, TILE_SIZE) * info.resources.layers * CONVERTED_BYTES_PER_BLOCK; + return NumBlocksPerLayer(info, TILE_SIZE) * info.resources.layers * + ConvertedBytesPerBlock(info.format); } u32 CalculateLayerStride(const ImageInfo& info) noexcept { @@ -945,7 +944,8 @@ void ConvertImage(std::span input, const ImageInfo& info, std::span input, const ImageInfo& info, std::span input, const ImageInfo& info, std::span(copy.buffer_size); } else { - DecompressBC4(input_offset, copy.image_extent, output.subspan(output_offset)); - + const Extent3D image_extent{ + .width = copy.image_extent.width, + .height = copy.image_extent.height * copy.image_subresource.num_layers, + .depth = copy.image_extent.depth, + }; + DecompressBCn(input_offset, output.subspan(output_offset), image_extent, info.format); output_offset += copy.image_extent.width * copy.image_extent.height * - copy.image_subresource.num_layers * CONVERTED_BYTES_PER_BLOCK; + copy.image_subresource.num_layers * + ConvertedBytesPerBlock(info.format); } } } diff --git a/src/video_core/textures/bcn.cpp b/src/video_core/textures/bcn.cpp index 671212a49..16ddbe320 100644 --- a/src/video_core/textures/bcn.cpp +++ b/src/video_core/textures/bcn.cpp @@ -3,7 +3,6 @@ #include #include - #include "common/alignment.h" #include "video_core/textures/bcn.h" #include "video_core/textures/workers.h" diff --git a/src/video_core/textures/bcn.h b/src/video_core/textures/bcn.h index 6464af885..d5d2a16c9 100644 --- a/src/video_core/textures/bcn.h +++ b/src/video_core/textures/bcn.h @@ -4,14 +4,13 @@ #pragma once #include -#include + +#include "common/common_types.h" namespace Tegra::Texture::BCN { -void CompressBC1(std::span data, uint32_t width, uint32_t height, uint32_t depth, - std::span output); +void CompressBC1(std::span data, u32 width, u32 height, u32 depth, std::span output); -void CompressBC3(std::span data, uint32_t width, uint32_t height, uint32_t depth, - std::span output); +void CompressBC3(std::span data, u32 width, u32 height, u32 depth, std::span output); } // namespace Tegra::Texture::BCN diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index e05d04db3..1f17265d5 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -293,6 +293,11 @@ public: return features.features.textureCompressionASTC_LDR; } + /// Returns true if BCn is natively supported. + bool IsOptimalBcnSupported() const { + return features.features.textureCompressionBC; + } + /// Returns true if descriptor aliasing is natively supported. bool IsDescriptorAliasingSupported() const { return GetDriverID() != VK_DRIVER_ID_QUALCOMM_PROPRIETARY; @@ -423,6 +428,11 @@ public: return extensions.sampler_filter_minmax; } + /// Returns true if the device supports VK_EXT_shader_stencil_export. + bool IsExtShaderStencilExportSupported() const { + return extensions.shader_stencil_export; + } + /// Returns true if the device supports VK_EXT_depth_range_unrestricted. bool IsExtDepthRangeUnrestrictedSupported() const { return extensions.depth_range_unrestricted; @@ -492,11 +502,6 @@ public: return extensions.vertex_input_dynamic_state; } - /// Returns true if the device supports VK_EXT_shader_stencil_export. - bool IsExtShaderStencilExportSupported() const { - return extensions.shader_stencil_export; - } - /// Returns true if the device supports VK_EXT_shader_demote_to_helper_invocation bool IsExtShaderDemoteToHelperInvocationSupported() const { return extensions.shader_demote_to_helper_invocation; -- cgit v1.2.3 From ddcc95833660c57647d3e99dad76ecfa3b86ee8d Mon Sep 17 00:00:00 2001 From: GPUCode Date: Thu, 15 Jun 2023 12:17:46 +0300 Subject: renderer_vulkan: Prevent crashes when blitting depth stencil --- src/video_core/renderer_vulkan/blit_image.cpp | 3 +++ 1 file changed, 3 insertions(+) (limited to 'src/video_core') diff --git a/src/video_core/renderer_vulkan/blit_image.cpp b/src/video_core/renderer_vulkan/blit_image.cpp index cf2964a3f..28d4b15a0 100644 --- a/src/video_core/renderer_vulkan/blit_image.cpp +++ b/src/video_core/renderer_vulkan/blit_image.cpp @@ -495,6 +495,9 @@ void BlitImageHelper::BlitDepthStencil(const Framebuffer* dst_framebuffer, const Region2D& dst_region, const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter, Tegra::Engines::Fermi2D::Operation operation) { + if (!device.IsExtShaderStencilExportSupported()) { + return; + } ASSERT(filter == Tegra::Engines::Fermi2D::Filter::Point); ASSERT(operation == Tegra::Engines::Fermi2D::Operation::SrcCopy); const BlitImagePipelineKey key{ -- cgit v1.2.3 From ed93cbd4621c6a0c0ffaa272951e1990732df18f Mon Sep 17 00:00:00 2001 From: Matías Locatti <42481638+goldenx86@users.noreply.github.com> Date: Wed, 28 Jun 2023 20:10:27 -0300 Subject: Blacklist EDS3 blending from new AMD drivers --- src/video_core/vulkan_common/vulkan_device.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'src/video_core') diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 70436cf1c..421e71e5a 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -528,6 +528,14 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR } sets_per_pool = 64; + if (extensions.extended_dynamic_state3 && is_amd_driver && + properties.properties.driverVersion >= VK_MAKE_API_VERSION(0, 2, 0, 270)) { + LOG_WARNING(Render_Vulkan, + "AMD drivers after 23.5.2 have broken extendedDynamicState3ColorBlendEquation"); + features.extended_dynamic_state3.extendedDynamicState3ColorBlendEnable = false; + features.extended_dynamic_state3.extendedDynamicState3ColorBlendEquation = false; + dynamic_state3_blending = false; + } if (is_amd_driver) { // AMD drivers need a higher amount of Sets per Pool in certain circumstances like in XC2. sets_per_pool = 96; -- cgit v1.2.3 From b62121fd605663dc9aaaae72fe8e444312f9c5d5 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Thu, 29 Jun 2023 11:58:45 +0200 Subject: Texture cache: Fix YFC regression due to code testing --- src/video_core/texture_cache/texture_cache.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index d3f03a995..485f6b6f3 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -598,14 +598,6 @@ void TextureCache

::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz [&](ImageId id, Image&) { deleted_images.push_back(id); }); for (const ImageId id : deleted_images) { Image& image = slot_images[id]; - if (True(image.flags & ImageFlagBits::CpuModified)) { - return; - } - image.flags |= ImageFlagBits::CpuModified; - if (True(image.flags & ImageFlagBits::Tracked)) { - UntrackImage(image, id); - } - /* if (True(image.flags & ImageFlagBits::Remapped)) { continue; } @@ -613,7 +605,6 @@ void TextureCache

::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz if (True(image.flags & ImageFlagBits::Tracked)) { UntrackImage(image, id); } - */ } } -- cgit v1.2.3 From 596a6132b974dd73935854d8f51842424e058be8 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Thu, 29 Jun 2023 17:23:29 +0200 Subject: AccelerateDMA: Don't accelerate 3D texture DMA operations --- src/video_core/texture_cache/texture_cache.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'src/video_core') diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index d3f03a995..0330415b7 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -879,6 +879,10 @@ ImageId TextureCache

::DmaImageId(const Tegra::DMA::ImageOperand& operand, boo return NULL_IMAGE_ID; } auto& image = slot_images[image_id]; + if (image.info.type == ImageType::e3D) { + // Don't accelerate 3D images. + return NULL_IMAGE_ID; + } if (!is_upload && !image.info.dma_downloaded) { // Force a full sync. image.info.dma_downloaded = true; -- cgit v1.2.3