From 8d6aefdcc452b602d94a84d13bbbc15f806b689c Mon Sep 17 00:00:00 2001
From: Liam <byteslice@airmail.cc>
Date: Wed, 14 Jun 2023 14:11:46 -0400
Subject: video_core: optionally skip barriers on feedback loops

---
 src/video_core/texture_cache/texture_cache.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'src/video_core/texture_cache')
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index c7f7448e9..43b7ac0a6 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -186,6 +186,10 @@ void TextureCache<P>::FillComputeImageViews(std::span<ImageViewInOut> views) {
 
 template <class P>
 void TextureCache<P>::CheckFeedbackLoop(std::span<const ImageViewInOut> views) {
+    if (!Settings::values.barrier_feedback_loops.GetValue()) {
+        return;
+    }
+
     const bool requires_barrier = [&] {
         for (const auto& view : views) {
             if (!view.id) {
-- 
cgit v1.2.3


From 76a676883a17523fb12eeac6f2b9702e4916b2c2 Mon Sep 17 00:00:00 2001
From: FengChen <vonchenplus@gmail.com>
Date: Sat, 17 Jun 2023 23:26:39 +0800
Subject: video_core: add samples check when find render target

---
 src/video_core/texture_cache/texture_cache.h      | 22 ++++++++++------------
 src/video_core/texture_cache/texture_cache_base.h | 10 ++++------
 2 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'src/video_core/texture_cache')

diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index c7f7448e9..f11998e20 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -280,7 +280,7 @@ void TextureCache<P>::SynchronizeComputeDescriptors() {
 }
 
 template <class P>
-bool TextureCache<P>::RescaleRenderTargets(bool is_clear) {
+bool TextureCache<P>::RescaleRenderTargets() {
     auto& flags = maxwell3d->dirty.flags;
     u32 scale_rating = 0;
     bool rescaled = false;
@@ -318,13 +318,13 @@ bool TextureCache<P>::RescaleRenderTargets(bool is_clear) {
             ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index];
             if (flags[Dirty::ColorBuffer0 + index] || force) {
                 flags[Dirty::ColorBuffer0 + index] = false;
-                BindRenderTarget(&color_buffer_id, FindColorBuffer(index, is_clear));
+                BindRenderTarget(&color_buffer_id, FindColorBuffer(index));
             }
             check_rescale(color_buffer_id, tmp_color_images[index]);
         }
         if (flags[Dirty::ZetaBuffer] || force) {
             flags[Dirty::ZetaBuffer] = false;
-            BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer(is_clear));
+            BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer());
         }
         check_rescale(render_targets.depth_buffer_id, tmp_depth_image);
 
@@ -389,7 +389,7 @@ void TextureCache<P>::UpdateRenderTargets(bool is_clear) {
         return;
     }
 
-    const bool rescaled = RescaleRenderTargets(is_clear);
+    const bool rescaled = RescaleRenderTargets();
     if (is_rescaling != rescaled) {
         flags[Dirty::RescaleViewports] = true;
         flags[Dirty::RescaleScissors] = true;
@@ -1658,7 +1658,7 @@ SamplerId TextureCache<P>::FindSampler(const TSCEntry& config) {
 }
 
 template <class P>
-ImageViewId TextureCache<P>::FindColorBuffer(size_t index, bool is_clear) {
+ImageViewId TextureCache<P>::FindColorBuffer(size_t index) {
     const auto& regs = maxwell3d->regs;
     if (index >= regs.rt_control.count) {
         return ImageViewId{};
@@ -1672,11 +1672,11 @@ ImageViewId TextureCache<P>::FindColorBuffer(size_t index, bool is_clear) {
         return ImageViewId{};
     }
     const ImageInfo info(regs.rt[index], regs.anti_alias_samples_mode);
-    return FindRenderTargetView(info, gpu_addr, is_clear);
+    return FindRenderTargetView(info, gpu_addr);
 }
 
 template <class P>
-ImageViewId TextureCache<P>::FindDepthBuffer(bool is_clear) {
+ImageViewId TextureCache<P>::FindDepthBuffer() {
     const auto& regs = maxwell3d->regs;
     if (!regs.zeta_enable) {
         return ImageViewId{};
@@ -1686,18 +1686,16 @@ ImageViewId TextureCache<P>::FindDepthBuffer(bool is_clear) {
         return ImageViewId{};
     }
     const ImageInfo info(regs.zeta, regs.zeta_size, regs.anti_alias_samples_mode);
-    return FindRenderTargetView(info, gpu_addr, is_clear);
+    return FindRenderTargetView(info, gpu_addr);
 }
 
 template <class P>
-ImageViewId TextureCache<P>::FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr,
-                                                  bool is_clear) {
-    const auto options = is_clear ? RelaxedOptions::Samples : RelaxedOptions{};
+ImageViewId TextureCache<P>::FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr) {
     ImageId image_id{};
     bool delete_state = has_deleted_images;
     do {
         has_deleted_images = false;
-        image_id = FindOrInsertImage(info, gpu_addr, options);
+        image_id = FindOrInsertImage(info, gpu_addr);
         delete_state |= has_deleted_images;
     } while (has_deleted_images);
     has_deleted_images = delete_state;
diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h
index 3bfa92154..c347eccd6 100644
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@@ -166,9 +166,8 @@ public:
     void SynchronizeComputeDescriptors();
 
     /// Updates the Render Targets if they can be rescaled
-    /// @param is_clear True when the render targets are being used for clears
     /// @retval True if the Render Targets have been rescaled.
-    bool RescaleRenderTargets(bool is_clear);
+    bool RescaleRenderTargets();
 
     /// Update bound render targets and upload memory if necessary
     /// @param is_clear True when the render targets are being used for clears
@@ -324,14 +323,13 @@ private:
     [[nodiscard]] SamplerId FindSampler(const TSCEntry& config);
 
     /// Find or create an image view for the given color buffer index
-    [[nodiscard]] ImageViewId FindColorBuffer(size_t index, bool is_clear);
+    [[nodiscard]] ImageViewId FindColorBuffer(size_t index);
 
     /// Find or create an image view for the depth buffer
-    [[nodiscard]] ImageViewId FindDepthBuffer(bool is_clear);
+    [[nodiscard]] ImageViewId FindDepthBuffer();
 
     /// Find or create a view for a render target with the given image parameters
-    [[nodiscard]] ImageViewId FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr,
-                                                   bool is_clear);
+    [[nodiscard]] ImageViewId FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr);
 
     /// Iterates over all the images in a region calling func
     template <typename Func>
-- 
cgit v1.2.3


From 5da70f719703084482933e103e561cc98163f370 Mon Sep 17 00:00:00 2001
From: Kelebek1 <eeeedddccc@hotmail.co.uk>
Date: Tue, 23 May 2023 14:45:54 +0100
Subject: Remove memory allocations in some hot paths

---
 src/audio_core/device/audio_buffers.h              |  8 +--
 src/audio_core/device/device_session.cpp           | 12 ++---
 src/audio_core/device/device_session.h             |  7 +--
 src/audio_core/in/audio_in_system.cpp              |  5 +-
 src/audio_core/out/audio_out_system.cpp            |  4 +-
 .../renderer/command/data_source/decode.cpp        | 23 ++++-----
 .../renderer/command/effect/compressor.cpp         |  8 +--
 src/audio_core/renderer/command/effect/delay.cpp   | 14 ++---
 .../renderer/command/effect/i3dl2_reverb.cpp       |  4 +-
 .../renderer/command/effect/light_limiter.cpp      | 12 ++---
 src/audio_core/renderer/command/effect/reverb.cpp  | 12 ++---
 .../renderer/command/sink/circular_buffer.cpp      |  4 +-
 src/audio_core/renderer/command/sink/device.cpp    |  5 +-
 src/audio_core/renderer/mix/mix_context.cpp        |  6 +--
 src/audio_core/renderer/nodes/node_states.cpp      |  4 +-
 src/audio_core/renderer/nodes/node_states.h        |  2 +-
 src/audio_core/renderer/system.cpp                 |  1 +
 src/audio_core/sink/null_sink.h                    |  2 +-
 src/audio_core/sink/sink_stream.cpp                | 15 +++---
 src/audio_core/sink/sink_stream.h                  |  5 +-
 src/common/ring_buffer.h                           |  3 +-
 src/common/scratch_buffer.h                        |  9 ++++
 src/core/hle/kernel/k_synchronization_object.cpp   |  3 +-
 src/core/hle/kernel/k_thread.cpp                   |  8 ++-
 src/core/hle/kernel/k_thread.h                     |  3 +-
 src/core/hle/kernel/svc/svc_ipc.cpp                |  7 +--
 src/core/hle/kernel/svc/svc_synchronization.cpp    | 10 ++--
 src/core/hle/kernel/svc/svc_thread.cpp             |  2 +-
 src/core/hle/service/audio/audin_u.cpp             | 16 +++---
 src/core/hle/service/audio/audout_u.cpp            | 15 ++----
 src/core/hle/service/audio/audren_u.cpp            | 22 ++++----
 src/core/hle/service/audio/audren_u.h              |  1 +
 src/core/hle/service/audio/hwopus.cpp              |  9 ++--
 src/core/hle/service/nvdrv/devices/nvdevice.h      |  6 +--
 .../hle/service/nvdrv/devices/nvdisp_disp0.cpp     |  6 +--
 src/core/hle/service/nvdrv/devices/nvdisp_disp0.h  |  8 +--
 .../hle/service/nvdrv/devices/nvhost_as_gpu.cpp    | 31 ++++++------
 src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h | 30 ++++++-----
 src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp | 19 ++++---
 src/core/hle/service/nvdrv/devices/nvhost_ctrl.h   | 21 ++++----
 .../hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp  | 32 ++++++------
 .../hle/service/nvdrv/devices/nvhost_ctrl_gpu.h    | 38 +++++++-------
 src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp  | 59 +++++++++++-----------
 src/core/hle/service/nvdrv/devices/nvhost_gpu.h    | 36 ++++++-------
 .../hle/service/nvdrv/devices/nvhost_nvdec.cpp     |  6 +--
 src/core/hle/service/nvdrv/devices/nvhost_nvdec.h  |  8 +--
 .../service/nvdrv/devices/nvhost_nvdec_common.cpp  | 15 +++---
 .../service/nvdrv/devices/nvhost_nvdec_common.h    | 12 ++---
 .../hle/service/nvdrv/devices/nvhost_nvjpg.cpp     |  8 +--
 src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h  | 10 ++--
 src/core/hle/service/nvdrv/devices/nvhost_vic.cpp  |  6 +--
 src/core/hle/service/nvdrv/devices/nvhost_vic.h    |  8 +--
 src/core/hle/service/nvdrv/devices/nvmap.cpp       | 20 ++++----
 src/core/hle/service/nvdrv/devices/nvmap.h         | 20 ++++----
 src/core/hle/service/nvdrv/nvdrv.cpp               |  8 +--
 src/core/hle/service/nvdrv/nvdrv.h                 |  8 +--
 src/core/hle/service/nvdrv/nvdrv_interface.cpp     | 24 ++++-----
 src/core/hle/service/nvdrv/nvdrv_interface.h       |  3 ++
 src/core/hle/service/nvnflinger/parcel.h           |  7 +--
 .../backend/glsl/glsl_emit_context.cpp             |  2 +-
 src/shader_recompiler/backend/spirv/emit_spirv.cpp |  2 +-
 .../backend/spirv/spirv_emit_context.cpp           |  2 +-
 src/shader_recompiler/runtime_info.h               |  3 +-
 src/video_core/buffer_cache/buffer_cache.h         |  4 +-
 src/video_core/buffer_cache/buffer_cache_base.h    |  4 +-
 src/video_core/cdma_pusher.h                       |  1 -
 src/video_core/dma_pusher.h                        |  8 +--
 src/video_core/engines/maxwell_dma.cpp             | 35 +++++++------
 src/video_core/host1x/codecs/h264.cpp              |  4 +-
 src/video_core/memory_manager.cpp                  | 13 ++---
 src/video_core/memory_manager.h                    | 15 ++++--
 src/video_core/renderer_opengl/gl_shader_cache.cpp |  4 +-
 src/video_core/renderer_vulkan/vk_buffer_cache.cpp |  2 +-
 .../renderer_vulkan/vk_pipeline_cache.cpp          | 10 +++-
 .../renderer_vulkan/vk_texture_cache.cpp           | 27 +++++-----
 src/video_core/shader_cache.cpp                    |  4 +-
 src/video_core/texture_cache/image_base.h          |  5 +-
 src/video_core/texture_cache/texture_cache.h       | 14 ++---
 src/video_core/texture_cache/texture_cache_base.h  |  4 +-
 src/video_core/texture_cache/util.cpp              | 48 ++++++++++--------
 src/video_core/texture_cache/util.h                | 31 ++++++------
 src/video_core/transform_feedback.cpp              |  8 +--
 src/video_core/transform_feedback.h                |  2 +-
 src/video_core/vulkan_common/vulkan_device.cpp     |  1 +
 84 files changed, 503 insertions(+), 460 deletions(-)

(limited to 'src/video_core/texture_cache')

diff --git a/src/audio_core/device/audio_buffers.h b/src/audio_core/device/audio_buffers.h
index 15082f6c6..5d8ed0ef7 100644
--- a/src/audio_core/device/audio_buffers.h
+++ b/src/audio_core/device/audio_buffers.h
@@ -7,6 +7,7 @@
 #include <mutex>
 #include <span>
 #include <vector>
+#include <boost/container/static_vector.hpp>
 
 #include "audio_buffer.h"
 #include "audio_core/device/device_session.h"
@@ -48,7 +49,7 @@ public:
      *
      * @param out_buffers - The buffers which were registered.
      */
-    void RegisterBuffers(std::vector<AudioBuffer>& out_buffers) {
+    void RegisterBuffers(boost::container::static_vector<AudioBuffer, N>& out_buffers) {
         std::scoped_lock l{lock};
         const s32 to_register{std::min(std::min(appended_count, BufferAppendLimit),
                                        BufferAppendLimit - registered_count)};
@@ -162,7 +163,8 @@ public:
      * @param max_buffers     - Maximum number of buffers to released.
      * @return The number of buffers released.
      */
-    u32 GetRegisteredAppendedBuffers(std::vector<AudioBuffer>& buffers_flushed, u32 max_buffers) {
+    u32 GetRegisteredAppendedBuffers(
+        boost::container::static_vector<AudioBuffer, N>& buffers_flushed, u32 max_buffers) {
         std::scoped_lock l{lock};
         if (registered_count + appended_count == 0) {
             return 0;
@@ -270,7 +272,7 @@ public:
      */
     bool FlushBuffers(u32& buffers_released) {
         std::scoped_lock l{lock};
-        std::vector<AudioBuffer> buffers_flushed{};
+        boost::container::static_vector<AudioBuffer, N> buffers_flushed{};
 
         buffers_released = GetRegisteredAppendedBuffers(buffers_flushed, append_limit);
 
diff --git a/src/audio_core/device/device_session.cpp b/src/audio_core/device/device_session.cpp
index b5c0ef0e6..86811fcb8 100644
--- a/src/audio_core/device/device_session.cpp
+++ b/src/audio_core/device/device_session.cpp
@@ -79,7 +79,7 @@ void DeviceSession::ClearBuffers() {
     }
 }
 
-void DeviceSession::AppendBuffers(std::span<const AudioBuffer> buffers) const {
+void DeviceSession::AppendBuffers(std::span<const AudioBuffer> buffers) {
     for (const auto& buffer : buffers) {
         Sink::SinkBuffer new_buffer{
             .frames = buffer.size / (channel_count * sizeof(s16)),
@@ -88,13 +88,13 @@ void DeviceSession::AppendBuffers(std::span<const AudioBuffer> buffers) const {
             .consumed = false,
         };
 
+        tmp_samples.resize_destructive(buffer.size / sizeof(s16));
         if (type == Sink::StreamType::In) {
-            std::vector<s16> samples{};
-            stream->AppendBuffer(new_buffer, samples);
+            stream->AppendBuffer(new_buffer, tmp_samples);
         } else {
-            std::vector<s16> samples(buffer.size / sizeof(s16));
-            system.ApplicationMemory().ReadBlockUnsafe(buffer.samples, samples.data(), buffer.size);
-            stream->AppendBuffer(new_buffer, samples);
+            system.ApplicationMemory().ReadBlockUnsafe(buffer.samples, tmp_samples.data(),
+                                                       buffer.size);
+            stream->AppendBuffer(new_buffer, tmp_samples);
         }
     }
 }
diff --git a/src/audio_core/device/device_session.h b/src/audio_core/device/device_session.h
index 75f766c68..7d52f362d 100644
--- a/src/audio_core/device/device_session.h
+++ b/src/audio_core/device/device_session.h
@@ -10,6 +10,7 @@
 
 #include "audio_core/common/common.h"
 #include "audio_core/sink/sink.h"
+#include "common/scratch_buffer.h"
 #include "core/hle/service/audio/errors.h"
 
 namespace Core {
@@ -62,7 +63,7 @@ public:
      *
      * @param buffers - The buffers to play.
      */
-    void AppendBuffers(std::span<const AudioBuffer> buffers) const;
+    void AppendBuffers(std::span<const AudioBuffer> buffers);
 
     /**
      * (Audio In only) Pop samples from the backend, and write them back to this buffer's address.
@@ -146,8 +147,8 @@ private:
     std::shared_ptr<Core::Timing::EventType> thread_event;
     /// Is this session initialised?
     bool initialized{};
-    /// Buffer queue
-    std::vector<AudioBuffer> buffer_queue{};
+    /// Temporary sample buffer
+    Common::ScratchBuffer<s16> tmp_samples{};
 };
 
 } // namespace AudioCore
diff --git a/src/audio_core/in/audio_in_system.cpp b/src/audio_core/in/audio_in_system.cpp
index e23e51758..579129121 100644
--- a/src/audio_core/in/audio_in_system.cpp
+++ b/src/audio_core/in/audio_in_system.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include <mutex>
+
 #include "audio_core/audio_event.h"
 #include "audio_core/audio_manager.h"
 #include "audio_core/in/audio_in_system.h"
@@ -89,7 +90,7 @@ Result System::Start() {
     session->Start();
     state = State::Started;
 
-    std::vector<AudioBuffer> buffers_to_flush{};
+    boost::container::static_vector<AudioBuffer, BufferCount> buffers_to_flush{};
     buffers.RegisterBuffers(buffers_to_flush);
     session->AppendBuffers(buffers_to_flush);
     session->SetRingSize(static_cast<u32>(buffers_to_flush.size()));
@@ -134,7 +135,7 @@ bool System::AppendBuffer(const AudioInBuffer& buffer, const u64 tag) {
 
 void System::RegisterBuffers() {
     if (state == State::Started) {
-        std::vector<AudioBuffer> registered_buffers{};
+        boost::container::static_vector<AudioBuffer, BufferCount> registered_buffers{};
         buffers.RegisterBuffers(registered_buffers);
         session->AppendBuffers(registered_buffers);
     }
diff --git a/src/audio_core/out/audio_out_system.cpp b/src/audio_core/out/audio_out_system.cpp
index bd13f7219..0adf64bd3 100644
--- a/src/audio_core/out/audio_out_system.cpp
+++ b/src/audio_core/out/audio_out_system.cpp
@@ -89,7 +89,7 @@ Result System::Start() {
     session->Start();
     state = State::Started;
 
-    std::vector<AudioBuffer> buffers_to_flush{};
+    boost::container::static_vector<AudioBuffer, BufferCount> buffers_to_flush{};
     buffers.RegisterBuffers(buffers_to_flush);
     session->AppendBuffers(buffers_to_flush);
     session->SetRingSize(static_cast<u32>(buffers_to_flush.size()));
@@ -134,7 +134,7 @@ bool System::AppendBuffer(const AudioOutBuffer& buffer, u64 tag) {
 
 void System::RegisterBuffers() {
     if (state == State::Started) {
-        std::vector<AudioBuffer> registered_buffers{};
+        boost::container::static_vector<AudioBuffer, BufferCount> registered_buffers{};
         buffers.RegisterBuffers(registered_buffers);
         session->AppendBuffers(registered_buffers);
     }
diff --git a/src/audio_core/renderer/command/data_source/decode.cpp b/src/audio_core/renderer/command/data_source/decode.cpp
index ff5d31bd6..f45933203 100644
--- a/src/audio_core/renderer/command/data_source/decode.cpp
+++ b/src/audio_core/renderer/command/data_source/decode.cpp
@@ -8,6 +8,7 @@
 #include "audio_core/renderer/command/resample/resample.h"
 #include "common/fixed_point.h"
 #include "common/logging/log.h"
+#include "common/scratch_buffer.h"
 #include "core/memory.h"
 
 namespace AudioCore::AudioRenderer {
@@ -27,6 +28,7 @@ constexpr std::array<u8, 3> PitchBySrcQuality = {4, 8, 4};
 template <typename T>
 static u32 DecodePcm(Core::Memory::Memory& memory, std::span<s16> out_buffer,
                      const DecodeArg& req) {
+    std::array<T, TempBufferSize> tmp_samples{};
     constexpr s32 min{std::numeric_limits<s16>::min()};
     constexpr s32 max{std::numeric_limits<s16>::max()};
 
@@ -49,18 +51,17 @@ static u32 DecodePcm(Core::Memory::Memory& memory, std::span<s16> out_buffer,
         const u64 size{channel_count * samples_to_decode};
         const u64 size_bytes{size * sizeof(T)};
 
-        std::vector<T> samples(size);
-        memory.ReadBlockUnsafe(source, samples.data(), size_bytes);
+        memory.ReadBlockUnsafe(source, tmp_samples.data(), size_bytes);
 
         if constexpr (std::is_floating_point_v<T>) {
             for (u32 i = 0; i < samples_to_decode; i++) {
-                auto sample{static_cast<s32>(samples[i * channel_count + req.target_channel] *
+                auto sample{static_cast<s32>(tmp_samples[i * channel_count + req.target_channel] *
                                              std::numeric_limits<s16>::max())};
                 out_buffer[i] = static_cast<s16>(std::clamp(sample, min, max));
             }
         } else {
             for (u32 i = 0; i < samples_to_decode; i++) {
-                out_buffer[i] = samples[i * channel_count + req.target_channel];
+                out_buffer[i] = tmp_samples[i * channel_count + req.target_channel];
             }
         }
     } break;
@@ -73,17 +74,16 @@ static u32 DecodePcm(Core::Memory::Memory& memory, std::span<s16> out_buffer,
         }
 
         const VAddr source{req.buffer + ((req.start_offset + req.offset) * sizeof(T))};
-        std::vector<T> samples(samples_to_decode);
-        memory.ReadBlockUnsafe(source, samples.data(), samples_to_decode * sizeof(T));
+        memory.ReadBlockUnsafe(source, tmp_samples.data(), samples_to_decode * sizeof(T));
 
         if constexpr (std::is_floating_point_v<T>) {
             for (u32 i = 0; i < samples_to_decode; i++) {
-                auto sample{static_cast<s32>(samples[i * channel_count + req.target_channel] *
+                auto sample{static_cast<s32>(tmp_samples[i * channel_count + req.target_channel] *
                                              std::numeric_limits<s16>::max())};
                 out_buffer[i] = static_cast<s16>(std::clamp(sample, min, max));
             }
         } else {
-            std::memcpy(out_buffer.data(), samples.data(), samples_to_decode * sizeof(s16));
+            std::memcpy(out_buffer.data(), tmp_samples.data(), samples_to_decode * sizeof(s16));
         }
         break;
     }
@@ -101,6 +101,7 @@ static u32 DecodePcm(Core::Memory::Memory& memory, std::span<s16> out_buffer,
  */
 static u32 DecodeAdpcm(Core::Memory::Memory& memory, std::span<s16> out_buffer,
                        const DecodeArg& req) {
+    std::array<u8, TempBufferSize> wavebuffer{};
     constexpr u32 SamplesPerFrame{14};
     constexpr u32 NibblesPerFrame{16};
 
@@ -138,9 +139,7 @@ static u32 DecodeAdpcm(Core::Memory::Memory& memory, std::span<s16> out_buffer,
     }
 
     const auto size{std::max((samples_to_process / 8U) * SamplesPerFrame, 8U)};
-    std::vector<u8> wavebuffer(size);
-    memory.ReadBlockUnsafe(req.buffer + position_in_frame / 2, wavebuffer.data(),
-                           wavebuffer.size());
+    memory.ReadBlockUnsafe(req.buffer + position_in_frame / 2, wavebuffer.data(), size);
 
     auto context{req.adpcm_context};
     auto header{context->header};
@@ -258,7 +257,7 @@ void DecodeFromWaveBuffers(Core::Memory::Memory& memory, const DecodeFromWaveBuf
     u32 offset{voice_state.offset};
 
     auto output_buffer{args.output};
-    std::vector<s16> temp_buffer(TempBufferSize, 0);
+    std::array<s16, TempBufferSize> temp_buffer{};
 
     while (remaining_sample_count > 0) {
         const auto samples_to_write{std::min(remaining_sample_count, max_remaining_sample_count)};
diff --git a/src/audio_core/renderer/command/effect/compressor.cpp b/src/audio_core/renderer/command/effect/compressor.cpp
index 7229618e8..ee9b68d5b 100644
--- a/src/audio_core/renderer/command/effect/compressor.cpp
+++ b/src/audio_core/renderer/command/effect/compressor.cpp
@@ -44,8 +44,8 @@ static void InitializeCompressorEffect(const CompressorInfo::ParameterVersion2&
 
 static void ApplyCompressorEffect(const CompressorInfo::ParameterVersion2& params,
                                   CompressorInfo::State& state, bool enabled,
-                                  std::vector<std::span<const s32>> input_buffers,
-                                  std::vector<std::span<s32>> output_buffers, u32 sample_count) {
+                                  std::span<std::span<const s32>> input_buffers,
+                                  std::span<std::span<s32>> output_buffers, u32 sample_count) {
     if (enabled) {
         auto state_00{state.unk_00};
         auto state_04{state.unk_04};
@@ -124,8 +124,8 @@ void CompressorCommand::Dump([[maybe_unused]] const ADSP::CommandListProcessor&
 }
 
 void CompressorCommand::Process(const ADSP::CommandListProcessor& processor) {
-    std::vector<std::span<const s32>> input_buffers(parameter.channel_count);
-    std::vector<std::span<s32>> output_buffers(parameter.channel_count);
+    std::array<std::span<const s32>, MaxChannels> input_buffers{};
+    std::array<std::span<s32>, MaxChannels> output_buffers{};
 
     for (s16 i = 0; i < parameter.channel_count; i++) {
         input_buffers[i] = processor.mix_buffers.subspan(inputs[i] * processor.sample_count,
diff --git a/src/audio_core/renderer/command/effect/delay.cpp b/src/audio_core/renderer/command/effect/delay.cpp
index a4e408d40..e536cbb1e 100644
--- a/src/audio_core/renderer/command/effect/delay.cpp
+++ b/src/audio_core/renderer/command/effect/delay.cpp
@@ -51,7 +51,7 @@ static void InitializeDelayEffect(const DelayInfo::ParameterVersion1& params,
         state.delay_lines[channel].sample_count_max = sample_count_max.to_int_floor();
         state.delay_lines[channel].sample_count = sample_count.to_int_floor();
         state.delay_lines[channel].buffer.resize(state.delay_lines[channel].sample_count, 0);
-        if (state.delay_lines[channel].buffer.size() == 0) {
+        if (state.delay_lines[channel].sample_count == 0) {
             state.delay_lines[channel].buffer.push_back(0);
         }
         state.delay_lines[channel].buffer_pos = 0;
@@ -74,8 +74,8 @@ static void InitializeDelayEffect(const DelayInfo::ParameterVersion1& params,
  */
 template <size_t NumChannels>
 static void ApplyDelay(const DelayInfo::ParameterVersion1& params, DelayInfo::State& state,
-                       std::vector<std::span<const s32>>& inputs,
-                       std::vector<std::span<s32>>& outputs, const u32 sample_count) {
+                       std::span<std::span<const s32>> inputs, std::span<std::span<s32>> outputs,
+                       const u32 sample_count) {
     for (u32 sample_index = 0; sample_index < sample_count; sample_index++) {
         std::array<Common::FixedPoint<50, 14>, NumChannels> input_samples{};
         for (u32 channel = 0; channel < NumChannels; channel++) {
@@ -153,8 +153,8 @@ static void ApplyDelay(const DelayInfo::ParameterVersion1& params, DelayInfo::St
  * @param sample_count - Number of samples to process.
  */
 static void ApplyDelayEffect(const DelayInfo::ParameterVersion1& params, DelayInfo::State& state,
-                             const bool enabled, std::vector<std::span<const s32>>& inputs,
-                             std::vector<std::span<s32>>& outputs, const u32 sample_count) {
+                             const bool enabled, std::span<std::span<const s32>> inputs,
+                             std::span<std::span<s32>> outputs, const u32 sample_count) {
 
     if (!IsChannelCountValid(params.channel_count)) {
         LOG_ERROR(Service_Audio, "Invalid delay channels {}", params.channel_count);
@@ -208,8 +208,8 @@ void DelayCommand::Dump([[maybe_unused]] const ADSP::CommandListProcessor& proce
 }
 
 void DelayCommand::Process(const ADSP::CommandListProcessor& processor) {
-    std::vector<std::span<const s32>> input_buffers(parameter.channel_count);
-    std::vector<std::span<s32>> output_buffers(parameter.channel_count);
+    std::array<std::span<const s32>, MaxChannels> input_buffers{};
+    std::array<std::span<s32>, MaxChannels> output_buffers{};
 
     for (s16 i = 0; i < parameter.channel_count; i++) {
         input_buffers[i] = processor.mix_buffers.subspan(inputs[i] * processor.sample_count,
diff --git a/src/audio_core/renderer/command/effect/i3dl2_reverb.cpp b/src/audio_core/renderer/command/effect/i3dl2_reverb.cpp
index 27d8b9844..d2bfb67cc 100644
--- a/src/audio_core/renderer/command/effect/i3dl2_reverb.cpp
+++ b/src/audio_core/renderer/command/effect/i3dl2_reverb.cpp
@@ -408,8 +408,8 @@ void I3dl2ReverbCommand::Dump([[maybe_unused]] const ADSP::CommandListProcessor&
 }
 
 void I3dl2ReverbCommand::Process(const ADSP::CommandListProcessor& processor) {
-    std::vector<std::span<const s32>> input_buffers(parameter.channel_count);
-    std::vector<std::span<s32>> output_buffers(parameter.channel_count);
+    std::array<std::span<const s32>, MaxChannels> input_buffers{};
+    std::array<std::span<s32>, MaxChannels> output_buffers{};
 
     for (u32 i = 0; i < parameter.channel_count; i++) {
         input_buffers[i] = processor.mix_buffers.subspan(inputs[i] * processor.sample_count,
diff --git a/src/audio_core/renderer/command/effect/light_limiter.cpp b/src/audio_core/renderer/command/effect/light_limiter.cpp
index e8fb0e2fc..4161a9821 100644
--- a/src/audio_core/renderer/command/effect/light_limiter.cpp
+++ b/src/audio_core/renderer/command/effect/light_limiter.cpp
@@ -47,8 +47,8 @@ static void InitializeLightLimiterEffect(const LightLimiterInfo::ParameterVersio
  */
 static void ApplyLightLimiterEffect(const LightLimiterInfo::ParameterVersion2& params,
                                     LightLimiterInfo::State& state, const bool enabled,
-                                    std::vector<std::span<const s32>>& inputs,
-                                    std::vector<std::span<s32>>& outputs, const u32 sample_count,
+                                    std::span<std::span<const s32>> inputs,
+                                    std::span<std::span<s32>> outputs, const u32 sample_count,
                                     LightLimiterInfo::StatisticsInternal* statistics) {
     constexpr s64 min{std::numeric_limits<s32>::min()};
     constexpr s64 max{std::numeric_limits<s32>::max()};
@@ -147,8 +147,8 @@ void LightLimiterVersion1Command::Dump([[maybe_unused]] const ADSP::CommandListP
 }
 
 void LightLimiterVersion1Command::Process(const ADSP::CommandListProcessor& processor) {
-    std::vector<std::span<const s32>> input_buffers(parameter.channel_count);
-    std::vector<std::span<s32>> output_buffers(parameter.channel_count);
+    std::array<std::span<const s32>, MaxChannels> input_buffers{};
+    std::array<std::span<s32>, MaxChannels> output_buffers{};
 
     for (u32 i = 0; i < parameter.channel_count; i++) {
         input_buffers[i] = processor.mix_buffers.subspan(inputs[i] * processor.sample_count,
@@ -190,8 +190,8 @@ void LightLimiterVersion2Command::Dump([[maybe_unused]] const ADSP::CommandListP
 }
 
 void LightLimiterVersion2Command::Process(const ADSP::CommandListProcessor& processor) {
-    std::vector<std::span<const s32>> input_buffers(parameter.channel_count);
-    std::vector<std::span<s32>> output_buffers(parameter.channel_count);
+    std::array<std::span<const s32>, MaxChannels> input_buffers{};
+    std::array<std::span<s32>, MaxChannels> output_buffers{};
 
     for (u32 i = 0; i < parameter.channel_count; i++) {
         input_buffers[i] = processor.mix_buffers.subspan(inputs[i] * processor.sample_count,
diff --git a/src/audio_core/renderer/command/effect/reverb.cpp b/src/audio_core/renderer/command/effect/reverb.cpp
index 8b9b65214..fc2f15a5e 100644
--- a/src/audio_core/renderer/command/effect/reverb.cpp
+++ b/src/audio_core/renderer/command/effect/reverb.cpp
@@ -250,8 +250,8 @@ static Common::FixedPoint<50, 14> Axfx2AllPassTick(ReverbInfo::ReverbDelayLine&
  */
 template <size_t NumChannels>
 static void ApplyReverbEffect(const ReverbInfo::ParameterVersion2& params, ReverbInfo::State& state,
-                              std::vector<std::span<const s32>>& inputs,
-                              std::vector<std::span<s32>>& outputs, const u32 sample_count) {
+                              std::span<std::span<const s32>> inputs,
+                              std::span<std::span<s32>> outputs, const u32 sample_count) {
     static constexpr std::array<u8, ReverbInfo::MaxDelayTaps> OutTapIndexes1Ch{
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     };
@@ -369,8 +369,8 @@ static void ApplyReverbEffect(const ReverbInfo::ParameterVersion2& params, Rever
  * @param sample_count - Number of samples to process.
  */
 static void ApplyReverbEffect(const ReverbInfo::ParameterVersion2& params, ReverbInfo::State& state,
-                              const bool enabled, std::vector<std::span<const s32>>& inputs,
-                              std::vector<std::span<s32>>& outputs, const u32 sample_count) {
+                              const bool enabled, std::span<std::span<const s32>> inputs,
+                              std::span<std::span<s32>> outputs, const u32 sample_count) {
     if (enabled) {
         switch (params.channel_count) {
         case 0:
@@ -412,8 +412,8 @@ void ReverbCommand::Dump([[maybe_unused]] const ADSP::CommandListProcessor& proc
 }
 
 void ReverbCommand::Process(const ADSP::CommandListProcessor& processor) {
-    std::vector<std::span<const s32>> input_buffers(parameter.channel_count);
-    std::vector<std::span<s32>> output_buffers(parameter.channel_count);
+    std::array<std::span<const s32>, MaxChannels> input_buffers{};
+    std::array<std::span<s32>, MaxChannels> output_buffers{};
 
     for (u32 i = 0; i < parameter.channel_count; i++) {
         input_buffers[i] = processor.mix_buffers.subspan(inputs[i] * processor.sample_count,
diff --git a/src/audio_core/renderer/command/sink/circular_buffer.cpp b/src/audio_core/renderer/command/sink/circular_buffer.cpp
index ded5afc94..e2ce59792 100644
--- a/src/audio_core/renderer/command/sink/circular_buffer.cpp
+++ b/src/audio_core/renderer/command/sink/circular_buffer.cpp
@@ -24,7 +24,7 @@ void CircularBufferSinkCommand::Process(const ADSP::CommandListProcessor& proces
     constexpr s32 min{std::numeric_limits<s16>::min()};
     constexpr s32 max{std::numeric_limits<s16>::max()};
 
-    std::vector<s16> output(processor.sample_count);
+    std::array<s16, TargetSampleCount * MaxChannels> output{};
     for (u32 channel = 0; channel < input_count; channel++) {
         auto input{processor.mix_buffers.subspan(inputs[channel] * processor.sample_count,
                                                  processor.sample_count)};
@@ -33,7 +33,7 @@ void CircularBufferSinkCommand::Process(const ADSP::CommandListProcessor& proces
         }
 
         processor.memory->WriteBlockUnsafe(address + pos, output.data(),
-                                           output.size() * sizeof(s16));
+                                           processor.sample_count * sizeof(s16));
         pos += static_cast<u32>(processor.sample_count * sizeof(s16));
         if (pos >= size) {
             pos = 0;
diff --git a/src/audio_core/renderer/command/sink/device.cpp b/src/audio_core/renderer/command/sink/device.cpp
index e88372a75..5f74dd7ad 100644
--- a/src/audio_core/renderer/command/sink/device.cpp
+++ b/src/audio_core/renderer/command/sink/device.cpp
@@ -33,8 +33,7 @@ void DeviceSinkCommand::Process(const ADSP::CommandListProcessor& processor) {
         .consumed{false},
     };
 
-    std::vector<s16> samples(out_buffer.frames * input_count);
-
+    std::array<s16, TargetSampleCount * MaxChannels> samples{};
     for (u32 channel = 0; channel < input_count; channel++) {
         const auto offset{inputs[channel] * out_buffer.frames};
 
@@ -45,7 +44,7 @@ void DeviceSinkCommand::Process(const ADSP::CommandListProcessor& processor) {
     }
 
     out_buffer.tag = reinterpret_cast<u64>(samples.data());
-    stream->AppendBuffer(out_buffer, samples);
+    stream->AppendBuffer(out_buffer, {samples.data(), out_buffer.frames * input_count});
 
     if (stream->IsPaused()) {
         stream->Start();
diff --git a/src/audio_core/renderer/mix/mix_context.cpp b/src/audio_core/renderer/mix/mix_context.cpp
index 35b748ede..3a18ae7c2 100644
--- a/src/audio_core/renderer/mix/mix_context.cpp
+++ b/src/audio_core/renderer/mix/mix_context.cpp
@@ -125,10 +125,10 @@ bool MixContext::TSortInfo(const SplitterContext& splitter_context) {
         return false;
     }
 
-    std::vector<s32> sorted_results{node_states.GetSortedResuls()};
-    const auto result_size{std::min(count, static_cast<s32>(sorted_results.size()))};
+    auto sorted_results{node_states.GetSortedResuls()};
+    const auto result_size{std::min(count, static_cast<s32>(sorted_results.second))};
     for (s32 i = 0; i < result_size; i++) {
-        sorted_mix_infos[i] = &mix_infos[sorted_results[i]];
+        sorted_mix_infos[i] = &mix_infos[sorted_results.first[i]];
     }
 
     CalcMixBufferOffset();
diff --git a/src/audio_core/renderer/nodes/node_states.cpp b/src/audio_core/renderer/nodes/node_states.cpp
index 1821a51e6..b7a44a54c 100644
--- a/src/audio_core/renderer/nodes/node_states.cpp
+++ b/src/audio_core/renderer/nodes/node_states.cpp
@@ -134,8 +134,8 @@ u32 NodeStates::GetNodeCount() const {
     return node_count;
 }
 
-std::vector<s32> NodeStates::GetSortedResuls() const {
-    return {results.rbegin(), results.rbegin() + result_pos};
+std::pair<std::span<u32>::reverse_iterator, size_t> NodeStates::GetSortedResuls() const {
+    return {results.rbegin(), result_pos};
 }
 
 } // namespace AudioCore::AudioRenderer
diff --git a/src/audio_core/renderer/nodes/node_states.h b/src/audio_core/renderer/nodes/node_states.h
index 94b1d1254..e768cd4b5 100644
--- a/src/audio_core/renderer/nodes/node_states.h
+++ b/src/audio_core/renderer/nodes/node_states.h
@@ -175,7 +175,7 @@ public:
      *
      * @return Vector of nodes in reverse order.
      */
-    std::vector<s32> GetSortedResuls() const;
+    std::pair<std::span<u32>::reverse_iterator, size_t> GetSortedResuls() const;
 
 private:
     /// Number of nodes in the graph
diff --git a/src/audio_core/renderer/system.cpp b/src/audio_core/renderer/system.cpp
index 53b258c4f..a23627472 100644
--- a/src/audio_core/renderer/system.cpp
+++ b/src/audio_core/renderer/system.cpp
@@ -444,6 +444,7 @@ Result System::Update(std::span<const u8> input, std::span<u8> performance, std:
     std::scoped_lock l{lock};
 
     const auto start_time{core.CoreTiming().GetClockTicks()};
+    std::memset(output.data(), 0, output.size());
 
     InfoUpdater info_updater(input, output, process_handle, behavior);
 
diff --git a/src/audio_core/sink/null_sink.h b/src/audio_core/sink/null_sink.h
index 1215d3cd2..b6b43c93e 100644
--- a/src/audio_core/sink/null_sink.h
+++ b/src/audio_core/sink/null_sink.h
@@ -20,7 +20,7 @@ public:
     explicit NullSinkStreamImpl(Core::System& system_, StreamType type_)
         : SinkStream{system_, type_} {}
     ~NullSinkStreamImpl() override {}
-    void AppendBuffer(SinkBuffer&, std::vector<s16>&) override {}
+    void AppendBuffer(SinkBuffer&, std::span<s16>) override {}
     std::vector<s16> ReleaseBuffer(u64) override {
         return {};
     }
diff --git a/src/audio_core/sink/sink_stream.cpp b/src/audio_core/sink/sink_stream.cpp
index 9a718a9cc..404dcd0e9 100644
--- a/src/audio_core/sink/sink_stream.cpp
+++ b/src/audio_core/sink/sink_stream.cpp
@@ -18,7 +18,7 @@
 
 namespace AudioCore::Sink {
 
-void SinkStream::AppendBuffer(SinkBuffer& buffer, std::vector<s16>& samples) {
+void SinkStream::AppendBuffer(SinkBuffer& buffer, std::span<s16> samples) {
     if (type == StreamType::In) {
         queue.enqueue(buffer);
         queued_buffers++;
@@ -66,15 +66,16 @@ void SinkStream::AppendBuffer(SinkBuffer& buffer, std::vector<s16>& samples) {
                 static_cast<s16>(std::clamp(right_sample, min, max));
         }
 
-        samples.resize(samples.size() / system_channels * device_channels);
+        samples = samples.subspan(0, samples.size() / system_channels * device_channels);
 
     } else if (system_channels == 2 && device_channels == 6) {
         // We need moar samples! Not all games will provide 6 channel audio.
         // TODO: Implement some upmixing here. Currently just passthrough, with other
         // channels left as silence.
-        std::vector<s16> new_samples(samples.size() / system_channels * device_channels, 0);
+        auto new_size = samples.size() / system_channels * device_channels;
+        tmp_samples.resize_destructive(new_size);
 
-        for (u32 read_index = 0, write_index = 0; read_index < samples.size();
+        for (u32 read_index = 0, write_index = 0; read_index < new_size;
              read_index += system_channels, write_index += device_channels) {
             const auto left_sample{static_cast<s16>(std::clamp(
                 static_cast<s32>(
@@ -82,7 +83,7 @@ void SinkStream::AppendBuffer(SinkBuffer& buffer, std::vector<s16>& samples) {
                     volume),
                 min, max))};
 
-            new_samples[write_index + static_cast<u32>(Channels::FrontLeft)] = left_sample;
+            tmp_samples[write_index + static_cast<u32>(Channels::FrontLeft)] = left_sample;
 
             const auto right_sample{static_cast<s16>(std::clamp(
                 static_cast<s32>(
@@ -90,9 +91,9 @@ void SinkStream::AppendBuffer(SinkBuffer& buffer, std::vector<s16>& samples) {
                     volume),
                 min, max))};
 
-            new_samples[write_index + static_cast<u32>(Channels::FrontRight)] = right_sample;
+            tmp_samples[write_index + static_cast<u32>(Channels::FrontRight)] = right_sample;
         }
-        samples = std::move(new_samples);
+        samples = std::span<s16>(tmp_samples);
 
     } else if (volume != 1.0f) {
         for (u32 i = 0; i < samples.size(); i++) {
diff --git a/src/audio_core/sink/sink_stream.h b/src/audio_core/sink/sink_stream.h
index 41cbadc9c..98d72ace1 100644
--- a/src/audio_core/sink/sink_stream.h
+++ b/src/audio_core/sink/sink_stream.h
@@ -16,6 +16,7 @@
 #include "common/polyfill_thread.h"
 #include "common/reader_writer_queue.h"
 #include "common/ring_buffer.h"
+#include "common/scratch_buffer.h"
 #include "common/thread.h"
 
 namespace Core {
@@ -170,7 +171,7 @@ public:
      * @param buffer  - Audio buffer information to be queued.
      * @param samples - The s16 samples to be queue for playback.
      */
-    virtual void AppendBuffer(SinkBuffer& buffer, std::vector<s16>& samples);
+    virtual void AppendBuffer(SinkBuffer& buffer, std::span<s16> samples);
 
     /**
      * Release a buffer. Audio In only, will fill a buffer with recorded samples.
@@ -255,6 +256,8 @@ private:
     /// Signalled when ring buffer entries are consumed
     std::condition_variable_any release_cv;
     std::mutex release_mutex;
+    /// Temporary buffer for appending samples when upmixing
+    Common::ScratchBuffer<s16> tmp_samples{};
 };
 
 using SinkStreamPtr = std::unique_ptr<SinkStream>;
diff --git a/src/common/ring_buffer.h b/src/common/ring_buffer.h
index 4c328ab44..416680d44 100644
--- a/src/common/ring_buffer.h
+++ b/src/common/ring_buffer.h
@@ -9,6 +9,7 @@
 #include <cstddef>
 #include <cstring>
 #include <new>
+#include <span>
 #include <type_traits>
 #include <vector>
 
@@ -53,7 +54,7 @@ public:
         return push_count;
     }
 
-    std::size_t Push(const std::vector<T>& input) {
+    std::size_t Push(const std::span<T> input) {
         return Push(input.data(), input.size());
     }
 
diff --git a/src/common/scratch_buffer.h b/src/common/scratch_buffer.h
index a69a5a7af..6fe907953 100644
--- a/src/common/scratch_buffer.h
+++ b/src/common/scratch_buffer.h
@@ -3,6 +3,9 @@
 
 #pragma once
 
+#include <iterator>
+
+#include "common/concepts.h"
 #include "common/make_unique_for_overwrite.h"
 
 namespace Common {
@@ -16,6 +19,12 @@ namespace Common {
 template <typename T>
 class ScratchBuffer {
 public:
+    using iterator = T*;
+    using const_iterator = const T*;
+    using value_type = T;
+    using element_type = T;
+    using iterator_category = std::contiguous_iterator_tag;
+
     ScratchBuffer() = default;
 
     explicit ScratchBuffer(size_t initial_capacity)
diff --git a/src/core/hle/kernel/k_synchronization_object.cpp b/src/core/hle/kernel/k_synchronization_object.cpp
index b7da3eee7..3e5b735b1 100644
--- a/src/core/hle/kernel/k_synchronization_object.cpp
+++ b/src/core/hle/kernel/k_synchronization_object.cpp
@@ -3,6 +3,7 @@
 
 #include "common/assert.h"
 #include "common/common_types.h"
+#include "common/scratch_buffer.h"
 #include "core/hle/kernel/k_scheduler.h"
 #include "core/hle/kernel/k_scoped_scheduler_lock_and_sleep.h"
 #include "core/hle/kernel/k_synchronization_object.h"
@@ -75,7 +76,7 @@ Result KSynchronizationObject::Wait(KernelCore& kernel, s32* out_index,
                                     KSynchronizationObject** objects, const s32 num_objects,
                                     s64 timeout) {
     // Allocate space on stack for thread nodes.
-    std::vector<ThreadListNode> thread_nodes(num_objects);
+    std::array<ThreadListNode, Svc::ArgumentHandleCountMax> thread_nodes;
 
     // Prepare for wait.
     KThread* thread = GetCurrentThreadPointer(kernel);
diff --git a/src/core/hle/kernel/k_thread.cpp b/src/core/hle/kernel/k_thread.cpp
index 908811e2c..adb6ec581 100644
--- a/src/core/hle/kernel/k_thread.cpp
+++ b/src/core/hle/kernel/k_thread.cpp
@@ -909,7 +909,7 @@ Result KThread::SetActivity(Svc::ThreadActivity activity) {
     R_SUCCEED();
 }
 
-Result KThread::GetThreadContext3(std::vector<u8>& out) {
+Result KThread::GetThreadContext3(Common::ScratchBuffer<u8>& out) {
     // Lock ourselves.
     KScopedLightLock lk{m_activity_pause_lock};
 
@@ -927,15 +927,13 @@ Result KThread::GetThreadContext3(std::vector<u8>& out) {
                 // Mask away mode bits, interrupt bits, IL bit, and other reserved bits.
                 auto context = GetContext64();
                 context.pstate &= 0xFF0FFE20;
-
-                out.resize(sizeof(context));
+                out.resize_destructive(sizeof(context));
                 std::memcpy(out.data(), std::addressof(context), sizeof(context));
             } else {
                 // Mask away mode bits, interrupt bits, IL bit, and other reserved bits.
                 auto context = GetContext32();
                 context.cpsr &= 0xFF0FFE20;
-
-                out.resize(sizeof(context));
+                out.resize_destructive(sizeof(context));
                 std::memcpy(out.data(), std::addressof(context), sizeof(context));
             }
         }
diff --git a/src/core/hle/kernel/k_thread.h b/src/core/hle/kernel/k_thread.h
index 37fe5db77..dd662b3f8 100644
--- a/src/core/hle/kernel/k_thread.h
+++ b/src/core/hle/kernel/k_thread.h
@@ -15,6 +15,7 @@
 #include "common/intrusive_list.h"
 
 #include "common/intrusive_red_black_tree.h"
+#include "common/scratch_buffer.h"
 #include "common/spin_lock.h"
 #include "core/arm/arm_interface.h"
 #include "core/hle/kernel/k_affinity_mask.h"
@@ -567,7 +568,7 @@ public:
 
     void RemoveWaiter(KThread* thread);
 
-    Result GetThreadContext3(std::vector<u8>& out);
+    Result GetThreadContext3(Common::ScratchBuffer<u8>& out);
 
     KThread* RemoveUserWaiterByKey(bool* out_has_waiters, KProcessAddress key) {
         return this->RemoveWaiterByKey(out_has_waiters, key, false);
diff --git a/src/core/hle/kernel/svc/svc_ipc.cpp b/src/core/hle/kernel/svc/svc_ipc.cpp
index ea03068aa..60247df2e 100644
--- a/src/core/hle/kernel/svc/svc_ipc.cpp
+++ b/src/core/hle/kernel/svc/svc_ipc.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include "common/scope_exit.h"
+#include "common/scratch_buffer.h"
 #include "core/core.h"
 #include "core/hle/kernel/k_client_session.h"
 #include "core/hle/kernel/k_process.h"
@@ -45,11 +46,11 @@ Result ReplyAndReceive(Core::System& system, s32* out_index, uint64_t handles_ad
                  handles_addr, static_cast<u64>(sizeof(Handle) * num_handles)),
              ResultInvalidPointer);
 
-    std::vector<Handle> handles(num_handles);
+    std::array<Handle, Svc::ArgumentHandleCountMax> handles;
     GetCurrentMemory(kernel).ReadBlock(handles_addr, handles.data(), sizeof(Handle) * num_handles);
 
     // Convert handle list to object table.
-    std::vector<KSynchronizationObject*> objs(num_handles);
+    std::array<KSynchronizationObject*, Svc::ArgumentHandleCountMax> objs;
     R_UNLESS(handle_table.GetMultipleObjects<KSynchronizationObject>(objs.data(), handles.data(),
                                                                      num_handles),
              ResultInvalidHandle);
@@ -80,7 +81,7 @@ Result ReplyAndReceive(Core::System& system, s32* out_index, uint64_t handles_ad
         // Wait for an object.
         s32 index;
         Result result = KSynchronizationObject::Wait(kernel, std::addressof(index), objs.data(),
-                                                     static_cast<s32>(objs.size()), timeout_ns);
+                                                     num_handles, timeout_ns);
         if (result == ResultTimedOut) {
             R_RETURN(result);
         }
diff --git a/src/core/hle/kernel/svc/svc_synchronization.cpp b/src/core/hle/kernel/svc/svc_synchronization.cpp
index 04d65f0bd..53df5bcd8 100644
--- a/src/core/hle/kernel/svc/svc_synchronization.cpp
+++ b/src/core/hle/kernel/svc/svc_synchronization.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include "common/scope_exit.h"
+#include "common/scratch_buffer.h"
 #include "core/core.h"
 #include "core/hle/kernel/k_process.h"
 #include "core/hle/kernel/k_readable_event.h"
@@ -54,7 +55,7 @@ static Result WaitSynchronization(Core::System& system, int32_t* out_index, cons
     // Get the synchronization context.
     auto& kernel = system.Kernel();
     auto& handle_table = GetCurrentProcess(kernel).GetHandleTable();
-    std::vector<KSynchronizationObject*> objs(num_handles);
+    std::array<KSynchronizationObject*, Svc::ArgumentHandleCountMax> objs;
 
     // Copy user handles.
     if (num_handles > 0) {
@@ -72,8 +73,8 @@ static Result WaitSynchronization(Core::System& system, int32_t* out_index, cons
     });
 
     // Wait on the objects.
-    Result res = KSynchronizationObject::Wait(kernel, out_index, objs.data(),
-                                              static_cast<s32>(objs.size()), timeout_ns);
+    Result res =
+        KSynchronizationObject::Wait(kernel, out_index, objs.data(), num_handles, timeout_ns);
 
     R_SUCCEED_IF(res == ResultSessionClosed);
     R_RETURN(res);
@@ -87,8 +88,7 @@ Result WaitSynchronization(Core::System& system, int32_t* out_index, u64 user_ha
 
     // Ensure number of handles is valid.
     R_UNLESS(0 <= num_handles && num_handles <= Svc::ArgumentHandleCountMax, ResultOutOfRange);
-
-    std::vector<Handle> handles(num_handles);
+    std::array<Handle, Svc::ArgumentHandleCountMax> handles;
     if (num_handles > 0) {
         GetCurrentMemory(system.Kernel())
             .ReadBlock(user_handles, handles.data(), num_handles * sizeof(Handle));
diff --git a/src/core/hle/kernel/svc/svc_thread.cpp b/src/core/hle/kernel/svc/svc_thread.cpp
index 37b54079c..36b94e6bf 100644
--- a/src/core/hle/kernel/svc/svc_thread.cpp
+++ b/src/core/hle/kernel/svc/svc_thread.cpp
@@ -174,7 +174,7 @@ Result GetThreadContext3(Core::System& system, u64 out_context, Handle thread_ha
         }
 
         // Get the thread context.
-        std::vector<u8> context;
+        static thread_local Common::ScratchBuffer<u8> context;
         R_TRY(thread->GetThreadContext3(context));
 
         // Copy the thread context to user space.
diff --git a/src/core/hle/service/audio/audin_u.cpp b/src/core/hle/service/audio/audin_u.cpp
index f0640c64f..c8d574993 100644
--- a/src/core/hle/service/audio/audin_u.cpp
+++ b/src/core/hle/service/audio/audin_u.cpp
@@ -5,6 +5,7 @@
 #include "audio_core/renderer/audio_device.h"
 #include "common/common_funcs.h"
 #include "common/logging/log.h"
+#include "common/settings.h"
 #include "common/string_util.h"
 #include "core/core.h"
 #include "core/hle/kernel/k_event.h"
@@ -123,19 +124,13 @@ private:
 
     void GetReleasedAudioInBuffer(HLERequestContext& ctx) {
         const auto write_buffer_size = ctx.GetWriteBufferNumElements<u64>();
-        std::vector<u64> released_buffers(write_buffer_size);
+        tmp_buffer.resize_destructive(write_buffer_size);
+        tmp_buffer[0] = 0;
 
-        const auto count = impl->GetReleasedBuffers(released_buffers);
+        const auto count = impl->GetReleasedBuffers(tmp_buffer);
 
-        [[maybe_unused]] std::string tags{};
-        for (u32 i = 0; i < count; i++) {
-            tags += fmt::format("{:08X}, ", released_buffers[i]);
-        }
-        [[maybe_unused]] auto sessionid{impl->GetSystem().GetSessionId()};
-        LOG_TRACE(Service_Audio, "called. Session {} released {} buffers: {}", sessionid, count,
-                  tags);
+        ctx.WriteBuffer(tmp_buffer);
 
-        ctx.WriteBuffer(released_buffers);
         IPC::ResponseBuilder rb{ctx, 3};
         rb.Push(ResultSuccess);
         rb.Push(count);
@@ -200,6 +195,7 @@ private:
     KernelHelpers::ServiceContext service_context;
     Kernel::KEvent* event;
     std::shared_ptr<AudioCore::AudioIn::In> impl;
+    Common::ScratchBuffer<u64> tmp_buffer;
 };
 
 AudInU::AudInU(Core::System& system_)
diff --git a/src/core/hle/service/audio/audout_u.cpp b/src/core/hle/service/audio/audout_u.cpp
index 3e62fa4fc..032c8c11f 100644
--- a/src/core/hle/service/audio/audout_u.cpp
+++ b/src/core/hle/service/audio/audout_u.cpp
@@ -123,19 +123,13 @@ private:
 
     void GetReleasedAudioOutBuffers(HLERequestContext& ctx) {
         const auto write_buffer_size = ctx.GetWriteBufferNumElements<u64>();
-        std::vector<u64> released_buffers(write_buffer_size);
+        tmp_buffer.resize_destructive(write_buffer_size);
+        tmp_buffer[0] = 0;
 
-        const auto count = impl->GetReleasedBuffers(released_buffers);
+        const auto count = impl->GetReleasedBuffers(tmp_buffer);
 
-        [[maybe_unused]] std::string tags{};
-        for (u32 i = 0; i < count; i++) {
-            tags += fmt::format("{:08X}, ", released_buffers[i]);
-        }
-        [[maybe_unused]] const auto sessionid{impl->GetSystem().GetSessionId()};
-        LOG_TRACE(Service_Audio, "called. Session {} released {} buffers: {}", sessionid, count,
-                  tags);
+        ctx.WriteBuffer(tmp_buffer);
 
-        ctx.WriteBuffer(released_buffers);
         IPC::ResponseBuilder rb{ctx, 3};
         rb.Push(ResultSuccess);
         rb.Push(count);
@@ -211,6 +205,7 @@ private:
     KernelHelpers::ServiceContext service_context;
     Kernel::KEvent* event;
     std::shared_ptr<AudioCore::AudioOut::Out> impl;
+    Common::ScratchBuffer<u64> tmp_buffer;
 };
 
 AudOutU::AudOutU(Core::System& system_)
diff --git a/src/core/hle/service/audio/audren_u.cpp b/src/core/hle/service/audio/audren_u.cpp
index 7086d4750..12845c23a 100644
--- a/src/core/hle/service/audio/audren_u.cpp
+++ b/src/core/hle/service/audio/audren_u.cpp
@@ -116,28 +116,26 @@ private:
         // These buffers are written manually to avoid an issue with WriteBuffer throwing errors for
         // checking size 0. Performance size is 0 for most games.
 
-        std::vector<u8> output{};
-        std::vector<u8> performance{};
         auto is_buffer_b{ctx.BufferDescriptorB()[0].Size() != 0};
         if (is_buffer_b) {
             const auto buffersB{ctx.BufferDescriptorB()};
-            output.resize(buffersB[0].Size(), 0);
-            performance.resize(buffersB[1].Size(), 0);
+            tmp_output.resize_destructive(buffersB[0].Size());
+            tmp_performance.resize_destructive(buffersB[1].Size());
         } else {
             const auto buffersC{ctx.BufferDescriptorC()};
-            output.resize(buffersC[0].Size(), 0);
-            performance.resize(buffersC[1].Size(), 0);
+            tmp_output.resize_destructive(buffersC[0].Size());
+            tmp_performance.resize_destructive(buffersC[1].Size());
         }
 
-        auto result = impl->RequestUpdate(input, performance, output);
+        auto result = impl->RequestUpdate(input, tmp_performance, tmp_output);
 
         if (result.IsSuccess()) {
             if (is_buffer_b) {
-                ctx.WriteBufferB(output.data(), output.size(), 0);
-                ctx.WriteBufferB(performance.data(), performance.size(), 1);
+                ctx.WriteBufferB(tmp_output.data(), tmp_output.size(), 0);
+                ctx.WriteBufferB(tmp_performance.data(), tmp_performance.size(), 1);
             } else {
-                ctx.WriteBufferC(output.data(), output.size(), 0);
-                ctx.WriteBufferC(performance.data(), performance.size(), 1);
+                ctx.WriteBufferC(tmp_output.data(), tmp_output.size(), 0);
+                ctx.WriteBufferC(tmp_performance.data(), tmp_performance.size(), 1);
             }
         } else {
             LOG_ERROR(Service_Audio, "RequestUpdate failed error 0x{:02X}!", result.description);
@@ -235,6 +233,8 @@ private:
     Kernel::KEvent* rendered_event;
     Manager& manager;
     std::unique_ptr<Renderer> impl;
+    Common::ScratchBuffer<u8> tmp_output;
+    Common::ScratchBuffer<u8> tmp_performance;
 };
 
 class IAudioDevice final : public ServiceFramework<IAudioDevice> {
diff --git a/src/core/hle/service/audio/audren_u.h b/src/core/hle/service/audio/audren_u.h
index 24ce37e87..d8e9c8719 100644
--- a/src/core/hle/service/audio/audren_u.h
+++ b/src/core/hle/service/audio/audren_u.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "audio_core/audio_render_manager.h"
+#include "common/scratch_buffer.h"
 #include "core/hle/service/kernel_helpers.h"
 #include "core/hle/service/service.h"
 
diff --git a/src/core/hle/service/audio/hwopus.cpp b/src/core/hle/service/audio/hwopus.cpp
index 451ac224a..c835f6cb7 100644
--- a/src/core/hle/service/audio/hwopus.cpp
+++ b/src/core/hle/service/audio/hwopus.cpp
@@ -68,13 +68,13 @@ private:
                                  ExtraBehavior extra_behavior) {
         u32 consumed = 0;
         u32 sample_count = 0;
-        std::vector<opus_int16> samples(ctx.GetWriteBufferNumElements<opus_int16>());
+        tmp_samples.resize_destructive(ctx.GetWriteBufferNumElements<opus_int16>());
 
         if (extra_behavior == ExtraBehavior::ResetContext) {
             ResetDecoderContext();
         }
 
-        if (!DecodeOpusData(consumed, sample_count, ctx.ReadBuffer(), samples, performance)) {
+        if (!DecodeOpusData(consumed, sample_count, ctx.ReadBuffer(), tmp_samples, performance)) {
             LOG_ERROR(Audio, "Failed to decode opus data");
             IPC::ResponseBuilder rb{ctx, 2};
             // TODO(ogniK): Use correct error code
@@ -90,11 +90,11 @@ private:
         if (performance) {
             rb.Push<u64>(*performance);
         }
-        ctx.WriteBuffer(samples);
+        ctx.WriteBuffer(tmp_samples);
     }
 
     bool DecodeOpusData(u32& consumed, u32& sample_count, std::span<const u8> input,
-                        std::vector<opus_int16>& output, u64* out_performance_time) const {
+                        std::span<opus_int16> output, u64* out_performance_time) const {
         const auto start_time = std::chrono::steady_clock::now();
         const std::size_t raw_output_sz = output.size() * sizeof(opus_int16);
         if (sizeof(OpusPacketHeader) > input.size()) {
@@ -154,6 +154,7 @@ private:
     OpusDecoderPtr decoder;
     u32 sample_rate;
     u32 channel_count;
+    Common::ScratchBuffer<opus_int16> tmp_samples;
 };
 
 class IHardwareOpusDecoderManager final : public ServiceFramework<IHardwareOpusDecoderManager> {
diff --git a/src/core/hle/service/nvdrv/devices/nvdevice.h b/src/core/hle/service/nvdrv/devices/nvdevice.h
index ab1f30f9e..a04538d5d 100644
--- a/src/core/hle/service/nvdrv/devices/nvdevice.h
+++ b/src/core/hle/service/nvdrv/devices/nvdevice.h
@@ -34,7 +34,7 @@ public:
      * @returns The result code of the ioctl.
      */
     virtual NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                            std::vector<u8>& output) = 0;
+                            std::span<u8> output) = 0;
 
     /**
      * Handles an ioctl2 request.
@@ -45,7 +45,7 @@ public:
      * @returns The result code of the ioctl.
      */
     virtual NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                            std::span<const u8> inline_input, std::vector<u8>& output) = 0;
+                            std::span<const u8> inline_input, std::span<u8> output) = 0;
 
     /**
      * Handles an ioctl3 request.
@@ -56,7 +56,7 @@ public:
      * @returns The result code of the ioctl.
      */
     virtual NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                            std::vector<u8>& output, std::vector<u8>& inline_output) = 0;
+                            std::span<u8> output, std::span<u8> inline_output) = 0;
 
     /**
      * Called once a device is opened
diff --git a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
index 0fe242e9d..05a43d8dc 100644
--- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
@@ -18,19 +18,19 @@ nvdisp_disp0::nvdisp_disp0(Core::System& system_, NvCore::Container& core)
 nvdisp_disp0::~nvdisp_disp0() = default;
 
 NvResult nvdisp_disp0::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                              std::vector<u8>& output) {
+                              std::span<u8> output) {
     UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
     return NvResult::NotImplemented;
 }
 
 NvResult nvdisp_disp0::Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                              std::span<const u8> inline_input, std::vector<u8>& output) {
+                              std::span<const u8> inline_input, std::span<u8> output) {
     UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
     return NvResult::NotImplemented;
 }
 
 NvResult nvdisp_disp0::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                              std::vector<u8>& output, std::vector<u8>& inline_output) {
+                              std::span<u8> output, std::span<u8> inline_output) {
     UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
     return NvResult::NotImplemented;
 }
diff --git a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h
index bcd0e3ed5..daee05fe8 100644
--- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h
+++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h
@@ -26,11 +26,11 @@ public:
     ~nvdisp_disp0() override;
 
     NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                    std::vector<u8>& output) override;
+                    std::span<u8> output) override;
     NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                    std::span<const u8> inline_input, std::vector<u8>& output) override;
-    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::vector<u8>& output,
-                    std::vector<u8>& inline_output) override;
+                    std::span<const u8> inline_input, std::span<u8> output) override;
+    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::span<u8> output,
+                    std::span<u8> inline_output) override;
 
     void OnOpen(DeviceFD fd) override;
     void OnClose(DeviceFD fd) override;
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
index 681bd0867..07e570a9f 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
@@ -28,7 +28,7 @@ nvhost_as_gpu::nvhost_as_gpu(Core::System& system_, Module& module_, NvCore::Con
 nvhost_as_gpu::~nvhost_as_gpu() = default;
 
 NvResult nvhost_as_gpu::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                               std::vector<u8>& output) {
+                               std::span<u8> output) {
     switch (command.group) {
     case 'A':
         switch (command.cmd) {
@@ -61,13 +61,13 @@ NvResult nvhost_as_gpu::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> i
 }
 
 NvResult nvhost_as_gpu::Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                               std::span<const u8> inline_input, std::vector<u8>& output) {
+                               std::span<const u8> inline_input, std::span<u8> output) {
     UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
     return NvResult::NotImplemented;
 }
 
 NvResult nvhost_as_gpu::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                               std::vector<u8>& output, std::vector<u8>& inline_output) {
+                               std::span<u8> output, std::span<u8> inline_output) {
     switch (command.group) {
     case 'A':
         switch (command.cmd) {
@@ -87,7 +87,7 @@ NvResult nvhost_as_gpu::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> i
 void nvhost_as_gpu::OnOpen(DeviceFD fd) {}
 void nvhost_as_gpu::OnClose(DeviceFD fd) {}
 
-NvResult nvhost_as_gpu::AllocAsEx(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_as_gpu::AllocAsEx(std::span<const u8> input, std::span<u8> output) {
     IoctlAllocAsEx params{};
     std::memcpy(&params, input.data(), input.size());
 
@@ -141,7 +141,7 @@ NvResult nvhost_as_gpu::AllocAsEx(std::span<const u8> input, std::vector<u8>& ou
     return NvResult::Success;
 }
 
-NvResult nvhost_as_gpu::AllocateSpace(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_as_gpu::AllocateSpace(std::span<const u8> input, std::span<u8> output) {
     IoctlAllocSpace params{};
     std::memcpy(&params, input.data(), input.size());
 
@@ -220,7 +220,7 @@ void nvhost_as_gpu::FreeMappingLocked(u64 offset) {
     mapping_map.erase(offset);
 }
 
-NvResult nvhost_as_gpu::FreeSpace(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_as_gpu::FreeSpace(std::span<const u8> input, std::span<u8> output) {
     IoctlFreeSpace params{};
     std::memcpy(&params, input.data(), input.size());
 
@@ -266,15 +266,14 @@ NvResult nvhost_as_gpu::FreeSpace(std::span<const u8> input, std::vector<u8>& ou
     return NvResult::Success;
 }
 
-NvResult nvhost_as_gpu::Remap(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_as_gpu::Remap(std::span<const u8> input, std::span<u8> output) {
     const auto num_entries = input.size() / sizeof(IoctlRemapEntry);
 
     LOG_DEBUG(Service_NVDRV, "called, num_entries=0x{:X}", num_entries);
 
-    std::vector<IoctlRemapEntry> entries(num_entries);
-    std::memcpy(entries.data(), input.data(), input.size());
-
     std::scoped_lock lock(mutex);
+    entries.resize_destructive(num_entries);
+    std::memcpy(entries.data(), input.data(), input.size());
 
     if (!vm.initialised) {
         return NvResult::BadValue;
@@ -320,7 +319,7 @@ NvResult nvhost_as_gpu::Remap(std::span<const u8> input, std::vector<u8>& output
     return NvResult::Success;
 }
 
-NvResult nvhost_as_gpu::MapBufferEx(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_as_gpu::MapBufferEx(std::span<const u8> input, std::span<u8> output) {
     IoctlMapBufferEx params{};
     std::memcpy(&params, input.data(), input.size());
 
@@ -424,7 +423,7 @@ NvResult nvhost_as_gpu::MapBufferEx(std::span<const u8> input, std::vector<u8>&
     return NvResult::Success;
 }
 
-NvResult nvhost_as_gpu::UnmapBuffer(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_as_gpu::UnmapBuffer(std::span<const u8> input, std::span<u8> output) {
     IoctlUnmapBuffer params{};
     std::memcpy(&params, input.data(), input.size());
 
@@ -463,7 +462,7 @@ NvResult nvhost_as_gpu::UnmapBuffer(std::span<const u8> input, std::vector<u8>&
     return NvResult::Success;
 }
 
-NvResult nvhost_as_gpu::BindChannel(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_as_gpu::BindChannel(std::span<const u8> input, std::span<u8> output) {
     IoctlBindChannel params{};
     std::memcpy(&params, input.data(), input.size());
     LOG_DEBUG(Service_NVDRV, "called, fd={:X}", params.fd);
@@ -492,7 +491,7 @@ void nvhost_as_gpu::GetVARegionsImpl(IoctlGetVaRegions& params) {
     };
 }
 
-NvResult nvhost_as_gpu::GetVARegions(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_as_gpu::GetVARegions(std::span<const u8> input, std::span<u8> output) {
     IoctlGetVaRegions params{};
     std::memcpy(&params, input.data(), input.size());
 
@@ -511,8 +510,8 @@ NvResult nvhost_as_gpu::GetVARegions(std::span<const u8> input, std::vector<u8>&
     return NvResult::Success;
 }
 
-NvResult nvhost_as_gpu::GetVARegions(std::span<const u8> input, std::vector<u8>& output,
-                                     std::vector<u8>& inline_output) {
+NvResult nvhost_as_gpu::GetVARegions(std::span<const u8> input, std::span<u8> output,
+                                     std::span<u8> inline_output) {
     IoctlGetVaRegions params{};
     std::memcpy(&params, input.data(), input.size());
 
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h
index 1aba8d579..2af3e1260 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h
@@ -15,6 +15,7 @@
 #include "common/address_space.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "common/scratch_buffer.h"
 #include "common/swap.h"
 #include "core/hle/service/nvdrv/core/nvmap.h"
 #include "core/hle/service/nvdrv/devices/nvdevice.h"
@@ -48,11 +49,11 @@ public:
     ~nvhost_as_gpu() override;
 
     NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                    std::vector<u8>& output) override;
+                    std::span<u8> output) override;
     NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                    std::span<const u8> inline_input, std::vector<u8>& output) override;
-    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::vector<u8>& output,
-                    std::vector<u8>& inline_output) override;
+                    std::span<const u8> inline_input, std::span<u8> output) override;
+    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::span<u8> output,
+                    std::span<u8> inline_output) override;
 
     void OnOpen(DeviceFD fd) override;
     void OnClose(DeviceFD fd) override;
@@ -138,18 +139,18 @@ private:
     static_assert(sizeof(IoctlGetVaRegions) == 16 + sizeof(VaRegion) * 2,
                   "IoctlGetVaRegions is incorrect size");
 
-    NvResult AllocAsEx(std::span<const u8> input, std::vector<u8>& output);
-    NvResult AllocateSpace(std::span<const u8> input, std::vector<u8>& output);
-    NvResult Remap(std::span<const u8> input, std::vector<u8>& output);
-    NvResult MapBufferEx(std::span<const u8> input, std::vector<u8>& output);
-    NvResult UnmapBuffer(std::span<const u8> input, std::vector<u8>& output);
-    NvResult FreeSpace(std::span<const u8> input, std::vector<u8>& output);
-    NvResult BindChannel(std::span<const u8> input, std::vector<u8>& output);
+    NvResult AllocAsEx(std::span<const u8> input, std::span<u8> output);
+    NvResult AllocateSpace(std::span<const u8> input, std::span<u8> output);
+    NvResult Remap(std::span<const u8> input, std::span<u8> output);
+    NvResult MapBufferEx(std::span<const u8> input, std::span<u8> output);
+    NvResult UnmapBuffer(std::span<const u8> input, std::span<u8> output);
+    NvResult FreeSpace(std::span<const u8> input, std::span<u8> output);
+    NvResult BindChannel(std::span<const u8> input, std::span<u8> output);
 
     void GetVARegionsImpl(IoctlGetVaRegions& params);
-    NvResult GetVARegions(std::span<const u8> input, std::vector<u8>& output);
-    NvResult GetVARegions(std::span<const u8> input, std::vector<u8>& output,
-                          std::vector<u8>& inline_output);
+    NvResult GetVARegions(std::span<const u8> input, std::span<u8> output);
+    NvResult GetVARegions(std::span<const u8> input, std::span<u8> output,
+                          std::span<u8> inline_output);
 
     void FreeMappingLocked(u64 offset);
 
@@ -212,6 +213,7 @@ private:
         bool initialised{};
     } vm;
     std::shared_ptr<Tegra::MemoryManager> gmmu;
+    Common::ScratchBuffer<IoctlRemapEntry> entries;
 
     // s32 channel{};
     // u32 big_page_size{VM::DEFAULT_BIG_PAGE_SIZE};
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp
index e12025560..4d55554b4 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp
@@ -35,7 +35,7 @@ nvhost_ctrl::~nvhost_ctrl() {
 }
 
 NvResult nvhost_ctrl::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                             std::vector<u8>& output) {
+                             std::span<u8> output) {
     switch (command.group) {
     case 0x0:
         switch (command.cmd) {
@@ -64,13 +64,13 @@ NvResult nvhost_ctrl::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> inp
 }
 
 NvResult nvhost_ctrl::Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                             std::span<const u8> inline_input, std::vector<u8>& output) {
+                             std::span<const u8> inline_input, std::span<u8> output) {
     UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
     return NvResult::NotImplemented;
 }
 
 NvResult nvhost_ctrl::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                             std::vector<u8>& output, std::vector<u8>& inline_outpu) {
+                             std::span<u8> output, std::span<u8> inline_outpu) {
     UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
     return NvResult::NotImplemented;
 }
@@ -79,7 +79,7 @@ void nvhost_ctrl::OnOpen(DeviceFD fd) {}
 
 void nvhost_ctrl::OnClose(DeviceFD fd) {}
 
-NvResult nvhost_ctrl::NvOsGetConfigU32(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_ctrl::NvOsGetConfigU32(std::span<const u8> input, std::span<u8> output) {
     IocGetConfigParams params{};
     std::memcpy(&params, input.data(), sizeof(params));
     LOG_TRACE(Service_NVDRV, "called, setting={}!{}", params.domain_str.data(),
@@ -87,7 +87,7 @@ NvResult nvhost_ctrl::NvOsGetConfigU32(std::span<const u8> input, std::vector<u8
     return NvResult::ConfigVarNotFound; // Returns error on production mode
 }
 
-NvResult nvhost_ctrl::IocCtrlEventWait(std::span<const u8> input, std::vector<u8>& output,
+NvResult nvhost_ctrl::IocCtrlEventWait(std::span<const u8> input, std::span<u8> output,
                                        bool is_allocation) {
     IocCtrlEventWaitParams params{};
     std::memcpy(&params, input.data(), sizeof(params));
@@ -231,7 +231,7 @@ NvResult nvhost_ctrl::FreeEvent(u32 slot) {
     return NvResult::Success;
 }
 
-NvResult nvhost_ctrl::IocCtrlEventRegister(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_ctrl::IocCtrlEventRegister(std::span<const u8> input, std::span<u8> output) {
     IocCtrlEventRegisterParams params{};
     std::memcpy(&params, input.data(), sizeof(params));
     const u32 event_id = params.user_event_id;
@@ -252,7 +252,7 @@ NvResult nvhost_ctrl::IocCtrlEventRegister(std::span<const u8> input, std::vecto
     return NvResult::Success;
 }
 
-NvResult nvhost_ctrl::IocCtrlEventUnregister(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_ctrl::IocCtrlEventUnregister(std::span<const u8> input, std::span<u8> output) {
     IocCtrlEventUnregisterParams params{};
     std::memcpy(&params, input.data(), sizeof(params));
     const u32 event_id = params.user_event_id & 0x00FF;
@@ -262,8 +262,7 @@ NvResult nvhost_ctrl::IocCtrlEventUnregister(std::span<const u8> input, std::vec
     return FreeEvent(event_id);
 }
 
-NvResult nvhost_ctrl::IocCtrlEventUnregisterBatch(std::span<const u8> input,
-                                                  std::vector<u8>& output) {
+NvResult nvhost_ctrl::IocCtrlEventUnregisterBatch(std::span<const u8> input, std::span<u8> output) {
     IocCtrlEventUnregisterBatchParams params{};
     std::memcpy(&params, input.data(), sizeof(params));
     u64 event_mask = params.user_events;
@@ -281,7 +280,7 @@ NvResult nvhost_ctrl::IocCtrlEventUnregisterBatch(std::span<const u8> input,
     return NvResult::Success;
 }
 
-NvResult nvhost_ctrl::IocCtrlClearEventWait(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_ctrl::IocCtrlClearEventWait(std::span<const u8> input, std::span<u8> output) {
     IocCtrlEventClearParams params{};
     std::memcpy(&params, input.data(), sizeof(params));
 
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h
index dd2e7888a..2efed4862 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h
@@ -26,11 +26,11 @@ public:
     ~nvhost_ctrl() override;
 
     NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                    std::vector<u8>& output) override;
+                    std::span<u8> output) override;
     NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                    std::span<const u8> inline_input, std::vector<u8>& output) override;
-    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::vector<u8>& output,
-                    std::vector<u8>& inline_output) override;
+                    std::span<const u8> inline_input, std::span<u8> output) override;
+    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::span<u8> output,
+                    std::span<u8> inline_output) override;
 
     void OnOpen(DeviceFD fd) override;
     void OnClose(DeviceFD fd) override;
@@ -186,13 +186,12 @@ private:
     static_assert(sizeof(IocCtrlEventUnregisterBatchParams) == 8,
                   "IocCtrlEventKill is incorrect size");
 
-    NvResult NvOsGetConfigU32(std::span<const u8> input, std::vector<u8>& output);
-    NvResult IocCtrlEventWait(std::span<const u8> input, std::vector<u8>& output,
-                              bool is_allocation);
-    NvResult IocCtrlEventRegister(std::span<const u8> input, std::vector<u8>& output);
-    NvResult IocCtrlEventUnregister(std::span<const u8> input, std::vector<u8>& output);
-    NvResult IocCtrlEventUnregisterBatch(std::span<const u8> input, std::vector<u8>& output);
-    NvResult IocCtrlClearEventWait(std::span<const u8> input, std::vector<u8>& output);
+    NvResult NvOsGetConfigU32(std::span<const u8> input, std::span<u8> output);
+    NvResult IocCtrlEventWait(std::span<const u8> input, std::span<u8> output, bool is_allocation);
+    NvResult IocCtrlEventRegister(std::span<const u8> input, std::span<u8> output);
+    NvResult IocCtrlEventUnregister(std::span<const u8> input, std::span<u8> output);
+    NvResult IocCtrlEventUnregisterBatch(std::span<const u8> input, std::span<u8> output);
+    NvResult IocCtrlClearEventWait(std::span<const u8> input, std::span<u8> output);
 
     NvResult FreeEvent(u32 slot);
 
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
index be3c083db..6081d92e9 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
@@ -22,7 +22,7 @@ nvhost_ctrl_gpu::~nvhost_ctrl_gpu() {
 }
 
 NvResult nvhost_ctrl_gpu::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                                 std::vector<u8>& output) {
+                                 std::span<u8> output) {
     switch (command.group) {
     case 'G':
         switch (command.cmd) {
@@ -54,13 +54,13 @@ NvResult nvhost_ctrl_gpu::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8>
 }
 
 NvResult nvhost_ctrl_gpu::Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                                 std::span<const u8> inline_input, std::vector<u8>& output) {
+                                 std::span<const u8> inline_input, std::span<u8> output) {
     UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
     return NvResult::NotImplemented;
 }
 
 NvResult nvhost_ctrl_gpu::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                                 std::vector<u8>& output, std::vector<u8>& inline_output) {
+                                 std::span<u8> output, std::span<u8> inline_output) {
     switch (command.group) {
     case 'G':
         switch (command.cmd) {
@@ -82,7 +82,7 @@ NvResult nvhost_ctrl_gpu::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8>
 void nvhost_ctrl_gpu::OnOpen(DeviceFD fd) {}
 void nvhost_ctrl_gpu::OnClose(DeviceFD fd) {}
 
-NvResult nvhost_ctrl_gpu::GetCharacteristics(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_ctrl_gpu::GetCharacteristics(std::span<const u8> input, std::span<u8> output) {
     LOG_DEBUG(Service_NVDRV, "called");
     IoctlCharacteristics params{};
     std::memcpy(&params, input.data(), input.size());
@@ -127,8 +127,8 @@ NvResult nvhost_ctrl_gpu::GetCharacteristics(std::span<const u8> input, std::vec
     return NvResult::Success;
 }
 
-NvResult nvhost_ctrl_gpu::GetCharacteristics(std::span<const u8> input, std::vector<u8>& output,
-                                             std::vector<u8>& inline_output) {
+NvResult nvhost_ctrl_gpu::GetCharacteristics(std::span<const u8> input, std::span<u8> output,
+                                             std::span<u8> inline_output) {
     LOG_DEBUG(Service_NVDRV, "called");
     IoctlCharacteristics params{};
     std::memcpy(&params, input.data(), input.size());
@@ -175,7 +175,7 @@ NvResult nvhost_ctrl_gpu::GetCharacteristics(std::span<const u8> input, std::vec
     return NvResult::Success;
 }
 
-NvResult nvhost_ctrl_gpu::GetTPCMasks(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_ctrl_gpu::GetTPCMasks(std::span<const u8> input, std::span<u8> output) {
     IoctlGpuGetTpcMasksArgs params{};
     std::memcpy(&params, input.data(), input.size());
     LOG_DEBUG(Service_NVDRV, "called, mask_buffer_size=0x{:X}", params.mask_buffer_size);
@@ -186,8 +186,8 @@ NvResult nvhost_ctrl_gpu::GetTPCMasks(std::span<const u8> input, std::vector<u8>
     return NvResult::Success;
 }
 
-NvResult nvhost_ctrl_gpu::GetTPCMasks(std::span<const u8> input, std::vector<u8>& output,
-                                      std::vector<u8>& inline_output) {
+NvResult nvhost_ctrl_gpu::GetTPCMasks(std::span<const u8> input, std::span<u8> output,
+                                      std::span<u8> inline_output) {
     IoctlGpuGetTpcMasksArgs params{};
     std::memcpy(&params, input.data(), input.size());
     LOG_DEBUG(Service_NVDRV, "called, mask_buffer_size=0x{:X}", params.mask_buffer_size);
@@ -199,7 +199,7 @@ NvResult nvhost_ctrl_gpu::GetTPCMasks(std::span<const u8> input, std::vector<u8>
     return NvResult::Success;
 }
 
-NvResult nvhost_ctrl_gpu::GetActiveSlotMask(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_ctrl_gpu::GetActiveSlotMask(std::span<const u8> input, std::span<u8> output) {
     LOG_DEBUG(Service_NVDRV, "called");
 
     IoctlActiveSlotMask params{};
@@ -212,7 +212,7 @@ NvResult nvhost_ctrl_gpu::GetActiveSlotMask(std::span<const u8> input, std::vect
     return NvResult::Success;
 }
 
-NvResult nvhost_ctrl_gpu::ZCullGetCtxSize(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_ctrl_gpu::ZCullGetCtxSize(std::span<const u8> input, std::span<u8> output) {
     LOG_DEBUG(Service_NVDRV, "called");
 
     IoctlZcullGetCtxSize params{};
@@ -224,7 +224,7 @@ NvResult nvhost_ctrl_gpu::ZCullGetCtxSize(std::span<const u8> input, std::vector
     return NvResult::Success;
 }
 
-NvResult nvhost_ctrl_gpu::ZCullGetInfo(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_ctrl_gpu::ZCullGetInfo(std::span<const u8> input, std::span<u8> output) {
     LOG_DEBUG(Service_NVDRV, "called");
 
     IoctlNvgpuGpuZcullGetInfoArgs params{};
@@ -247,7 +247,7 @@ NvResult nvhost_ctrl_gpu::ZCullGetInfo(std::span<const u8> input, std::vector<u8
     return NvResult::Success;
 }
 
-NvResult nvhost_ctrl_gpu::ZBCSetTable(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_ctrl_gpu::ZBCSetTable(std::span<const u8> input, std::span<u8> output) {
     LOG_WARNING(Service_NVDRV, "(STUBBED) called");
 
     IoctlZbcSetTable params{};
@@ -263,7 +263,7 @@ NvResult nvhost_ctrl_gpu::ZBCSetTable(std::span<const u8> input, std::vector<u8>
     return NvResult::Success;
 }
 
-NvResult nvhost_ctrl_gpu::ZBCQueryTable(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_ctrl_gpu::ZBCQueryTable(std::span<const u8> input, std::span<u8> output) {
     LOG_WARNING(Service_NVDRV, "(STUBBED) called");
 
     IoctlZbcQueryTable params{};
@@ -273,7 +273,7 @@ NvResult nvhost_ctrl_gpu::ZBCQueryTable(std::span<const u8> input, std::vector<u
     return NvResult::Success;
 }
 
-NvResult nvhost_ctrl_gpu::FlushL2(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_ctrl_gpu::FlushL2(std::span<const u8> input, std::span<u8> output) {
     LOG_WARNING(Service_NVDRV, "(STUBBED) called");
 
     IoctlFlushL2 params{};
@@ -283,7 +283,7 @@ NvResult nvhost_ctrl_gpu::FlushL2(std::span<const u8> input, std::vector<u8>& ou
     return NvResult::Success;
 }
 
-NvResult nvhost_ctrl_gpu::GetGpuTime(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_ctrl_gpu::GetGpuTime(std::span<const u8> input, std::span<u8> output) {
     LOG_DEBUG(Service_NVDRV, "called");
 
     IoctlGetGpuTime params{};
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
index b9333d9d3..97995551c 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
@@ -22,11 +22,11 @@ public:
     ~nvhost_ctrl_gpu() override;
 
     NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                    std::vector<u8>& output) override;
+                    std::span<u8> output) override;
     NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                    std::span<const u8> inline_input, std::vector<u8>& output) override;
-    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::vector<u8>& output,
-                    std::vector<u8>& inline_output) override;
+                    std::span<const u8> inline_input, std::span<u8> output) override;
+    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::span<u8> output,
+                    std::span<u8> inline_output) override;
 
     void OnOpen(DeviceFD fd) override;
     void OnClose(DeviceFD fd) override;
@@ -151,21 +151,21 @@ private:
     };
     static_assert(sizeof(IoctlGetGpuTime) == 0x10, "IoctlGetGpuTime is incorrect size");
 
-    NvResult GetCharacteristics(std::span<const u8> input, std::vector<u8>& output);
-    NvResult GetCharacteristics(std::span<const u8> input, std::vector<u8>& output,
-                                std::vector<u8>& inline_output);
-
-    NvResult GetTPCMasks(std::span<const u8> input, std::vector<u8>& output);
-    NvResult GetTPCMasks(std::span<const u8> input, std::vector<u8>& output,
-                         std::vector<u8>& inline_output);
-
-    NvResult GetActiveSlotMask(std::span<const u8> input, std::vector<u8>& output);
-    NvResult ZCullGetCtxSize(std::span<const u8> input, std::vector<u8>& output);
-    NvResult ZCullGetInfo(std::span<const u8> input, std::vector<u8>& output);
-    NvResult ZBCSetTable(std::span<const u8> input, std::vector<u8>& output);
-    NvResult ZBCQueryTable(std::span<const u8> input, std::vector<u8>& output);
-    NvResult FlushL2(std::span<const u8> input, std::vector<u8>& output);
-    NvResult GetGpuTime(std::span<const u8> input, std::vector<u8>& output);
+    NvResult GetCharacteristics(std::span<const u8> input, std::span<u8> output);
+    NvResult GetCharacteristics(std::span<const u8> input, std::span<u8> output,
+                                std::span<u8> inline_output);
+
+    NvResult GetTPCMasks(std::span<const u8> input, std::span<u8> output);
+    NvResult GetTPCMasks(std::span<const u8> input, std::span<u8> output,
+                         std::span<u8> inline_output);
+
+    NvResult GetActiveSlotMask(std::span<const u8> input, std::span<u8> output);
+    NvResult ZCullGetCtxSize(std::span<const u8> input, std::span<u8> output);
+    NvResult ZCullGetInfo(std::span<const u8> input, std::span<u8> output);
+    NvResult ZBCSetTable(std::span<const u8> input, std::span<u8> output);
+    NvResult ZBCQueryTable(std::span<const u8> input, std::span<u8> output);
+    NvResult FlushL2(std::span<const u8> input, std::span<u8> output);
+    NvResult GetGpuTime(std::span<const u8> input, std::span<u8> output);
 
     EventInterface& events_interface;
 
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
index 453a965dc..46a25fcab 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@@ -47,7 +47,7 @@ nvhost_gpu::~nvhost_gpu() {
 }
 
 NvResult nvhost_gpu::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                            std::vector<u8>& output) {
+                            std::span<u8> output) {
     switch (command.group) {
     case 0x0:
         switch (command.cmd) {
@@ -99,7 +99,7 @@ NvResult nvhost_gpu::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> inpu
 };
 
 NvResult nvhost_gpu::Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                            std::span<const u8> inline_input, std::vector<u8>& output) {
+                            std::span<const u8> inline_input, std::span<u8> output) {
     switch (command.group) {
     case 'H':
         switch (command.cmd) {
@@ -113,7 +113,7 @@ NvResult nvhost_gpu::Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> inpu
 }
 
 NvResult nvhost_gpu::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                            std::vector<u8>& output, std::vector<u8>& inline_output) {
+                            std::span<u8> output, std::span<u8> inline_output) {
     UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
     return NvResult::NotImplemented;
 }
@@ -121,7 +121,7 @@ NvResult nvhost_gpu::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> inpu
 void nvhost_gpu::OnOpen(DeviceFD fd) {}
 void nvhost_gpu::OnClose(DeviceFD fd) {}
 
-NvResult nvhost_gpu::SetNVMAPfd(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_gpu::SetNVMAPfd(std::span<const u8> input, std::span<u8> output) {
     IoctlSetNvmapFD params{};
     std::memcpy(&params, input.data(), input.size());
     LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd);
@@ -130,7 +130,7 @@ NvResult nvhost_gpu::SetNVMAPfd(std::span<const u8> input, std::vector<u8>& outp
     return NvResult::Success;
 }
 
-NvResult nvhost_gpu::SetClientData(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_gpu::SetClientData(std::span<const u8> input, std::span<u8> output) {
     LOG_DEBUG(Service_NVDRV, "called");
 
     IoctlClientData params{};
@@ -139,7 +139,7 @@ NvResult nvhost_gpu::SetClientData(std::span<const u8> input, std::vector<u8>& o
     return NvResult::Success;
 }
 
-NvResult nvhost_gpu::GetClientData(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_gpu::GetClientData(std::span<const u8> input, std::span<u8> output) {
     LOG_DEBUG(Service_NVDRV, "called");
 
     IoctlClientData params{};
@@ -149,7 +149,7 @@ NvResult nvhost_gpu::GetClientData(std::span<const u8> input, std::vector<u8>& o
     return NvResult::Success;
 }
 
-NvResult nvhost_gpu::ZCullBind(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_gpu::ZCullBind(std::span<const u8> input, std::span<u8> output) {
     std::memcpy(&zcull_params, input.data(), input.size());
     LOG_DEBUG(Service_NVDRV, "called, gpu_va={:X}, mode={:X}", zcull_params.gpu_va,
               zcull_params.mode);
@@ -158,7 +158,7 @@ NvResult nvhost_gpu::ZCullBind(std::span<const u8> input, std::vector<u8>& outpu
     return NvResult::Success;
 }
 
-NvResult nvhost_gpu::SetErrorNotifier(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_gpu::SetErrorNotifier(std::span<const u8> input, std::span<u8> output) {
     IoctlSetErrorNotifier params{};
     std::memcpy(&params, input.data(), input.size());
     LOG_WARNING(Service_NVDRV, "(STUBBED) called, offset={:X}, size={:X}, mem={:X}", params.offset,
@@ -168,14 +168,14 @@ NvResult nvhost_gpu::SetErrorNotifier(std::span<const u8> input, std::vector<u8>
     return NvResult::Success;
 }
 
-NvResult nvhost_gpu::SetChannelPriority(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_gpu::SetChannelPriority(std::span<const u8> input, std::span<u8> output) {
     std::memcpy(&channel_priority, input.data(), input.size());
     LOG_DEBUG(Service_NVDRV, "(STUBBED) called, priority={:X}", channel_priority);
 
     return NvResult::Success;
 }
 
-NvResult nvhost_gpu::AllocGPFIFOEx2(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_gpu::AllocGPFIFOEx2(std::span<const u8> input, std::span<u8> output) {
     IoctlAllocGpfifoEx2 params{};
     std::memcpy(&params, input.data(), input.size());
     LOG_WARNING(Service_NVDRV,
@@ -197,7 +197,7 @@ NvResult nvhost_gpu::AllocGPFIFOEx2(std::span<const u8> input, std::vector<u8>&
     return NvResult::Success;
 }
 
-NvResult nvhost_gpu::AllocateObjectContext(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_gpu::AllocateObjectContext(std::span<const u8> input, std::span<u8> output) {
     IoctlAllocObjCtx params{};
     std::memcpy(&params, input.data(), input.size());
     LOG_WARNING(Service_NVDRV, "(STUBBED) called, class_num={:X}, flags={:X}", params.class_num,
@@ -208,7 +208,8 @@ NvResult nvhost_gpu::AllocateObjectContext(std::span<const u8> input, std::vecto
     return NvResult::Success;
 }
 
-static std::vector<Tegra::CommandHeader> BuildWaitCommandList(NvFence fence) {
+static boost::container::small_vector<Tegra::CommandHeader, 512> BuildWaitCommandList(
+    NvFence fence) {
     return {
         Tegra::BuildCommandHeader(Tegra::BufferMethods::SyncpointPayload, 1,
                                   Tegra::SubmissionMode::Increasing),
@@ -219,35 +220,35 @@ static std::vector<Tegra::CommandHeader> BuildWaitCommandList(NvFence fence) {
     };
 }
 
-static std::vector<Tegra::CommandHeader> BuildIncrementCommandList(NvFence fence) {
-    std::vector<Tegra::CommandHeader> result{
+static boost::container::small_vector<Tegra::CommandHeader, 512> BuildIncrementCommandList(
+    NvFence fence) {
+    boost::container::small_vector<Tegra::CommandHeader, 512> result{
         Tegra::BuildCommandHeader(Tegra::BufferMethods::SyncpointPayload, 1,
                                   Tegra::SubmissionMode::Increasing),
         {}};
 
     for (u32 count = 0; count < 2; ++count) {
-        result.emplace_back(Tegra::BuildCommandHeader(Tegra::BufferMethods::SyncpointOperation, 1,
-                                                      Tegra::SubmissionMode::Increasing));
-        result.emplace_back(
+        result.push_back(Tegra::BuildCommandHeader(Tegra::BufferMethods::SyncpointOperation, 1,
+                                                   Tegra::SubmissionMode::Increasing));
+        result.push_back(
             BuildFenceAction(Tegra::Engines::Puller::FenceOperation::Increment, fence.id));
     }
 
     return result;
 }
 
-static std::vector<Tegra::CommandHeader> BuildIncrementWithWfiCommandList(NvFence fence) {
-    std::vector<Tegra::CommandHeader> result{
+static boost::container::small_vector<Tegra::CommandHeader, 512> BuildIncrementWithWfiCommandList(
+    NvFence fence) {
+    boost::container::small_vector<Tegra::CommandHeader, 512> result{
         Tegra::BuildCommandHeader(Tegra::BufferMethods::WaitForIdle, 1,
                                   Tegra::SubmissionMode::Increasing),
         {}};
-    const std::vector<Tegra::CommandHeader> increment{BuildIncrementCommandList(fence)};
-
-    result.insert(result.end(), increment.begin(), increment.end());
-
+    auto increment_list{BuildIncrementCommandList(fence)};
+    result.insert(result.end(), increment_list.begin(), increment_list.end());
     return result;
 }
 
-NvResult nvhost_gpu::SubmitGPFIFOImpl(IoctlSubmitGpfifo& params, std::vector<u8>& output,
+NvResult nvhost_gpu::SubmitGPFIFOImpl(IoctlSubmitGpfifo& params, std::span<u8> output,
                                       Tegra::CommandList&& entries) {
     LOG_TRACE(Service_NVDRV, "called, gpfifo={:X}, num_entries={:X}, flags={:X}", params.address,
               params.num_entries, params.flags.raw);
@@ -293,7 +294,7 @@ NvResult nvhost_gpu::SubmitGPFIFOImpl(IoctlSubmitGpfifo& params, std::vector<u8>
     return NvResult::Success;
 }
 
-NvResult nvhost_gpu::SubmitGPFIFOBase(std::span<const u8> input, std::vector<u8>& output,
+NvResult nvhost_gpu::SubmitGPFIFOBase(std::span<const u8> input, std::span<u8> output,
                                       bool kickoff) {
     if (input.size() < sizeof(IoctlSubmitGpfifo)) {
         UNIMPLEMENTED();
@@ -315,7 +316,7 @@ NvResult nvhost_gpu::SubmitGPFIFOBase(std::span<const u8> input, std::vector<u8>
 }
 
 NvResult nvhost_gpu::SubmitGPFIFOBase(std::span<const u8> input, std::span<const u8> input_inline,
-                                      std::vector<u8>& output) {
+                                      std::span<u8> output) {
     if (input.size() < sizeof(IoctlSubmitGpfifo)) {
         UNIMPLEMENTED();
         return NvResult::InvalidSize;
@@ -327,7 +328,7 @@ NvResult nvhost_gpu::SubmitGPFIFOBase(std::span<const u8> input, std::span<const
     return SubmitGPFIFOImpl(params, output, std::move(entries));
 }
 
-NvResult nvhost_gpu::GetWaitbase(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_gpu::GetWaitbase(std::span<const u8> input, std::span<u8> output) {
     IoctlGetWaitbase params{};
     std::memcpy(&params, input.data(), sizeof(IoctlGetWaitbase));
     LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
@@ -337,7 +338,7 @@ NvResult nvhost_gpu::GetWaitbase(std::span<const u8> input, std::vector<u8>& out
     return NvResult::Success;
 }
 
-NvResult nvhost_gpu::ChannelSetTimeout(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_gpu::ChannelSetTimeout(std::span<const u8> input, std::span<u8> output) {
     IoctlChannelSetTimeout params{};
     std::memcpy(&params, input.data(), sizeof(IoctlChannelSetTimeout));
     LOG_INFO(Service_NVDRV, "called, timeout=0x{:X}", params.timeout);
@@ -345,7 +346,7 @@ NvResult nvhost_gpu::ChannelSetTimeout(std::span<const u8> input, std::vector<u8
     return NvResult::Success;
 }
 
-NvResult nvhost_gpu::ChannelSetTimeslice(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_gpu::ChannelSetTimeslice(std::span<const u8> input, std::span<u8> output) {
     IoctlSetTimeslice params{};
     std::memcpy(&params, input.data(), sizeof(IoctlSetTimeslice));
     LOG_INFO(Service_NVDRV, "called, timeslice=0x{:X}", params.timeslice);
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
index 3ca58202d..529c20526 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
@@ -41,11 +41,11 @@ public:
     ~nvhost_gpu() override;
 
     NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                    std::vector<u8>& output) override;
+                    std::span<u8> output) override;
     NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                    std::span<const u8> inline_input, std::vector<u8>& output) override;
-    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::vector<u8>& output,
-                    std::vector<u8>& inline_output) override;
+                    std::span<const u8> inline_input, std::span<u8> output) override;
+    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::span<u8> output,
+                    std::span<u8> inline_output) override;
 
     void OnOpen(DeviceFD fd) override;
     void OnClose(DeviceFD fd) override;
@@ -186,23 +186,23 @@ private:
     u32_le channel_priority{};
     u32_le channel_timeslice{};
 
-    NvResult SetNVMAPfd(std::span<const u8> input, std::vector<u8>& output);
-    NvResult SetClientData(std::span<const u8> input, std::vector<u8>& output);
-    NvResult GetClientData(std::span<const u8> input, std::vector<u8>& output);
-    NvResult ZCullBind(std::span<const u8> input, std::vector<u8>& output);
-    NvResult SetErrorNotifier(std::span<const u8> input, std::vector<u8>& output);
-    NvResult SetChannelPriority(std::span<const u8> input, std::vector<u8>& output);
-    NvResult AllocGPFIFOEx2(std::span<const u8> input, std::vector<u8>& output);
-    NvResult AllocateObjectContext(std::span<const u8> input, std::vector<u8>& output);
-    NvResult SubmitGPFIFOImpl(IoctlSubmitGpfifo& params, std::vector<u8>& output,
+    NvResult SetNVMAPfd(std::span<const u8> input, std::span<u8> output);
+    NvResult SetClientData(std::span<const u8> input, std::span<u8> output);
+    NvResult GetClientData(std::span<const u8> input, std::span<u8> output);
+    NvResult ZCullBind(std::span<const u8> input, std::span<u8> output);
+    NvResult SetErrorNotifier(std::span<const u8> input, std::span<u8> output);
+    NvResult SetChannelPriority(std::span<const u8> input, std::span<u8> output);
+    NvResult AllocGPFIFOEx2(std::span<const u8> input, std::span<u8> output);
+    NvResult AllocateObjectContext(std::span<const u8> input, std::span<u8> output);
+    NvResult SubmitGPFIFOImpl(IoctlSubmitGpfifo& params, std::span<u8> output,
                               Tegra::CommandList&& entries);
-    NvResult SubmitGPFIFOBase(std::span<const u8> input, std::vector<u8>& output,
+    NvResult SubmitGPFIFOBase(std::span<const u8> input, std::span<u8> output,
                               bool kickoff = false);
     NvResult SubmitGPFIFOBase(std::span<const u8> input, std::span<const u8> input_inline,
-                              std::vector<u8>& output);
-    NvResult GetWaitbase(std::span<const u8> input, std::vector<u8>& output);
-    NvResult ChannelSetTimeout(std::span<const u8> input, std::vector<u8>& output);
-    NvResult ChannelSetTimeslice(std::span<const u8> input, std::vector<u8>& output);
+                              std::span<u8> output);
+    NvResult GetWaitbase(std::span<const u8> input, std::span<u8> output);
+    NvResult ChannelSetTimeout(std::span<const u8> input, std::span<u8> output);
+    NvResult ChannelSetTimeslice(std::span<const u8> input, std::span<u8> output);
 
     EventInterface& events_interface;
     NvCore::Container& core;
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
index dc45169ad..a174442a6 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
@@ -16,7 +16,7 @@ nvhost_nvdec::nvhost_nvdec(Core::System& system_, NvCore::Container& core_)
 nvhost_nvdec::~nvhost_nvdec() = default;
 
 NvResult nvhost_nvdec::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                              std::vector<u8>& output) {
+                              std::span<u8> output) {
     switch (command.group) {
     case 0x0:
         switch (command.cmd) {
@@ -56,13 +56,13 @@ NvResult nvhost_nvdec::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> in
 }
 
 NvResult nvhost_nvdec::Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                              std::span<const u8> inline_input, std::vector<u8>& output) {
+                              std::span<const u8> inline_input, std::span<u8> output) {
     UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
     return NvResult::NotImplemented;
 }
 
 NvResult nvhost_nvdec::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                              std::vector<u8>& output, std::vector<u8>& inline_output) {
+                              std::span<u8> output, std::span<u8> inline_output) {
     UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
     return NvResult::NotImplemented;
 }
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h
index 0d615bbcb..ad2233c49 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h
@@ -14,11 +14,11 @@ public:
     ~nvhost_nvdec() override;
 
     NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                    std::vector<u8>& output) override;
+                    std::span<u8> output) override;
     NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                    std::span<const u8> inline_input, std::vector<u8>& output) override;
-    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::vector<u8>& output,
-                    std::vector<u8>& inline_output) override;
+                    std::span<const u8> inline_input, std::span<u8> output) override;
+    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::span<u8> output,
+                    std::span<u8> inline_output) override;
 
     void OnOpen(DeviceFD fd) override;
     void OnClose(DeviceFD fd) override;
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
index 1ab51f10b..61649aa4a 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
@@ -36,7 +36,7 @@ std::size_t SliceVectors(std::span<const u8> input, std::vector<T>& dst, std::si
 // Writes the data in src to an offset into the dst vector. The offset is specified in bytes
 // Returns the number of bytes written into dst.
 template <typename T>
-std::size_t WriteVectors(std::vector<u8>& dst, const std::vector<T>& src, std::size_t offset) {
+std::size_t WriteVectors(std::span<u8> dst, const std::vector<T>& src, std::size_t offset) {
     if (src.empty()) {
         return 0;
     }
@@ -72,8 +72,7 @@ NvResult nvhost_nvdec_common::SetNVMAPfd(std::span<const u8> input) {
     return NvResult::Success;
 }
 
-NvResult nvhost_nvdec_common::Submit(DeviceFD fd, std::span<const u8> input,
-                                     std::vector<u8>& output) {
+NvResult nvhost_nvdec_common::Submit(DeviceFD fd, std::span<const u8> input, std::span<u8> output) {
     IoctlSubmit params{};
     std::memcpy(&params, input.data(), sizeof(IoctlSubmit));
     LOG_DEBUG(Service_NVDRV, "called NVDEC Submit, cmd_buffer_count={}", params.cmd_buffer_count);
@@ -121,7 +120,7 @@ NvResult nvhost_nvdec_common::Submit(DeviceFD fd, std::span<const u8> input,
     return NvResult::Success;
 }
 
-NvResult nvhost_nvdec_common::GetSyncpoint(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_nvdec_common::GetSyncpoint(std::span<const u8> input, std::span<u8> output) {
     IoctlGetSyncpoint params{};
     std::memcpy(&params, input.data(), sizeof(IoctlGetSyncpoint));
     LOG_DEBUG(Service_NVDRV, "called GetSyncpoint, id={}", params.param);
@@ -133,7 +132,7 @@ NvResult nvhost_nvdec_common::GetSyncpoint(std::span<const u8> input, std::vecto
     return NvResult::Success;
 }
 
-NvResult nvhost_nvdec_common::GetWaitbase(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_nvdec_common::GetWaitbase(std::span<const u8> input, std::span<u8> output) {
     IoctlGetWaitbase params{};
     LOG_CRITICAL(Service_NVDRV, "called WAITBASE");
     std::memcpy(&params, input.data(), sizeof(IoctlGetWaitbase));
@@ -142,7 +141,7 @@ NvResult nvhost_nvdec_common::GetWaitbase(std::span<const u8> input, std::vector
     return NvResult::Success;
 }
 
-NvResult nvhost_nvdec_common::MapBuffer(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_nvdec_common::MapBuffer(std::span<const u8> input, std::span<u8> output) {
     IoctlMapBuffer params{};
     std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
     std::vector<MapBufferEntry> cmd_buffer_handles(params.num_entries);
@@ -159,7 +158,7 @@ NvResult nvhost_nvdec_common::MapBuffer(std::span<const u8> input, std::vector<u
     return NvResult::Success;
 }
 
-NvResult nvhost_nvdec_common::UnmapBuffer(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_nvdec_common::UnmapBuffer(std::span<const u8> input, std::span<u8> output) {
     IoctlMapBuffer params{};
     std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
     std::vector<MapBufferEntry> cmd_buffer_handles(params.num_entries);
@@ -173,7 +172,7 @@ NvResult nvhost_nvdec_common::UnmapBuffer(std::span<const u8> input, std::vector
     return NvResult::Success;
 }
 
-NvResult nvhost_nvdec_common::SetSubmitTimeout(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_nvdec_common::SetSubmitTimeout(std::span<const u8> input, std::span<u8> output) {
     std::memcpy(&submit_timeout, input.data(), input.size());
     LOG_WARNING(Service_NVDRV, "(STUBBED) called");
     return NvResult::Success;
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
index 5af26a26f..9bb573bfe 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
@@ -108,12 +108,12 @@ protected:
 
     /// Ioctl command implementations
     NvResult SetNVMAPfd(std::span<const u8> input);
-    NvResult Submit(DeviceFD fd, std::span<const u8> input, std::vector<u8>& output);
-    NvResult GetSyncpoint(std::span<const u8> input, std::vector<u8>& output);
-    NvResult GetWaitbase(std::span<const u8> input, std::vector<u8>& output);
-    NvResult MapBuffer(std::span<const u8> input, std::vector<u8>& output);
-    NvResult UnmapBuffer(std::span<const u8> input, std::vector<u8>& output);
-    NvResult SetSubmitTimeout(std::span<const u8> input, std::vector<u8>& output);
+    NvResult Submit(DeviceFD fd, std::span<const u8> input, std::span<u8> output);
+    NvResult GetSyncpoint(std::span<const u8> input, std::span<u8> output);
+    NvResult GetWaitbase(std::span<const u8> input, std::span<u8> output);
+    NvResult MapBuffer(std::span<const u8> input, std::span<u8> output);
+    NvResult UnmapBuffer(std::span<const u8> input, std::span<u8> output);
+    NvResult SetSubmitTimeout(std::span<const u8> input, std::span<u8> output);
 
     Kernel::KEvent* QueryEvent(u32 event_id) override;
 
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp
index 39f30e7c8..a05c8cdae 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp
@@ -13,7 +13,7 @@ nvhost_nvjpg::nvhost_nvjpg(Core::System& system_) : nvdevice{system_} {}
 nvhost_nvjpg::~nvhost_nvjpg() = default;
 
 NvResult nvhost_nvjpg::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                              std::vector<u8>& output) {
+                              std::span<u8> output) {
     switch (command.group) {
     case 'H':
         switch (command.cmd) {
@@ -32,13 +32,13 @@ NvResult nvhost_nvjpg::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> in
 }
 
 NvResult nvhost_nvjpg::Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                              std::span<const u8> inline_input, std::vector<u8>& output) {
+                              std::span<const u8> inline_input, std::span<u8> output) {
     UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
     return NvResult::NotImplemented;
 }
 
 NvResult nvhost_nvjpg::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                              std::vector<u8>& output, std::vector<u8>& inline_output) {
+                              std::span<u8> output, std::span<u8> inline_output) {
     UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
     return NvResult::NotImplemented;
 }
@@ -46,7 +46,7 @@ NvResult nvhost_nvjpg::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> in
 void nvhost_nvjpg::OnOpen(DeviceFD fd) {}
 void nvhost_nvjpg::OnClose(DeviceFD fd) {}
 
-NvResult nvhost_nvjpg::SetNVMAPfd(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvhost_nvjpg::SetNVMAPfd(std::span<const u8> input, std::span<u8> output) {
     IoctlSetNvmapFD params{};
     std::memcpy(&params, input.data(), input.size());
     LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd);
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h
index 41b57e872..5623e0d47 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h
@@ -16,11 +16,11 @@ public:
     ~nvhost_nvjpg() override;
 
     NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                    std::vector<u8>& output) override;
+                    std::span<u8> output) override;
     NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                    std::span<const u8> inline_input, std::vector<u8>& output) override;
-    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::vector<u8>& output,
-                    std::vector<u8>& inline_output) override;
+                    std::span<const u8> inline_input, std::span<u8> output) override;
+    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::span<u8> output,
+                    std::span<u8> inline_output) override;
 
     void OnOpen(DeviceFD fd) override;
     void OnClose(DeviceFD fd) override;
@@ -33,7 +33,7 @@ private:
 
     s32_le nvmap_fd{};
 
-    NvResult SetNVMAPfd(std::span<const u8> input, std::vector<u8>& output);
+    NvResult SetNVMAPfd(std::span<const u8> input, std::span<u8> output);
 };
 
 } // namespace Service::Nvidia::Devices
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
index b0ea402a7..c0b8684c3 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
@@ -16,7 +16,7 @@ nvhost_vic::nvhost_vic(Core::System& system_, NvCore::Container& core_)
 nvhost_vic::~nvhost_vic() = default;
 
 NvResult nvhost_vic::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                            std::vector<u8>& output) {
+                            std::span<u8> output) {
     switch (command.group) {
     case 0x0:
         switch (command.cmd) {
@@ -56,13 +56,13 @@ NvResult nvhost_vic::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> inpu
 }
 
 NvResult nvhost_vic::Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                            std::span<const u8> inline_input, std::vector<u8>& output) {
+                            std::span<const u8> inline_input, std::span<u8> output) {
     UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
     return NvResult::NotImplemented;
 }
 
 NvResult nvhost_vic::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                            std::vector<u8>& output, std::vector<u8>& inline_output) {
+                            std::span<u8> output, std::span<u8> inline_output) {
     UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
     return NvResult::NotImplemented;
 }
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.h b/src/core/hle/service/nvdrv/devices/nvhost_vic.h
index b5e350a83..cadbcb0a5 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.h
@@ -13,11 +13,11 @@ public:
     ~nvhost_vic();
 
     NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                    std::vector<u8>& output) override;
+                    std::span<u8> output) override;
     NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                    std::span<const u8> inline_input, std::vector<u8>& output) override;
-    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::vector<u8>& output,
-                    std::vector<u8>& inline_output) override;
+                    std::span<const u8> inline_input, std::span<u8> output) override;
+    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::span<u8> output,
+                    std::span<u8> inline_output) override;
 
     void OnOpen(DeviceFD fd) override;
     void OnClose(DeviceFD fd) override;
diff --git a/src/core/hle/service/nvdrv/devices/nvmap.cpp b/src/core/hle/service/nvdrv/devices/nvmap.cpp
index 07417f045..e7f7e273b 100644
--- a/src/core/hle/service/nvdrv/devices/nvmap.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvmap.cpp
@@ -26,7 +26,7 @@ nvmap::nvmap(Core::System& system_, NvCore::Container& container_)
 nvmap::~nvmap() = default;
 
 NvResult nvmap::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                       std::vector<u8>& output) {
+                       std::span<u8> output) {
     switch (command.group) {
     case 0x1:
         switch (command.cmd) {
@@ -55,13 +55,13 @@ NvResult nvmap::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
 }
 
 NvResult nvmap::Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                       std::span<const u8> inline_input, std::vector<u8>& output) {
+                       std::span<const u8> inline_input, std::span<u8> output) {
     UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
     return NvResult::NotImplemented;
 }
 
-NvResult nvmap::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                       std::vector<u8>& output, std::vector<u8>& inline_output) {
+NvResult nvmap::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::span<u8> output,
+                       std::span<u8> inline_output) {
     UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
     return NvResult::NotImplemented;
 }
@@ -69,7 +69,7 @@ NvResult nvmap::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input,
 void nvmap::OnOpen(DeviceFD fd) {}
 void nvmap::OnClose(DeviceFD fd) {}
 
-NvResult nvmap::IocCreate(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvmap::IocCreate(std::span<const u8> input, std::span<u8> output) {
     IocCreateParams params;
     std::memcpy(&params, input.data(), sizeof(params));
     LOG_DEBUG(Service_NVDRV, "called, size=0x{:08X}", params.size);
@@ -89,7 +89,7 @@ NvResult nvmap::IocCreate(std::span<const u8> input, std::vector<u8>& output) {
     return NvResult::Success;
 }
 
-NvResult nvmap::IocAlloc(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvmap::IocAlloc(std::span<const u8> input, std::span<u8> output) {
     IocAllocParams params;
     std::memcpy(&params, input.data(), sizeof(params));
     LOG_DEBUG(Service_NVDRV, "called, addr={:X}", params.address);
@@ -137,7 +137,7 @@ NvResult nvmap::IocAlloc(std::span<const u8> input, std::vector<u8>& output) {
     return result;
 }
 
-NvResult nvmap::IocGetId(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvmap::IocGetId(std::span<const u8> input, std::span<u8> output) {
     IocGetIdParams params;
     std::memcpy(&params, input.data(), sizeof(params));
 
@@ -161,7 +161,7 @@ NvResult nvmap::IocGetId(std::span<const u8> input, std::vector<u8>& output) {
     return NvResult::Success;
 }
 
-NvResult nvmap::IocFromId(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvmap::IocFromId(std::span<const u8> input, std::span<u8> output) {
     IocFromIdParams params;
     std::memcpy(&params, input.data(), sizeof(params));
 
@@ -192,7 +192,7 @@ NvResult nvmap::IocFromId(std::span<const u8> input, std::vector<u8>& output) {
     return NvResult::Success;
 }
 
-NvResult nvmap::IocParam(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvmap::IocParam(std::span<const u8> input, std::span<u8> output) {
     enum class ParamTypes { Size = 1, Alignment = 2, Base = 3, Heap = 4, Kind = 5, Compr = 6 };
 
     IocParamParams params;
@@ -241,7 +241,7 @@ NvResult nvmap::IocParam(std::span<const u8> input, std::vector<u8>& output) {
     return NvResult::Success;
 }
 
-NvResult nvmap::IocFree(std::span<const u8> input, std::vector<u8>& output) {
+NvResult nvmap::IocFree(std::span<const u8> input, std::span<u8> output) {
     IocFreeParams params;
     std::memcpy(&params, input.data(), sizeof(params));
 
diff --git a/src/core/hle/service/nvdrv/devices/nvmap.h b/src/core/hle/service/nvdrv/devices/nvmap.h
index 82bd3b118..40c65b430 100644
--- a/src/core/hle/service/nvdrv/devices/nvmap.h
+++ b/src/core/hle/service/nvdrv/devices/nvmap.h
@@ -27,11 +27,11 @@ public:
     nvmap& operator=(const nvmap&) = delete;
 
     NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                    std::vector<u8>& output) override;
+                    std::span<u8> output) override;
     NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                    std::span<const u8> inline_input, std::vector<u8>& output) override;
-    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::vector<u8>& output,
-                    std::vector<u8>& inline_output) override;
+                    std::span<const u8> inline_input, std::span<u8> output) override;
+    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::span<u8> output,
+                    std::span<u8> inline_output) override;
 
     void OnOpen(DeviceFD fd) override;
     void OnClose(DeviceFD fd) override;
@@ -106,12 +106,12 @@ private:
     };
     static_assert(sizeof(IocGetIdParams) == 8, "IocGetIdParams has wrong size");
 
-    NvResult IocCreate(std::span<const u8> input, std::vector<u8>& output);
-    NvResult IocAlloc(std::span<const u8> input, std::vector<u8>& output);
-    NvResult IocGetId(std::span<const u8> input, std::vector<u8>& output);
-    NvResult IocFromId(std::span<const u8> input, std::vector<u8>& output);
-    NvResult IocParam(std::span<const u8> input, std::vector<u8>& output);
-    NvResult IocFree(std::span<const u8> input, std::vector<u8>& output);
+    NvResult IocCreate(std::span<const u8> input, std::span<u8> output);
+    NvResult IocAlloc(std::span<const u8> input, std::span<u8> output);
+    NvResult IocGetId(std::span<const u8> input, std::span<u8> output);
+    NvResult IocFromId(std::span<const u8> input, std::span<u8> output);
+    NvResult IocParam(std::span<const u8> input, std::span<u8> output);
+    NvResult IocFree(std::span<const u8> input, std::span<u8> output);
 
     NvCore::Container& container;
     NvCore::NvMap& file;
diff --git a/src/core/hle/service/nvdrv/nvdrv.cpp b/src/core/hle/service/nvdrv/nvdrv.cpp
index 3d774eec4..9e46ee8dd 100644
--- a/src/core/hle/service/nvdrv/nvdrv.cpp
+++ b/src/core/hle/service/nvdrv/nvdrv.cpp
@@ -130,7 +130,7 @@ DeviceFD Module::Open(const std::string& device_name) {
 }
 
 NvResult Module::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                        std::vector<u8>& output) {
+                        std::span<u8> output) {
     if (fd < 0) {
         LOG_ERROR(Service_NVDRV, "Invalid DeviceFD={}!", fd);
         return NvResult::InvalidState;
@@ -147,7 +147,7 @@ NvResult Module::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input,
 }
 
 NvResult Module::Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                        std::span<const u8> inline_input, std::vector<u8>& output) {
+                        std::span<const u8> inline_input, std::span<u8> output) {
     if (fd < 0) {
         LOG_ERROR(Service_NVDRV, "Invalid DeviceFD={}!", fd);
         return NvResult::InvalidState;
@@ -163,8 +163,8 @@ NvResult Module::Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
     return itr->second->Ioctl2(fd, command, input, inline_input, output);
 }
 
-NvResult Module::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                        std::vector<u8>& output, std::vector<u8>& inline_output) {
+NvResult Module::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::span<u8> output,
+                        std::span<u8> inline_output) {
     if (fd < 0) {
         LOG_ERROR(Service_NVDRV, "Invalid DeviceFD={}!", fd);
         return NvResult::InvalidState;
diff --git a/src/core/hle/service/nvdrv/nvdrv.h b/src/core/hle/service/nvdrv/nvdrv.h
index 668be742b..d8622b3ca 100644
--- a/src/core/hle/service/nvdrv/nvdrv.h
+++ b/src/core/hle/service/nvdrv/nvdrv.h
@@ -80,13 +80,13 @@ public:
     DeviceFD Open(const std::string& device_name);
 
     /// Sends an ioctl command to the specified file descriptor.
-    NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input, std::vector<u8>& output);
+    NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> input, std::span<u8> output);
 
     NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span<const u8> input,
-                    std::span<const u8> inline_input, std::vector<u8>& output);
+                    std::span<const u8> inline_input, std::span<u8> output);
 
-    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::vector<u8>& output,
-                    std::vector<u8>& inline_output);
+    NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> input, std::span<u8> output,
+                    std::span<u8> inline_output);
 
     /// Closes a device file descriptor and returns operation success.
     NvResult Close(DeviceFD fd);
diff --git a/src/core/hle/service/nvdrv/nvdrv_interface.cpp b/src/core/hle/service/nvdrv/nvdrv_interface.cpp
index d010a1e03..348207e25 100644
--- a/src/core/hle/service/nvdrv/nvdrv_interface.cpp
+++ b/src/core/hle/service/nvdrv/nvdrv_interface.cpp
@@ -63,12 +63,12 @@ void NVDRV::Ioctl1(HLERequestContext& ctx) {
     }
 
     // Check device
-    std::vector<u8> output_buffer(ctx.GetWriteBufferSize(0));
+    tmp_output.resize_destructive(ctx.GetWriteBufferSize(0));
     const auto input_buffer = ctx.ReadBuffer(0);
 
-    const auto nv_result = nvdrv->Ioctl1(fd, command, input_buffer, output_buffer);
+    const auto nv_result = nvdrv->Ioctl1(fd, command, input_buffer, tmp_output);
     if (command.is_out != 0) {
-        ctx.WriteBuffer(output_buffer);
+        ctx.WriteBuffer(tmp_output);
     }
 
     IPC::ResponseBuilder rb{ctx, 3};
@@ -90,12 +90,12 @@ void NVDRV::Ioctl2(HLERequestContext& ctx) {
 
     const auto input_buffer = ctx.ReadBuffer(0);
     const auto input_inlined_buffer = ctx.ReadBuffer(1);
-    std::vector<u8> output_buffer(ctx.GetWriteBufferSize(0));
+    tmp_output.resize_destructive(ctx.GetWriteBufferSize(0));
 
     const auto nv_result =
-        nvdrv->Ioctl2(fd, command, input_buffer, input_inlined_buffer, output_buffer);
+        nvdrv->Ioctl2(fd, command, input_buffer, input_inlined_buffer, tmp_output);
     if (command.is_out != 0) {
-        ctx.WriteBuffer(output_buffer);
+        ctx.WriteBuffer(tmp_output);
     }
 
     IPC::ResponseBuilder rb{ctx, 3};
@@ -116,14 +116,12 @@ void NVDRV::Ioctl3(HLERequestContext& ctx) {
     }
 
     const auto input_buffer = ctx.ReadBuffer(0);
-    std::vector<u8> output_buffer(ctx.GetWriteBufferSize(0));
-    std::vector<u8> output_buffer_inline(ctx.GetWriteBufferSize(1));
-
-    const auto nv_result =
-        nvdrv->Ioctl3(fd, command, input_buffer, output_buffer, output_buffer_inline);
+    tmp_output.resize_destructive(ctx.GetWriteBufferSize(0));
+    tmp_output_inline.resize_destructive(ctx.GetWriteBufferSize(1));
+    const auto nv_result = nvdrv->Ioctl3(fd, command, input_buffer, tmp_output, tmp_output_inline);
     if (command.is_out != 0) {
-        ctx.WriteBuffer(output_buffer, 0);
-        ctx.WriteBuffer(output_buffer_inline, 1);
+        ctx.WriteBuffer(tmp_output, 0);
+        ctx.WriteBuffer(tmp_output_inline, 1);
     }
 
     IPC::ResponseBuilder rb{ctx, 3};
diff --git a/src/core/hle/service/nvdrv/nvdrv_interface.h b/src/core/hle/service/nvdrv/nvdrv_interface.h
index 881ea1a6b..4b593ff90 100644
--- a/src/core/hle/service/nvdrv/nvdrv_interface.h
+++ b/src/core/hle/service/nvdrv/nvdrv_interface.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <memory>
+#include "common/scratch_buffer.h"
 #include "core/hle/service/nvdrv/nvdrv.h"
 #include "core/hle/service/service.h"
 
@@ -33,6 +34,8 @@ private:
 
     u64 pid{};
     bool is_initialized{};
+    Common::ScratchBuffer<u8> tmp_output;
+    Common::ScratchBuffer<u8> tmp_output_inline;
 };
 
 } // namespace Service::Nvidia
diff --git a/src/core/hle/service/nvnflinger/parcel.h b/src/core/hle/service/nvnflinger/parcel.h
index fb56d75d7..23ba315a0 100644
--- a/src/core/hle/service/nvnflinger/parcel.h
+++ b/src/core/hle/service/nvnflinger/parcel.h
@@ -6,6 +6,7 @@
 #include <memory>
 #include <span>
 #include <vector>
+#include <boost/container/small_vector.hpp>
 
 #include "common/alignment.h"
 #include "common/assert.h"
@@ -167,7 +168,7 @@ public:
 private:
     template <typename T>
         requires(std::is_trivially_copyable_v<T>)
-    void WriteImpl(const T& val, std::vector<u8>& buffer) {
+    void WriteImpl(const T& val, boost::container::small_vector<u8, 0x200>& buffer) {
         const size_t aligned_size = Common::AlignUp(sizeof(T), 4);
         const size_t old_size = buffer.size();
         buffer.resize(old_size + aligned_size);
@@ -176,8 +177,8 @@ private:
     }
 
 private:
-    std::vector<u8> m_data_buffer;
-    std::vector<u8> m_object_buffer;
+    boost::container::small_vector<u8, 0x200> m_data_buffer;
+    boost::container::small_vector<u8, 0x200> m_object_buffer;
 };
 
 } // namespace Service::android
diff --git a/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp b/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp
index c3c2281bb..9ff4028c2 100644
--- a/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp
+++ b/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp
@@ -479,7 +479,7 @@ void EmitContext::DefineGenericOutput(size_t index, u32 invocations) {
         const u32 remainder{4 - element};
         const TransformFeedbackVarying* xfb_varying{};
         const size_t xfb_varying_index{base_index + element};
-        if (xfb_varying_index < runtime_info.xfb_varyings.size()) {
+        if (xfb_varying_index < runtime_info.xfb_count) {
             xfb_varying = &runtime_info.xfb_varyings[xfb_varying_index];
             xfb_varying = xfb_varying->components > 0 ? xfb_varying : nullptr;
         }
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp
index 0f86a8004..34592a01f 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp
@@ -387,7 +387,7 @@ void SetupSignedNanCapabilities(const Profile& profile, const IR::Program& progr
 }
 
 void SetupTransformFeedbackCapabilities(EmitContext& ctx, Id main_func) {
-    if (ctx.runtime_info.xfb_varyings.empty()) {
+    if (ctx.runtime_info.xfb_count == 0) {
         return;
     }
     ctx.AddCapability(spv::Capability::TransformFeedback);
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
index fd15f47ea..bec5db173 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@@ -160,7 +160,7 @@ void DefineGenericOutput(EmitContext& ctx, size_t index, std::optional<u32> invo
         const u32 remainder{4 - element};
         const TransformFeedbackVarying* xfb_varying{};
         const size_t xfb_varying_index{base_attr_index + element};
-        if (xfb_varying_index < ctx.runtime_info.xfb_varyings.size()) {
+        if (xfb_varying_index < ctx.runtime_info.xfb_count) {
             xfb_varying = &ctx.runtime_info.xfb_varyings[xfb_varying_index];
             xfb_varying = xfb_varying->components > 0 ? xfb_varying : nullptr;
         }
diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h
index 3b63c249f..619c0b138 100644
--- a/src/shader_recompiler/runtime_info.h
+++ b/src/shader_recompiler/runtime_info.h
@@ -84,7 +84,8 @@ struct RuntimeInfo {
     bool glasm_use_storage_buffers{};
 
     /// Transform feedback state for each varying
-    std::vector<TransformFeedbackVarying> xfb_varyings;
+    std::array<TransformFeedbackVarying, 256> xfb_varyings{};
+    u32 xfb_count{0};
 };
 
 } // namespace Shader
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 45977d578..58a45ab67 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -207,7 +207,7 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am
     if (has_new_downloads) {
         memory_tracker.MarkRegionAsGpuModified(*cpu_dest_address, amount);
     }
-    tmp_buffer.resize(amount);
+    tmp_buffer.resize_destructive(amount);
     cpu_memory.ReadBlockUnsafe(*cpu_src_address, tmp_buffer.data(), amount);
     cpu_memory.WriteBlockUnsafe(*cpu_dest_address, tmp_buffer.data(), amount);
     return true;
@@ -1279,7 +1279,7 @@ template <class P>
 typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu_addr,
                                                                        u32 wanted_size) {
     static constexpr int STREAM_LEAP_THRESHOLD = 16;
-    std::vector<BufferId> overlap_ids;
+    boost::container::small_vector<BufferId, 16> overlap_ids;
     VAddr begin = cpu_addr;
     VAddr end = cpu_addr + wanted_size;
     int stream_score = 0;
diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h
index 63a120f7a..fe6068cfe 100644
--- a/src/video_core/buffer_cache/buffer_cache_base.h
+++ b/src/video_core/buffer_cache/buffer_cache_base.h
@@ -229,7 +229,7 @@ class BufferCache : public VideoCommon::ChannelSetupCaches<BufferCacheChannelInf
     using OverlapCounter = boost::icl::split_interval_map<VAddr, int>;
 
     struct OverlapResult {
-        std::vector<BufferId> ids;
+        boost::container::small_vector<BufferId, 16> ids;
         VAddr begin;
         VAddr end;
         bool has_stream_leap = false;
@@ -582,7 +582,7 @@ private:
     BufferId inline_buffer_id;
 
     std::array<BufferId, ((1ULL << 39) >> CACHING_PAGEBITS)> page_table;
-    std::vector<u8> tmp_buffer;
+    Common::ScratchBuffer<u8> tmp_buffer;
 };
 
 } // namespace VideoCommon
diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h
index 83112dfce..7d660af47 100644
--- a/src/video_core/cdma_pusher.h
+++ b/src/video_core/cdma_pusher.h
@@ -63,7 +63,6 @@ struct ChCommand {
 };
 
 using ChCommandHeaderList = std::vector<ChCommandHeader>;
-using ChCommandList = std::vector<ChCommand>;
 
 struct ThiRegisters {
     u32_le increment_syncpt{};
diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h
index 1cdb690ed..8a2784cdc 100644
--- a/src/video_core/dma_pusher.h
+++ b/src/video_core/dma_pusher.h
@@ -6,6 +6,7 @@
 #include <array>
 #include <span>
 #include <vector>
+#include <boost/container/small_vector.hpp>
 #include <queue>
 
 #include "common/bit_field.h"
@@ -102,11 +103,12 @@ inline CommandHeader BuildCommandHeader(BufferMethods method, u32 arg_count, Sub
 struct CommandList final {
     CommandList() = default;
     explicit CommandList(std::size_t size) : command_lists(size) {}
-    explicit CommandList(std::vector<CommandHeader>&& prefetch_command_list_)
+    explicit CommandList(
+        boost::container::small_vector<CommandHeader, 512>&& prefetch_command_list_)
         : prefetch_command_list{std::move(prefetch_command_list_)} {}
 
-    std::vector<CommandListHeader> command_lists;
-    std::vector<CommandHeader> prefetch_command_list;
+    boost::container::small_vector<CommandListHeader, 512> command_lists;
+    boost::container::small_vector<CommandHeader, 512> prefetch_command_list;
 };
 
 /**
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index ebe5536de..bc1eb41e7 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -108,9 +108,11 @@ void MaxwellDMA::Launch() {
         if (regs.launch_dma.remap_enable != 0 && is_const_a_dst) {
             ASSERT(regs.remap_const.component_size_minus_one == 3);
             accelerate.BufferClear(regs.offset_out, regs.line_length_in, regs.remap_consta_value);
-            std::vector<u32> tmp_buffer(regs.line_length_in, regs.remap_consta_value);
+            read_buffer.resize_destructive(regs.line_length_in * sizeof(u32));
+            std::span<u32> span(reinterpret_cast<u32*>(read_buffer.data()), regs.line_length_in);
+            std::ranges::fill(span, regs.remap_consta_value);
             memory_manager.WriteBlockUnsafe(regs.offset_out,
-                                            reinterpret_cast<u8*>(tmp_buffer.data()),
+                                            reinterpret_cast<u8*>(read_buffer.data()),
                                             regs.line_length_in * sizeof(u32));
         } else {
             memory_manager.FlushCaching();
@@ -126,32 +128,32 @@ void MaxwellDMA::Launch() {
                 UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0);
                 UNIMPLEMENTED_IF(regs.offset_in % 16 != 0);
                 UNIMPLEMENTED_IF(regs.offset_out % 16 != 0);
-                std::vector<u8> tmp_buffer(16);
+                read_buffer.resize_destructive(16);
                 for (u32 offset = 0; offset < regs.line_length_in; offset += 16) {
                     memory_manager.ReadBlockUnsafe(
                         convert_linear_2_blocklinear_addr(regs.offset_in + offset),
-                        tmp_buffer.data(), tmp_buffer.size());
-                    memory_manager.WriteBlockCached(regs.offset_out + offset, tmp_buffer.data(),
-                                                    tmp_buffer.size());
+                        read_buffer.data(), read_buffer.size());
+                    memory_manager.WriteBlockCached(regs.offset_out + offset, read_buffer.data(),
+                                                    read_buffer.size());
                 }
             } else if (is_src_pitch && !is_dst_pitch) {
                 UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0);
                 UNIMPLEMENTED_IF(regs.offset_in % 16 != 0);
                 UNIMPLEMENTED_IF(regs.offset_out % 16 != 0);
-                std::vector<u8> tmp_buffer(16);
+                read_buffer.resize_destructive(16);
                 for (u32 offset = 0; offset < regs.line_length_in; offset += 16) {
-                    memory_manager.ReadBlockUnsafe(regs.offset_in + offset, tmp_buffer.data(),
-                                                   tmp_buffer.size());
+                    memory_manager.ReadBlockUnsafe(regs.offset_in + offset, read_buffer.data(),
+                                                   read_buffer.size());
                     memory_manager.WriteBlockCached(
                         convert_linear_2_blocklinear_addr(regs.offset_out + offset),
-                        tmp_buffer.data(), tmp_buffer.size());
+                        read_buffer.data(), read_buffer.size());
                 }
             } else {
                 if (!accelerate.BufferCopy(regs.offset_in, regs.offset_out, regs.line_length_in)) {
-                    std::vector<u8> tmp_buffer(regs.line_length_in);
-                    memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(),
+                    read_buffer.resize_destructive(regs.line_length_in);
+                    memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(),
                                                    regs.line_length_in);
-                    memory_manager.WriteBlockCached(regs.offset_out, tmp_buffer.data(),
+                    memory_manager.WriteBlockCached(regs.offset_out, read_buffer.data(),
                                                     regs.line_length_in);
                 }
             }
@@ -171,7 +173,8 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
     src_operand.address = regs.offset_in;
 
     DMA::BufferOperand dst_operand;
-    dst_operand.pitch = regs.pitch_out;
+    u32 abs_pitch_out = std::abs(static_cast<s32>(regs.pitch_out));
+    dst_operand.pitch = abs_pitch_out;
     dst_operand.width = regs.line_length_in;
     dst_operand.height = regs.line_count;
     dst_operand.address = regs.offset_out;
@@ -218,7 +221,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
     const size_t src_size =
         CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
 
-    const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
+    const size_t dst_size = static_cast<size_t>(abs_pitch_out) * regs.line_count;
     read_buffer.resize_destructive(src_size);
     write_buffer.resize_destructive(dst_size);
 
@@ -227,7 +230,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
 
     UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset,
                      src_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
-                     regs.pitch_out);
+                     abs_pitch_out);
 
     memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }
diff --git a/src/video_core/host1x/codecs/h264.cpp b/src/video_core/host1x/codecs/h264.cpp
index 6ce179167..ce827eb6c 100644
--- a/src/video_core/host1x/codecs/h264.cpp
+++ b/src/video_core/host1x/codecs/h264.cpp
@@ -4,6 +4,7 @@
 #include <array>
 #include <bit>
 
+#include "common/scratch_buffer.h"
 #include "common/settings.h"
 #include "video_core/host1x/codecs/h264.h"
 #include "video_core/host1x/host1x.h"
@@ -188,7 +189,8 @@ void H264BitWriter::WriteBit(bool state) {
 }
 
 void H264BitWriter::WriteScalingList(std::span<const u8> list, s32 start, s32 count) {
-    std::vector<u8> scan(count);
+    static Common::ScratchBuffer<u8> scan{};
+    scan.resize_destructive(count);
     if (count == 16) {
         std::memcpy(scan.data(), zig_zag_scan.data(), scan.size());
     } else {
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index b2f7e160a..45141e488 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -587,7 +587,7 @@ void MemoryManager::InvalidateRegion(GPUVAddr gpu_addr, size_t size,
 
 void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size,
                               VideoCommon::CacheType which) {
-    std::vector<u8> tmp_buffer(size);
+    tmp_buffer.resize_destructive(size);
     ReadBlock(gpu_src_addr, tmp_buffer.data(), size, which);
 
     // The output block must be flushed in case it has data modified from the GPU.
@@ -670,9 +670,9 @@ bool MemoryManager::IsFullyMappedRange(GPUVAddr gpu_addr, std::size_t size) cons
     return result;
 }
 
-std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
-    GPUVAddr gpu_addr, std::size_t size) const {
-    std::vector<std::pair<GPUVAddr, std::size_t>> result{};
+boost::container::small_vector<std::pair<GPUVAddr, std::size_t>, 32>
+MemoryManager::GetSubmappedRange(GPUVAddr gpu_addr, std::size_t size) const {
+    boost::container::small_vector<std::pair<GPUVAddr, std::size_t>, 32> result{};
     GetSubmappedRangeImpl<true>(gpu_addr, size, result);
     return result;
 }
@@ -680,8 +680,9 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
 template <bool is_gpu_address>
 void MemoryManager::GetSubmappedRangeImpl(
     GPUVAddr gpu_addr, std::size_t size,
-    std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>&
-        result) const {
+    boost::container::small_vector<
+        std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>, 32>& result)
+    const {
     std::optional<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>
         last_segment{};
     std::optional<VAddr> old_page_addr{};
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 794535122..4202c26ff 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -8,10 +8,12 @@
 #include <mutex>
 #include <optional>
 #include <vector>
+#include <boost/container/small_vector.hpp>
 
 #include "common/common_types.h"
 #include "common/multi_level_page_table.h"
 #include "common/range_map.h"
+#include "common/scratch_buffer.h"
 #include "common/virtual_buffer.h"
 #include "video_core/cache_types.h"
 #include "video_core/pte_kind.h"
@@ -107,8 +109,8 @@ public:
      * if the region is continuous, a single pair will be returned. If it's unmapped, an empty
      * vector will be returned;
      */
-    std::vector<std::pair<GPUVAddr, std::size_t>> GetSubmappedRange(GPUVAddr gpu_addr,
-                                                                    std::size_t size) const;
+    boost::container::small_vector<std::pair<GPUVAddr, std::size_t>, 32> GetSubmappedRange(
+        GPUVAddr gpu_addr, std::size_t size) const;
 
     GPUVAddr Map(GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size,
                  PTEKind kind = PTEKind::INVALID, bool is_big_pages = true);
@@ -165,7 +167,8 @@ private:
     template <bool is_gpu_address>
     void GetSubmappedRangeImpl(
         GPUVAddr gpu_addr, std::size_t size,
-        std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>&
+        boost::container::small_vector<
+            std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>, 32>&
             result) const;
 
     Core::System& system;
@@ -215,8 +218,8 @@ private:
     Common::VirtualBuffer<u32> big_page_table_cpu;
 
     std::vector<u64> big_page_continuous;
-    std::vector<std::pair<VAddr, std::size_t>> page_stash{};
-    std::vector<std::pair<VAddr, std::size_t>> page_stash2{};
+    boost::container::small_vector<std::pair<VAddr, std::size_t>, 32> page_stash{};
+    boost::container::small_vector<std::pair<VAddr, std::size_t>, 32> page_stash2{};
 
     mutable std::mutex guard;
 
@@ -226,6 +229,8 @@ private:
     std::unique_ptr<VideoCommon::InvalidationAccumulator> accumulator;
 
     static std::atomic<size_t> unique_identifier_generator;
+
+    Common::ScratchBuffer<u8> tmp_buffer;
 };
 
 } // namespace Tegra
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 3f077311e..0329ed820 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -85,7 +85,9 @@ Shader::RuntimeInfo MakeRuntimeInfo(const GraphicsPipelineKey& key,
     case Shader::Stage::VertexB:
     case Shader::Stage::Geometry:
         if (!use_assembly_shaders && key.xfb_enabled != 0) {
-            info.xfb_varyings = VideoCommon::MakeTransformFeedbackVaryings(key.xfb_state);
+            auto [varyings, count] = VideoCommon::MakeTransformFeedbackVaryings(key.xfb_state);
+            info.xfb_varyings = varyings;
+            info.xfb_count = count;
         }
         break;
     case Shader::Stage::TessellationEval:
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index e30fcb1ed..f47301ad5 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -361,7 +361,7 @@ void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer,
         .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
     };
     // Measuring a popular game, this number never exceeds the specified size once data is warmed up
-    boost::container::small_vector<VkBufferCopy, 3> vk_copies(copies.size());
+    boost::container::small_vector<VkBufferCopy, 8> vk_copies(copies.size());
     std::ranges::transform(copies, vk_copies.begin(), MakeBufferCopy);
     scheduler.RequestOutsideRenderPassOperationContext();
     scheduler.Record([src_buffer, dst_buffer, vk_copies, barrier](vk::CommandBuffer cmdbuf) {
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index a2cfb2105..9f316113c 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -167,7 +167,10 @@ Shader::RuntimeInfo MakeRuntimeInfo(std::span<const Shader::IR::Program> program
                 info.fixed_state_point_size = point_size;
             }
             if (key.state.xfb_enabled) {
-                info.xfb_varyings = VideoCommon::MakeTransformFeedbackVaryings(key.state.xfb_state);
+                auto [varyings, count] =
+                    VideoCommon::MakeTransformFeedbackVaryings(key.state.xfb_state);
+                info.xfb_varyings = varyings;
+                info.xfb_count = count;
             }
             info.convert_depth_mode = gl_ndc;
         }
@@ -214,7 +217,10 @@ Shader::RuntimeInfo MakeRuntimeInfo(std::span<const Shader::IR::Program> program
             info.fixed_state_point_size = point_size;
         }
         if (key.state.xfb_enabled != 0) {
-            info.xfb_varyings = VideoCommon::MakeTransformFeedbackVaryings(key.state.xfb_state);
+            auto [varyings, count] =
+                VideoCommon::MakeTransformFeedbackVaryings(key.state.xfb_state);
+            info.xfb_varyings = varyings;
+            info.xfb_count = count;
         }
         info.convert_depth_mode = gl_ndc;
         break;
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index f025f618b..f3cef09dd 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -330,9 +330,9 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
     };
 }
 
-[[maybe_unused]] [[nodiscard]] std::vector<VkBufferCopy> TransformBufferCopies(
-    std::span<const VideoCommon::BufferCopy> copies, size_t buffer_offset) {
-    std::vector<VkBufferCopy> result(copies.size());
+[[maybe_unused]] [[nodiscard]] boost::container::small_vector<VkBufferCopy, 16>
+TransformBufferCopies(std::span<const VideoCommon::BufferCopy> copies, size_t buffer_offset) {
+    boost::container::small_vector<VkBufferCopy, 16> result(copies.size());
     std::ranges::transform(
         copies, result.begin(), [buffer_offset](const VideoCommon::BufferCopy& copy) {
             return VkBufferCopy{
@@ -344,7 +344,7 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
     return result;
 }
 
-[[nodiscard]] std::vector<VkBufferImageCopy> TransformBufferImageCopies(
+[[nodiscard]] boost::container::small_vector<VkBufferImageCopy, 16> TransformBufferImageCopies(
     std::span<const BufferImageCopy> copies, size_t buffer_offset, VkImageAspectFlags aspect_mask) {
     struct Maker {
         VkBufferImageCopy operator()(const BufferImageCopy& copy) const {
@@ -377,14 +377,14 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
         VkImageAspectFlags aspect_mask;
     };
     if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
-        std::vector<VkBufferImageCopy> result(copies.size() * 2);
+        boost::container::small_vector<VkBufferImageCopy, 16> result(copies.size() * 2);
         std::ranges::transform(copies, result.begin(),
                                Maker{buffer_offset, VK_IMAGE_ASPECT_DEPTH_BIT});
         std::ranges::transform(copies, result.begin() + copies.size(),
                                Maker{buffer_offset, VK_IMAGE_ASPECT_STENCIL_BIT});
         return result;
     } else {
-        std::vector<VkBufferImageCopy> result(copies.size());
+        boost::container::small_vector<VkBufferImageCopy, 16> result(copies.size());
         std::ranges::transform(copies, result.begin(), Maker{buffer_offset, aspect_mask});
         return result;
     }
@@ -867,8 +867,8 @@ void TextureCacheRuntime::BarrierFeedbackLoop() {
 
 void TextureCacheRuntime::ReinterpretImage(Image& dst, Image& src,
                                            std::span<const VideoCommon::ImageCopy> copies) {
-    std::vector<VkBufferImageCopy> vk_in_copies(copies.size());
-    std::vector<VkBufferImageCopy> vk_out_copies(copies.size());
+    boost::container::small_vector<VkBufferImageCopy, 16> vk_in_copies(copies.size());
+    boost::container::small_vector<VkBufferImageCopy, 16> vk_out_copies(copies.size());
     const VkImageAspectFlags src_aspect_mask = src.AspectMask();
     const VkImageAspectFlags dst_aspect_mask = dst.AspectMask();
 
@@ -1157,7 +1157,7 @@ void TextureCacheRuntime::ConvertImage(Framebuffer* dst, ImageView& dst_view, Im
 
 void TextureCacheRuntime::CopyImage(Image& dst, Image& src,
                                     std::span<const VideoCommon::ImageCopy> copies) {
-    std::vector<VkImageCopy> vk_copies(copies.size());
+    boost::container::small_vector<VkImageCopy, 16> vk_copies(copies.size());
     const VkImageAspectFlags aspect_mask = dst.AspectMask();
     ASSERT(aspect_mask == src.AspectMask());
 
@@ -1332,7 +1332,7 @@ void Image::UploadMemory(VkBuffer buffer, VkDeviceSize offset,
         ScaleDown(true);
     }
     scheduler->RequestOutsideRenderPassOperationContext();
-    std::vector vk_copies = TransformBufferImageCopies(copies, offset, aspect_mask);
+    auto vk_copies = TransformBufferImageCopies(copies, offset, aspect_mask);
     const VkBuffer src_buffer = buffer;
     const VkImage vk_image = *original_image;
     const VkImageAspectFlags vk_aspect_mask = aspect_mask;
@@ -1367,8 +1367,9 @@ void Image::DownloadMemory(std::span<VkBuffer> buffers_span, std::span<VkDeviceS
     if (is_rescaled) {
         ScaleDown();
     }
-    boost::container::small_vector<VkBuffer, 1> buffers_vector{};
-    boost::container::small_vector<std::vector<VkBufferImageCopy>, 1> vk_copies;
+    boost::container::small_vector<VkBuffer, 8> buffers_vector{};
+    boost::container::small_vector<boost::container::small_vector<VkBufferImageCopy, 16>, 8>
+        vk_copies;
     for (size_t index = 0; index < buffers_span.size(); index++) {
         buffers_vector.emplace_back(buffers_span[index]);
         vk_copies.emplace_back(
@@ -1858,7 +1859,7 @@ Framebuffer::~Framebuffer() = default;
 void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime,
                                     std::span<ImageView*, NUM_RT> color_buffers,
                                     ImageView* depth_buffer, bool is_rescaled) {
-    std::vector<VkImageView> attachments;
+    boost::container::small_vector<VkImageView, NUM_RT + 1> attachments;
     RenderPassKey renderpass_key{};
     s32 num_layers = 1;
 
diff --git a/src/video_core/shader_cache.cpp b/src/video_core/shader_cache.cpp
index c5213875b..4db948b6d 100644
--- a/src/video_core/shader_cache.cpp
+++ b/src/video_core/shader_cache.cpp
@@ -151,11 +151,9 @@ void ShaderCache::RemovePendingShaders() {
     marked_for_removal.erase(std::unique(marked_for_removal.begin(), marked_for_removal.end()),
                              marked_for_removal.end());
 
-    std::vector<ShaderInfo*> removed_shaders;
-    removed_shaders.reserve(marked_for_removal.size());
+    boost::container::small_vector<ShaderInfo*, 16> removed_shaders;
 
     std::scoped_lock lock{lookup_mutex};
-
     for (Entry* const entry : marked_for_removal) {
         removed_shaders.push_back(entry->data);
 
diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h
index 1b8a17ee8..55d49d017 100644
--- a/src/video_core/texture_cache/image_base.h
+++ b/src/video_core/texture_cache/image_base.h
@@ -6,6 +6,7 @@
 #include <array>
 #include <optional>
 #include <vector>
+#include <boost/container/small_vector.hpp>
 
 #include "common/common_funcs.h"
 #include "common/common_types.h"
@@ -108,8 +109,8 @@ struct ImageBase {
     std::vector<ImageViewInfo> image_view_infos;
     std::vector<ImageViewId> image_view_ids;
 
-    std::vector<u32> slice_offsets;
-    std::vector<SubresourceBase> slice_subresources;
+    boost::container::small_vector<u32, 16> slice_offsets;
+    boost::container::small_vector<SubresourceBase, 16> slice_subresources;
 
     std::vector<AliasedImage> aliased_images;
     std::vector<ImageId> overlapping_images;
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index d58bb69ff..d3f03a995 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -526,7 +526,7 @@ void TextureCache<P>::WriteMemory(VAddr cpu_addr, size_t size) {
 
 template <class P>
 void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) {
-    std::vector<ImageId> images;
+    boost::container::small_vector<ImageId, 16> images;
     ForEachImageInRegion(cpu_addr, size, [&images](ImageId image_id, ImageBase& image) {
         if (!image.IsSafeDownload()) {
             return;
@@ -579,7 +579,7 @@ std::optional<VideoCore::RasterizerDownloadArea> TextureCache<P>::GetFlushArea(V
 
 template <class P>
 void TextureCache<P>::UnmapMemory(VAddr cpu_addr, size_t size) {
-    std::vector<ImageId> deleted_images;
+    boost::container::small_vector<ImageId, 16> deleted_images;
     ForEachImageInRegion(cpu_addr, size, [&](ImageId id, Image&) { deleted_images.push_back(id); });
     for (const ImageId id : deleted_images) {
         Image& image = slot_images[id];
@@ -593,7 +593,7 @@ void TextureCache<P>::UnmapMemory(VAddr cpu_addr, size_t size) {
 
 template <class P>
 void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size) {
-    std::vector<ImageId> deleted_images;
+    boost::container::small_vector<ImageId, 16> deleted_images;
     ForEachImageInRegionGPU(as_id, gpu_addr, size,
                             [&](ImageId id, Image&) { deleted_images.push_back(id); });
     for (const ImageId id : deleted_images) {
@@ -1101,7 +1101,7 @@ ImageId TextureCache<P>::FindImage(const ImageInfo& info, GPUVAddr gpu_addr,
     const bool native_bgr = runtime.HasNativeBgr();
     const bool flexible_formats = True(options & RelaxedOptions::Format);
     ImageId image_id{};
-    boost::container::small_vector<ImageId, 1> image_ids;
+    boost::container::small_vector<ImageId, 8> image_ids;
     const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) {
         if (True(existing_image.flags & ImageFlagBits::Remapped)) {
             return false;
@@ -1622,7 +1622,7 @@ ImageId TextureCache<P>::FindDMAImage(const ImageInfo& info, GPUVAddr gpu_addr)
         }
     }
     ImageId image_id{};
-    boost::container::small_vector<ImageId, 1> image_ids;
+    boost::container::small_vector<ImageId, 8> image_ids;
     const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) {
         if (True(existing_image.flags & ImageFlagBits::Remapped)) {
             return false;
@@ -1942,7 +1942,7 @@ void TextureCache<P>::RegisterImage(ImageId image_id) {
         image.map_view_id = map_id;
         return;
     }
-    std::vector<ImageViewId> sparse_maps{};
+    boost::container::small_vector<ImageViewId, 16> sparse_maps;
     ForEachSparseSegment(
         image, [this, image_id, &sparse_maps](GPUVAddr gpu_addr, VAddr cpu_addr, size_t size) {
             auto map_id = slot_map_views.insert(gpu_addr, cpu_addr, size, image_id);
@@ -2217,7 +2217,7 @@ void TextureCache<P>::MarkModification(ImageBase& image) noexcept {
 
 template <class P>
 void TextureCache<P>::SynchronizeAliases(ImageId image_id) {
-    boost::container::small_vector<const AliasedImage*, 1> aliased_images;
+    boost::container::small_vector<const AliasedImage*, 8> aliased_images;
     Image& image = slot_images[image_id];
     bool any_rescaled = True(image.flags & ImageFlagBits::Rescaled);
     bool any_modified = True(image.flags & ImageFlagBits::GpuModified);
diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h
index 44232b961..e9ec91265 100644
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@@ -56,7 +56,7 @@ struct ImageViewInOut {
 struct AsyncDecodeContext {
     ImageId image_id;
     Common::ScratchBuffer<u8> decoded_data;
-    std::vector<BufferImageCopy> copies;
+    boost::container::small_vector<BufferImageCopy, 16> copies;
     std::mutex mutex;
     std::atomic_bool complete;
 };
@@ -429,7 +429,7 @@ private:
 
     std::unordered_map<u64, std::vector<ImageMapId>, Common::IdentityHash<u64>> page_table;
     std::unordered_map<u64, std::vector<ImageId>, Common::IdentityHash<u64>> sparse_page_table;
-    std::unordered_map<ImageId, std::vector<ImageViewId>> sparse_views;
+    std::unordered_map<ImageId, boost::container::small_vector<ImageViewId, 16>> sparse_views;
 
     VAddr virtual_invalid_space{};
 
diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp
index 95a5b47d8..f781cb7a0 100644
--- a/src/video_core/texture_cache/util.cpp
+++ b/src/video_core/texture_cache/util.cpp
@@ -329,13 +329,13 @@ template <u32 GOB_EXTENT>
 
 [[nodiscard]] std::optional<SubresourceExtent> ResolveOverlapRightAddress3D(
     const ImageInfo& new_info, GPUVAddr gpu_addr, const ImageBase& overlap, bool strict_size) {
-    const std::vector<u32> slice_offsets = CalculateSliceOffsets(new_info);
+    const auto slice_offsets = CalculateSliceOffsets(new_info);
     const u32 diff = static_cast<u32>(overlap.gpu_addr - gpu_addr);
     const auto it = std::ranges::find(slice_offsets, diff);
     if (it == slice_offsets.end()) {
         return std::nullopt;
     }
-    const std::vector subresources = CalculateSliceSubresources(new_info);
+    const auto subresources = CalculateSliceSubresources(new_info);
     const SubresourceBase base = subresources[std::distance(slice_offsets.begin(), it)];
     const ImageInfo& info = overlap.info;
     if (!IsBlockLinearSizeCompatible(new_info, info, base.level, 0, strict_size)) {
@@ -655,9 +655,9 @@ LevelArray CalculateMipLevelSizes(const ImageInfo& info) noexcept {
     return sizes;
 }
 
-std::vector<u32> CalculateSliceOffsets(const ImageInfo& info) {
+boost::container::small_vector<u32, 16> CalculateSliceOffsets(const ImageInfo& info) {
     ASSERT(info.type == ImageType::e3D);
-    std::vector<u32> offsets;
+    boost::container::small_vector<u32, 16> offsets;
     offsets.reserve(NumSlices(info));
 
     const LevelInfo level_info = MakeLevelInfo(info);
@@ -679,9 +679,10 @@ std::vector<u32> CalculateSliceOffsets(const ImageInfo& info) {
     return offsets;
 }
 
-std::vector<SubresourceBase> CalculateSliceSubresources(const ImageInfo& info) {
+boost::container::small_vector<SubresourceBase, 16> CalculateSliceSubresources(
+    const ImageInfo& info) {
     ASSERT(info.type == ImageType::e3D);
-    std::vector<SubresourceBase> subresources;
+    boost::container::small_vector<SubresourceBase, 16> subresources;
     subresources.reserve(NumSlices(info));
     for (s32 level = 0; level < info.resources.levels; ++level) {
         const s32 depth = AdjustMipSize(info.size.depth, level);
@@ -723,8 +724,10 @@ ImageViewType RenderTargetImageViewType(const ImageInfo& info) noexcept {
     }
 }
 
-std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, const ImageInfo& src,
-                                             SubresourceBase base, u32 up_scale, u32 down_shift) {
+boost::container::small_vector<ImageCopy, 16> MakeShrinkImageCopies(const ImageInfo& dst,
+                                                                    const ImageInfo& src,
+                                                                    SubresourceBase base,
+                                                                    u32 up_scale, u32 down_shift) {
     ASSERT(dst.resources.levels >= src.resources.levels);
 
     const bool is_dst_3d = dst.type == ImageType::e3D;
@@ -733,7 +736,7 @@ std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, const ImageIn
         ASSERT(src.resources.levels == 1);
     }
     const bool both_2d{src.type == ImageType::e2D && dst.type == ImageType::e2D};
-    std::vector<ImageCopy> copies;
+    boost::container::small_vector<ImageCopy, 16> copies;
     copies.reserve(src.resources.levels);
     for (s32 level = 0; level < src.resources.levels; ++level) {
         ImageCopy& copy = copies.emplace_back();
@@ -770,9 +773,10 @@ std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, const ImageIn
     return copies;
 }
 
-std::vector<ImageCopy> MakeReinterpretImageCopies(const ImageInfo& src, u32 up_scale,
-                                                  u32 down_shift) {
-    std::vector<ImageCopy> copies;
+boost::container::small_vector<ImageCopy, 16> MakeReinterpretImageCopies(const ImageInfo& src,
+                                                                         u32 up_scale,
+                                                                         u32 down_shift) {
+    boost::container::small_vector<ImageCopy, 16> copies;
     copies.reserve(src.resources.levels);
     const bool is_3d = src.type == ImageType::e3D;
     for (s32 level = 0; level < src.resources.levels; ++level) {
@@ -824,9 +828,11 @@ bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config
     return gpu_memory.GpuToCpuAddress(address, guest_size_bytes).has_value();
 }
 
-std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr,
-                                            const ImageInfo& info, std::span<const u8> input,
-                                            std::span<u8> output) {
+boost::container::small_vector<BufferImageCopy, 16> UnswizzleImage(Tegra::MemoryManager& gpu_memory,
+                                                                   GPUVAddr gpu_addr,
+                                                                   const ImageInfo& info,
+                                                                   std::span<const u8> input,
+                                                                   std::span<u8> output) {
     const size_t guest_size_bytes = input.size_bytes();
     const u32 bpp_log2 = BytesPerBlockLog2(info.format);
     const Extent3D size = info.size;
@@ -861,7 +867,7 @@ std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory, GP
                                             info.tile_width_spacing);
     size_t guest_offset = 0;
     u32 host_offset = 0;
-    std::vector<BufferImageCopy> copies(num_levels);
+    boost::container::small_vector<BufferImageCopy, 16> copies(num_levels);
 
     for (s32 level = 0; level < num_levels; ++level) {
         const Extent3D level_size = AdjustMipSize(size, level);
@@ -978,7 +984,7 @@ void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8
     }
 }
 
-std::vector<BufferImageCopy> FullDownloadCopies(const ImageInfo& info) {
+boost::container::small_vector<BufferImageCopy, 16> FullDownloadCopies(const ImageInfo& info) {
     const Extent3D size = info.size;
     const u32 bytes_per_block = BytesPerBlock(info.format);
     if (info.type == ImageType::Linear) {
@@ -1006,7 +1012,7 @@ std::vector<BufferImageCopy> FullDownloadCopies(const ImageInfo& info) {
 
     u32 host_offset = 0;
 
-    std::vector<BufferImageCopy> copies(num_levels);
+    boost::container::small_vector<BufferImageCopy, 16> copies(num_levels);
     for (s32 level = 0; level < num_levels; ++level) {
         const Extent3D level_size = AdjustMipSize(size, level);
         const u32 num_blocks_per_layer = NumBlocks(level_size, tile_size);
@@ -1042,10 +1048,10 @@ Extent3D MipBlockSize(const ImageInfo& info, u32 level) {
     return AdjustMipBlockSize(num_tiles, level_info.block, level);
 }
 
-std::vector<SwizzleParameters> FullUploadSwizzles(const ImageInfo& info) {
+boost::container::small_vector<SwizzleParameters, 16> FullUploadSwizzles(const ImageInfo& info) {
     const Extent2D tile_size = DefaultBlockSize(info.format);
     if (info.type == ImageType::Linear) {
-        return std::vector{SwizzleParameters{
+        return {SwizzleParameters{
             .num_tiles = AdjustTileSize(info.size, tile_size),
             .block = {},
             .buffer_offset = 0,
@@ -1057,7 +1063,7 @@ std::vector<SwizzleParameters> FullUploadSwizzles(const ImageInfo& info) {
     const s32 num_levels = info.resources.levels;
 
     u32 guest_offset = 0;
-    std::vector<SwizzleParameters> params(num_levels);
+    boost::container::small_vector<SwizzleParameters, 16> params(num_levels);
     for (s32 level = 0; level < num_levels; ++level) {
         const Extent3D level_size = AdjustMipSize(size, level);
         const Extent3D num_tiles = AdjustTileSize(level_size, tile_size);
diff --git a/src/video_core/texture_cache/util.h b/src/video_core/texture_cache/util.h
index 84aa6880d..ab45a43c4 100644
--- a/src/video_core/texture_cache/util.h
+++ b/src/video_core/texture_cache/util.h
@@ -5,6 +5,7 @@
 
 #include <optional>
 #include <span>
+#include <boost/container/small_vector.hpp>
 
 #include "common/common_types.h"
 #include "common/scratch_buffer.h"
@@ -40,9 +41,10 @@ struct OverlapResult {
 
 [[nodiscard]] LevelArray CalculateMipLevelSizes(const ImageInfo& info) noexcept;
 
-[[nodiscard]] std::vector<u32> CalculateSliceOffsets(const ImageInfo& info);
+[[nodiscard]] boost::container::small_vector<u32, 16> CalculateSliceOffsets(const ImageInfo& info);
 
-[[nodiscard]] std::vector<SubresourceBase> CalculateSliceSubresources(const ImageInfo& info);
+[[nodiscard]] boost::container::small_vector<SubresourceBase, 16> CalculateSliceSubresources(
+    const ImageInfo& info);
 
 [[nodiscard]] u32 CalculateLevelStrideAlignment(const ImageInfo& info, u32 level);
 
@@ -51,21 +53,18 @@ struct OverlapResult {
 
 [[nodiscard]] ImageViewType RenderTargetImageViewType(const ImageInfo& info) noexcept;
 
-[[nodiscard]] std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst,
-                                                           const ImageInfo& src,
-                                                           SubresourceBase base, u32 up_scale = 1,
-                                                           u32 down_shift = 0);
+[[nodiscard]] boost::container::small_vector<ImageCopy, 16> MakeShrinkImageCopies(
+    const ImageInfo& dst, const ImageInfo& src, SubresourceBase base, u32 up_scale = 1,
+    u32 down_shift = 0);
 
-[[nodiscard]] std::vector<ImageCopy> MakeReinterpretImageCopies(const ImageInfo& src,
-                                                                u32 up_scale = 1,
-                                                                u32 down_shift = 0);
+[[nodiscard]] boost::container::small_vector<ImageCopy, 16> MakeReinterpretImageCopies(
+    const ImageInfo& src, u32 up_scale = 1, u32 down_shift = 0);
 
 [[nodiscard]] bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config);
 
-[[nodiscard]] std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory,
-                                                          GPUVAddr gpu_addr, const ImageInfo& info,
-                                                          std::span<const u8> input,
-                                                          std::span<u8> output);
+[[nodiscard]] boost::container::small_vector<BufferImageCopy, 16> UnswizzleImage(
+    Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info,
+    std::span<const u8> input, std::span<u8> output);
 
 [[nodiscard]] BufferCopy UploadBufferCopy(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr,
                                           const ImageBase& image, std::span<u8> output);
@@ -73,13 +72,15 @@ struct OverlapResult {
 void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8> output,
                   std::span<BufferImageCopy> copies);
 
-[[nodiscard]] std::vector<BufferImageCopy> FullDownloadCopies(const ImageInfo& info);
+[[nodiscard]] boost::container::small_vector<BufferImageCopy, 16> FullDownloadCopies(
+    const ImageInfo& info);
 
 [[nodiscard]] Extent3D MipSize(Extent3D size, u32 level);
 
 [[nodiscard]] Extent3D MipBlockSize(const ImageInfo& info, u32 level);
 
-[[nodiscard]] std::vector<SwizzleParameters> FullUploadSwizzles(const ImageInfo& info);
+[[nodiscard]] boost::container::small_vector<SwizzleParameters, 16> FullUploadSwizzles(
+    const ImageInfo& info);
 
 void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info,
                   std::span<const BufferImageCopy> copies, std::span<const u8> memory,
diff --git a/src/video_core/transform_feedback.cpp b/src/video_core/transform_feedback.cpp
index 155599316..1f353d2df 100644
--- a/src/video_core/transform_feedback.cpp
+++ b/src/video_core/transform_feedback.cpp
@@ -13,7 +13,7 @@
 
 namespace VideoCommon {
 
-std::vector<Shader::TransformFeedbackVarying> MakeTransformFeedbackVaryings(
+std::pair<std::array<Shader::TransformFeedbackVarying, 256>, u32> MakeTransformFeedbackVaryings(
     const TransformFeedbackState& state) {
     static constexpr std::array VECTORS{
         28U,  // gl_Position
@@ -62,7 +62,8 @@ std::vector<Shader::TransformFeedbackVarying> MakeTransformFeedbackVaryings(
         216U, // gl_TexCoord[6]
         220U, // gl_TexCoord[7]
     };
-    std::vector<Shader::TransformFeedbackVarying> xfb(256);
+    std::array<Shader::TransformFeedbackVarying, 256> xfb{};
+    u32 count{0};
     for (size_t buffer = 0; buffer < state.layouts.size(); ++buffer) {
         const auto& locations = state.varyings[buffer];
         const auto& layout = state.layouts[buffer];
@@ -103,11 +104,12 @@ std::vector<Shader::TransformFeedbackVarying> MakeTransformFeedbackVaryings(
                 }
             }
             xfb[attribute] = varying;
+            count = std::max(count, attribute);
             highest = std::max(highest, (base_offset + varying.components) * 4);
         }
         UNIMPLEMENTED_IF(highest != layout.stride);
     }
-    return xfb;
+    return {xfb, count + 1};
 }
 
 } // namespace VideoCommon
diff --git a/src/video_core/transform_feedback.h b/src/video_core/transform_feedback.h
index d13eb16c3..401b1352a 100644
--- a/src/video_core/transform_feedback.h
+++ b/src/video_core/transform_feedback.h
@@ -24,7 +24,7 @@ struct TransformFeedbackState {
         varyings;
 };
 
-std::vector<Shader::TransformFeedbackVarying> MakeTransformFeedbackVaryings(
+std::pair<std::array<Shader::TransformFeedbackVarying, 256>, u32> MakeTransformFeedbackVaryings(
     const TransformFeedbackState& state);
 
 } // namespace VideoCommon
diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp
index fa9cde75b..b11abe311 100644
--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -316,6 +316,7 @@ NvidiaArchitecture GetNvidiaArchitecture(vk::PhysicalDevice physical,
 std::vector<const char*> ExtensionListForVulkan(
     const std::set<std::string, std::less<>>& extensions) {
     std::vector<const char*> output;
+    output.reserve(extensions.size());
     for (const auto& extension : extensions) {
         output.push_back(extension.c_str());
     }
-- 
cgit v1.2.3


From eac46ad7ceca5e35b396a8b80bfc38dc6ef1a4fe Mon Sep 17 00:00:00 2001
From: GPUCode <geoster3d@gmail.com>
Date: Tue, 6 Jun 2023 23:10:06 +0300
Subject: video_core: Add BCn decoding support

---
 externals/CMakeLists.txt                           |    3 +
 externals/bc_decoder/bc_decoder.cpp                | 1522 ++++++++++++++++++++
 externals/bc_decoder/bc_decoder.h                  |   43 +
 src/video_core/CMakeLists.txt                      |    6 +-
 src/video_core/renderer_vulkan/maxwell_to_vk.cpp   |   20 +
 src/video_core/renderer_vulkan/vk_rasterizer.cpp   |    7 +
 .../renderer_vulkan/vk_texture_cache.cpp           |    4 +
 src/video_core/surface.cpp                         |   22 +
 src/video_core/surface.h                           |    2 +
 src/video_core/texture_cache/decode_bc.cpp         |  129 ++
 src/video_core/texture_cache/decode_bc.h           |   19 +
 src/video_core/texture_cache/decode_bc4.cpp        |   96 --
 src/video_core/texture_cache/decode_bc4.h          |   15 -
 src/video_core/texture_cache/util.cpp              |   24 +-
 src/video_core/textures/bcn.cpp                    |    1 -
 src/video_core/textures/bcn.h                      |    9 +-
 src/video_core/vulkan_common/vulkan_device.h       |   15 +-
 17 files changed, 1803 insertions(+), 134 deletions(-)
 create mode 100644 externals/bc_decoder/bc_decoder.cpp
 create mode 100644 externals/bc_decoder/bc_decoder.h
 create mode 100644 src/video_core/texture_cache/decode_bc.cpp
 create mode 100644 src/video_core/texture_cache/decode_bc.h
 delete mode 100644 src/video_core/texture_cache/decode_bc4.cpp
 delete mode 100644 src/video_core/texture_cache/decode_bc4.h

(limited to 'src/video_core/texture_cache')

diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index 0184289eb..4ff588851 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -157,6 +157,9 @@ endif()
 add_library(stb stb/stb_dxt.cpp)
 target_include_directories(stb PUBLIC ./stb)
 
+add_library(bc_decoder bc_decoder/bc_decoder.cpp)
+target_include_directories(bc_decoder PUBLIC ./bc_decoder)
+
 if (ANDROID)
    if (ARCHITECTURE_arm64)
        add_subdirectory(libadrenotools)
diff --git a/externals/bc_decoder/bc_decoder.cpp b/externals/bc_decoder/bc_decoder.cpp
new file mode 100644
index 000000000..536c44f34
--- /dev/null
+++ b/externals/bc_decoder/bc_decoder.cpp
@@ -0,0 +1,1522 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+// Copyright 2019 The SwiftShader Authors. All Rights Reserved.
+
+// This BCn Decoder is directly derivative of Swiftshader's BCn Decoder found at: https://github.com/google/swiftshader/blob/d070309f7d154d6764cbd514b1a5c8bfcef61d06/src/Device/BC_Decoder.cpp
+// This file does not follow the Skyline code conventions but has certain Skyline specific code
+// There are a lot of implicit and narrowing conversions in this file due to this (Warnings are disabled as a result)
+
+#include <array>
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+namespace {
+    constexpr int BlockWidth = 4;
+    constexpr int BlockHeight = 4;
+
+    struct BC_color {
+        void decode(uint8_t *dst, size_t x, size_t y, size_t dstW, size_t dstH, size_t dstPitch, size_t dstBpp, bool hasAlphaChannel, bool hasSeparateAlpha) const {
+            Color c[4];
+            c[0].extract565(c0);
+            c[1].extract565(c1);
+            if (hasSeparateAlpha || (c0 > c1)) {
+                c[2] = ((c[0] * 2) + c[1]) / 3;
+                c[3] = ((c[1] * 2) + c[0]) / 3;
+            } else {
+                c[2] = (c[0] + c[1]) >> 1;
+                if (hasAlphaChannel) {
+                    c[3].clearAlpha();
+                }
+            }
+
+            for (int j = 0; j < BlockHeight && (y + j) < dstH; j++) {
+                size_t dstOffset = j * dstPitch;
+                size_t idxOffset = j * BlockHeight;
+                for (size_t i = 0; i < BlockWidth && (x + i) < dstW; i++, idxOffset++, dstOffset += dstBpp) {
+                    *reinterpret_cast<unsigned int *>(dst + dstOffset) = c[getIdx(idxOffset)].pack8888();
+                }
+            }
+        }
+
+      private:
+        struct Color {
+            Color() {
+                c[0] = c[1] = c[2] = 0;
+                c[3] = 0xFF000000;
+            }
+
+            void extract565(const unsigned int c565) {
+                c[0] = ((c565 & 0x0000001F) << 3) | ((c565 & 0x0000001C) >> 2);
+                c[1] = ((c565 & 0x000007E0) >> 3) | ((c565 & 0x00000600) >> 9);
+                c[2] = ((c565 & 0x0000F800) >> 8) | ((c565 & 0x0000E000) >> 13);
+            }
+
+            unsigned int pack8888() const {
+                return ((c[0] & 0xFF) << 16) | ((c[1] & 0xFF) << 8) | (c[2] & 0xFF) | c[3];
+            }
+
+            void clearAlpha() {
+                c[3] = 0;
+            }
+
+            Color operator*(int factor) const {
+                Color res;
+                for (int i = 0; i < 4; ++i) {
+                    res.c[i] = c[i] * factor;
+                }
+                return res;
+            }
+
+            Color operator/(int factor) const {
+                Color res;
+                for (int i = 0; i < 4; ++i) {
+                    res.c[i] = c[i] / factor;
+                }
+                return res;
+            }
+
+            Color operator>>(int shift) const {
+                Color res;
+                for (int i = 0; i < 4; ++i) {
+                    res.c[i] = c[i] >> shift;
+                }
+                return res;
+            }
+
+            Color operator+(Color const &obj) const {
+                Color res;
+                for (int i = 0; i < 4; ++i) {
+                    res.c[i] = c[i] + obj.c[i];
+                }
+                return res;
+            }
+
+          private:
+            int c[4];
+        };
+
+        size_t getIdx(int i) const {
+            size_t offset = i << 1;  // 2 bytes per index
+            return (idx & (0x3 << offset)) >> offset;
+        }
+
+        unsigned short c0;
+        unsigned short c1;
+        unsigned int idx;
+    };
+    static_assert(sizeof(BC_color) == 8, "BC_color must be 8 bytes");
+
+    struct BC_channel {
+        void decode(uint8_t *dst, size_t x, size_t y, size_t dstW, size_t dstH, size_t dstPitch, size_t dstBpp, size_t channel, bool isSigned) const {
+            int c[8] = {0};
+
+            if (isSigned) {
+                c[0] = static_cast<signed char>(data & 0xFF);
+                c[1] = static_cast<signed char>((data & 0xFF00) >> 8);
+            } else {
+                c[0] = static_cast<uint8_t>(data & 0xFF);
+                c[1] = static_cast<uint8_t>((data & 0xFF00) >> 8);
+            }
+
+            if (c[0] > c[1]) {
+                for (int i = 2; i < 8; ++i) {
+                    c[i] = ((8 - i) * c[0] + (i - 1) * c[1]) / 7;
+                }
+            } else {
+                for (int i = 2; i < 6; ++i) {
+                    c[i] = ((6 - i) * c[0] + (i - 1) * c[1]) / 5;
+                }
+                c[6] = isSigned ? -128 : 0;
+                c[7] = isSigned ? 127 : 255;
+            }
+
+            for (size_t j = 0; j < BlockHeight && (y + j) < dstH; j++) {
+                for (size_t i = 0; i < BlockWidth && (x + i) < dstW; i++) {
+                    dst[channel + (i * dstBpp) + (j * dstPitch)] = static_cast<uint8_t>(c[getIdx((j * BlockHeight) + i)]);
+                }
+            }
+        }
+
+      private:
+        uint8_t getIdx(int i) const {
+            int offset = i * 3 + 16;
+            return static_cast<uint8_t>((data & (0x7ull << offset)) >> offset);
+        }
+
+        uint64_t data;
+    };
+    static_assert(sizeof(BC_channel) == 8, "BC_channel must be 8 bytes");
+
+    struct BC_alpha {
+        void decode(uint8_t *dst, size_t x, size_t y, size_t dstW, size_t dstH, size_t dstPitch, size_t dstBpp) const {
+            dst += 3;  // Write only to alpha (channel 3)
+            for (size_t j = 0; j < BlockHeight && (y + j) < dstH; j++, dst += dstPitch) {
+                uint8_t *dstRow = dst;
+                for (size_t i = 0; i < BlockWidth && (x + i) < dstW; i++, dstRow += dstBpp) {
+                    *dstRow = getAlpha(j * BlockHeight + i);
+                }
+            }
+        }
+
+      private:
+        uint8_t getAlpha(int i) const {
+            int offset = i << 2;
+            int alpha = (data & (0xFull << offset)) >> offset;
+            return static_cast<uint8_t>(alpha | (alpha << 4));
+        }
+
+        uint64_t data;
+    };
+    static_assert(sizeof(BC_alpha) == 8, "BC_alpha must be 8 bytes");
+
+    namespace BC6H {
+        static constexpr int MaxPartitions = 64;
+
+        // @fmt:off
+
+        static constexpr uint8_t PartitionTable2[MaxPartitions][16] = {
+            { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 },
+            { 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1 },
+            { 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1 },
+            { 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1 },
+            { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1 },
+            { 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 },
+            { 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 },
+            { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1 },
+            { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1 },
+            { 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+            { 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 },
+            { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1 },
+            { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+            { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 },
+            { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+            { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1 },
+            { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1 },
+            { 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 },
+            { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0 },
+            { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0 },
+            { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 },
+            { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0 },
+            { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 },
+            { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1 },
+            { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0 },
+            { 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 },
+            { 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0 },
+            { 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0 },
+            { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0 },
+            { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 },
+            { 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0 },
+            { 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0 },
+            { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+            { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1 },
+            { 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0 },
+            { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0 },
+            { 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0 },
+            { 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0 },
+            { 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1 },
+            { 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1 },
+            { 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0 },
+            { 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0 },
+            { 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0 },
+            { 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0 },
+            { 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0 },
+            { 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1 },
+            { 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1 },
+            { 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0 },
+            { 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
+            { 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0 },
+            { 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0 },
+            { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0 },
+            { 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1 },
+            { 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1 },
+            { 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0 },
+            { 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0 },
+            { 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1 },
+            { 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1 },
+            { 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1 },
+            { 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1 },
+            { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 },
+            { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 },
+            { 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0 },
+            { 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1 },
+        };
+
+        static constexpr uint8_t AnchorTable2[MaxPartitions] = {
+            0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
+            0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
+            0xf, 0x2, 0x8, 0x2, 0x2, 0x8, 0x8, 0xf,
+            0x2, 0x8, 0x2, 0x2, 0x8, 0x8, 0x2, 0x2,
+            0xf, 0xf, 0x6, 0x8, 0x2, 0x8, 0xf, 0xf,
+            0x2, 0x8, 0x2, 0x2, 0x2, 0xf, 0xf, 0x6,
+            0x6, 0x2, 0x6, 0x8, 0xf, 0xf, 0x2, 0x2,
+            0xf, 0xf, 0xf, 0xf, 0xf, 0x2, 0x2, 0xf,
+        };
+
+    // @fmt:on
+
+        // 1.0f in half-precision floating point format
+        static constexpr uint16_t halfFloat1 = 0x3C00;
+        union Color {
+            struct RGBA {
+                uint16_t r = 0;
+                uint16_t g = 0;
+                uint16_t b = 0;
+                uint16_t a = halfFloat1;
+
+                RGBA(uint16_t r, uint16_t g, uint16_t b)
+                    : r(r), g(g), b(b) {
+                }
+
+                RGBA &operator=(const RGBA &other) {
+                    this->r = other.r;
+                    this->g = other.g;
+                    this->b = other.b;
+                    this->a = halfFloat1;
+
+                    return *this;
+                }
+            };
+
+            Color(uint16_t r, uint16_t g, uint16_t b)
+                : rgba(r, g, b) {
+            }
+
+            Color(int r, int g, int b)
+                : rgba((uint16_t) r, (uint16_t) g, (uint16_t) b) {
+            }
+
+            Color() {}
+
+            Color(const Color &other) {
+                this->rgba = other.rgba;
+            }
+
+            Color &operator=(const Color &other) {
+                this->rgba = other.rgba;
+
+                return *this;
+            }
+
+            RGBA rgba;
+            uint16_t channel[4];
+        };
+        static_assert(sizeof(Color) == 8, "BC6h::Color must be 8 bytes long");
+
+        inline int32_t extendSign(int32_t val, size_t size) {
+            // Suppose we have a 2-bit integer being stored in 4 bit variable:
+            //    x = 0b00AB
+            //
+            // In order to sign extend x, we need to turn the 0s into A's:
+            //    x_extend = 0bAAAB
+            //
+            // We can do that by flipping A in x then subtracting 0b0010 from x.
+            // Suppose A is 1:
+            //    x       = 0b001B
+            //    x_flip  = 0b000B
+            //    x_minus = 0b111B
+            // Since A is flipped to 0, subtracting the mask sets it and all the bits above it to 1.
+            // And if A is 0:
+            //    x       = 0b000B
+            //    x_flip  = 0b001B
+            //    x_minus = 0b000B
+            // We unset the bit we flipped, and touch no other bit
+            uint16_t mask = 1u << (size - 1);
+            return (val ^ mask) - mask;
+        }
+
+        static int constexpr RGBfChannels = 3;
+        struct RGBf {
+            uint16_t channel[RGBfChannels];
+            size_t size[RGBfChannels];
+            bool isSigned;
+
+            RGBf() {
+                static_assert(RGBfChannels == 3, "RGBf must have exactly 3 channels");
+                static_assert(sizeof(channel) / sizeof(channel[0]) == RGBfChannels, "RGBf must have exactly 3 channels");
+                static_assert(sizeof(channel) / sizeof(channel[0]) == sizeof(size) / sizeof(size[0]), "RGBf requires equally sized arrays for channels and channel sizes");
+
+                for (int i = 0; i < RGBfChannels; i++) {
+                    channel[i] = 0;
+                    size[i] = 0;
+                }
+
+                isSigned = false;
+            }
+
+            void extendSign() {
+                for (int i = 0; i < RGBfChannels; i++) {
+                    channel[i] = BC6H::extendSign(channel[i], size[i]);
+                }
+            }
+
+            // Assuming this is the delta, take the base-endpoint and transform this into
+            // a proper endpoint.
+            //
+            // The final computed endpoint is truncated to the base-endpoint's size;
+            void resolveDelta(RGBf base) {
+                for (int i = 0; i < RGBfChannels; i++) {
+                    size[i] = base.size[i];
+                    channel[i] = (base.channel[i] + channel[i]) & ((1 << base.size[i]) - 1);
+                }
+
+                // Per the spec:
+                // "For signed formats, the results of the delta calculation must be sign
+                // extended as well."
+                if (isSigned) {
+                    extendSign();
+                }
+            }
+
+            void unquantize() {
+                if (isSigned) {
+                    unquantizeSigned();
+                } else {
+                    unquantizeUnsigned();
+                }
+            }
+
+            void unquantizeUnsigned() {
+                for (int i = 0; i < RGBfChannels; i++) {
+                    if (size[i] >= 15 || channel[i] == 0) {
+                        continue;
+                    } else if (channel[i] == ((1u << size[i]) - 1)) {
+                        channel[i] = 0xFFFFu;
+                    } else {
+                        // Need 32 bits to avoid overflow
+                        uint32_t tmp = channel[i];
+                        channel[i] = (uint16_t) (((tmp << 16) + 0x8000) >> size[i]);
+                    }
+                    size[i] = 16;
+                }
+            }
+
+            void unquantizeSigned() {
+                for (int i = 0; i < RGBfChannels; i++) {
+                    if (size[i] >= 16 || channel[i] == 0) {
+                        continue;
+                    }
+
+                    int16_t value = (int16_t)channel[i];
+                    int32_t result = value;
+                    bool signBit = value < 0;
+                    if (signBit) {
+                        value = -value;
+                    }
+
+                    if (value >= ((1 << (size[i] - 1)) - 1)) {
+                        result = 0x7FFF;
+                    } else {
+                        // Need 32 bits to avoid overflow
+                        int32_t tmp = value;
+                        result = (((tmp << 15) + 0x4000) >> (size[i] - 1));
+                    }
+
+                    if (signBit) {
+                        result = -result;
+                    }
+
+                    channel[i] = (uint16_t) result;
+                    size[i] = 16;
+                }
+            }
+        };
+
+        struct Data {
+            uint64_t low64;
+            uint64_t high64;
+
+            Data() = default;
+
+            Data(uint64_t low64, uint64_t high64)
+                : low64(low64), high64(high64) {
+            }
+
+            // Consumes the lowest N bits from from low64 and high64 where N is:
+            //      abs(MSB - LSB)
+            // MSB and LSB come from the block description of the BC6h spec and specify
+            // the location of the bits in the returned bitstring.
+            //
+            // If MSB < LSB, then the bits are reversed. Otherwise, the bitstring is read and
+            // shifted without further modification.
+            //
+            uint32_t consumeBits(uint32_t MSB, uint32_t LSB) {
+                bool reversed = MSB < LSB;
+                if (reversed) {
+                    std::swap(MSB, LSB);
+                }
+                assert(MSB - LSB + 1 < sizeof(uint32_t) * 8);
+
+                uint32_t numBits = MSB - LSB + 1;
+                uint32_t mask = (1 << numBits) - 1;
+                // Read the low N bits
+                uint32_t bits = (low64 & mask);
+
+                low64 >>= numBits;
+                // Put the low N bits of high64 into the high 64-N bits of low64
+                low64 |= (high64 & mask) << (sizeof(high64) * 8 - numBits);
+                high64 >>= numBits;
+
+                if (reversed) {
+                    uint32_t tmp = 0;
+                    for (uint32_t numSwaps = 0; numSwaps < numBits; numSwaps++) {
+                        tmp <<= 1;
+                        tmp |= (bits & 1);
+                        bits >>= 1;
+                    }
+
+                    bits = tmp;
+                }
+
+                return bits << LSB;
+            }
+        };
+
+        struct IndexInfo {
+            uint64_t value;
+            int numBits;
+        };
+
+// Interpolates between two endpoints, then does a final unquantization step
+        Color interpolate(RGBf e0, RGBf e1, const IndexInfo &index, bool isSigned) {
+            static constexpr uint32_t weights3[] = {0, 9, 18, 27, 37, 46, 55, 64};
+            static constexpr uint32_t weights4[] = {0, 4, 9, 13, 17, 21, 26, 30,
+                                                    34, 38, 43, 47, 51, 55, 60, 64};
+            static constexpr uint32_t const *weightsN[] = {
+                nullptr, nullptr, nullptr, weights3, weights4
+            };
+            auto weights = weightsN[index.numBits];
+            assert(weights != nullptr);
+            Color color;
+            uint32_t e0Weight = 64 - weights[index.value];
+            uint32_t e1Weight = weights[index.value];
+
+            for (int i = 0; i < RGBfChannels; i++) {
+                int32_t e0Channel = e0.channel[i];
+                int32_t e1Channel = e1.channel[i];
+
+                if (isSigned) {
+                    e0Channel = extendSign(e0Channel, 16);
+                    e1Channel = extendSign(e1Channel, 16);
+                }
+
+                int32_t e0Value = e0Channel * e0Weight;
+                int32_t e1Value = e1Channel * e1Weight;
+
+                uint32_t tmp = ((e0Value + e1Value + 32) >> 6);
+
+                // Need to unquantize value to limit it to the legal range of half-precision
+                // floats. We do this by scaling by 31/32 or 31/64 depending on if the value
+                // is signed or unsigned.
+                if (isSigned) {
+                    tmp = ((tmp & 0x80000000) != 0) ? (((~tmp + 1) * 31) >> 5) | 0x8000 : (tmp * 31) >> 5;
+                    // Don't return -0.0f, just normalize it to 0.0f.
+                    if (tmp == 0x8000)
+                        tmp = 0;
+                } else {
+                    tmp = (tmp * 31) >> 6;
+                }
+
+                color.channel[i] = (uint16_t) tmp;
+            }
+
+            return color;
+        }
+
+        enum DataType {
+            // Endpoints
+            EP0 = 0,
+            EP1 = 1,
+            EP2 = 2,
+            EP3 = 3,
+            Mode,
+            Partition,
+            End,
+        };
+
+        enum Channel {
+            R = 0,
+            G = 1,
+            B = 2,
+            None,
+        };
+
+        struct DeltaBits {
+            size_t channel[3];
+
+            constexpr DeltaBits()
+                : channel{0, 0, 0} {
+            }
+
+            constexpr DeltaBits(size_t r, size_t g, size_t b)
+                : channel{r, g, b} {
+            }
+        };
+
+        struct ModeDesc {
+            int number;
+            bool hasDelta;
+            int partitionCount;
+            int endpointBits;
+            DeltaBits deltaBits;
+
+            constexpr ModeDesc()
+                : number(-1), hasDelta(false), partitionCount(0), endpointBits(0) {
+            }
+
+            constexpr ModeDesc(int number, bool hasDelta, int partitionCount, int endpointBits, DeltaBits deltaBits)
+                : number(number), hasDelta(hasDelta), partitionCount(partitionCount), endpointBits(endpointBits), deltaBits(deltaBits) {
+            }
+        };
+
+        struct BlockDesc {
+            DataType type;
+            Channel channel;
+            int MSB;
+            int LSB;
+            ModeDesc modeDesc;
+
+            constexpr BlockDesc()
+                : type(End), channel(None), MSB(0), LSB(0), modeDesc() {
+            }
+
+            constexpr BlockDesc(const DataType type, Channel channel, int MSB, int LSB, ModeDesc modeDesc)
+                : type(type), channel(channel), MSB(MSB), LSB(LSB), modeDesc(modeDesc) {
+            }
+
+            constexpr BlockDesc(DataType type, Channel channel, int MSB, int LSB)
+                : type(type), channel(channel), MSB(MSB), LSB(LSB), modeDesc() {
+            }
+        };
+
+// Turns a legal mode into an index into the BlockDesc table.
+// Illegal or reserved modes return -1.
+        static int modeToIndex(uint8_t mode) {
+            if (mode <= 3) {
+                return mode;
+            } else if ((mode & 0x2) != 0) {
+                if (mode <= 18) {
+// Turns 6 into 4, 7 into 5, 10 into 6, etc.
+                    return (mode / 2) + 1 + (mode & 0x1);
+                } else if (mode == 22 || mode == 26 || mode == 30) {
+// Turns 22 into 11, 26 into 12, etc.
+                    return mode / 4 + 6;
+                }
+            }
+
+            return -1;
+        }
+
+// Returns a description of the bitfields for each mode from the LSB
+// to the MSB before the index data starts.
+//
+// The numbers come from the BC6h block description. Each BlockDesc in the
+//   {Type, Channel, MSB, LSB}
+//   * Type describes which endpoint this is, or if this is a mode, a partition
+//     number, or the end of the block description.
+//   * Channel describes one of the 3 color channels within an endpoint
+//   * MSB and LSB specificy:
+//      * The size of the bitfield being read
+//      * The position of the bitfield within the variable it is being read to
+//      * If the bitfield is stored in reverse bit order
+//     If MSB < LSB then the bitfield is stored in reverse order. The size of
+//     the bitfield is abs(MSB-LSB+1). And the position of the bitfield within
+//     the variable is min(LSB, MSB).
+//
+// Invalid or reserved modes return an empty list.
+        static constexpr int NumBlocks = 14;
+// The largest number of descriptions within a block.
+        static constexpr int MaxBlockDescIndex = 26;
+        static constexpr BlockDesc blockDescs[NumBlocks][MaxBlockDescIndex] = {
+// @fmt:off
+// Mode 0, Index 0
+{
+{ Mode, None, 1, 0, { 0, true, 2, 10, { 5, 5, 5 } } },
+{ EP2, G, 4, 4 }, { EP2, B, 4, 4 }, { EP3, B, 4, 4 },
+{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 },
+{ EP1, R, 4, 0 }, { EP3, G, 4, 4 }, { EP2, G, 3, 0 },
+{ EP1, G, 4, 0 }, { EP3, B, 0, 0 }, { EP3, G, 3, 0 },
+{ EP1, B, 4, 0 }, { EP3, B, 1, 1 }, { EP2, B, 3, 0 },
+{ EP2, R, 4, 0 }, { EP3, B, 2, 2 }, { EP3, R, 4, 0 },
+{ EP3, B, 3, 3 },
+{ Partition, None, 4, 0 },
+{ End, None, 0, 0},
+},
+// Mode 1, Index 1
+{
+{ Mode, None, 1, 0, { 1, true, 2, 7, { 6, 6, 6 } } },
+{ EP2, G, 5, 5 }, { EP3, G, 5, 4 }, { EP0, R, 6, 0 },
+{ EP3, B, 1, 0 }, { EP2, B, 4, 4 }, { EP0, G, 6, 0 },
+{ EP2, B, 5, 5 }, { EP3, B, 2, 2 }, { EP2, G, 4, 4 },
+{ EP0, B, 6, 0 }, { EP3, B, 3, 3 }, { EP3, B, 5, 5 },
+{ EP3, B, 4, 4 }, { EP1, R, 5, 0 }, { EP2, G, 3, 0 },
+{ EP1, G, 5, 0 }, { EP3, G, 3, 0 }, { EP1, B, 5, 0 },
+{ EP2, B, 3, 0 }, { EP2, R, 5, 0 }, { EP3, R, 5, 0 },
+{ Partition, None, 4, 0 },
+{ End, None, 0, 0},
+},
+// Mode 2, Index 2
+{
+{ Mode, None, 4, 0, { 2, true, 2, 11, { 5, 4, 4 } } },
+{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 },
+{ EP1, R, 4, 0 }, { EP0, R, 10, 10 }, { EP2, G, 3, 0 },
+{ EP1, G, 3, 0 }, { EP0, G, 10, 10 }, { EP3, B, 0, 0 },
+{ EP3, G, 3, 0 }, { EP1, B, 3, 0 }, { EP0, B, 10, 10 },
+{ EP3, B, 1, 1 }, { EP2, B, 3, 0 }, { EP2, R, 4, 0 },
+{ EP3, B, 2, 2 }, { EP3, R, 4, 0 }, { EP3, B, 3, 3 },
+{ Partition, None, 4, 0 },
+{ End, None, 0, 0},
+},
+// Mode 3, Index 3
+{
+{ Mode, None, 4, 0, { 3, false, 1, 10, { 0, 0, 0 } } },
+{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 },
+{ EP1, R, 9, 0 }, { EP1, G, 9, 0 }, { EP1, B, 9, 0 },
+{ End, None, 0, 0},
+},
+// Mode 6, Index 4
+{
+{ Mode, None, 4, 0, { 6, true, 2, 11, { 4, 5, 4 } } }, // 1 1
+{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 },
+{ EP1, R, 3, 0 }, { EP0, R, 10, 10 }, { EP3, G, 4, 4 },
+{ EP2, G, 3, 0 }, { EP1, G, 4, 0 }, { EP0, G, 10, 10 },
+{ EP3, G, 3, 0 }, { EP1, B, 3, 0 }, { EP0, B, 10, 10 },
+{ EP3, B, 1, 1 }, { EP2, B, 3, 0 }, { EP2, R, 3, 0 },
+{ EP3, B, 0, 0 }, { EP3, B, 2, 2 }, { EP3, R, 3, 0 }, // 18 19
+{ EP2, G, 4, 4 }, { EP3, B, 3, 3 }, // 2 21
+{ Partition, None, 4, 0 },
+{ End, None, 0, 0},
+},
+// Mode 7, Index 5
+{
+{ Mode, None, 4, 0, { 7, true, 1, 11, { 9, 9, 9 } } },
+{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 },
+{ EP1, R, 8, 0 }, { EP0, R, 10, 10 }, { EP1, G, 8, 0 },
+{ EP0, G, 10, 10 }, { EP1, B, 8, 0 }, { EP0, B, 10, 10 },
+{ End, None, 0, 0},
+},
+// Mode 10, Index 6
+{
+{ Mode, None, 4, 0, { 10, true, 2, 11, { 4, 4, 5 } } },
+{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 },
+{ EP1, R, 3, 0 }, { EP0, R, 10, 10 }, { EP2, B, 4, 4 },
+{ EP2, G, 3, 0 }, { EP1, G, 3, 0 }, { EP0, G, 10, 10 },
+{ EP3, B, 0, 0 }, { EP3, G, 3, 0 }, { EP1, B, 4, 0 },
+{ EP0, B, 10, 10 }, { EP2, B, 3, 0 }, { EP2, R, 3, 0 },
+{ EP3, B, 1, 1 }, { EP3, B, 2, 2 }, { EP3, R, 3, 0 },
+{ EP3, B, 4, 4 }, { EP3, B, 3, 3 },
+{ Partition, None, 4, 0 },
+{ End, None, 0, 0},
+},
+// Mode 11, Index 7
+{
+{ Mode, None, 4, 0, { 11, true, 1, 12, { 8, 8, 8 } } },
+{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 },
+{ EP1, R, 7, 0 }, { EP0, R, 10, 11 }, { EP1, G, 7, 0 },
+{ EP0, G, 10, 11 }, { EP1, B, 7, 0 }, { EP0, B, 10, 11 },
+{ End, None, 0, 0},
+},
+// Mode 14, Index 8
+{
+{ Mode, None, 4, 0, { 14, true, 2, 9, { 5, 5, 5 } } },
+{ EP0, R, 8, 0 }, { EP2, B, 4, 4 }, { EP0, G, 8, 0 },
+{ EP2, G, 4, 4 }, { EP0, B, 8, 0 }, { EP3, B, 4, 4 },
+{ EP1, R, 4, 0 }, { EP3, G, 4, 4 }, { EP2, G, 3, 0 },
+{ EP1, G, 4, 0 }, { EP3, B, 0, 0 }, { EP3, G, 3, 0 },
+{ EP1, B, 4, 0 }, { EP3, B, 1, 1 }, { EP2, B, 3, 0 },
+{ EP2, R, 4, 0 }, { EP3, B, 2, 2 }, { EP3, R, 4, 0 },
+{ EP3, B, 3, 3 },
+{ Partition, None, 4, 0 },
+{ End, None, 0, 0},
+},
+// Mode 15, Index 9
+{
+{ Mode, None, 4, 0, { 15, true, 1, 16, { 4, 4, 4 } } },
+{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 },
+{ EP1, R, 3, 0 }, { EP0, R, 10, 15 }, { EP1, G, 3, 0 },
+{ EP0, G, 10, 15 }, { EP1, B, 3, 0 }, { EP0, B, 10, 15 },
+{ End, None, 0, 0},
+},
+// Mode 18, Index 10
+{
+{ Mode, None, 4, 0, { 18, true, 2, 8, { 6, 5, 5 } } },
+{ EP0, R, 7, 0 }, { EP3, G, 4, 4 }, { EP2, B, 4, 4 },
+{ EP0, G, 7, 0 }, { EP3, B, 2, 2 }, { EP2, G, 4, 4 },
+{ EP0, B, 7, 0 }, { EP3, B, 3, 3 }, { EP3, B, 4, 4 },
+{ EP1, R, 5, 0 }, { EP2, G, 3, 0 }, { EP1, G, 4, 0 },
+{ EP3, B, 0, 0 }, { EP3, G, 3, 0 }, { EP1, B, 4, 0 },
+{ EP3, B, 1, 1 }, { EP2, B, 3, 0 }, { EP2, R, 5, 0 },
+{ EP3, R, 5, 0 },
+{ Partition, None, 4, 0 },
+{ End, None, 0, 0},
+},
+// Mode 22, Index 11
+{
+{ Mode, None, 4, 0, { 22, true, 2, 8, { 5, 6, 5 } } },
+{ EP0, R, 7, 0 }, { EP3, B, 0, 0 }, { EP2, B, 4, 4 },
+{ EP0, G, 7, 0 }, { EP2, G, 5, 5 }, { EP2, G, 4, 4 },
+{ EP0, B, 7, 0 }, { EP3, G, 5, 5 }, { EP3, B, 4, 4 },
+{ EP1, R, 4, 0 }, { EP3, G, 4, 4 }, { EP2, G, 3, 0 },
+{ EP1, G, 5, 0 }, { EP3, G, 3, 0 }, { EP1, B, 4, 0 },
+{ EP3, B, 1, 1 }, { EP2, B, 3, 0 }, { EP2, R, 4, 0 },
+{ EP3, B, 2, 2 }, { EP3, R, 4, 0 }, { EP3, B, 3, 3 },
+{ Partition, None, 4, 0 },
+{ End, None, 0, 0},
+},
+// Mode 26, Index 12
+{
+{ Mode, None, 4, 0, { 26, true, 2, 8, { 5, 5, 6 } } },
+{ EP0, R, 7, 0 }, { EP3, B, 1, 1 }, { EP2, B, 4, 4 },
+{ EP0, G, 7, 0 }, { EP2, B, 5, 5 }, { EP2, G, 4, 4 },
+{ EP0, B, 7, 0 }, { EP3, B, 5, 5 }, { EP3, B, 4, 4 },
+{ EP1, R, 4, 0 }, { EP3, G, 4, 4 }, { EP2, G, 3, 0 },
+{ EP1, G, 4, 0 }, { EP3, B, 0, 0 }, { EP3, G, 3, 0 },
+{ EP1, B, 5, 0 }, { EP2, B, 3, 0 }, { EP2, R, 4, 0 },
+{ EP3, B, 2, 2 }, { EP3, R, 4, 0 }, { EP3, B, 3, 3 },
+{ Partition, None, 4, 0 },
+{ End, None, 0, 0},
+},
+// Mode 30, Index 13
+{
+{ Mode, None, 4, 0, { 30, false, 2, 6, { 0, 0, 0 } } },
+{ EP0, R, 5, 0 }, { EP3, G, 4, 4 }, { EP3, B, 0, 0 },
+{ EP3, B, 1, 1 }, { EP2, B, 4, 4 }, { EP0, G, 5, 0 },
+{ EP2, G, 5, 5 }, { EP2, B, 5, 5 }, { EP3, B, 2, 2 },
+{ EP2, G, 4, 4 }, { EP0, B, 5, 0 }, { EP3, G, 5, 5 },
+{ EP3, B, 3, 3 }, { EP3, B, 5, 5 }, { EP3, B, 4, 4 },
+{ EP1, R, 5, 0 }, { EP2, G, 3, 0 }, { EP1, G, 5, 0 },
+{ EP3, G, 3, 0 }, { EP1, B, 5, 0 }, { EP2, B, 3, 0 },
+{ EP2, R, 5, 0 }, { EP3, R, 5, 0 },
+{ Partition, None, 4, 0 },
+{ End, None, 0, 0},
+}
+// @fmt:on
+        };
+
+        struct Block {
+            uint64_t low64;
+            uint64_t high64;
+
+            void decode(uint8_t *dst, size_t dstX, size_t dstY, size_t dstWidth, size_t dstHeight, size_t dstPitch, size_t dstBpp, bool isSigned) const {
+                uint8_t mode = 0;
+                Data data(low64, high64);
+                assert(dstBpp == sizeof(Color));
+
+                if ((data.low64 & 0x2) == 0) {
+                    mode = data.consumeBits(1, 0);
+                } else {
+                    mode = data.consumeBits(4, 0);
+                }
+
+                int blockIndex = modeToIndex(mode);
+                // Handle illegal or reserved mode
+                if (blockIndex == -1) {
+                    for (int y = 0; y < 4 && y + dstY < dstHeight; y++) {
+                        for (int x = 0; x < 4 && x + dstX < dstWidth; x++) {
+                            auto out = reinterpret_cast<Color *>(dst + sizeof(Color) * x + dstPitch * y);
+                            out->rgba = {0, 0, 0};
+                        }
+                    }
+                    return;
+                }
+                const BlockDesc *blockDesc = blockDescs[blockIndex];
+
+                RGBf e[4];
+                e[0].isSigned = e[1].isSigned = e[2].isSigned = e[3].isSigned = isSigned;
+
+                int partition = 0;
+                ModeDesc modeDesc;
+                for (int index = 0; blockDesc[index].type != End; index++) {
+                    const BlockDesc desc = blockDesc[index];
+
+                    switch (desc.type) {
+                        case Mode:
+                            modeDesc = desc.modeDesc;
+                            assert(modeDesc.number == mode);
+
+                            e[0].size[0] = e[0].size[1] = e[0].size[2] = modeDesc.endpointBits;
+                            for (int i = 0; i < RGBfChannels; i++) {
+                                if (modeDesc.hasDelta) {
+                                    e[1].size[i] = e[2].size[i] = e[3].size[i] = modeDesc.deltaBits.channel[i];
+                                } else {
+                                    e[1].size[i] = e[2].size[i] = e[3].size[i] = modeDesc.endpointBits;
+                                }
+                            }
+                            break;
+                        case Partition:
+                            partition |= data.consumeBits(desc.MSB, desc.LSB);
+                            break;
+                        case EP0:
+                        case EP1:
+                        case EP2:
+                        case EP3:
+                            e[desc.type].channel[desc.channel] |= data.consumeBits(desc.MSB, desc.LSB);
+                            break;
+                        default:
+                            assert(false);
+                            return;
+                    }
+                }
+
+                // Sign extension
+                if (isSigned) {
+                    for (int ep = 0; ep < modeDesc.partitionCount * 2; ep++) {
+                        e[ep].extendSign();
+                    }
+                } else if (modeDesc.hasDelta) {
+                    // Don't sign-extend the base endpoint in an unsigned format.
+                    for (int ep = 1; ep < modeDesc.partitionCount * 2; ep++) {
+                        e[ep].extendSign();
+                    }
+                }
+
+                // Turn the deltas into endpoints
+                if (modeDesc.hasDelta) {
+                    for (int ep = 1; ep < modeDesc.partitionCount * 2; ep++) {
+                        e[ep].resolveDelta(e[0]);
+                    }
+                }
+
+                for (int ep = 0; ep < modeDesc.partitionCount * 2; ep++) {
+                    e[ep].unquantize();
+                }
+
+                // Get the indices, calculate final colors, and output
+                for (int y = 0; y < 4; y++) {
+                    for (int x = 0; x < 4; x++) {
+                        int pixelNum = x + y * 4;
+                        IndexInfo idx;
+                        bool isAnchor = false;
+                        int firstEndpoint = 0;
+                        // Bc6H can have either 1 or 2 petitions depending on the mode.
+                        // The number of petitions affects the number of indices with implicit
+                        // leading 0 bits and the number of bits per index.
+                        if (modeDesc.partitionCount == 1) {
+                            idx.numBits = 4;
+                            // There's an implicit leading 0 bit for the first idx
+                            isAnchor = (pixelNum == 0);
+                        } else {
+                            idx.numBits = 3;
+                            // There are 2 indices with implicit leading 0-bits.
+                            isAnchor = ((pixelNum == 0) || (pixelNum == AnchorTable2[partition]));
+                            firstEndpoint = PartitionTable2[partition][pixelNum] * 2;
+                        }
+
+                        idx.value = data.consumeBits(idx.numBits - isAnchor - 1, 0);
+
+                        // Don't exit the loop early, we need to consume these index bits regardless if
+                        // we actually output them or not.
+                        if ((y + dstY >= dstHeight) || (x + dstX >= dstWidth)) {
+                            continue;
+                        }
+
+                        Color color = interpolate(e[firstEndpoint], e[firstEndpoint + 1], idx, isSigned);
+                        auto out = reinterpret_cast<Color *>(dst + dstBpp * x + dstPitch * y);
+                        *out = color;
+                    }
+                }
+            }
+        };
+
+    }  // namespace BC6H
+
+    namespace BC7 {
+// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_bptc.txt
+// https://docs.microsoft.com/en-us/windows/win32/direct3d11/bc7-format
+
+        struct Bitfield {
+            int offset;
+            int count;
+
+            constexpr Bitfield Then(const int bits) { return {offset + count, bits}; }
+
+            constexpr bool operator==(const Bitfield &rhs) {
+                return offset == rhs.offset && count == rhs.count;
+            }
+        };
+
+        struct Mode {
+            const int IDX;  // Mode index
+            const int NS;   // Number of subsets in each partition
+            const int PB;   // Partition bits
+            const int RB;   // Rotation bits
+            const int ISB;  // Index selection bits
+            const int CB;   // Color bits
+            const int AB;   // Alpha bits
+            const int EPB;  // Endpoint P-bits
+            const int SPB;  // Shared P-bits
+            const int IB;   // Primary index bits per element
+            const int IBC;  // Primary index bits total
+            const int IB2;  // Secondary index bits per element
+
+            constexpr int NumColors() const { return NS * 2; }
+
+            constexpr Bitfield Partition() const { return {IDX + 1, PB}; }
+
+            constexpr Bitfield Rotation() const { return Partition().Then(RB); }
+
+            constexpr Bitfield IndexSelection() const { return Rotation().Then(ISB); }
+
+            constexpr Bitfield Red(int idx) const {
+                return IndexSelection().Then(CB * idx).Then(CB);
+            }
+
+            constexpr Bitfield Green(int idx) const {
+                return Red(NumColors() - 1).Then(CB * idx).Then(CB);
+            }
+
+            constexpr Bitfield Blue(int idx) const {
+                return Green(NumColors() - 1).Then(CB * idx).Then(CB);
+            }
+
+            constexpr Bitfield Alpha(int idx) const {
+                return Blue(NumColors() - 1).Then(AB * idx).Then(AB);
+            }
+
+            constexpr Bitfield EndpointPBit(int idx) const {
+                return Alpha(NumColors() - 1).Then(EPB * idx).Then(EPB);
+            }
+
+            constexpr Bitfield SharedPBit0() const {
+                return EndpointPBit(NumColors() - 1).Then(SPB);
+            }
+
+            constexpr Bitfield SharedPBit1() const {
+                return SharedPBit0().Then(SPB);
+            }
+
+            constexpr Bitfield PrimaryIndex(int offset, int count) const {
+                return SharedPBit1().Then(offset).Then(count);
+            }
+
+            constexpr Bitfield SecondaryIndex(int offset, int count) const {
+                return SharedPBit1().Then(IBC + offset).Then(count);
+            }
+        };
+
+        static constexpr Mode Modes[] = {
+            //     IDX  NS   PB   RB   ISB  CB   AB   EPB  SPB  IB   IBC, IB2
+            /**/ {0x0, 0x3, 0x4, 0x0, 0x0, 0x4, 0x0, 0x1, 0x0, 0x3, 0x2d, 0x0},
+/**/ {0x1, 0x2, 0x6, 0x0, 0x0, 0x6, 0x0, 0x0, 0x1, 0x3, 0x2e, 0x0},
+/**/ {0x2, 0x3, 0x6, 0x0, 0x0, 0x5, 0x0, 0x0, 0x0, 0x2, 0x1d, 0x0},
+/**/ {0x3, 0x2, 0x6, 0x0, 0x0, 0x7, 0x0, 0x1, 0x0, 0x2, 0x1e, 0x0},
+/**/ {0x4, 0x1, 0x0, 0x2, 0x1, 0x5, 0x6, 0x0, 0x0, 0x2, 0x1f, 0x3},
+/**/ {0x5, 0x1, 0x0, 0x2, 0x0, 0x7, 0x8, 0x0, 0x0, 0x2, 0x1f, 0x2},
+/**/ {0x6, 0x1, 0x0, 0x0, 0x0, 0x7, 0x7, 0x1, 0x0, 0x4, 0x3f, 0x0},
+/**/ {0x7, 0x2, 0x6, 0x0, 0x0, 0x5, 0x5, 0x1, 0x0, 0x2, 0x1e, 0x0},
+/**/ {-1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x00, 0x0},
+        };
+
+        static constexpr int MaxPartitions = 64;
+        static constexpr int MaxSubsets = 3;
+
+        static constexpr uint8_t PartitionTable2[MaxPartitions][16] = {
+            {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1},
+            {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1},
+            {0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1},
+            {0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1},
+            {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1},
+            {0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1},
+            {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1},
+            {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1},
+            {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1},
+            {0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+            {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1},
+            {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1},
+            {0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+            {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1},
+            {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+            {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1},
+            {0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1},
+            {0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0},
+            {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0},
+            {0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0},
+            {0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0},
+            {0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0},
+            {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0},
+            {0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1},
+            {0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0},
+            {0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0},
+            {0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0},
+            {0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0},
+            {0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0},
+            {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0},
+            {0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0},
+            {0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0},
+            {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
+            {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1},
+            {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0},
+            {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0},
+            {0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0},
+            {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0},
+            {0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1},
+            {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1},
+            {0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0},
+            {0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0},
+            {0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0},
+            {0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0},
+            {0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0},
+            {0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1},
+            {0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1},
+            {0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0},
+            {0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0},
+            {0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0},
+            {0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0},
+            {0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0},
+            {0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1},
+            {0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1},
+            {0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0},
+            {0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0},
+            {0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1},
+            {0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1},
+            {0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1},
+            {0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1},
+            {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1},
+            {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0},
+            {0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0},
+            {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1},
+        };
+
+        static constexpr uint8_t PartitionTable3[MaxPartitions][16] = {
+            {0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 1, 2, 2, 2, 2},
+            {0, 0, 0, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1},
+            {0, 0, 0, 0, 2, 0, 0, 1, 2, 2, 1, 1, 2, 2, 1, 1},
+            {0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 1, 0, 1, 1, 1},
+            {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2},
+            {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 2, 2},
+            {0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1},
+            {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1},
+            {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2},
+            {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2},
+            {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2},
+            {0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2},
+            {0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2},
+            {0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2},
+            {0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 2},
+            {0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0, 2, 2, 2, 0},
+            {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2},
+            {0, 1, 1, 1, 0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0},
+            {0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2},
+            {0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1},
+            {0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2},
+            {0, 0, 0, 1, 0, 0, 0, 1, 2, 2, 2, 1, 2, 2, 2, 1},
+            {0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2},
+            {0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 1, 0, 2, 2, 1, 0},
+            {0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0},
+            {0, 0, 1, 2, 0, 0, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2},
+            {0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1, 0, 1, 1, 0},
+            {0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1},
+            {0, 0, 2, 2, 1, 1, 0, 2, 1, 1, 0, 2, 0, 0, 2, 2},
+            {0, 1, 1, 0, 0, 1, 1, 0, 2, 0, 0, 2, 2, 2, 2, 2},
+            {0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1},
+            {0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 1, 1, 2, 2, 2, 1},
+            {0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 2, 2, 2},
+            {0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 2, 0, 0, 1, 1},
+            {0, 0, 1, 1, 0, 0, 1, 2, 0, 0, 2, 2, 0, 2, 2, 2},
+            {0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0},
+            {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0},
+            {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
+            {0, 1, 2, 0, 2, 0, 1, 2, 1, 2, 0, 1, 0, 1, 2, 0},
+            {0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1},
+            {0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0, 1, 1},
+            {0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2},
+            {0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1},
+            {0, 0, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2, 1, 1, 2, 2},
+            {0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 1, 1},
+            {0, 2, 2, 0, 1, 2, 2, 1, 0, 2, 2, 0, 1, 2, 2, 1},
+            {0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 0, 1},
+            {0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1},
+            {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2},
+            {0, 2, 2, 2, 0, 1, 1, 1, 0, 2, 2, 2, 0, 1, 1, 1},
+            {0, 0, 0, 2, 1, 1, 1, 2, 0, 0, 0, 2, 1, 1, 1, 2},
+            {0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2},
+            {0, 2, 2, 2, 0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2},
+            {0, 0, 0, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2},
+            {0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2},
+            {0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2},
+            {0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2},
+            {0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2},
+            {0, 0, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2},
+            {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2},
+            {0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1},
+            {0, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2},
+            {0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
+            {0, 1, 1, 1, 2, 0, 1, 1, 2, 2, 0, 1, 2, 2, 2, 0},
+        };
+
+        static constexpr uint8_t AnchorTable2[MaxPartitions] = {
+// @fmt:off
+0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
+0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
+0xf, 0x2, 0x8, 0x2, 0x2, 0x8, 0x8, 0xf,
+0x2, 0x8, 0x2, 0x2, 0x8, 0x8, 0x2, 0x2,
+0xf, 0xf, 0x6, 0x8, 0x2, 0x8, 0xf, 0xf,
+0x2, 0x8, 0x2, 0x2, 0x2, 0xf, 0xf, 0x6,
+0x6, 0x2, 0x6, 0x8, 0xf, 0xf, 0x2, 0x2,
+0xf, 0xf, 0xf, 0xf, 0xf, 0x2, 0x2, 0xf,
+// @fmt:on
+        };
+
+        static constexpr uint8_t AnchorTable3a[MaxPartitions] = {
+// @fmt:off
+0x3, 0x3, 0xf, 0xf, 0x8, 0x3, 0xf, 0xf,
+0x8, 0x8, 0x6, 0x6, 0x6, 0x5, 0x3, 0x3,
+0x3, 0x3, 0x8, 0xf, 0x3, 0x3, 0x6, 0xa,
+0x5, 0x8, 0x8, 0x6, 0x8, 0x5, 0xf, 0xf,
+0x8, 0xf, 0x3, 0x5, 0x6, 0xa, 0x8, 0xf,
+0xf, 0x3, 0xf, 0x5, 0xf, 0xf, 0xf, 0xf,
+0x3, 0xf, 0x5, 0x5, 0x5, 0x8, 0x5, 0xa,
+0x5, 0xa, 0x8, 0xd, 0xf, 0xc, 0x3, 0x3,
+// @fmt:on
+        };
+
+        static constexpr uint8_t AnchorTable3b[MaxPartitions] = {
+// @fmt:off
+0xf, 0x8, 0x8, 0x3, 0xf, 0xf, 0x3, 0x8,
+0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0x8,
+0xf, 0x8, 0xf, 0x3, 0xf, 0x8, 0xf, 0x8,
+0x3, 0xf, 0x6, 0xa, 0xf, 0xf, 0xa, 0x8,
+0xf, 0x3, 0xf, 0xa, 0xa, 0x8, 0x9, 0xa,
+0x6, 0xf, 0x8, 0xf, 0x3, 0x6, 0x6, 0x8,
+0xf, 0x3, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
+0xf, 0xf, 0xf, 0xf, 0x3, 0xf, 0xf, 0x8,
+// @fmt:on
+        };
+
+        struct Color {
+            struct RGB {
+                RGB() = default;
+
+                RGB(uint8_t r, uint8_t g, uint8_t b)
+                    : b(b), g(g), r(r) {}
+
+                RGB(int r, int g, int b)
+                    : b(static_cast<uint8_t>(b)), g(static_cast<uint8_t>(g)), r(static_cast<uint8_t>(r)) {}
+
+                RGB operator<<(int shift) const { return {r << shift, g << shift, b << shift}; }
+
+                RGB operator>>(int shift) const { return {r >> shift, g >> shift, b >> shift}; }
+
+                RGB operator|(int bits) const { return {r | bits, g | bits, b | bits}; }
+
+                RGB operator|(const RGB &rhs) const { return {r | rhs.r, g | rhs.g, b | rhs.b}; }
+
+                RGB operator+(const RGB &rhs) const { return {r + rhs.r, g + rhs.g, b + rhs.b}; }
+
+                uint8_t b;
+                uint8_t g;
+                uint8_t r;
+            };
+
+            RGB rgb;
+            uint8_t a;
+        };
+
+        static_assert(sizeof(Color) == 4, "Color size must be 4 bytes");
+
+        struct Block {
+            constexpr uint64_t Get(const Bitfield &bf) const {
+                uint64_t mask = (1ULL << bf.count) - 1;
+                if (bf.offset + bf.count <= 64) {
+                    return (low >> bf.offset) & mask;
+                }
+                if (bf.offset >= 64) {
+                    return (high >> (bf.offset - 64)) & mask;
+                }
+                return ((low >> bf.offset) | (high << (64 - bf.offset))) & mask;
+            }
+
+            const Mode &mode() const {
+                if ((low & 0b00000001) != 0) {
+                    return Modes[0];
+                }
+                if ((low & 0b00000010) != 0) {
+                    return Modes[1];
+                }
+                if ((low & 0b00000100) != 0) {
+                    return Modes[2];
+                }
+                if ((low & 0b00001000) != 0) {
+                    return Modes[3];
+                }
+                if ((low & 0b00010000) != 0) {
+                    return Modes[4];
+                }
+                if ((low & 0b00100000) != 0) {
+                    return Modes[5];
+                }
+                if ((low & 0b01000000) != 0) {
+                    return Modes[6];
+                }
+                if ((low & 0b10000000) != 0) {
+                    return Modes[7];
+                }
+                return Modes[8];  // Invalid mode
+            }
+
+            struct IndexInfo {
+                uint64_t value;
+                int numBits;
+            };
+
+            uint8_t interpolate(uint8_t e0, uint8_t e1, const IndexInfo &index) const {
+                static constexpr uint16_t weights2[] = {0, 21, 43, 64};
+                static constexpr uint16_t weights3[] = {0, 9, 18, 27, 37, 46, 55, 64};
+                static constexpr uint16_t weights4[] = {0, 4, 9, 13, 17, 21, 26, 30,
+                                                        34, 38, 43, 47, 51, 55, 60, 64};
+                static constexpr uint16_t const *weightsN[] = {
+                    nullptr, nullptr, weights2, weights3, weights4
+                };
+                auto weights = weightsN[index.numBits];
+                assert(weights != nullptr);
+                return (uint8_t) (((64 - weights[index.value]) * uint16_t(e0) + weights[index.value] * uint16_t(e1) + 32) >> 6);
+            }
+
+            void decode(uint8_t *dst, size_t dstX, size_t dstY, size_t dstWidth, size_t dstHeight, size_t dstPitch) const {
+                auto const &mode = this->mode();
+
+                if (mode.IDX < 0)  // Invalid mode:
+                {
+                    for (size_t y = 0; y < 4 && y + dstY < dstHeight; y++) {
+                        for (size_t x = 0; x < 4 && x + dstX < dstWidth; x++) {
+                            auto out = reinterpret_cast<Color *>(dst + sizeof(Color) * x + dstPitch * y);
+                            out->rgb = {0, 0, 0};
+                            out->a = 0;
+                        }
+                    }
+                    return;
+                }
+
+                using Endpoint = std::array<Color, 2>;
+                std::array<Endpoint, MaxSubsets> subsets;
+
+                for (size_t i = 0; i < mode.NS; i++) {
+                    auto &subset = subsets[i];
+                    subset[0].rgb.r = Get(mode.Red(i * 2 + 0));
+                    subset[0].rgb.g = Get(mode.Green(i * 2 + 0));
+                    subset[0].rgb.b = Get(mode.Blue(i * 2 + 0));
+                    subset[0].a = (mode.AB > 0) ? Get(mode.Alpha(i * 2 + 0)) : 255;
+
+                    subset[1].rgb.r = Get(mode.Red(i * 2 + 1));
+                    subset[1].rgb.g = Get(mode.Green(i * 2 + 1));
+                    subset[1].rgb.b = Get(mode.Blue(i * 2 + 1));
+                    subset[1].a = (mode.AB > 0) ? Get(mode.Alpha(i * 2 + 1)) : 255;
+                }
+
+                if (mode.SPB > 0) {
+                    auto pbit0 = Get(mode.SharedPBit0());
+                    auto pbit1 = Get(mode.SharedPBit1());
+                    subsets[0][0].rgb = (subsets[0][0].rgb << 1) | pbit0;
+                    subsets[0][1].rgb = (subsets[0][1].rgb << 1) | pbit0;
+                    subsets[1][0].rgb = (subsets[1][0].rgb << 1) | pbit1;
+                    subsets[1][1].rgb = (subsets[1][1].rgb << 1) | pbit1;
+                }
+
+                if (mode.EPB > 0) {
+                    for (size_t i = 0; i < mode.NS; i++) {
+                        auto &subset = subsets[i];
+                        auto pbit0 = Get(mode.EndpointPBit(i * 2 + 0));
+                        auto pbit1 = Get(mode.EndpointPBit(i * 2 + 1));
+                        subset[0].rgb = (subset[0].rgb << 1) | pbit0;
+                        subset[1].rgb = (subset[1].rgb << 1) | pbit1;
+                        if (mode.AB > 0) {
+                            subset[0].a = (subset[0].a << 1) | pbit0;
+                            subset[1].a = (subset[1].a << 1) | pbit1;
+                        }
+                    }
+                }
+
+                auto const colorBits = mode.CB + mode.SPB + mode.EPB;
+                auto const alphaBits = mode.AB + mode.SPB + mode.EPB;
+
+                for (size_t i = 0; i < mode.NS; i++) {
+                    auto &subset = subsets[i];
+                    subset[0].rgb = subset[0].rgb << (8 - colorBits);
+                    subset[1].rgb = subset[1].rgb << (8 - colorBits);
+                    subset[0].rgb = subset[0].rgb | (subset[0].rgb >> colorBits);
+                    subset[1].rgb = subset[1].rgb | (subset[1].rgb >> colorBits);
+
+                    if (mode.AB > 0) {
+                        subset[0].a = subset[0].a << (8 - alphaBits);
+                        subset[1].a = subset[1].a << (8 - alphaBits);
+                        subset[0].a = subset[0].a | (subset[0].a >> alphaBits);
+                        subset[1].a = subset[1].a | (subset[1].a >> alphaBits);
+                    }
+                }
+
+                int colorIndexBitOffset = 0;
+                int alphaIndexBitOffset = 0;
+                for (int y = 0; y < 4; y++) {
+                    for (int x = 0; x < 4; x++) {
+                        auto texelIdx = y * 4 + x;
+                        auto partitionIdx = Get(mode.Partition());
+                        assert(partitionIdx < MaxPartitions);
+                        auto subsetIdx = subsetIndex(mode, partitionIdx, texelIdx);
+                        assert(subsetIdx < MaxSubsets);
+                        auto const &subset = subsets[subsetIdx];
+
+                        auto anchorIdx = anchorIndex(mode, partitionIdx, subsetIdx);
+                        auto isAnchor = anchorIdx == texelIdx;
+                        auto colorIdx = colorIndex(mode, isAnchor, colorIndexBitOffset);
+                        auto alphaIdx = alphaIndex(mode, isAnchor, alphaIndexBitOffset);
+
+                        if (y + dstY >= dstHeight || x + dstX >= dstWidth) {
+                            // Don't be tempted to skip early at the loops:
+                            // The calls to colorIndex() and alphaIndex() adjust bit
+                            // offsets that need to be carefully tracked.
+                            continue;
+                        }
+
+                        Color output;
+                        // Note: We flip r and b channels past this point as the texture storage is BGR while the output is RGB
+                        output.rgb.r = interpolate(subset[0].rgb.b, subset[1].rgb.b, colorIdx);
+                        output.rgb.g = interpolate(subset[0].rgb.g, subset[1].rgb.g, colorIdx);
+                        output.rgb.b = interpolate(subset[0].rgb.r, subset[1].rgb.r, colorIdx);
+                        output.a = interpolate(subset[0].a, subset[1].a, alphaIdx);
+
+                        switch (Get(mode.Rotation())) {
+                            default:
+                                break;
+                            case 1:
+                                std::swap(output.a, output.rgb.b);
+                                break;
+                            case 2:
+                                std::swap(output.a, output.rgb.g);
+                                break;
+                            case 3:
+                                std::swap(output.a, output.rgb.r);
+                                break;
+                        }
+
+                        auto out = reinterpret_cast<Color *>(dst + sizeof(Color) * x + dstPitch * y);
+                        *out = output;
+                    }
+                }
+            }
+
+            int subsetIndex(const Mode &mode, int partitionIdx, int texelIndex) const {
+                switch (mode.NS) {
+                    default:
+                        return 0;
+                    case 2:
+                        return PartitionTable2[partitionIdx][texelIndex];
+                    case 3:
+                        return PartitionTable3[partitionIdx][texelIndex];
+                }
+            }
+
+            int anchorIndex(const Mode &mode, int partitionIdx, int subsetIdx) const {
+                // ARB_texture_compression_bptc states:
+                // "In partition zero, the anchor index is always index zero.
+                // In other partitions, the anchor index is specified by tables
+                // Table.A2 and Table.A3.""
+                // Note: This is really confusing - I believe they meant subset instead
+                // of partition here.
+                switch (subsetIdx) {
+                    default:
+                        return 0;
+                    case 1:
+                        return mode.NS == 2 ? AnchorTable2[partitionIdx] : AnchorTable3a[partitionIdx];
+                    case 2:
+                        return AnchorTable3b[partitionIdx];
+                }
+            }
+
+            IndexInfo colorIndex(const Mode &mode, bool isAnchor,
+                                 int &indexBitOffset) const {
+                // ARB_texture_compression_bptc states:
+                // "The index value for interpolating color comes from the secondary
+                // index for the texel if the format has an index selection bit and its
+                // value is one and from the primary index otherwise.""
+                auto idx = Get(mode.IndexSelection());
+                assert(idx <= 1);
+                bool secondary = idx == 1;
+                auto numBits = secondary ? mode.IB2 : mode.IB;
+                auto numReadBits = numBits - (isAnchor ? 1 : 0);
+                auto index =
+                    Get(secondary ? mode.SecondaryIndex(indexBitOffset, numReadBits)
+                                  : mode.PrimaryIndex(indexBitOffset, numReadBits));
+                indexBitOffset += numReadBits;
+                return {index, numBits};
+            }
+
+            IndexInfo alphaIndex(const Mode &mode, bool isAnchor,
+                                 int &indexBitOffset) const {
+                // ARB_texture_compression_bptc states:
+                // "The alpha index comes from the secondary index if the block has a
+                // secondary index and the block either doesn't have an index selection
+                // bit or that bit is zero and the primary index otherwise."
+                auto idx = Get(mode.IndexSelection());
+                assert(idx <= 1);
+                bool secondary = (mode.IB2 != 0) && (idx == 0);
+                auto numBits = secondary ? mode.IB2 : mode.IB;
+                auto numReadBits = numBits - (isAnchor ? 1 : 0);
+                auto index =
+                    Get(secondary ? mode.SecondaryIndex(indexBitOffset, numReadBits)
+                                  : mode.PrimaryIndex(indexBitOffset, numReadBits));
+                indexBitOffset += numReadBits;
+                return {index, numBits};
+            }
+
+            // Assumes little-endian
+            uint64_t low;
+            uint64_t high;
+        };
+
+    }  // namespace BC7
+}  // anonymous namespace
+
+namespace bcn {
+    constexpr size_t R8Bpp{1}; //!< The amount of bytes per pixel in R8
+    constexpr size_t R8g8Bpp{2}; //!< The amount of bytes per pixel in R8G8
+    constexpr size_t R8g8b8a8Bpp{4}; //!< The amount of bytes per pixel in R8G8B8A8
+    constexpr size_t R16g16b16a16Bpp{8}; //!< The amount of bytes per pixel in R16G16B16
+
+    void DecodeBc1(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height) {
+        const auto *color{reinterpret_cast<const BC_color *>(src)};
+        size_t pitch{R8g8b8a8Bpp * width};
+        color->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp, true, false);
+    }
+
+    void DecodeBc2(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height) {
+        const auto *alpha{reinterpret_cast<const BC_alpha *>(src)};
+        const auto *color{reinterpret_cast<const BC_color *>(src + 8)};
+        size_t pitch{R8g8b8a8Bpp * width};
+        color->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp, false, true);
+        alpha->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp);
+    }
+
+    void DecodeBc3(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height) {
+        const auto *alpha{reinterpret_cast<const BC_channel *>(src)};
+        const auto *color{reinterpret_cast<const BC_color *>(src + 8)};
+        size_t pitch{R8g8b8a8Bpp * width};
+        color->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp, false, true);
+        alpha->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp, 3, false);
+    }
+
+    void DecodeBc4(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned) {
+        const auto *red{reinterpret_cast<const BC_channel *>(src)};
+        size_t pitch{R8Bpp * width};
+        red->decode(dst, x, y, width, height, pitch, R8Bpp, 0, isSigned);
+    }
+
+    void DecodeBc5(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned) {
+        const auto *red{reinterpret_cast<const BC_channel *>(src)};
+        const auto *green{reinterpret_cast<const BC_channel *>(src + 8)};
+        size_t pitch{R8g8Bpp * width};
+        red->decode(dst, x, y, width, height, pitch, R8g8Bpp, 0, isSigned);
+        green->decode(dst, x, y, width, height, pitch, R8g8Bpp, 1, isSigned);
+    }
+
+    void DecodeBc6(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned) {
+        const auto *block{reinterpret_cast<const BC6H::Block *>(src)};
+        size_t pitch{R16g16b16a16Bpp * width};
+        block->decode(dst, x, y, width, height, pitch, R16g16b16a16Bpp, isSigned);
+    }
+
+    void DecodeBc7(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height) {
+        const auto *block{reinterpret_cast<const BC7::Block *>(src)};
+        size_t pitch{R8g8b8a8Bpp * width};
+        block->decode(dst, x, y, width, height, pitch);
+    }
+}
diff --git a/externals/bc_decoder/bc_decoder.h b/externals/bc_decoder/bc_decoder.h
new file mode 100644
index 000000000..4f0ead7d3
--- /dev/null
+++ b/externals/bc_decoder/bc_decoder.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include <cstdint>
+
+namespace bcn {
+    /**
+     * @brief Decodes a BC1 encoded image to R8G8B8A8
+     */
+    void DecodeBc1(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height);
+
+    /**
+     * @brief Decodes a BC2 encoded image to R8G8B8A8
+     */
+    void DecodeBc2(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height);
+
+    /**
+     * @brief Decodes a BC3 encoded image to R8G8B8A8
+     */
+    void DecodeBc3(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height);
+
+    /**
+     * @brief Decodes a BC4 encoded image to R8
+     */
+    void DecodeBc4(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned);
+
+    /**
+     * @brief Decodes a BC5 encoded image to R8G8
+     */
+    void DecodeBc5(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned);
+
+    /**
+     * @brief Decodes a BC6 encoded image to R16G16B16A16
+     */
+    void DecodeBc6(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned);
+
+    /**
+     * @brief Decodes a BC7 encoded image to R8G8B8A8
+     */
+    void DecodeBc7(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height);
+}
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index e9e6f278d..3b2fe01da 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -220,8 +220,8 @@ add_library(video_core STATIC
     surface.h
     texture_cache/accelerated_swizzle.cpp
     texture_cache/accelerated_swizzle.h
-    texture_cache/decode_bc4.cpp
-    texture_cache/decode_bc4.h
+    texture_cache/decode_bc.cpp
+    texture_cache/decode_bc.h
     texture_cache/descriptor_table.h
     texture_cache/formatter.cpp
     texture_cache/formatter.h
@@ -279,7 +279,7 @@ add_library(video_core STATIC
 create_target_directory_groups(video_core)
 
 target_link_libraries(video_core PUBLIC common core)
-target_link_libraries(video_core PUBLIC glad shader_recompiler stb)
+target_link_libraries(video_core PUBLIC glad shader_recompiler stb bc_decoder)
 
 if (YUZU_USE_BUNDLED_FFMPEG AND NOT (WIN32 OR ANDROID))
     add_dependencies(video_core ffmpeg-build)
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 9a0b10568..a8540339d 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -259,6 +259,26 @@ FormatInfo SurfaceFormat(const Device& device, FormatType format_type, bool with
             break;
         }
     }
+    // Transcode on hardware that doesn't support BCn natively
+    if (!device.IsOptimalBcnSupported() && VideoCore::Surface::IsPixelFormatBCn(pixel_format)) {
+        const bool is_srgb = with_srgb && VideoCore::Surface::IsPixelFormatSRGB(pixel_format);
+        if (pixel_format == PixelFormat::BC4_SNORM) {
+            tuple.format = VK_FORMAT_R8_SNORM;
+        } else if (pixel_format == PixelFormat::BC4_UNORM) {
+            tuple.format = VK_FORMAT_R8_UNORM;
+        } else if (pixel_format == PixelFormat::BC5_SNORM) {
+            tuple.format = VK_FORMAT_R8G8_SNORM;
+        } else if (pixel_format == PixelFormat::BC5_UNORM) {
+            tuple.format = VK_FORMAT_R8G8_UNORM;
+        } else if (pixel_format == PixelFormat::BC6H_SFLOAT ||
+                   pixel_format == PixelFormat::BC6H_UFLOAT) {
+            tuple.format = VK_FORMAT_R16G16B16A16_SFLOAT;
+        } else if (is_srgb) {
+            tuple.format = VK_FORMAT_A8B8G8R8_SRGB_PACK32;
+        } else {
+            tuple.format = VK_FORMAT_A8B8G8R8_UNORM_PACK32;
+        }
+    }
     const bool attachable = (tuple.usage & Attachable) != 0;
     const bool storage = (tuple.usage & Storage) != 0;
 
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 268b955fb..f7c0d939a 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -315,7 +315,14 @@ void RasterizerVulkan::Clear(u32 layer_count) {
     FlushWork();
     gpu_memory->FlushCaching();
 
+#if ANDROID
+    if (Settings::IsGPULevelHigh()) {
+        // This is problematic on Android, disable on GPU Normal.
+        query_cache.UpdateCounters();
+    }
+#else
     query_cache.UpdateCounters();
+#endif
 
     auto& regs = maxwell3d->regs;
     const bool use_color = regs.clear_surface.R || regs.clear_surface.G || regs.clear_surface.B ||
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index ce6acc30c..8385b5509 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -1279,6 +1279,10 @@ Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu
         flags |= VideoCommon::ImageFlagBits::Converted;
         flags |= VideoCommon::ImageFlagBits::CostlyLoad;
     }
+    if (IsPixelFormatBCn(info.format) && !runtime->device.IsOptimalBcnSupported()) {
+        flags |= VideoCommon::ImageFlagBits::Converted;
+        flags |= VideoCommon::ImageFlagBits::CostlyLoad;
+    }
     if (runtime->device.HasDebuggingToolAttached()) {
         original_image.SetObjectNameEXT(VideoCommon::Name(*this).c_str());
     }
diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp
index cb51529e4..e16cd5e73 100644
--- a/src/video_core/surface.cpp
+++ b/src/video_core/surface.cpp
@@ -269,6 +269,28 @@ bool IsPixelFormatASTC(PixelFormat format) {
     }
 }
 
+bool IsPixelFormatBCn(PixelFormat format) {
+    switch (format) {
+    case PixelFormat::BC1_RGBA_UNORM:
+    case PixelFormat::BC2_UNORM:
+    case PixelFormat::BC3_UNORM:
+    case PixelFormat::BC4_UNORM:
+    case PixelFormat::BC4_SNORM:
+    case PixelFormat::BC5_UNORM:
+    case PixelFormat::BC5_SNORM:
+    case PixelFormat::BC1_RGBA_SRGB:
+    case PixelFormat::BC2_SRGB:
+    case PixelFormat::BC3_SRGB:
+    case PixelFormat::BC7_UNORM:
+    case PixelFormat::BC6H_UFLOAT:
+    case PixelFormat::BC6H_SFLOAT:
+    case PixelFormat::BC7_SRGB:
+        return true;
+    default:
+        return false;
+    }
+}
+
 bool IsPixelFormatSRGB(PixelFormat format) {
     switch (format) {
     case PixelFormat::A8B8G8R8_SRGB:
diff --git a/src/video_core/surface.h b/src/video_core/surface.h
index 0225d3287..9b9c4d9bc 100644
--- a/src/video_core/surface.h
+++ b/src/video_core/surface.h
@@ -501,6 +501,8 @@ SurfaceType GetFormatType(PixelFormat pixel_format);
 
 bool IsPixelFormatASTC(PixelFormat format);
 
+bool IsPixelFormatBCn(PixelFormat format);
+
 bool IsPixelFormatSRGB(PixelFormat format);
 
 bool IsPixelFormatInteger(PixelFormat format);
diff --git a/src/video_core/texture_cache/decode_bc.cpp b/src/video_core/texture_cache/decode_bc.cpp
new file mode 100644
index 000000000..3e26474a3
--- /dev/null
+++ b/src/video_core/texture_cache/decode_bc.cpp
@@ -0,0 +1,129 @@
+// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <algorithm>
+#include <array>
+#include <span>
+#include <bc_decoder.h>
+
+#include "common/common_types.h"
+#include "video_core/texture_cache/decode_bc.h"
+
+namespace VideoCommon {
+
+namespace {
+constexpr u32 BLOCK_SIZE = 4;
+
+using VideoCore::Surface::PixelFormat;
+
+constexpr bool IsSigned(PixelFormat pixel_format) {
+    switch (pixel_format) {
+    case PixelFormat::BC4_SNORM:
+    case PixelFormat::BC4_UNORM:
+    case PixelFormat::BC5_SNORM:
+    case PixelFormat::BC5_UNORM:
+    case PixelFormat::BC6H_SFLOAT:
+    case PixelFormat::BC6H_UFLOAT:
+        return true;
+    default:
+        return false;
+    }
+}
+
+constexpr u32 BlockSize(PixelFormat pixel_format) {
+    switch (pixel_format) {
+    case PixelFormat::BC1_RGBA_SRGB:
+    case PixelFormat::BC1_RGBA_UNORM:
+    case PixelFormat::BC4_SNORM:
+    case PixelFormat::BC4_UNORM:
+        return 8;
+    default:
+        return 16;
+    }
+}
+} // Anonymous namespace
+
+u32 ConvertedBytesPerBlock(VideoCore::Surface::PixelFormat pixel_format) {
+    switch (pixel_format) {
+    case PixelFormat::BC4_SNORM:
+    case PixelFormat::BC4_UNORM:
+        return 1;
+    case PixelFormat::BC5_SNORM:
+    case PixelFormat::BC5_UNORM:
+        return 2;
+    case PixelFormat::BC6H_SFLOAT:
+    case PixelFormat::BC6H_UFLOAT:
+        return 8;
+    default:
+        return 4;
+    }
+}
+
+template <auto decompress, PixelFormat pixel_format>
+void DecompressBlocks(std::span<const u8> input, std::span<u8> output, Extent3D extent,
+                      bool is_signed = false) {
+    const u32 out_bpp = ConvertedBytesPerBlock(pixel_format);
+    const u32 block_width = std::min(extent.width, BLOCK_SIZE);
+    const u32 block_height = std::min(extent.height, BLOCK_SIZE);
+    const u32 pitch = extent.width * out_bpp;
+    size_t input_offset = 0;
+    size_t output_offset = 0;
+    for (u32 slice = 0; slice < extent.depth; ++slice) {
+        for (u32 y = 0; y < extent.height; y += block_height) {
+            size_t row_offset = 0;
+            for (u32 x = 0; x < extent.width;
+                 x += block_width, row_offset += block_width * out_bpp) {
+                const u8* src = input.data() + input_offset;
+                u8* const dst = output.data() + output_offset + row_offset;
+                if constexpr (IsSigned(pixel_format)) {
+                    decompress(src, dst, x, y, extent.width, extent.height, is_signed);
+                } else {
+                    decompress(src, dst, x, y, extent.width, extent.height);
+                }
+                input_offset += BlockSize(pixel_format);
+            }
+            output_offset += block_height * pitch;
+        }
+    }
+}
+
+void DecompressBCn(std::span<const u8> input, std::span<u8> output, Extent3D extent,
+                   VideoCore::Surface::PixelFormat pixel_format) {
+    switch (pixel_format) {
+    case PixelFormat::BC1_RGBA_UNORM:
+    case PixelFormat::BC1_RGBA_SRGB:
+        DecompressBlocks<bcn::DecodeBc1, PixelFormat::BC1_RGBA_UNORM>(input, output, extent);
+        break;
+    case PixelFormat::BC2_UNORM:
+    case PixelFormat::BC2_SRGB:
+        DecompressBlocks<bcn::DecodeBc2, PixelFormat::BC2_UNORM>(input, output, extent);
+        break;
+    case PixelFormat::BC3_UNORM:
+    case PixelFormat::BC3_SRGB:
+        DecompressBlocks<bcn::DecodeBc3, PixelFormat::BC3_UNORM>(input, output, extent);
+        break;
+    case PixelFormat::BC4_SNORM:
+    case PixelFormat::BC4_UNORM:
+        DecompressBlocks<bcn::DecodeBc4, PixelFormat::BC4_UNORM>(
+            input, output, extent, pixel_format == PixelFormat::BC4_SNORM);
+        break;
+    case PixelFormat::BC5_SNORM:
+    case PixelFormat::BC5_UNORM:
+        DecompressBlocks<bcn::DecodeBc5, PixelFormat::BC5_UNORM>(
+            input, output, extent, pixel_format == PixelFormat::BC5_SNORM);
+        break;
+    case PixelFormat::BC6H_SFLOAT:
+    case PixelFormat::BC6H_UFLOAT:
+        DecompressBlocks<bcn::DecodeBc6, PixelFormat::BC6H_UFLOAT>(
+            input, output, extent, pixel_format == PixelFormat::BC6H_SFLOAT);
+        break;
+    case PixelFormat::BC7_SRGB:
+    case PixelFormat::BC7_UNORM:
+        DecompressBlocks<bcn::DecodeBc7, PixelFormat::BC7_UNORM>(input, output, extent);
+        break;
+    default:
+        LOG_WARNING(HW_GPU, "Unimplemented BCn decompression {}", pixel_format);
+    }
+}
+
+} // namespace VideoCommon
diff --git a/src/video_core/texture_cache/decode_bc.h b/src/video_core/texture_cache/decode_bc.h
new file mode 100644
index 000000000..41d1ec0a3
--- /dev/null
+++ b/src/video_core/texture_cache/decode_bc.h
@@ -0,0 +1,19 @@
+// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <span>
+
+#include "common/common_types.h"
+#include "video_core/surface.h"
+#include "video_core/texture_cache/types.h"
+
+namespace VideoCommon {
+
+[[nodiscard]] u32 ConvertedBytesPerBlock(VideoCore::Surface::PixelFormat pixel_format);
+
+void DecompressBCn(std::span<const u8> input, std::span<u8> output, Extent3D extent,
+                   VideoCore::Surface::PixelFormat pixel_format);
+
+} // namespace VideoCommon
diff --git a/src/video_core/texture_cache/decode_bc4.cpp b/src/video_core/texture_cache/decode_bc4.cpp
deleted file mode 100644
index ef98afdca..000000000
--- a/src/video_core/texture_cache/decode_bc4.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include <algorithm>
-#include <array>
-#include <span>
-
-#include "common/assert.h"
-#include "common/common_types.h"
-#include "video_core/texture_cache/decode_bc4.h"
-#include "video_core/texture_cache/types.h"
-
-namespace VideoCommon {
-
-// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_rgtc.txt
-[[nodiscard]] constexpr u32 DecompressBlock(u64 bits, u32 x, u32 y) {
-    const u32 code_offset = 16 + 3 * (4 * y + x);
-    const u32 code = (bits >> code_offset) & 7;
-    const u32 red0 = (bits >> 0) & 0xff;
-    const u32 red1 = (bits >> 8) & 0xff;
-    if (red0 > red1) {
-        switch (code) {
-        case 0:
-            return red0;
-        case 1:
-            return red1;
-        case 2:
-            return (6 * red0 + 1 * red1) / 7;
-        case 3:
-            return (5 * red0 + 2 * red1) / 7;
-        case 4:
-            return (4 * red0 + 3 * red1) / 7;
-        case 5:
-            return (3 * red0 + 4 * red1) / 7;
-        case 6:
-            return (2 * red0 + 5 * red1) / 7;
-        case 7:
-            return (1 * red0 + 6 * red1) / 7;
-        }
-    } else {
-        switch (code) {
-        case 0:
-            return red0;
-        case 1:
-            return red1;
-        case 2:
-            return (4 * red0 + 1 * red1) / 5;
-        case 3:
-            return (3 * red0 + 2 * red1) / 5;
-        case 4:
-            return (2 * red0 + 3 * red1) / 5;
-        case 5:
-            return (1 * red0 + 4 * red1) / 5;
-        case 6:
-            return 0;
-        case 7:
-            return 0xff;
-        }
-    }
-    return 0;
-}
-
-void DecompressBC4(std::span<const u8> input, Extent3D extent, std::span<u8> output) {
-    UNIMPLEMENTED_IF_MSG(extent.width % 4 != 0, "Unaligned width={}", extent.width);
-    UNIMPLEMENTED_IF_MSG(extent.height % 4 != 0, "Unaligned height={}", extent.height);
-    static constexpr u32 BLOCK_SIZE = 4;
-    size_t input_offset = 0;
-    for (u32 slice = 0; slice < extent.depth; ++slice) {
-        for (u32 block_y = 0; block_y < extent.height / 4; ++block_y) {
-            for (u32 block_x = 0; block_x < extent.width / 4; ++block_x) {
-                u64 bits;
-                std::memcpy(&bits, &input[input_offset], sizeof(bits));
-                input_offset += sizeof(bits);
-
-                for (u32 y = 0; y < BLOCK_SIZE; ++y) {
-                    for (u32 x = 0; x < BLOCK_SIZE; ++x) {
-                        const u32 linear_z = slice;
-                        const u32 linear_y = block_y * BLOCK_SIZE + y;
-                        const u32 linear_x = block_x * BLOCK_SIZE + x;
-                        const u32 offset_z = linear_z * extent.width * extent.height;
-                        const u32 offset_y = linear_y * extent.width;
-                        const u32 offset_x = linear_x;
-                        const u32 output_offset = (offset_z + offset_y + offset_x) * 4ULL;
-                        const u32 color = DecompressBlock(bits, x, y);
-                        output[output_offset + 0] = static_cast<u8>(color);
-                        output[output_offset + 1] = 0;
-                        output[output_offset + 2] = 0;
-                        output[output_offset + 3] = 0xff;
-                    }
-                }
-            }
-        }
-    }
-}
-
-} // namespace VideoCommon
diff --git a/src/video_core/texture_cache/decode_bc4.h b/src/video_core/texture_cache/decode_bc4.h
deleted file mode 100644
index ab2f735be..000000000
--- a/src/video_core/texture_cache/decode_bc4.h
+++ /dev/null
@@ -1,15 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#pragma once
-
-#include <span>
-
-#include "common/common_types.h"
-#include "video_core/texture_cache/types.h"
-
-namespace VideoCommon {
-
-void DecompressBC4(std::span<const u8> data, Extent3D extent, std::span<u8> output);
-
-} // namespace VideoCommon
diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp
index f781cb7a0..9a618a57a 100644
--- a/src/video_core/texture_cache/util.cpp
+++ b/src/video_core/texture_cache/util.cpp
@@ -24,7 +24,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/memory_manager.h"
 #include "video_core/surface.h"
-#include "video_core/texture_cache/decode_bc4.h"
+#include "video_core/texture_cache/decode_bc.h"
 #include "video_core/texture_cache/format_lookup_table.h"
 #include "video_core/texture_cache/formatter.h"
 #include "video_core/texture_cache/samples_helper.h"
@@ -61,8 +61,6 @@ using VideoCore::Surface::PixelFormatFromDepthFormat;
 using VideoCore::Surface::PixelFormatFromRenderTargetFormat;
 using VideoCore::Surface::SurfaceType;
 
-constexpr u32 CONVERTED_BYTES_PER_BLOCK = BytesPerBlock(PixelFormat::A8B8G8R8_UNORM);
-
 struct LevelInfo {
     Extent3D size;
     Extent3D block;
@@ -612,7 +610,8 @@ u32 CalculateConvertedSizeBytes(const ImageInfo& info) noexcept {
         }
         return output_size;
     }
-    return NumBlocksPerLayer(info, TILE_SIZE) * info.resources.layers * CONVERTED_BYTES_PER_BLOCK;
+    return NumBlocksPerLayer(info, TILE_SIZE) * info.resources.layers *
+           ConvertedBytesPerBlock(info.format);
 }
 
 u32 CalculateLayerStride(const ImageInfo& info) noexcept {
@@ -945,7 +944,8 @@ void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8
                 tile_size.height, output.subspan(output_offset));
 
             output_offset += copy.image_extent.width * copy.image_extent.height *
-                             copy.image_subresource.num_layers * CONVERTED_BYTES_PER_BLOCK;
+                             copy.image_subresource.num_layers *
+                             BytesPerBlock(PixelFormat::A8B8G8R8_UNORM);
         } else if (astc) {
             // BC1 uses 0.5 bytes per texel
             // BC3 uses 1 byte per texel
@@ -956,7 +956,8 @@ void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8
 
             const u32 plane_dim = copy.image_extent.width * copy.image_extent.height;
             const u32 level_size = plane_dim * copy.image_extent.depth *
-                                   copy.image_subresource.num_layers * CONVERTED_BYTES_PER_BLOCK;
+                                   copy.image_subresource.num_layers *
+                                   BytesPerBlock(PixelFormat::A8B8G8R8_UNORM);
             decode_scratch.resize_destructive(level_size);
 
             Tegra::Texture::ASTC::Decompress(
@@ -976,10 +977,15 @@ void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8
                 bpp_div;
             output_offset += static_cast<u32>(copy.buffer_size);
         } else {
-            DecompressBC4(input_offset, copy.image_extent, output.subspan(output_offset));
-
+            const Extent3D image_extent{
+                .width = copy.image_extent.width,
+                .height = copy.image_extent.height * copy.image_subresource.num_layers,
+                .depth = copy.image_extent.depth,
+            };
+            DecompressBCn(input_offset, output.subspan(output_offset), image_extent, info.format);
             output_offset += copy.image_extent.width * copy.image_extent.height *
-                             copy.image_subresource.num_layers * CONVERTED_BYTES_PER_BLOCK;
+                             copy.image_subresource.num_layers *
+                             ConvertedBytesPerBlock(info.format);
         }
     }
 }
diff --git a/src/video_core/textures/bcn.cpp b/src/video_core/textures/bcn.cpp
index 671212a49..16ddbe320 100644
--- a/src/video_core/textures/bcn.cpp
+++ b/src/video_core/textures/bcn.cpp
@@ -3,7 +3,6 @@
 
 #include <stb_dxt.h>
 #include <string.h>
-
 #include "common/alignment.h"
 #include "video_core/textures/bcn.h"
 #include "video_core/textures/workers.h"
diff --git a/src/video_core/textures/bcn.h b/src/video_core/textures/bcn.h
index 6464af885..d5d2a16c9 100644
--- a/src/video_core/textures/bcn.h
+++ b/src/video_core/textures/bcn.h
@@ -4,14 +4,13 @@
 #pragma once
 
 #include <span>
-#include <stdint.h>
+
+#include "common/common_types.h"
 
 namespace Tegra::Texture::BCN {
 
-void CompressBC1(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
-                 std::span<uint8_t> output);
+void CompressBC1(std::span<const u8> data, u32 width, u32 height, u32 depth, std::span<u8> output);
 
-void CompressBC3(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
-                 std::span<uint8_t> output);
+void CompressBC3(std::span<const u8> data, u32 width, u32 height, u32 depth, std::span<u8> output);
 
 } // namespace Tegra::Texture::BCN
diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h
index e05d04db3..1f17265d5 100644
--- a/src/video_core/vulkan_common/vulkan_device.h
+++ b/src/video_core/vulkan_common/vulkan_device.h
@@ -293,6 +293,11 @@ public:
         return features.features.textureCompressionASTC_LDR;
     }
 
+    /// Returns true if BCn is natively supported.
+    bool IsOptimalBcnSupported() const {
+        return features.features.textureCompressionBC;
+    }
+
     /// Returns true if descriptor aliasing is natively supported.
     bool IsDescriptorAliasingSupported() const {
         return GetDriverID() != VK_DRIVER_ID_QUALCOMM_PROPRIETARY;
@@ -423,6 +428,11 @@ public:
         return extensions.sampler_filter_minmax;
     }
 
+    /// Returns true if the device supports VK_EXT_shader_stencil_export.
+    bool IsExtShaderStencilExportSupported() const {
+        return extensions.shader_stencil_export;
+    }
+
     /// Returns true if the device supports VK_EXT_depth_range_unrestricted.
     bool IsExtDepthRangeUnrestrictedSupported() const {
         return extensions.depth_range_unrestricted;
@@ -492,11 +502,6 @@ public:
         return extensions.vertex_input_dynamic_state;
     }
 
-    /// Returns true if the device supports VK_EXT_shader_stencil_export.
-    bool IsExtShaderStencilExportSupported() const {
-        return extensions.shader_stencil_export;
-    }
-
     /// Returns true if the device supports VK_EXT_shader_demote_to_helper_invocation
     bool IsExtShaderDemoteToHelperInvocationSupported() const {
         return extensions.shader_demote_to_helper_invocation;
-- 
cgit v1.2.3


From b62121fd605663dc9aaaae72fe8e444312f9c5d5 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Thu, 29 Jun 2023 11:58:45 +0200
Subject: Texture cache: Fix YFC regression due to code testing

---
 src/video_core/texture_cache/texture_cache.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'src/video_core/texture_cache')

diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index d3f03a995..485f6b6f3 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -598,14 +598,6 @@ void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz
                             [&](ImageId id, Image&) { deleted_images.push_back(id); });
     for (const ImageId id : deleted_images) {
         Image& image = slot_images[id];
-        if (True(image.flags & ImageFlagBits::CpuModified)) {
-            return;
-        }
-        image.flags |= ImageFlagBits::CpuModified;
-        if (True(image.flags & ImageFlagBits::Tracked)) {
-            UntrackImage(image, id);
-        }
-        /*
         if (True(image.flags & ImageFlagBits::Remapped)) {
             continue;
         }
@@ -613,7 +605,6 @@ void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz
         if (True(image.flags & ImageFlagBits::Tracked)) {
             UntrackImage(image, id);
         }
-        */
     }
 }
 
-- 
cgit v1.2.3


From 596a6132b974dd73935854d8f51842424e058be8 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Thu, 29 Jun 2023 17:23:29 +0200
Subject: AccelerateDMA: Don't accelerate 3D texture DMA operations

---
 src/video_core/texture_cache/texture_cache.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'src/video_core/texture_cache')

diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index d3f03a995..0330415b7 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -879,6 +879,10 @@ ImageId TextureCache<P>::DmaImageId(const Tegra::DMA::ImageOperand& operand, boo
         return NULL_IMAGE_ID;
     }
     auto& image = slot_images[image_id];
+    if (image.info.type == ImageType::e3D) {
+        // Don't accelerate 3D images.
+        return NULL_IMAGE_ID;
+    }
     if (!is_upload && !image.info.dma_downloaded) {
         // Force a full sync.
         image.info.dma_downloaded = true;
-- 
cgit v1.2.3