45 files changed, 427 insertions, 238 deletions
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 58a45ab67..6ed4b78f2 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -115,7 +115,34 @@ void BufferCache<P>::WriteMemory(VAddr cpu_addr, u64 size) {
 
 template <class P>
 void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) {
-    memory_tracker.CachedCpuWrite(cpu_addr, size);
+    const bool is_dirty = IsRegionRegistered(cpu_addr, size);
+    if (!is_dirty) {
+        return;
+    }
+    VAddr aligned_start = Common::AlignDown(cpu_addr, YUZU_PAGESIZE);
+    VAddr aligned_end = Common::AlignUp(cpu_addr + size, YUZU_PAGESIZE);
+    if (!IsRegionGpuModified(aligned_start, aligned_end - aligned_start)) {
+        WriteMemory(cpu_addr, size);
+        return;
+    }
+
+    tmp_buffer.resize_destructive(size);
+    cpu_memory.ReadBlockUnsafe(cpu_addr, tmp_buffer.data(), size);
+
+    InlineMemoryImplementation(cpu_addr, size, tmp_buffer);
+}
+
+template <class P>
+bool BufferCache<P>::OnCPUWrite(VAddr cpu_addr, u64 size) {
+    const bool is_dirty = IsRegionRegistered(cpu_addr, size);
+    if (!is_dirty) {
+        return false;
+    }
+    if (memory_tracker.IsRegionGpuModified(cpu_addr, size)) {
+        return true;
+    }
+    WriteMemory(cpu_addr, size);
+    return false;
 }
 
 template <class P>
@@ -207,9 +234,10 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am
     if (has_new_downloads) {
         memory_tracker.MarkRegionAsGpuModified(*cpu_dest_address, amount);
     }
-    tmp_buffer.resize_destructive(amount);
-    cpu_memory.ReadBlockUnsafe(*cpu_src_address, tmp_buffer.data(), amount);
-    cpu_memory.WriteBlockUnsafe(*cpu_dest_address, tmp_buffer.data(), amount);
+
+    Core::Memory::CpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::UnsafeReadWrite> tmp(
+        cpu_memory, *cpu_src_address, amount, &tmp_buffer);
+    tmp.SetAddressAndSize(*cpu_dest_address, amount);
     return true;
 }
 
@@ -1553,6 +1581,14 @@ bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size,
         return false;
     }
 
+    InlineMemoryImplementation(dest_address, copy_size, inlined_buffer);
+
+    return true;
+}
+
+template <class P>
+void BufferCache<P>::InlineMemoryImplementation(VAddr dest_address, size_t copy_size,
+                                                std::span<const u8> inlined_buffer) {
     const IntervalType subtract_interval{dest_address, dest_address + copy_size};
     ClearDownload(subtract_interval);
     common_ranges.subtract(subtract_interval);
@@ -1574,8 +1610,6 @@ bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size,
     } else {
         buffer.ImmediateUpload(buffer.Offset(dest_address), inlined_buffer.first(copy_size));
     }
-
-    return true;
 }
 
 template <class P>
diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h
index fe6068cfe..460fc7551 100644
--- a/src/video_core/buffer_cache/buffer_cache_base.h
+++ b/src/video_core/buffer_cache/buffer_cache_base.h
@@ -245,6 +245,8 @@ public:
 
     void CachedWriteMemory(VAddr cpu_addr, u64 size);
 
+    bool OnCPUWrite(VAddr cpu_addr, u64 size);
+
     void DownloadMemory(VAddr cpu_addr, u64 size);
 
     std::optional<VideoCore::RasterizerDownloadArea> GetFlushArea(VAddr cpu_addr, u64 size);
@@ -543,6 +545,9 @@ private:
 
     void ClearDownload(IntervalType subtract_interval);
 
+    void InlineMemoryImplementation(VAddr dest_address, size_t copy_size,
+                                    std::span<const u8> inlined_buffer);
+
     VideoCore::RasterizerInterface& rasterizer;
     Core::Memory::Memory& cpu_memory;
 
diff --git a/src/video_core/compatible_formats.cpp b/src/video_core/compatible_formats.cpp
index ab4f4d407..87d69ebc5 100644
--- a/src/video_core/compatible_formats.cpp
+++ b/src/video_core/compatible_formats.cpp
@@ -272,6 +272,9 @@ constexpr Table MakeNonNativeBgrCopyTable() {
 
 bool IsViewCompatible(PixelFormat format_a, PixelFormat format_b, bool broken_views,
                       bool native_bgr) {
+    if (format_a == format_b) {
+        return true;
+    }
     if (broken_views) {
         // If format views are broken, only accept formats that are identical.
         return format_a == format_b;
@@ -282,6 +285,9 @@ bool IsViewCompatible(PixelFormat format_a, PixelFormat format_b, bool broken_vi
 }
 
 bool IsCopyCompatible(PixelFormat format_a, PixelFormat format_b, bool native_bgr) {
+    if (format_a == format_b) {
+        return true;
+    }
     static constexpr Table BGR_TABLE = MakeNativeBgrCopyTable();
     static constexpr Table NO_BGR_TABLE = MakeNonNativeBgrCopyTable();
     return IsSupported(native_bgr ? BGR_TABLE : NO_BGR_TABLE, format_a, format_b);
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 551929824..9f1b340a9 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -5,6 +5,7 @@
 #include "common/microprofile.h"
 #include "common/settings.h"
 #include "core/core.h"
+#include "core/memory.h"
 #include "video_core/dma_pusher.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/gpu.h"
@@ -12,6 +13,8 @@
 
 namespace Tegra {
 
+constexpr u32 MacroRegistersStart = 0xE00;
+
 DmaPusher::DmaPusher(Core::System& system_, GPU& gpu_, MemoryManager& memory_manager_,
                      Control::ChannelState& channel_state_)
     : gpu{gpu_}, system{system_}, memory_manager{memory_manager_}, puller{gpu_, memory_manager_,
@@ -74,25 +77,16 @@ bool DmaPusher::Step() {
         }
 
         // Push buffer non-empty, read a word
-        command_headers.resize_destructive(command_list_header.size);
-        constexpr u32 MacroRegistersStart = 0xE00;
-        if (dma_state.method < MacroRegistersStart) {
-            if (Settings::IsGPULevelHigh()) {
-                memory_manager.ReadBlock(dma_state.dma_get, command_headers.data(),
-                                         command_list_header.size * sizeof(u32));
-            } else {
-                memory_manager.ReadBlockUnsafe(dma_state.dma_get, command_headers.data(),
-                                               command_list_header.size * sizeof(u32));
-            }
-        } else {
-            const size_t copy_size = command_list_header.size * sizeof(u32);
+        if (dma_state.method >= MacroRegistersStart) {
             if (subchannels[dma_state.subchannel]) {
-                subchannels[dma_state.subchannel]->current_dirty =
-                    memory_manager.IsMemoryDirty(dma_state.dma_get, copy_size);
+                subchannels[dma_state.subchannel]->current_dirty = memory_manager.IsMemoryDirty(
+                    dma_state.dma_get, command_list_header.size * sizeof(u32));
             }
-            memory_manager.ReadBlockUnsafe(dma_state.dma_get, command_headers.data(), copy_size);
         }
-        ProcessCommands(command_headers);
+        Core::Memory::GpuGuestMemory<Tegra::CommandHeader,
+                                     Core::Memory::GuestMemoryFlags::UnsafeRead>
+            headers(memory_manager, dma_state.dma_get, command_list_header.size, &command_headers);
+        ProcessCommands(headers);
     }
 
     return true;
diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp
index 7f5a0c29d..bc64d4486 100644
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@@ -5,6 +5,7 @@
 
 #include "common/algorithm.h"
 #include "common/assert.h"
+#include "core/memory.h"
 #include "video_core/engines/engine_upload.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
@@ -46,15 +47,11 @@ void State::ProcessData(const u32* data, size_t num_data) {
 void State::ProcessData(std::span<const u8> read_buffer) {
     const GPUVAddr address{regs.dest.Address()};
     if (is_linear) {
-        if (regs.line_count == 1) {
-            rasterizer->AccelerateInlineToMemory(address, copy_size, read_buffer);
-        } else {
-            for (size_t line = 0; line < regs.line_count; ++line) {
-                const GPUVAddr dest_line = address + line * regs.dest.pitch;
-                std::span<const u8> buffer(read_buffer.data() + line * regs.line_length_in,
-                                           regs.line_length_in);
-                rasterizer->AccelerateInlineToMemory(dest_line, regs.line_length_in, buffer);
-            }
+        for (size_t line = 0; line < regs.line_count; ++line) {
+            const GPUVAddr dest_line = address + line * regs.dest.pitch;
+            std::span<const u8> buffer(read_buffer.data() + line * regs.line_length_in,
+                                       regs.line_length_in);
+            rasterizer->AccelerateInlineToMemory(dest_line, regs.line_length_in, buffer);
         }
     } else {
         u32 width = regs.dest.width;
@@ -70,13 +67,14 @@ void State::ProcessData(std::span<const u8> read_buffer) {
         const std::size_t dst_size = Tegra::Texture::CalculateSize(
             true, bytes_per_pixel, width, regs.dest.height, regs.dest.depth,
             regs.dest.BlockHeight(), regs.dest.BlockDepth());
-        tmp_buffer.resize_destructive(dst_size);
-        memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
-        Tegra::Texture::SwizzleSubrect(tmp_buffer, read_buffer, bytes_per_pixel, width,
-                                       regs.dest.height, regs.dest.depth, x_offset, regs.dest.y,
-                                       x_elements, regs.line_count, regs.dest.BlockHeight(),
+
+        Core::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeReadCachedWrite>
+            tmp(memory_manager, address, dst_size, &tmp_buffer);
+
+        Tegra::Texture::SwizzleSubrect(tmp, read_buffer, bytes_per_pixel, width, regs.dest.height,
+                                       regs.dest.depth, x_offset, regs.dest.y, x_elements,
+                                       regs.line_count, regs.dest.BlockHeight(),
                                        regs.dest.BlockDepth(), regs.line_length_in);
-        memory_manager.WriteBlockCached(address, tmp_buffer.data(), dst_size);
     }
 }
 
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index 601095f03..a38d9528a 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -84,7 +84,6 @@ Texture::TICEntry KeplerCompute::GetTICEntry(u32 tic_index) const {
 
     Texture::TICEntry tic_entry;
     memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
-
     return tic_entry;
 }
 
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 62d70e9f3..c3696096d 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -9,6 +9,7 @@
 #include "common/settings.h"
 #include "core/core.h"
 #include "core/core_timing.h"
+#include "core/memory.h"
 #include "video_core/dirty_flags.h"
 #include "video_core/engines/draw_manager.h"
 #include "video_core/engines/maxwell_3d.h"
@@ -679,17 +680,14 @@ void Maxwell3D::ProcessCBData(u32 value) {
 Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
     const GPUVAddr tic_address_gpu{regs.tex_header.Address() +
                                    tic_index * sizeof(Texture::TICEntry)};
-
     Texture::TICEntry tic_entry;
     memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
-
     return tic_entry;
 }
 
 Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const {
     const GPUVAddr tsc_address_gpu{regs.tex_sampler.Address() +
                                    tsc_index * sizeof(Texture::TSCEntry)};
-
     Texture::TSCEntry tsc_entry;
     memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry));
     return tsc_entry;
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index a290d6ea7..cd8e24b0b 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -7,6 +7,7 @@
 #include "common/microprofile.h"
 #include "common/settings.h"
 #include "core/core.h"
+#include "core/memory.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_dma.h"
 #include "video_core/memory_manager.h"
@@ -130,11 +131,12 @@ void MaxwellDMA::Launch() {
                 UNIMPLEMENTED_IF(regs.offset_out % 16 != 0);
                 read_buffer.resize_destructive(16);
                 for (u32 offset = 0; offset < regs.line_length_in; offset += 16) {
-                    memory_manager.ReadBlock(
-                        convert_linear_2_blocklinear_addr(regs.offset_in + offset),
-                        read_buffer.data(), read_buffer.size());
-                    memory_manager.WriteBlockCached(regs.offset_out + offset, read_buffer.data(),
-                                                    read_buffer.size());
+                    Core::Memory::GpuGuestMemoryScoped<
+                        u8, Core::Memory::GuestMemoryFlags::SafeReadCachedWrite>
+                        tmp_write_buffer(memory_manager,
+                                         convert_linear_2_blocklinear_addr(regs.offset_in + offset),
+                                         16, &read_buffer);
+                    tmp_write_buffer.SetAddressAndSize(regs.offset_out + offset, 16);
                 }
             } else if (is_src_pitch && !is_dst_pitch) {
                 UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0);
@@ -142,20 +144,19 @@ void MaxwellDMA::Launch() {
                 UNIMPLEMENTED_IF(regs.offset_out % 16 != 0);
                 read_buffer.resize_destructive(16);
                 for (u32 offset = 0; offset < regs.line_length_in; offset += 16) {
-                    memory_manager.ReadBlock(regs.offset_in + offset, read_buffer.data(),
-                                             read_buffer.size());
-                    memory_manager.WriteBlockCached(
-                        convert_linear_2_blocklinear_addr(regs.offset_out + offset),
-                        read_buffer.data(), read_buffer.size());
+                    Core::Memory::GpuGuestMemoryScoped<
+                        u8, Core::Memory::GuestMemoryFlags::SafeReadCachedWrite>
+                        tmp_write_buffer(memory_manager, regs.offset_in + offset, 16, &read_buffer);
+                    tmp_write_buffer.SetAddressAndSize(
+                        convert_linear_2_blocklinear_addr(regs.offset_out + offset), 16);
                 }
             } else {
                 if (!accelerate.BufferCopy(regs.offset_in, regs.offset_out, regs.line_length_in)) {
-                    read_buffer.resize_destructive(regs.line_length_in);
-                    memory_manager.ReadBlock(regs.offset_in, read_buffer.data(),
-                                             regs.line_length_in,
-                                             VideoCommon::CacheType::NoBufferCache);
-                    memory_manager.WriteBlockCached(regs.offset_out, read_buffer.data(),
-                                                    regs.line_length_in);
+                    Core::Memory::GpuGuestMemoryScoped<
+                        u8, Core::Memory::GuestMemoryFlags::SafeReadCachedWrite>
+                        tmp_write_buffer(memory_manager, regs.offset_in, regs.line_length_in,
+                                         &read_buffer);
+                    tmp_write_buffer.SetAddressAndSize(regs.offset_out, regs.line_length_in);
                 }
             }
         }
@@ -174,8 +175,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
     src_operand.address = regs.offset_in;
 
     DMA::BufferOperand dst_operand;
-    u32 abs_pitch_out = std::abs(static_cast<s32>(regs.pitch_out));
-    dst_operand.pitch = abs_pitch_out;
+    dst_operand.pitch = static_cast<u32>(std::abs(regs.pitch_out));
     dst_operand.width = regs.line_length_in;
     dst_operand.height = regs.line_count;
     dst_operand.address = regs.offset_out;
@@ -222,18 +222,16 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
     const size_t src_size =
         CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
 
-    const size_t dst_size = static_cast<size_t>(abs_pitch_out) * regs.line_count;
-    read_buffer.resize_destructive(src_size);
-    write_buffer.resize_destructive(dst_size);
+    const size_t dst_size = dst_operand.pitch * regs.line_count;
 
-    memory_manager.ReadBlock(src_operand.address, read_buffer.data(), src_size);
-    memory_manager.ReadBlock(dst_operand.address, write_buffer.data(), dst_size);
+    Core::Memory::GpuGuestMemory<u8, Core::Memory::GuestMemoryFlags::SafeRead> tmp_read_buffer(
+        memory_manager, src_operand.address, src_size, &read_buffer);
+    Core::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeReadCachedWrite>
+        tmp_write_buffer(memory_manager, dst_operand.address, dst_size, &write_buffer);
 
-    UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset,
-                     src_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
-                     abs_pitch_out);
-
-    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
+    UnswizzleSubrect(tmp_write_buffer, tmp_read_buffer, bytes_per_pixel, width, height, depth,
+                     x_offset, src_params.origin.y, x_elements, regs.line_count, block_height,
+                     block_depth, dst_operand.pitch);
 }
 
 void MaxwellDMA::CopyPitchToBlockLinear() {
@@ -288,18 +286,17 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
         CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
     const size_t src_size = static_cast<size_t>(regs.pitch_in) * regs.line_count;
 
-    read_buffer.resize_destructive(src_size);
-    write_buffer.resize_destructive(dst_size);
-
-    memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
-    memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
-
-    // If the input is linear and the output is tiled, swizzle the input and copy it over.
-    SwizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset,
-                   dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
-                   regs.pitch_in);
-
-    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
+    GPUVAddr src_addr = regs.offset_in;
+    GPUVAddr dst_addr = regs.offset_out;
+    Core::Memory::GpuGuestMemory<u8, Core::Memory::GuestMemoryFlags::SafeRead> tmp_read_buffer(
+        memory_manager, src_addr, src_size, &read_buffer);
+    Core::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeReadCachedWrite>
+        tmp_write_buffer(memory_manager, dst_addr, dst_size, &write_buffer);
+
+    //  If the input is linear and the output is tiled, swizzle the input and copy it over.
+    SwizzleSubrect(tmp_write_buffer, tmp_read_buffer, bytes_per_pixel, width, height, depth,
+                   x_offset, dst_params.origin.y, x_elements, regs.line_count, block_height,
+                   block_depth, regs.pitch_in);
 }
 
 void MaxwellDMA::CopyBlockLinearToBlockLinear() {
@@ -343,23 +340,20 @@ void MaxwellDMA::CopyBlockLinearToBlockLinear() {
     const u32 pitch = x_elements * bytes_per_pixel;
     const size_t mid_buffer_size = pitch * regs.line_count;
 
-    read_buffer.resize_destructive(src_size);
-    write_buffer.resize_destructive(dst_size);
-
     intermediate_buffer.resize_destructive(mid_buffer_size);
 
-    memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
-    memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
+    Core::Memory::GpuGuestMemory<u8, Core::Memory::GuestMemoryFlags::SafeRead> tmp_read_buffer(
+        memory_manager, regs.offset_in, src_size, &read_buffer);
+    Core::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeReadCachedWrite>
+        tmp_write_buffer(memory_manager, regs.offset_out, dst_size, &write_buffer);
 
-    UnswizzleSubrect(intermediate_buffer, read_buffer, bytes_per_pixel, src_width, src.height,
+    UnswizzleSubrect(intermediate_buffer, tmp_read_buffer, bytes_per_pixel, src_width, src.height,
                      src.depth, src_x_offset, src.origin.y, x_elements, regs.line_count,
                      src.block_size.height, src.block_size.depth, pitch);
 
-    SwizzleSubrect(write_buffer, intermediate_buffer, bytes_per_pixel, dst_width, dst.height,
+    SwizzleSubrect(tmp_write_buffer, intermediate_buffer, bytes_per_pixel, dst_width, dst.height,
                    dst.depth, dst_x_offset, dst.origin.y, x_elements, regs.line_count,
                    dst.block_size.height, dst.block_size.depth, pitch);
-
-    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }
 
 void MaxwellDMA::ReleaseSemaphore() {
diff --git a/src/video_core/engines/sw_blitter/blitter.cpp b/src/video_core/engines/sw_blitter/blitter.cpp
index ff88cd03d..3a599f466 100644
--- a/src/video_core/engines/sw_blitter/blitter.cpp
+++ b/src/video_core/engines/sw_blitter/blitter.cpp
@@ -159,11 +159,11 @@ bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst,
     const auto src_bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(src.format));
     const auto dst_bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(dst.format));
     const size_t src_size = get_surface_size(src, src_bytes_per_pixel);
-    impl->tmp_buffer.resize_destructive(src_size);
-    memory_manager.ReadBlock(src.Address(), impl->tmp_buffer.data(), src_size);
 
-    const size_t src_copy_size = src_extent_x * src_extent_y * src_bytes_per_pixel;
+    Core::Memory::GpuGuestMemory<u8, Core::Memory::GuestMemoryFlags::SafeRead> tmp_buffer(
+        memory_manager, src.Address(), src_size, &impl->tmp_buffer);
 
+    const size_t src_copy_size = src_extent_x * src_extent_y * src_bytes_per_pixel;
     const size_t dst_copy_size = dst_extent_x * dst_extent_y * dst_bytes_per_pixel;
 
     impl->src_buffer.resize_destructive(src_copy_size);
@@ -200,12 +200,11 @@ bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst,
 
     impl->dst_buffer.resize_destructive(dst_copy_size);
     if (src.linear == Fermi2D::MemoryLayout::BlockLinear) {
-        UnswizzleSubrect(impl->src_buffer, impl->tmp_buffer, src_bytes_per_pixel, src.width,
-                         src.height, src.depth, config.src_x0, config.src_y0, src_extent_x,
-                         src_extent_y, src.block_height, src.block_depth,
-                         src_extent_x * src_bytes_per_pixel);
+        UnswizzleSubrect(impl->src_buffer, tmp_buffer, src_bytes_per_pixel, src.width, src.height,
+                         src.depth, config.src_x0, config.src_y0, src_extent_x, src_extent_y,
+                         src.block_height, src.block_depth, src_extent_x * src_bytes_per_pixel);
     } else {
-        process_pitch_linear(false, impl->tmp_buffer, impl->src_buffer, src_extent_x, src_extent_y,
+        process_pitch_linear(false, tmp_buffer, impl->src_buffer, src_extent_x, src_extent_y,
                              src.pitch, config.src_x0, config.src_y0, src_bytes_per_pixel);
     }
 
@@ -221,20 +220,18 @@ bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst,
     }
 
     const size_t dst_size = get_surface_size(dst, dst_bytes_per_pixel);
-    impl->tmp_buffer.resize_destructive(dst_size);
-    memory_manager.ReadBlock(dst.Address(), impl->tmp_buffer.data(), dst_size);
+    Core::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeReadWrite>
+        tmp_buffer2(memory_manager, dst.Address(), dst_size, &impl->tmp_buffer);
 
     if (dst.linear == Fermi2D::MemoryLayout::BlockLinear) {
-        SwizzleSubrect(impl->tmp_buffer, impl->dst_buffer, dst_bytes_per_pixel, dst.width,
-                       dst.height, dst.depth, config.dst_x0, config.dst_y0, dst_extent_x,
-                       dst_extent_y, dst.block_height, dst.block_depth,
-                       dst_extent_x * dst_bytes_per_pixel);
+        SwizzleSubrect(tmp_buffer2, impl->dst_buffer, dst_bytes_per_pixel, dst.width, dst.height,
+                       dst.depth, config.dst_x0, config.dst_y0, dst_extent_x, dst_extent_y,
+                       dst.block_height, dst.block_depth, dst_extent_x * dst_bytes_per_pixel);
     } else {
-        process_pitch_linear(true, impl->dst_buffer, impl->tmp_buffer, dst_extent_x, dst_extent_y,
+        process_pitch_linear(true, impl->dst_buffer, tmp_buffer2, dst_extent_x, dst_extent_y,
                              dst.pitch, config.dst_x0, config.dst_y0,
                              static_cast<size_t>(dst_bytes_per_pixel));
     }
-    memory_manager.WriteBlock(dst.Address(), impl->tmp_buffer.data(), dst_size);
     return true;
 }
 
diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h
index 35d699bbf..ab20ff30f 100644
--- a/src/video_core/fence_manager.h
+++ b/src/video_core/fence_manager.h
@@ -69,7 +69,6 @@ public:
     }
 
     void SignalFence(std::function<void()>&& func) {
-        rasterizer.InvalidateGPUCache();
         bool delay_fence = Settings::IsGPULevelHigh();
         if constexpr (!can_async_check) {
             TryReleasePendingFences<false>();
@@ -96,6 +95,7 @@ public:
             guard.unlock();
             cv.notify_all();
         }
+        rasterizer.InvalidateGPUCache();
     }
 
     void SignalSyncPoint(u32 value) {
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index db385076d..c192e33b2 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -95,7 +95,9 @@ struct GPU::Impl {
 
     /// Synchronizes CPU writes with Host GPU memory.
     void InvalidateGPUCache() {
-        rasterizer->InvalidateGPUCache();
+        std::function<void(VAddr, size_t)> callback_writes(
+            [this](VAddr address, size_t size) { rasterizer->OnCacheInvalidation(address, size); });
+        system.GatherGPUDirtyMemory(callback_writes);
     }
 
     /// Signal the ending of command list.
@@ -299,6 +301,10 @@ struct GPU::Impl {
         gpu_thread.InvalidateRegion(addr, size);
     }
 
+    bool OnCPUWrite(VAddr addr, u64 size) {
+        return rasterizer->OnCPUWrite(addr, size);
+    }
+
     /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
     void FlushAndInvalidateRegion(VAddr addr, u64 size) {
         gpu_thread.FlushAndInvalidateRegion(addr, size);
@@ -561,6 +567,10 @@ void GPU::InvalidateRegion(VAddr addr, u64 size) {
     impl->InvalidateRegion(addr, size);
 }
 
+bool GPU::OnCPUWrite(VAddr addr, u64 size) {
+    return impl->OnCPUWrite(addr, size);
+}
+
 void GPU::FlushAndInvalidateRegion(VAddr addr, u64 size) {
     impl->FlushAndInvalidateRegion(addr, size);
 }
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index e49c40cf2..ba2838b89 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -250,6 +250,10 @@ public:
     /// Notify rasterizer that any caches of the specified region should be invalidated
     void InvalidateRegion(VAddr addr, u64 size);
 
+    /// Notify rasterizer that CPU is trying to write this area. It returns true if the area is
+    /// sensible, false otherwise
+    bool OnCPUWrite(VAddr addr, u64 size);
+
     /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
     void FlushAndInvalidateRegion(VAddr addr, u64 size);
 
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 889144f38..2f0f9f593 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -47,7 +47,7 @@ static void RunThread(std::stop_token stop_token, Core::System& system,
         } else if (const auto* flush = std::get_if<FlushRegionCommand>(&next.data)) {
             rasterizer->FlushRegion(flush->addr, flush->size);
         } else if (const auto* invalidate = std::get_if<InvalidateRegionCommand>(&next.data)) {
-            rasterizer->OnCPUWrite(invalidate->addr, invalidate->size);
+            rasterizer->OnCacheInvalidation(invalidate->addr, invalidate->size);
         } else {
             ASSERT(false);
         }
@@ -102,12 +102,12 @@ void ThreadManager::TickGPU() {
 }
 
 void ThreadManager::InvalidateRegion(VAddr addr, u64 size) {
-    rasterizer->OnCPUWrite(addr, size);
+    rasterizer->OnCacheInvalidation(addr, size);
 }
 
 void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
     // Skip flush on asynch mode, as FlushAndInvalidateRegion is not used for anything too important
-    rasterizer->OnCPUWrite(addr, size);
+    rasterizer->OnCacheInvalidation(addr, size);
 }
 
 u64 ThreadManager::PushCommand(CommandData&& command_data, bool block) {
diff --git a/src/video_core/host1x/codecs/codec.cpp b/src/video_core/host1x/codecs/codec.cpp
index cd6a3a9b8..da07a556f 100644
--- a/src/video_core/host1x/codecs/codec.cpp
+++ b/src/video_core/host1x/codecs/codec.cpp
@@ -290,7 +290,7 @@ void Codec::Decode() {
             return vp9_decoder->GetFrameBytes();
         default:
             ASSERT(false);
-            return std::vector<u8>{};
+            return std::span<const u8>{};
         }
     }();
     AVPacketPtr packet{av_packet_alloc(), AVPacketDeleter};
diff --git a/src/video_core/host1x/codecs/h264.cpp b/src/video_core/host1x/codecs/h264.cpp
index ce827eb6c..862904e39 100644
--- a/src/video_core/host1x/codecs/h264.cpp
+++ b/src/video_core/host1x/codecs/h264.cpp
@@ -29,15 +29,15 @@ H264::H264(Host1x::Host1x& host1x_) : host1x{host1x_} {}
 
 H264::~H264() = default;
 
-const std::vector<u8>& H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state,
-                                          bool is_first_frame) {
+std::span<const u8> H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state,
+                                       bool is_first_frame) {
     H264DecoderContext context;
     host1x.MemoryManager().ReadBlock(state.picture_info_offset, &context,
                                      sizeof(H264DecoderContext));
 
     const s64 frame_number = context.h264_parameter_set.frame_number.Value();
     if (!is_first_frame && frame_number != 0) {
-        frame.resize(context.stream_len);
+        frame.resize_destructive(context.stream_len);
         host1x.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size());
         return frame;
     }
@@ -135,14 +135,14 @@ const std::vector<u8>& H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegist
     for (s32 index = 0; index < 6; index++) {
         writer.WriteBit(true);
         std::span<const u8> matrix{context.weight_scale};
-        writer.WriteScalingList(matrix, index * 16, 16);
+        writer.WriteScalingList(scan, matrix, index * 16, 16);
     }
 
     if (context.h264_parameter_set.transform_8x8_mode_flag) {
         for (s32 index = 0; index < 2; index++) {
             writer.WriteBit(true);
             std::span<const u8> matrix{context.weight_scale_8x8};
-            writer.WriteScalingList(matrix, index * 64, 64);
+            writer.WriteScalingList(scan, matrix, index * 64, 64);
         }
     }
 
@@ -188,8 +188,8 @@ void H264BitWriter::WriteBit(bool state) {
     WriteBits(state ? 1 : 0, 1);
 }
 
-void H264BitWriter::WriteScalingList(std::span<const u8> list, s32 start, s32 count) {
-    static Common::ScratchBuffer<u8> scan{};
+void H264BitWriter::WriteScalingList(Common::ScratchBuffer<u8>& scan, std::span<const u8> list,
+                                     s32 start, s32 count) {
     scan.resize_destructive(count);
     if (count == 16) {
         std::memcpy(scan.data(), zig_zag_scan.data(), scan.size());
diff --git a/src/video_core/host1x/codecs/h264.h b/src/video_core/host1x/codecs/h264.h
index 5cc86454e..d6b556322 100644
--- a/src/video_core/host1x/codecs/h264.h
+++ b/src/video_core/host1x/codecs/h264.h
@@ -5,9 +5,11 @@
 
 #include <span>
 #include <vector>
+
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "common/scratch_buffer.h"
 #include "video_core/host1x/nvdec_common.h"
 
 namespace Tegra {
@@ -37,7 +39,8 @@ public:
 
     /// Based on section 7.3.2.1.1.1 and Table 7-4 in the H.264 specification
     /// Writes the scaling matrices of the sream
-    void WriteScalingList(std::span<const u8> list, s32 start, s32 count);
+    void WriteScalingList(Common::ScratchBuffer<u8>& scan, std::span<const u8> list, s32 start,
+                          s32 count);
 
     /// Return the bitstream as a vector.
     [[nodiscard]] std::vector<u8>& GetByteArray();
@@ -63,11 +66,12 @@ public:
     ~H264();
 
     /// Compose the H264 frame for FFmpeg decoding
-    [[nodiscard]] const std::vector<u8>& ComposeFrame(
-        const Host1x::NvdecCommon::NvdecRegisters& state, bool is_first_frame = false);
+    [[nodiscard]] std::span<const u8> ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state,
+                                                   bool is_first_frame = false);
 
 private:
-    std::vector<u8> frame;
+    Common::ScratchBuffer<u8> frame;
+    Common::ScratchBuffer<u8> scan;
     Host1x::Host1x& host1x;
 
     struct H264ParameterSet {
diff --git a/src/video_core/host1x/codecs/vp8.cpp b/src/video_core/host1x/codecs/vp8.cpp
index 28fb12cb8..ee6392ff9 100644
--- a/src/video_core/host1x/codecs/vp8.cpp
+++ b/src/video_core/host1x/codecs/vp8.cpp
@@ -12,7 +12,7 @@ VP8::VP8(Host1x::Host1x& host1x_) : host1x{host1x_} {}
 
 VP8::~VP8() = default;
 
-const std::vector<u8>& VP8::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) {
+std::span<const u8> VP8::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) {
     VP8PictureInfo info;
     host1x.MemoryManager().ReadBlock(state.picture_info_offset, &info, sizeof(VP8PictureInfo));
 
diff --git a/src/video_core/host1x/codecs/vp8.h b/src/video_core/host1x/codecs/vp8.h
index 5bf07ecab..7926b73f3 100644
--- a/src/video_core/host1x/codecs/vp8.h
+++ b/src/video_core/host1x/codecs/vp8.h
@@ -4,10 +4,11 @@
 #pragma once
 
 #include <array>
-#include <vector>
+#include <span>
 
 #include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "common/scratch_buffer.h"
 #include "video_core/host1x/nvdec_common.h"
 
 namespace Tegra {
@@ -24,11 +25,11 @@ public:
     ~VP8();
 
     /// Compose the VP8 frame for FFmpeg decoding
-    [[nodiscard]] const std::vector<u8>& ComposeFrame(
+    [[nodiscard]] std::span<const u8> ComposeFrame(
         const Host1x::NvdecCommon::NvdecRegisters& state);
 
 private:
-    std::vector<u8> frame;
+    Common::ScratchBuffer<u8> frame;
     Host1x::Host1x& host1x;
 
     struct VP8PictureInfo {
diff --git a/src/video_core/host1x/codecs/vp9.cpp b/src/video_core/host1x/codecs/vp9.cpp
index cf40c9012..306c3d0e8 100644
--- a/src/video_core/host1x/codecs/vp9.cpp
+++ b/src/video_core/host1x/codecs/vp9.cpp
@@ -3,6 +3,7 @@
 
 #include <algorithm> // for std::copy
 #include <numeric>
+
 #include "common/assert.h"
 #include "video_core/host1x/codecs/vp9.h"
 #include "video_core/host1x/host1x.h"
diff --git a/src/video_core/host1x/codecs/vp9.h b/src/video_core/host1x/codecs/vp9.h
index d4083e8d3..f1ed19508 100644
--- a/src/video_core/host1x/codecs/vp9.h
+++ b/src/video_core/host1x/codecs/vp9.h
@@ -4,9 +4,11 @@
 #pragma once
 
 #include <array>
+#include <span>
 #include <vector>
 
 #include "common/common_types.h"
+#include "common/scratch_buffer.h"
 #include "common/stream.h"
 #include "video_core/host1x/codecs/vp9_types.h"
 #include "video_core/host1x/nvdec_common.h"
@@ -128,8 +130,8 @@ public:
         return !current_frame_info.show_frame;
     }
 
-    /// Returns a const reference to the composed frame data.
-    [[nodiscard]] const std::vector<u8>& GetFrameBytes() const {
+    /// Returns a const span to the composed frame data.
+    [[nodiscard]] std::span<const u8> GetFrameBytes() const {
         return frame;
     }
 
@@ -181,7 +183,7 @@ private:
     [[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader();
 
     Host1x::Host1x& host1x;
-    std::vector<u8> frame;
+    Common::ScratchBuffer<u8> frame;
 
     std::array<s8, 4> loop_filter_ref_deltas{};
     std::array<s8, 2> loop_filter_mode_deltas{};
diff --git a/src/video_core/host1x/codecs/vp9_types.h b/src/video_core/host1x/codecs/vp9_types.h
index adad8ed7e..cc9b25690 100644
--- a/src/video_core/host1x/codecs/vp9_types.h
+++ b/src/video_core/host1x/codecs/vp9_types.h
@@ -5,6 +5,7 @@
 
 #include <array>
 #include <vector>
+
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 45141e488..d16040613 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -10,13 +10,13 @@
 #include "core/device_memory.h"
 #include "core/hle/kernel/k_page_table.h"
 #include "core/hle/kernel/k_process.h"
-#include "core/memory.h"
 #include "video_core/invalidation_accumulator.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_base.h"
 
 namespace Tegra {
+using Core::Memory::GuestMemoryFlags;
 
 std::atomic<size_t> MemoryManager::unique_identifier_generator{};
 
@@ -587,13 +587,10 @@ void MemoryManager::InvalidateRegion(GPUVAddr gpu_addr, size_t size,
 
 void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size,
                               VideoCommon::CacheType which) {
-    tmp_buffer.resize_destructive(size);
-    ReadBlock(gpu_src_addr, tmp_buffer.data(), size, which);
-
-    // The output block must be flushed in case it has data modified from the GPU.
-    // Fixes NPC geometry in Zombie Panic in Wonderland DX
+    Core::Memory::GpuGuestMemoryScoped<u8, GuestMemoryFlags::SafeReadWrite> data(
+        *this, gpu_src_addr, size);
+    data.SetAddressAndSize(gpu_dest_addr, size);
     FlushRegion(gpu_dest_addr, size, which);
-    WriteBlock(gpu_dest_addr, tmp_buffer.data(), size, which);
 }
 
 bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) const {
@@ -758,4 +755,23 @@ void MemoryManager::FlushCaching() {
     accumulator->Clear();
 }
 
+const u8* MemoryManager::GetSpan(const GPUVAddr src_addr, const std::size_t size) const {
+    auto cpu_addr = GpuToCpuAddress(src_addr);
+    if (cpu_addr) {
+        return memory.GetSpan(*cpu_addr, size);
+    }
+    return nullptr;
+}
+
+u8* MemoryManager::GetSpan(const GPUVAddr src_addr, const std::size_t size) {
+    if (!IsContinuousRange(src_addr, size)) {
+        return nullptr;
+    }
+    auto cpu_addr = GpuToCpuAddress(src_addr);
+    if (cpu_addr) {
+        return memory.GetSpan(*cpu_addr, size);
+    }
+    return nullptr;
+}
+
 } // namespace Tegra
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 4202c26ff..9b311b9e5 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -15,6 +15,7 @@
 #include "common/range_map.h"
 #include "common/scratch_buffer.h"
 #include "common/virtual_buffer.h"
+#include "core/memory.h"
 #include "video_core/cache_types.h"
 #include "video_core/pte_kind.h"
 
@@ -62,6 +63,20 @@ public:
     [[nodiscard]] u8* GetPointer(GPUVAddr addr);
     [[nodiscard]] const u8* GetPointer(GPUVAddr addr) const;
 
+    template <typename T>
+    [[nodiscard]] T* GetPointer(GPUVAddr addr) {
+        const auto address{GpuToCpuAddress(addr)};
+        if (!address) {
+            return {};
+        }
+        return memory.GetPointer(*address);
+    }
+
+    template <typename T>
+    [[nodiscard]] const T* GetPointer(GPUVAddr addr) const {
+        return GetPointer<T*>(addr);
+    }
+
     /**
      * ReadBlock and WriteBlock are full read and write operations over virtual
      * GPU Memory. It's important to use these when GPU memory may not be continuous
@@ -139,6 +154,9 @@ public:
 
     void FlushCaching();
 
+    const u8* GetSpan(const GPUVAddr src_addr, const std::size_t size) const;
+    u8* GetSpan(const GPUVAddr src_addr, const std::size_t size);
+
 private:
     template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped>
     inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped,
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 7566a8c4e..cb8029a4f 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -109,7 +109,9 @@ public:
     }
 
     /// Notify rasterizer that any caches of the specified region are desync with guest
-    virtual void OnCPUWrite(VAddr addr, u64 size) = 0;
+    virtual void OnCacheInvalidation(VAddr addr, u64 size) = 0;
+
+    virtual bool OnCPUWrite(VAddr addr, u64 size) = 0;
 
     /// Sync memory between guest and host.
     virtual void InvalidateGPUCache() = 0;
diff --git a/src/video_core/renderer_null/null_rasterizer.cpp b/src/video_core/renderer_null/null_rasterizer.cpp
index bf2ce4c49..92ecf6682 100644
--- a/src/video_core/renderer_null/null_rasterizer.cpp
+++ b/src/video_core/renderer_null/null_rasterizer.cpp
@@ -47,7 +47,10 @@ bool RasterizerNull::MustFlushRegion(VAddr addr, u64 size, VideoCommon::CacheTyp
     return false;
 }
 void RasterizerNull::InvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType) {}
-void RasterizerNull::OnCPUWrite(VAddr addr, u64 size) {}
+bool RasterizerNull::OnCPUWrite(VAddr addr, u64 size) {
+    return false;
+}
+void RasterizerNull::OnCacheInvalidation(VAddr addr, u64 size) {}
 VideoCore::RasterizerDownloadArea RasterizerNull::GetFlushArea(VAddr addr, u64 size) {
     VideoCore::RasterizerDownloadArea new_area{
         .start_address = Common::AlignDown(addr, Core::Memory::YUZU_PAGESIZE),
diff --git a/src/video_core/renderer_null/null_rasterizer.h b/src/video_core/renderer_null/null_rasterizer.h
index a8d35d2c1..93b9a6971 100644
--- a/src/video_core/renderer_null/null_rasterizer.h
+++ b/src/video_core/renderer_null/null_rasterizer.h
@@ -53,7 +53,8 @@ public:
                          VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
     void InvalidateRegion(VAddr addr, u64 size,
                           VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
-    void OnCPUWrite(VAddr addr, u64 size) override;
+    void OnCacheInvalidation(VAddr addr, u64 size) override;
+    bool OnCPUWrite(VAddr addr, u64 size) override;
     VideoCore::RasterizerDownloadArea GetFlushArea(VAddr addr, u64 size) override;
     void InvalidateGPUCache() override;
     void UnmapMemory(VAddr addr, u64 size) override;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index edf527f2d..aadd6967c 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -485,12 +485,33 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size, VideoCommon::Cache
     }
 }
 
-void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) {
+bool RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) {
+    MICROPROFILE_SCOPE(OpenGL_CacheManagement);
+    if (addr == 0 || size == 0) {
+        return false;
+    }
+
+    {
+        std::scoped_lock lock{buffer_cache.mutex};
+        if (buffer_cache.OnCPUWrite(addr, size)) {
+            return true;
+        }
+    }
+
+    {
+        std::scoped_lock lock{texture_cache.mutex};
+        texture_cache.WriteMemory(addr, size);
+    }
+
+    shader_cache.InvalidateRegion(addr, size);
+    return false;
+}
+
+void RasterizerOpenGL::OnCacheInvalidation(VAddr addr, u64 size) {
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
     if (addr == 0 || size == 0) {
         return;
     }
-    shader_cache.OnCPUWrite(addr, size);
     {
         std::scoped_lock lock{texture_cache.mutex};
         texture_cache.WriteMemory(addr, size);
@@ -499,15 +520,11 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) {
         std::scoped_lock lock{buffer_cache.mutex};
         buffer_cache.CachedWriteMemory(addr, size);
     }
+    shader_cache.InvalidateRegion(addr, size);
 }
 
 void RasterizerOpenGL::InvalidateGPUCache() {
-    MICROPROFILE_SCOPE(OpenGL_CacheManagement);
-    shader_cache.SyncGuestHost();
-    {
-        std::scoped_lock lock{buffer_cache.mutex};
-        buffer_cache.FlushCachedWrites();
-    }
+    gpu.InvalidateGPUCache();
 }
 
 void RasterizerOpenGL::UnmapMemory(VAddr addr, u64 size) {
@@ -519,7 +536,7 @@ void RasterizerOpenGL::UnmapMemory(VAddr addr, u64 size) {
         std::scoped_lock lock{buffer_cache.mutex};
         buffer_cache.WriteMemory(addr, size);
     }
-    shader_cache.OnCPUWrite(addr, size);
+    shader_cache.OnCacheInvalidation(addr, size);
 }
 
 void RasterizerOpenGL::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index a73ad15c1..8eda2ddba 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -98,7 +98,8 @@ public:
     VideoCore::RasterizerDownloadArea GetFlushArea(VAddr addr, u64 size) override;
     void InvalidateRegion(VAddr addr, u64 size,
                           VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
-    void OnCPUWrite(VAddr addr, u64 size) override;
+    void OnCacheInvalidation(VAddr addr, u64 size) override;
+    bool OnCPUWrite(VAddr addr, u64 size) override;
     void InvalidateGPUCache() override;
     void UnmapMemory(VAddr addr, u64 size) override;
     void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) override;
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index b72f95235..51df18ec3 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -591,7 +591,7 @@ void BufferCacheRuntime::ReserveNullBuffer() {
         .flags = 0,
         .size = 4,
         .usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
-                 VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+                 VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT,
         .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
         .queueFamilyIndexCount = 0,
         .pQueueFamilyIndices = nullptr,
@@ -599,7 +599,6 @@ void BufferCacheRuntime::ReserveNullBuffer() {
     if (device.IsExtTransformFeedbackSupported()) {
         create_info.usage |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT;
     }
-    create_info.usage |= VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT;
     null_buffer = memory_allocator.CreateBuffer(create_info, MemoryUsage::DeviceLocal);
     if (device.HasDebuggingToolAttached()) {
         null_buffer.SetObjectNameEXT("Null buffer");
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index f7c0d939a..456bb040e 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -566,11 +566,32 @@ void RasterizerVulkan::InnerInvalidation(std::span<const std::pair<VAddr, std::s
     }
 }
 
-void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
+bool RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
+    if (addr == 0 || size == 0) {
+        return false;
+    }
+
+    {
+        std::scoped_lock lock{buffer_cache.mutex};
+        if (buffer_cache.OnCPUWrite(addr, size)) {
+            return true;
+        }
+    }
+
+    {
+        std::scoped_lock lock{texture_cache.mutex};
+        texture_cache.WriteMemory(addr, size);
+    }
+
+    pipeline_cache.InvalidateRegion(addr, size);
+    return false;
+}
+
+void RasterizerVulkan::OnCacheInvalidation(VAddr addr, u64 size) {
     if (addr == 0 || size == 0) {
         return;
     }
-    pipeline_cache.OnCPUWrite(addr, size);
+
     {
         std::scoped_lock lock{texture_cache.mutex};
         texture_cache.WriteMemory(addr, size);
@@ -579,14 +600,11 @@ void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
         std::scoped_lock lock{buffer_cache.mutex};
         buffer_cache.CachedWriteMemory(addr, size);
     }
+    pipeline_cache.InvalidateRegion(addr, size);
 }
 
 void RasterizerVulkan::InvalidateGPUCache() {
-    pipeline_cache.SyncGuestHost();
-    {
-        std::scoped_lock lock{buffer_cache.mutex};
-        buffer_cache.FlushCachedWrites();
-    }
+    gpu.InvalidateGPUCache();
 }
 
 void RasterizerVulkan::UnmapMemory(VAddr addr, u64 size) {
@@ -598,7 +616,7 @@ void RasterizerVulkan::UnmapMemory(VAddr addr, u64 size) {
         std::scoped_lock lock{buffer_cache.mutex};
         buffer_cache.WriteMemory(addr, size);
     }
-    pipeline_cache.OnCPUWrite(addr, size);
+    pipeline_cache.OnCacheInvalidation(addr, size);
 }
 
 void RasterizerVulkan::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) {
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index b39710b3c..73257d964 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -96,7 +96,8 @@ public:
     void InvalidateRegion(VAddr addr, u64 size,
                           VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
     void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) override;
-    void OnCPUWrite(VAddr addr, u64 size) override;
+    void OnCacheInvalidation(VAddr addr, u64 size) override;
+    bool OnCPUWrite(VAddr addr, u64 size) override;
     void InvalidateGPUCache() override;
     void UnmapMemory(VAddr addr, u64 size) override;
     void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) override;
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index 8385b5509..3aac3cfab 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -36,8 +36,10 @@ using VideoCommon::ImageFlagBits;
 using VideoCommon::ImageInfo;
 using VideoCommon::ImageType;
 using VideoCommon::SubresourceRange;
+using VideoCore::Surface::BytesPerBlock;
 using VideoCore::Surface::IsPixelFormatASTC;
 using VideoCore::Surface::IsPixelFormatInteger;
+using VideoCore::Surface::SurfaceType;
 
 namespace {
 constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
@@ -130,7 +132,7 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
 [[nodiscard]] VkImageCreateInfo MakeImageCreateInfo(const Device& device, const ImageInfo& info) {
     const PixelFormat format = StorageFormat(info.format);
     const auto format_info = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, false, format);
-    VkImageCreateFlags flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;
+    VkImageCreateFlags flags{};
     if (info.type == ImageType::e2D && info.resources.layers >= 6 &&
         info.size.width == info.size.height && !device.HasBrokenCubeImageCompability()) {
         flags |= VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT;
@@ -163,11 +165,24 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
 }
 
 [[nodiscard]] vk::Image MakeImage(const Device& device, const MemoryAllocator& allocator,
-                                  const ImageInfo& info) {
+                                  const ImageInfo& info, std::span<const VkFormat> view_formats) {
     if (info.type == ImageType::Buffer) {
         return vk::Image{};
     }
-    return allocator.CreateImage(MakeImageCreateInfo(device, info));
+    VkImageCreateInfo image_ci = MakeImageCreateInfo(device, info);
+    const VkImageFormatListCreateInfo image_format_list = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_LIST_CREATE_INFO,
+        .pNext = nullptr,
+        .viewFormatCount = static_cast<u32>(view_formats.size()),
+        .pViewFormats = view_formats.data(),
+    };
+    if (view_formats.size() > 1) {
+        image_ci.flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;
+        if (device.IsKhrImageFormatListSupported()) {
+            image_ci.pNext = &image_format_list;
+        }
+    }
+    return allocator.CreateImage(image_ci);
 }
 
 [[nodiscard]] VkImageAspectFlags ImageAspectMask(PixelFormat format) {
@@ -806,6 +821,23 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, Scheduler& sched
         astc_decoder_pass.emplace(device, scheduler, descriptor_pool, staging_buffer_pool,
                                   compute_pass_descriptor_queue, memory_allocator);
     }
+    if (!device.IsKhrImageFormatListSupported()) {
+        return;
+    }
+    for (size_t index_a = 0; index_a < VideoCore::Surface::MaxPixelFormat; index_a++) {
+        const auto image_format = static_cast<PixelFormat>(index_a);
+        if (IsPixelFormatASTC(image_format) && !device.IsOptimalAstcSupported()) {
+            view_formats[index_a].push_back(VK_FORMAT_A8B8G8R8_UNORM_PACK32);
+        }
+        for (size_t index_b = 0; index_b < VideoCore::Surface::MaxPixelFormat; index_b++) {
+            const auto view_format = static_cast<PixelFormat>(index_b);
+            if (VideoCore::Surface::IsViewCompatible(image_format, view_format, false, true)) {
+                const auto view_info =
+                    MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, true, view_format);
+                view_formats[index_a].push_back(view_info.format);
+            }
+        }
+    }
 }
 
 void TextureCacheRuntime::Finish() {
@@ -1265,8 +1297,8 @@ void TextureCacheRuntime::TickFrame() {}
 Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu_addr_,
              VAddr cpu_addr_)
     : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), scheduler{&runtime_.scheduler},
-      runtime{&runtime_},
-      original_image(MakeImage(runtime_.device, runtime_.memory_allocator, info)),
+      runtime{&runtime_}, original_image(MakeImage(runtime_.device, runtime_.memory_allocator, info,
+                                                   runtime->ViewFormats(info.format))),
       aspect_mask(ImageAspectMask(info.format)) {
     if (IsPixelFormatASTC(info.format) && !runtime->device.IsOptimalAstcSupported()) {
         if (Settings::values.async_astc.GetValue()) {
@@ -1471,7 +1503,8 @@ bool Image::ScaleUp(bool ignore) {
         auto scaled_info = info;
         scaled_info.size.width = scaled_width;
         scaled_info.size.height = scaled_height;
-        scaled_image = MakeImage(runtime->device, runtime->memory_allocator, scaled_info);
+        scaled_image = MakeImage(runtime->device, runtime->memory_allocator, scaled_info,
+                                 runtime->ViewFormats(info.format));
         ignore = false;
     }
     current_image = *scaled_image;
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index 220943116..6621210ea 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -103,6 +103,10 @@ public:
 
     [[nodiscard]] VkBuffer GetTemporaryBuffer(size_t needed_size);
 
+    std::span<const VkFormat> ViewFormats(PixelFormat format) {
+        return view_formats[static_cast<std::size_t>(format)];
+    }
+
     void BarrierFeedbackLoop();
 
     const Device& device;
@@ -113,6 +117,7 @@ public:
     RenderPassCache& render_pass_cache;
     std::optional<ASTCDecoderPass> astc_decoder_pass;
     const Settings::ResolutionScalingInfo& resolution;
+    std::array<std::vector<VkFormat>, VideoCore::Surface::MaxPixelFormat> view_formats;
 
     static constexpr size_t indexing_slots = 8 * sizeof(size_t);
     std::array<vk::Buffer, indexing_slots> buffers{};
diff --git a/src/video_core/shader_cache.cpp b/src/video_core/shader_cache.cpp
index 4db948b6d..01701201d 100644
--- a/src/video_core/shader_cache.cpp
+++ b/src/video_core/shader_cache.cpp
@@ -24,7 +24,7 @@ void ShaderCache::InvalidateRegion(VAddr addr, size_t size) {
     RemovePendingShaders();
 }
 
-void ShaderCache::OnCPUWrite(VAddr addr, size_t size) {
+void ShaderCache::OnCacheInvalidation(VAddr addr, size_t size) {
     std::scoped_lock lock{invalidation_mutex};
     InvalidatePagesInRegion(addr, size);
 }
diff --git a/src/video_core/shader_cache.h b/src/video_core/shader_cache.h
index f3cc4c70b..de8e08002 100644
--- a/src/video_core/shader_cache.h
+++ b/src/video_core/shader_cache.h
@@ -62,7 +62,7 @@ public:
     /// @brief Unmarks a memory region as cached and marks it for removal
     /// @param addr Start address of the CPU write operation
     /// @param size Number of bytes of the CPU write operation
-    void OnCPUWrite(VAddr addr, size_t size);
+    void OnCacheInvalidation(VAddr addr, size_t size);
 
     /// @brief Flushes delayed removal operations
     void SyncGuestHost();
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 8190f3ba1..4457b366f 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -8,6 +8,7 @@
 
 #include "common/alignment.h"
 #include "common/settings.h"
+#include "core/memory.h"
 #include "video_core/control/channel_state.h"
 #include "video_core/dirty_flags.h"
 #include "video_core/engines/kepler_compute.h"
@@ -598,6 +599,10 @@ void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz
                             [&](ImageId id, Image&) { deleted_images.push_back(id); });
     for (const ImageId id : deleted_images) {
         Image& image = slot_images[id];
+        if (True(image.flags & ImageFlagBits::CpuModified)) {
+            continue;
+        }
+        image.flags |= ImageFlagBits::CpuModified;
         if (True(image.flags & ImageFlagBits::Remapped)) {
             continue;
         }
@@ -865,11 +870,15 @@ void TextureCache<P>::PopAsyncFlushes() {
 template <class P>
 ImageId TextureCache<P>::DmaImageId(const Tegra::DMA::ImageOperand& operand, bool is_upload) {
     const ImageInfo dst_info(operand);
-    const ImageId image_id = FindDMAImage(dst_info, operand.address);
-    if (!image_id) {
+    const ImageId dst_id = FindDMAImage(dst_info, operand.address);
+    if (!dst_id) {
+        return NULL_IMAGE_ID;
+    }
+    auto& image = slot_images[dst_id];
+    if (False(image.flags & ImageFlagBits::GpuModified)) {
+        // No need to waste time on an image that's synced with guest
         return NULL_IMAGE_ID;
     }
-    auto& image = slot_images[image_id];
     if (image.info.type == ImageType::e3D) {
         // Don't accelerate 3D images.
         return NULL_IMAGE_ID;
@@ -883,7 +892,7 @@ ImageId TextureCache<P>::DmaImageId(const Tegra::DMA::ImageOperand& operand, boo
     if (!base) {
         return NULL_IMAGE_ID;
     }
-    return image_id;
+    return dst_id;
 }
 
 template <class P>
@@ -1018,19 +1027,19 @@ void TextureCache<P>::UploadImageContents(Image& image, StagingBuffer& staging)
         runtime.AccelerateImageUpload(image, staging, uploads);
         return;
     }
-    const size_t guest_size_bytes = image.guest_size_bytes;
-    swizzle_data_buffer.resize_destructive(guest_size_bytes);
-    gpu_memory->ReadBlockUnsafe(gpu_addr, swizzle_data_buffer.data(), guest_size_bytes);
+
+    Core::Memory::GpuGuestMemory<u8, Core::Memory::GuestMemoryFlags::UnsafeRead> swizzle_data(
+        *gpu_memory, gpu_addr, image.guest_size_bytes, &swizzle_data_buffer);
 
     if (True(image.flags & ImageFlagBits::Converted)) {
         unswizzle_data_buffer.resize_destructive(image.unswizzled_size_bytes);
-        auto copies = UnswizzleImage(*gpu_memory, gpu_addr, image.info, swizzle_data_buffer,
-                                     unswizzle_data_buffer);
+        auto copies =
+            UnswizzleImage(*gpu_memory, gpu_addr, image.info, swizzle_data, unswizzle_data_buffer);
         ConvertImage(unswizzle_data_buffer, image.info, mapped_span, copies);
         image.UploadMemory(staging, copies);
     } else {
         const auto copies =
-            UnswizzleImage(*gpu_memory, gpu_addr, image.info, swizzle_data_buffer, mapped_span);
+            UnswizzleImage(*gpu_memory, gpu_addr, image.info, swizzle_data, mapped_span);
         image.UploadMemory(staging, copies);
     }
 }
@@ -1223,11 +1232,12 @@ void TextureCache<P>::QueueAsyncDecode(Image& image, ImageId image_id) {
     decode->image_id = image_id;
     async_decodes.push_back(std::move(decode));
 
-    Common::ScratchBuffer<u8> local_unswizzle_data_buffer(image.unswizzled_size_bytes);
-    const size_t guest_size_bytes = image.guest_size_bytes;
-    swizzle_data_buffer.resize_destructive(guest_size_bytes);
-    gpu_memory->ReadBlockUnsafe(image.gpu_addr, swizzle_data_buffer.data(), guest_size_bytes);
-    auto copies = UnswizzleImage(*gpu_memory, image.gpu_addr, image.info, swizzle_data_buffer,
+    static Common::ScratchBuffer<u8> local_unswizzle_data_buffer;
+    local_unswizzle_data_buffer.resize_destructive(image.unswizzled_size_bytes);
+    Core::Memory::GpuGuestMemory<u8, Core::Memory::GuestMemoryFlags::UnsafeRead> swizzle_data(
+        *gpu_memory, image.gpu_addr, image.guest_size_bytes, &swizzle_data_buffer);
+
+    auto copies = UnswizzleImage(*gpu_memory, image.gpu_addr, image.info, swizzle_data,
                                  local_unswizzle_data_buffer);
     const size_t out_size = MapSizeBytes(image);
 
diff --git a/src/video_core/texture_cache/types.h b/src/video_core/texture_cache/types.h
index a0e10643f..0453456b4 100644
--- a/src/video_core/texture_cache/types.h
+++ b/src/video_core/texture_cache/types.h
@@ -54,7 +54,6 @@ enum class RelaxedOptions : u32 {
     Format = 1 << 1,
     Samples = 1 << 2,
     ForceBrokenViews = 1 << 3,
-    FormatBpp = 1 << 4,
 };
 DECLARE_ENUM_FLAG_OPERATORS(RelaxedOptions)
 
diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp
index 9a618a57a..a83f5d41c 100644
--- a/src/video_core/texture_cache/util.cpp
+++ b/src/video_core/texture_cache/util.cpp
@@ -20,6 +20,7 @@
 #include "common/div_ceil.h"
 #include "common/scratch_buffer.h"
 #include "common/settings.h"
+#include "core/memory.h"
 #include "video_core/compatible_formats.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/memory_manager.h"
@@ -544,17 +545,15 @@ void SwizzleBlockLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr
                        tile_size.height, info.tile_width_spacing);
     const size_t subresource_size = sizes[level];
 
-    tmp_buffer.resize_destructive(subresource_size);
-    const std::span<u8> dst(tmp_buffer);
-
     for (s32 layer = 0; layer < info.resources.layers; ++layer) {
         const std::span<const u8> src = input.subspan(host_offset);
-        gpu_memory.ReadBlockUnsafe(gpu_addr + guest_offset, dst.data(), dst.size_bytes());
-
-        SwizzleTexture(dst, src, bytes_per_block, num_tiles.width, num_tiles.height,
-                       num_tiles.depth, block.height, block.depth);
+        {
+            Core::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::UnsafeReadWrite>
+                dst(gpu_memory, gpu_addr + guest_offset, subresource_size, &tmp_buffer);
 
-        gpu_memory.WriteBlockUnsafe(gpu_addr + guest_offset, dst.data(), dst.size_bytes());
+            SwizzleTexture(dst, src, bytes_per_block, num_tiles.width, num_tiles.height,
+                           num_tiles.depth, block.height, block.depth);
+        }
 
         host_offset += host_bytes_per_layer;
         guest_offset += layer_stride;
@@ -837,6 +836,7 @@ boost::container::small_vector<BufferImageCopy, 16> UnswizzleImage(Tegra::Memory
     const Extent3D size = info.size;
 
     if (info.type == ImageType::Linear) {
+        ASSERT(output.size_bytes() >= guest_size_bytes);
         gpu_memory.ReadBlockUnsafe(gpu_addr, output.data(), guest_size_bytes);
 
         ASSERT((info.pitch >> bpp_log2) << bpp_log2 == info.pitch);
@@ -904,16 +904,6 @@ boost::container::small_vector<BufferImageCopy, 16> UnswizzleImage(Tegra::Memory
     return copies;
 }
 
-BufferCopy UploadBufferCopy(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr,
-                            const ImageBase& image, std::span<u8> output) {
-    gpu_memory.ReadBlockUnsafe(gpu_addr, output.data(), image.guest_size_bytes);
-    return BufferCopy{
-        .src_offset = 0,
-        .dst_offset = 0,
-        .size = image.guest_size_bytes,
-    };
-}
-
 void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8> output,
                   std::span<BufferImageCopy> copies) {
     u32 output_offset = 0;
@@ -1201,8 +1191,7 @@ std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const
         // Format checking is relaxed, but we still have to check for matching bytes per block.
         // This avoids creating a view for blits on UE4 titles where formats with different bytes
         // per block are aliased.
-        if (BytesPerBlock(existing.format) != BytesPerBlock(candidate.format) &&
-            False(options & RelaxedOptions::FormatBpp)) {
+        if (BytesPerBlock(existing.format) != BytesPerBlock(candidate.format)) {
             return std::nullopt;
         }
     } else {
@@ -1233,11 +1222,7 @@ std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const
     }
     const bool strict_size = False(options & RelaxedOptions::Size);
     if (!IsBlockLinearSizeCompatible(existing, candidate, base->level, 0, strict_size)) {
-        if (False(options & RelaxedOptions::FormatBpp)) {
-            return std::nullopt;
-        } else if (!IsBlockLinearSizeCompatibleBPPRelaxed(existing, candidate, base->level, 0)) {
-            return std::nullopt;
-        }
+        return std::nullopt;
     }
     // TODO: compare block sizes
     return base;
diff --git a/src/video_core/texture_cache/util.h b/src/video_core/texture_cache/util.h
index ab45a43c4..5a0649d24 100644
--- a/src/video_core/texture_cache/util.h
+++ b/src/video_core/texture_cache/util.h
@@ -66,9 +66,6 @@ struct OverlapResult {
     Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info,
     std::span<const u8> input, std::span<u8> output);
 
-[[nodiscard]] BufferCopy UploadBufferCopy(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr,
-                                          const ImageBase& image, std::span<u8> output);
-
 void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8> output,
                   std::span<BufferImageCopy> copies);
 
diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp
index 421e71e5a..e04852e01 100644
--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -485,7 +485,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
             loaded_extensions.erase(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME);
         }
     }
-    if (extensions.extended_dynamic_state2 && (is_radv || is_qualcomm)) {
+    if (extensions.extended_dynamic_state2 && is_radv) {
         const u32 version = (properties.properties.driverVersion << 3) >> 3;
         if (version < VK_MAKE_API_VERSION(0, 22, 3, 1)) {
             LOG_WARNING(
@@ -498,6 +498,20 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
             loaded_extensions.erase(VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME);
         }
     }
+    if (extensions.extended_dynamic_state2 && is_qualcomm) {
+        const u32 version = (properties.properties.driverVersion << 3) >> 3;
+        if (version >= VK_MAKE_API_VERSION(0, 0, 676, 0) &&
+            version < VK_MAKE_API_VERSION(0, 0, 680, 0)) {
+            // Qualcomm Adreno 7xx drivers do not properly support extended_dynamic_state2.
+            LOG_WARNING(Render_Vulkan,
+                        "Qualcomm Adreno 7xx drivers have broken VK_EXT_extended_dynamic_state2");
+            features.extended_dynamic_state2.extendedDynamicState2 = false;
+            features.extended_dynamic_state2.extendedDynamicState2LogicOp = false;
+            features.extended_dynamic_state2.extendedDynamicState2PatchControlPoints = false;
+            extensions.extended_dynamic_state2 = false;
+            loaded_extensions.erase(VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME);
+        }
+    }
     if (extensions.extended_dynamic_state3 && is_radv) {
         LOG_WARNING(Render_Vulkan, "RADV has broken extendedDynamicState3ColorBlendEquation");
         features.extended_dynamic_state3.extendedDynamicState3ColorBlendEnable = false;
@@ -512,8 +526,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
             dynamic_state3_enables = false;
         }
     }
-    if (extensions.vertex_input_dynamic_state && (is_radv || is_qualcomm)) {
-        // Qualcomm S8gen2 drivers do not properly support vertex_input_dynamic_state.
+    if (extensions.vertex_input_dynamic_state && is_radv) {
         // TODO(ameerj): Blacklist only offending driver versions
         // TODO(ameerj): Confirm if RDNA1 is affected
         const bool is_rdna2 =
@@ -526,6 +539,19 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
             loaded_extensions.erase(VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME);
         }
     }
+    if (extensions.vertex_input_dynamic_state && is_qualcomm) {
+        const u32 version = (properties.properties.driverVersion << 3) >> 3;
+        if (version >= VK_MAKE_API_VERSION(0, 0, 676, 0) &&
+            version < VK_MAKE_API_VERSION(0, 0, 680, 0)) {
+            // Qualcomm Adreno 7xx drivers do not properly support vertex_input_dynamic_state.
+            LOG_WARNING(
+                Render_Vulkan,
+                "Qualcomm Adreno 7xx drivers have broken VK_EXT_vertex_input_dynamic_state");
+            features.vertex_input_dynamic_state.vertexInputDynamicState = false;
+            extensions.vertex_input_dynamic_state = false;
+            loaded_extensions.erase(VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME);
+        }
+    }
 
     sets_per_pool = 64;
     if (extensions.extended_dynamic_state3 && is_amd_driver &&
@@ -774,6 +800,17 @@ bool Device::ShouldBoostClocks() const {
     return validated_driver && !is_steam_deck && !is_debugging;
 }
 
+bool Device::HasTimelineSemaphore() const {
+    if (GetDriverID() == VK_DRIVER_ID_QUALCOMM_PROPRIETARY ||
+        GetDriverID() == VK_DRIVER_ID_MESA_TURNIP) {
+        // Timeline semaphores do not work properly on all Qualcomm drivers.
+        // They generally work properly with Turnip drivers, but are problematic on some devices
+        // (e.g. ZTE handsets with Snapdragon 870).
+        return false;
+    }
+    return features.timeline_semaphore.timelineSemaphore;
+}
+
 bool Device::GetSuitability(bool requires_swapchain) {
     // Assume we will be suitable.
     bool suitable = true;
diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h
index 1f17265d5..be3ed45ff 100644
--- a/src/video_core/vulkan_common/vulkan_device.h
+++ b/src/video_core/vulkan_common/vulkan_device.h
@@ -77,6 +77,7 @@ VK_DEFINE_HANDLE(VmaAllocator)
     EXTENSION(KHR, SPIRV_1_4, spirv_1_4)                                                           \
     EXTENSION(KHR, SWAPCHAIN, swapchain)                                                           \
     EXTENSION(KHR, SWAPCHAIN_MUTABLE_FORMAT, swapchain_mutable_format)                             \
+    EXTENSION(KHR, IMAGE_FORMAT_LIST, image_format_list)                                           \
     EXTENSION(NV, DEVICE_DIAGNOSTICS_CONFIG, device_diagnostics_config)                            \
     EXTENSION(NV, GEOMETRY_SHADER_PASSTHROUGH, geometry_shader_passthrough)                        \
     EXTENSION(NV, VIEWPORT_ARRAY2, viewport_array2)                                                \
@@ -408,6 +409,11 @@ public:
         return extensions.workgroup_memory_explicit_layout;
     }
 
+    /// Returns true if the device supports VK_KHR_image_format_list.
+    bool IsKhrImageFormatListSupported() const {
+        return extensions.image_format_list || instance_version >= VK_API_VERSION_1_2;
+    }
+
     /// Returns true if the device supports VK_EXT_primitive_topology_list_restart.
     bool IsTopologyListPrimitiveRestartSupported() const {
         return features.primitive_topology_list_restart.primitiveTopologyListRestart;
@@ -522,13 +528,7 @@ public:
         return extensions.shader_atomic_int64;
     }
 
-    bool HasTimelineSemaphore() const {
-        if (GetDriverID() == VK_DRIVER_ID_QUALCOMM_PROPRIETARY) {
-            // Timeline semaphores do not work properly on all Qualcomm drivers.
-            return false;
-        }
-        return features.timeline_semaphore.timelineSemaphore;
-    }
+    bool HasTimelineSemaphore() const;
 
     /// Returns the minimum supported version of SPIR-V.
     u32 SupportedSpirvVersion() const {
diff --git a/src/video_core/vulkan_common/vulkan_instance.cpp b/src/video_core/vulkan_common/vulkan_instance.cpp
index 7624a9b32..6a294c1da 100644
--- a/src/video_core/vulkan_common/vulkan_instance.cpp
+++ b/src/video_core/vulkan_common/vulkan_instance.cpp
@@ -19,11 +19,9 @@
 #include <windows.h>
 // ensure include order
 #include <vulkan/vulkan_win32.h>
-#elif defined(__APPLE__)
-#include <vulkan/vulkan_macos.h>
 #elif defined(__ANDROID__)
 #include <vulkan/vulkan_android.h>
-#else
+#elif !defined(__APPLE__)
 #include <X11/Xlib.h>
 #include <vulkan/vulkan_wayland.h>
 #include <vulkan/vulkan_xlib.h>
@@ -68,7 +66,7 @@ namespace {
         break;
 #elif defined(__APPLE__)
     case Core::Frontend::WindowSystemType::Cocoa:
-        extensions.push_back(VK_MVK_MACOS_SURFACE_EXTENSION_NAME);
+        extensions.push_back(VK_EXT_METAL_SURFACE_EXTENSION_NAME);
         break;
 #elif defined(__ANDROID__)
     case Core::Frontend::WindowSystemType::Android:
diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp
index a2ef0efa4..42f3ee0b4 100644
--- a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp
+++ b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp
@@ -221,8 +221,8 @@ vk::Image MemoryAllocator::CreateImage(const VkImageCreateInfo& ci) const {
     const VmaAllocationCreateInfo alloc_ci = {
         .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT,
         .usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE,
-        .requiredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
-        .preferredFlags = 0,
+        .requiredFlags = 0,
+        .preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
         .memoryTypeBits = 0,
         .pool = VK_NULL_HANDLE,
         .pUserData = nullptr,
diff --git a/src/video_core/vulkan_common/vulkan_surface.cpp b/src/video_core/vulkan_common/vulkan_surface.cpp
index c34599365..cfea4cd7b 100644
--- a/src/video_core/vulkan_common/vulkan_surface.cpp
+++ b/src/video_core/vulkan_common/vulkan_surface.cpp
@@ -11,11 +11,9 @@
 #include <windows.h>
 // ensure include order
 #include <vulkan/vulkan_win32.h>
-#elif defined(__APPLE__)
-#include <vulkan/vulkan_macos.h>
 #elif defined(__ANDROID__)
 #include <vulkan/vulkan_android.h>
-#else
+#elif !defined(__APPLE__)
 #include <X11/Xlib.h>
 #include <vulkan/vulkan_wayland.h>
 #include <vulkan/vulkan_xlib.h>
@@ -44,12 +42,13 @@ vk::SurfaceKHR CreateSurface(
     }
 #elif defined(__APPLE__)
     if (window_info.type == Core::Frontend::WindowSystemType::Cocoa) {
-        const VkMacOSSurfaceCreateInfoMVK mvk_ci{VK_STRUCTURE_TYPE_MACOS_SURFACE_CREATE_INFO_MVK,
-                                                 nullptr, 0, window_info.render_surface};
-        const auto vkCreateMacOSSurfaceMVK = reinterpret_cast<PFN_vkCreateMacOSSurfaceMVK>(
-            dld.vkGetInstanceProcAddr(*instance, "vkCreateMacOSSurfaceMVK"));
-        if (!vkCreateMacOSSurfaceMVK ||
-            vkCreateMacOSSurfaceMVK(*instance, &mvk_ci, nullptr, &unsafe_surface) != VK_SUCCESS) {
+        const VkMetalSurfaceCreateInfoEXT macos_ci = {
+            .pLayer = static_cast<const CAMetalLayer*>(window_info.render_surface),
+        };
+        const auto vkCreateMetalSurfaceEXT = reinterpret_cast<PFN_vkCreateMetalSurfaceEXT>(
+            dld.vkGetInstanceProcAddr(*instance, "vkCreateMetalSurfaceEXT"));
+        if (!vkCreateMetalSurfaceEXT ||
+            vkCreateMetalSurfaceEXT(*instance, &macos_ci, nullptr, &unsafe_surface) != VK_SUCCESS) {
             LOG_ERROR(Render_Vulkan, "Failed to initialize Metal surface");
             throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED);
         }
diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h
index b5e70fcd4..32bd75ad8 100644
--- a/src/video_core/vulkan_common/vulkan_wrapper.h
+++ b/src/video_core/vulkan_common/vulkan_wrapper.h
@@ -15,6 +15,8 @@
 #define VK_NO_PROTOTYPES
 #ifdef _WIN32
 #define VK_USE_PLATFORM_WIN32_KHR
+#elif defined(__APPLE__)
+#define VK_USE_PLATFORM_METAL_EXT
 #endif
 #include <vulkan/vulkan.h>