41 files changed, 472 insertions, 413 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 2f6cdd216..269db21a5 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -231,6 +231,7 @@ endif()
 
 target_include_directories(video_core PRIVATE ${FFmpeg_INCLUDE_DIR})
 target_link_libraries(video_core PRIVATE ${FFmpeg_LIBRARIES})
+target_link_options(video_core PRIVATE ${FFmpeg_LDFLAGS})
 
 add_dependencies(video_core host_shaders)
 target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE})
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 7bfd57369..d350c9b36 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -570,13 +570,12 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am
     ForEachWrittenRange(*cpu_src_address, amount, mirror);
     // This subtraction in this order is important for overlapping copies.
     common_ranges.subtract(subtract_interval);
-    bool atleast_1_download = tmp_intervals.size() != 0;
-    for (const IntervalType add_interval : tmp_intervals) {
+    const bool has_new_downloads = tmp_intervals.size() != 0;
+    for (const IntervalType& add_interval : tmp_intervals) {
         common_ranges.add(add_interval);
     }
-
     runtime.CopyBuffer(dest_buffer, src_buffer, copies);
-    if (atleast_1_download) {
+    if (has_new_downloads) {
         dest_buffer.MarkRegionAsGpuModified(*cpu_dest_address, amount);
     }
     std::vector<u8> tmp_buffer(amount);
diff --git a/src/video_core/command_classes/codecs/codec.cpp b/src/video_core/command_classes/codecs/codec.cpp
index f798a0053..61966cbfe 100644
--- a/src/video_core/command_classes/codecs/codec.cpp
+++ b/src/video_core/command_classes/codecs/codec.cpp
@@ -5,6 +5,7 @@
 #include <fstream>
 #include <vector>
 #include "common/assert.h"
+#include "common/settings.h"
 #include "video_core/command_classes/codecs/codec.h"
 #include "video_core/command_classes/codecs/h264.h"
 #include "video_core/command_classes/codecs/vp9.h"
@@ -16,108 +17,146 @@ extern "C" {
 }
 
 namespace Tegra {
-#if defined(LIBVA_FOUND)
-// Hardware acceleration code from FFmpeg/doc/examples/hw_decode.c originally under MIT license
 namespace {
-constexpr std::array<const char*, 2> VAAPI_DRIVERS = {
-    "i915",
-    "amdgpu",
-};
+constexpr AVPixelFormat PREFERRED_GPU_FMT = AV_PIX_FMT_NV12;
+constexpr AVPixelFormat PREFERRED_CPU_FMT = AV_PIX_FMT_YUV420P;
+
+void AVPacketDeleter(AVPacket* ptr) {
+    av_packet_free(&ptr);
+}
 
-AVPixelFormat GetHwFormat(AVCodecContext*, const AVPixelFormat* pix_fmts) {
+using AVPacketPtr = std::unique_ptr<AVPacket, decltype(&AVPacketDeleter)>;
+
+AVPixelFormat GetGpuFormat(AVCodecContext* av_codec_ctx, const AVPixelFormat* pix_fmts) {
     for (const AVPixelFormat* p = pix_fmts; *p != AV_PIX_FMT_NONE; ++p) {
-        if (*p == AV_PIX_FMT_VAAPI) {
-            return AV_PIX_FMT_VAAPI;
+        if (*p == av_codec_ctx->pix_fmt) {
+            return av_codec_ctx->pix_fmt;
         }
     }
     LOG_INFO(Service_NVDRV, "Could not find compatible GPU AV format, falling back to CPU");
-    return *pix_fmts;
+    av_buffer_unref(&av_codec_ctx->hw_device_ctx);
+    av_codec_ctx->pix_fmt = PREFERRED_CPU_FMT;
+    return PREFERRED_CPU_FMT;
+}
+} // namespace
+
+void AVFrameDeleter(AVFrame* ptr) {
+    av_frame_free(&ptr);
 }
 
-bool CreateVaapiHwdevice(AVBufferRef** av_hw_device) {
+Codec::Codec(GPU& gpu_, const NvdecCommon::NvdecRegisters& regs)
+    : gpu(gpu_), state{regs}, h264_decoder(std::make_unique<Decoder::H264>(gpu)),
+      vp9_decoder(std::make_unique<Decoder::VP9>(gpu)) {}
+
+Codec::~Codec() {
+    if (!initialized) {
+        return;
+    }
+    // Free libav memory
+    avcodec_free_context(&av_codec_ctx);
+    av_buffer_unref(&av_gpu_decoder);
+}
+
+bool Codec::CreateGpuAvDevice() {
+#if defined(LIBVA_FOUND)
+    static constexpr std::array<const char*, 3> VAAPI_DRIVERS = {
+        "i915",
+        "iHD",
+        "amdgpu",
+    };
     AVDictionary* hwdevice_options = nullptr;
     av_dict_set(&hwdevice_options, "connection_type", "drm", 0);
     for (const auto& driver : VAAPI_DRIVERS) {
         av_dict_set(&hwdevice_options, "kernel_driver", driver, 0);
-        const int hwdevice_error = av_hwdevice_ctx_create(av_hw_device, AV_HWDEVICE_TYPE_VAAPI,
+        const int hwdevice_error = av_hwdevice_ctx_create(&av_gpu_decoder, AV_HWDEVICE_TYPE_VAAPI,
                                                           nullptr, hwdevice_options, 0);
         if (hwdevice_error >= 0) {
             LOG_INFO(Service_NVDRV, "Using VA-API with {}", driver);
             av_dict_free(&hwdevice_options);
+            av_codec_ctx->pix_fmt = AV_PIX_FMT_VAAPI;
             return true;
         }
         LOG_DEBUG(Service_NVDRV, "VA-API av_hwdevice_ctx_create failed {}", hwdevice_error);
     }
     LOG_DEBUG(Service_NVDRV, "VA-API av_hwdevice_ctx_create failed for all drivers");
     av_dict_free(&hwdevice_options);
-    return false;
-}
-} // namespace
 #endif
-
-void AVFrameDeleter(AVFrame* ptr) {
-    av_frame_free(&ptr);
+    static constexpr auto HW_CONFIG_METHOD = AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX;
+    static constexpr std::array GPU_DECODER_TYPES{
+        AV_HWDEVICE_TYPE_CUDA,
+#ifdef _WIN32
+        AV_HWDEVICE_TYPE_D3D11VA,
+#else
+        AV_HWDEVICE_TYPE_VDPAU,
+#endif
+    };
+    for (const auto& type : GPU_DECODER_TYPES) {
+        const int hwdevice_res = av_hwdevice_ctx_create(&av_gpu_decoder, type, nullptr, nullptr, 0);
+        if (hwdevice_res < 0) {
+            LOG_DEBUG(Service_NVDRV, "{} av_hwdevice_ctx_create failed {}",
+                      av_hwdevice_get_type_name(type), hwdevice_res);
+            continue;
+        }
+        for (int i = 0;; i++) {
+            const AVCodecHWConfig* config = avcodec_get_hw_config(av_codec, i);
+            if (!config) {
+                LOG_DEBUG(Service_NVDRV, "{} decoder does not support device type {}.",
+                          av_codec->name, av_hwdevice_get_type_name(type));
+                break;
+            }
+            if (config->methods & HW_CONFIG_METHOD && config->device_type == type) {
+                av_codec_ctx->pix_fmt = config->pix_fmt;
+                LOG_INFO(Service_NVDRV, "Using {} GPU decoder", av_hwdevice_get_type_name(type));
+                return true;
+            }
+        }
+    }
+    return false;
 }
 
-Codec::Codec(GPU& gpu_, const NvdecCommon::NvdecRegisters& regs)
-    : gpu(gpu_), state{regs}, h264_decoder(std::make_unique<Decoder::H264>(gpu)),
-      vp9_decoder(std::make_unique<Decoder::VP9>(gpu)) {}
-
-Codec::~Codec() {
-    if (!initialized) {
-        return;
-    }
-    // Free libav memory
-    avcodec_send_packet(av_codec_ctx, nullptr);
-    AVFrame* av_frame = av_frame_alloc();
-    avcodec_receive_frame(av_codec_ctx, av_frame);
-    avcodec_flush_buffers(av_codec_ctx);
-    av_frame_free(&av_frame);
-    avcodec_close(av_codec_ctx);
-    av_buffer_unref(&av_hw_device);
+void Codec::InitializeAvCodecContext() {
+    av_codec_ctx = avcodec_alloc_context3(av_codec);
+    av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0);
 }
 
-void Codec::InitializeHwdec() {
-    // Prioritize integrated GPU to mitigate bandwidth bottlenecks
-#if defined(LIBVA_FOUND)
-    if (CreateVaapiHwdevice(&av_hw_device)) {
-        const auto hw_device_ctx = av_buffer_ref(av_hw_device);
-        ASSERT_MSG(hw_device_ctx, "av_buffer_ref failed");
-        av_codec_ctx->hw_device_ctx = hw_device_ctx;
-        av_codec_ctx->get_format = GetHwFormat;
+void Codec::InitializeGpuDecoder() {
+    if (!CreateGpuAvDevice()) {
+        av_buffer_unref(&av_gpu_decoder);
         return;
     }
-#endif
-    // TODO more GPU accelerated decoders
+    auto* hw_device_ctx = av_buffer_ref(av_gpu_decoder);
+    ASSERT_MSG(hw_device_ctx, "av_buffer_ref failed");
+    av_codec_ctx->hw_device_ctx = hw_device_ctx;
+    av_codec_ctx->get_format = GetGpuFormat;
 }
 
 void Codec::Initialize() {
-    AVCodecID codec;
-    switch (current_codec) {
-    case NvdecCommon::VideoCodec::H264:
-        codec = AV_CODEC_ID_H264;
-        break;
-    case NvdecCommon::VideoCodec::Vp9:
-        codec = AV_CODEC_ID_VP9;
-        break;
-    default:
-        UNIMPLEMENTED_MSG("Unknown codec {}", current_codec);
+    const AVCodecID codec = [&] {
+        switch (current_codec) {
+        case NvdecCommon::VideoCodec::H264:
+            return AV_CODEC_ID_H264;
+        case NvdecCommon::VideoCodec::Vp9:
+            return AV_CODEC_ID_VP9;
+        default:
+            UNIMPLEMENTED_MSG("Unknown codec {}", current_codec);
+            return AV_CODEC_ID_NONE;
+        }
+    }();
+    av_codec = avcodec_find_decoder(codec);
+
+    InitializeAvCodecContext();
+    if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::GPU) {
+        InitializeGpuDecoder();
+    }
+    if (const int res = avcodec_open2(av_codec_ctx, av_codec, nullptr); res < 0) {
+        LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed with result {}", res);
+        avcodec_free_context(&av_codec_ctx);
+        av_buffer_unref(&av_gpu_decoder);
         return;
     }
-    av_codec = avcodec_find_decoder(codec);
-    av_codec_ctx = avcodec_alloc_context3(av_codec);
-    av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0);
-    InitializeHwdec();
     if (!av_codec_ctx->hw_device_ctx) {
         LOG_INFO(Service_NVDRV, "Using FFmpeg software decoding");
     }
-    const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr);
-    if (av_error < 0) {
-        LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed.");
-        avcodec_close(av_codec_ctx);
-        av_buffer_unref(&av_hw_device);
-        return;
-    }
     initialized = true;
 }
 
@@ -133,6 +172,9 @@ void Codec::Decode() {
     if (is_first_frame) {
         Initialize();
     }
+    if (!initialized) {
+        return;
+    }
     bool vp9_hidden_frame = false;
     std::vector<u8> frame_data;
     if (current_codec == NvdecCommon::VideoCodec::H264) {
@@ -141,50 +183,48 @@ void Codec::Decode() {
         frame_data = vp9_decoder->ComposeFrameHeader(state);
         vp9_hidden_frame = vp9_decoder->WasFrameHidden();
     }
-    AVPacket packet{};
-    av_init_packet(&packet);
-    packet.data = frame_data.data();
-    packet.size = static_cast<s32>(frame_data.size());
-    if (const int ret = avcodec_send_packet(av_codec_ctx, &packet); ret) {
-        LOG_DEBUG(Service_NVDRV, "avcodec_send_packet error {}", ret);
+    AVPacketPtr packet{av_packet_alloc(), AVPacketDeleter};
+    if (!packet) {
+        LOG_ERROR(Service_NVDRV, "av_packet_alloc failed");
+        return;
+    }
+    packet->data = frame_data.data();
+    packet->size = static_cast<s32>(frame_data.size());
+    if (const int res = avcodec_send_packet(av_codec_ctx, packet.get()); res != 0) {
+        LOG_DEBUG(Service_NVDRV, "avcodec_send_packet error {}", res);
         return;
     }
     // Only receive/store visible frames
     if (vp9_hidden_frame) {
         return;
     }
-    AVFrame* hw_frame = av_frame_alloc();
-    AVFrame* sw_frame = hw_frame;
-    ASSERT_MSG(hw_frame, "av_frame_alloc hw_frame failed");
-    if (const int ret = avcodec_receive_frame(av_codec_ctx, hw_frame); ret) {
+    AVFramePtr initial_frame{av_frame_alloc(), AVFrameDeleter};
+    AVFramePtr final_frame{nullptr, AVFrameDeleter};
+    ASSERT_MSG(initial_frame, "av_frame_alloc initial_frame failed");
+    if (const int ret = avcodec_receive_frame(av_codec_ctx, initial_frame.get()); ret) {
         LOG_DEBUG(Service_NVDRV, "avcodec_receive_frame error {}", ret);
-        av_frame_free(&hw_frame);
         return;
     }
-    if (!hw_frame->width || !hw_frame->height) {
+    if (initial_frame->width == 0 || initial_frame->height == 0) {
         LOG_WARNING(Service_NVDRV, "Zero width or height in frame");
-        av_frame_free(&hw_frame);
         return;
     }
-#if defined(LIBVA_FOUND)
-    // Hardware acceleration code from FFmpeg/doc/examples/hw_decode.c under MIT license
-    if (hw_frame->format == AV_PIX_FMT_VAAPI) {
-        sw_frame = av_frame_alloc();
-        ASSERT_MSG(sw_frame, "av_frame_alloc sw_frame failed");
+    if (av_codec_ctx->hw_device_ctx) {
+        final_frame = AVFramePtr{av_frame_alloc(), AVFrameDeleter};
+        ASSERT_MSG(final_frame, "av_frame_alloc final_frame failed");
         // Can't use AV_PIX_FMT_YUV420P and share code with software decoding in vic.cpp
         // because Intel drivers crash unless using AV_PIX_FMT_NV12
-        sw_frame->format = AV_PIX_FMT_NV12;
-        const int transfer_data_ret = av_hwframe_transfer_data(sw_frame, hw_frame, 0);
-        ASSERT_MSG(!transfer_data_ret, "av_hwframe_transfer_data error {}", transfer_data_ret);
-        av_frame_free(&hw_frame);
+        final_frame->format = PREFERRED_GPU_FMT;
+        const int ret = av_hwframe_transfer_data(final_frame.get(), initial_frame.get(), 0);
+        ASSERT_MSG(!ret, "av_hwframe_transfer_data error {}", ret);
+    } else {
+        final_frame = std::move(initial_frame);
     }
-#endif
-    if (sw_frame->format != AV_PIX_FMT_YUV420P && sw_frame->format != AV_PIX_FMT_NV12) {
-        UNIMPLEMENTED_MSG("Unexpected video format from host graphics: {}", sw_frame->format);
-        av_frame_free(&sw_frame);
+    if (final_frame->format != PREFERRED_CPU_FMT && final_frame->format != PREFERRED_GPU_FMT) {
+        UNIMPLEMENTED_MSG("Unexpected video format: {}", final_frame->format);
         return;
     }
-    av_frames.push(AVFramePtr{sw_frame, AVFrameDeleter});
+    av_frames.push(std::move(final_frame));
     if (av_frames.size() > 10) {
         LOG_TRACE(Service_NVDRV, "av_frames.push overflow dropped frame");
         av_frames.pop();
diff --git a/src/video_core/command_classes/codecs/codec.h b/src/video_core/command_classes/codecs/codec.h
index 71936203f..f9a80886f 100644
--- a/src/video_core/command_classes/codecs/codec.h
+++ b/src/video_core/command_classes/codecs/codec.h
@@ -5,6 +5,7 @@
 #pragma once
 
 #include <memory>
+#include <string_view>
 #include <queue>
 #include "common/common_types.h"
 #include "video_core/command_classes/nvdec_common.h"
@@ -50,18 +51,23 @@ public:
 
     /// Returns the value of current_codec
     [[nodiscard]] NvdecCommon::VideoCodec GetCurrentCodec() const;
+
     /// Return name of the current codec
     [[nodiscard]] std::string_view GetCurrentCodecName() const;
 
 private:
-    void InitializeHwdec();
+    void InitializeAvCodecContext();
+
+    void InitializeGpuDecoder();
+
+    bool CreateGpuAvDevice();
 
     bool initialized{};
     NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None};
 
     AVCodec* av_codec{nullptr};
-    AVBufferRef* av_hw_device{nullptr};
     AVCodecContext* av_codec_ctx{nullptr};
+    AVBufferRef* av_gpu_decoder{nullptr};
 
     GPU& gpu;
     const NvdecCommon::NvdecRegisters& state;
diff --git a/src/video_core/command_classes/codecs/h264.cpp b/src/video_core/command_classes/codecs/h264.cpp
index 5fb6d45ee..51ee14c13 100644
--- a/src/video_core/command_classes/codecs/h264.cpp
+++ b/src/video_core/command_classes/codecs/h264.cpp
@@ -95,7 +95,8 @@ const std::vector<u8>& H264::ComposeFrameHeader(const NvdecCommon::NvdecRegister
     const s32 pic_height = context.h264_parameter_set.frame_height_in_map_units /
                            (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);
 
-    writer.WriteUe(16);
+    // TODO (ameerj): Where do we get this number, it seems to be particular for each stream
+    writer.WriteUe(6); // Max number of reference frames
     writer.WriteBit(false);
     writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1);
     writer.WriteUe(pic_height - 1);
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 1aa43523a..7f4ca6282 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -475,10 +475,10 @@ public:
 
                 // These values are used by Nouveau and some games.
                 AddGL = 0x8006,
-                SubtractGL = 0x8007,
-                ReverseSubtractGL = 0x8008,
-                MinGL = 0x800a,
-                MaxGL = 0x800b
+                MinGL = 0x8007,
+                MaxGL = 0x8008,
+                SubtractGL = 0x800a,
+                ReverseSubtractGL = 0x800b
             };
 
             enum class Factor : u32 {
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index c7ec1eac9..67388d980 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -82,41 +82,41 @@ void MaxwellDMA::Launch() {
 }
 
 void MaxwellDMA::CopyPitchToPitch() {
-    // When `multi_line_enable` bit is disabled the copy is performed as if we were copying a 1D
-    // buffer of length `line_length_in`.
-    // Otherwise we copy a 2D image of dimensions (line_length_in, line_count).
-    auto& accelerate = rasterizer->AccessAccelerateDMA();
-    if (!regs.launch_dma.multi_line_enable) {
-        const bool is_buffer_clear = regs.launch_dma.remap_enable != 0 &&
-                                     regs.remap_const.dst_x == RemapConst::Swizzle::CONST_A;
-        // TODO: allow multisized components.
-        if (is_buffer_clear) {
-            ASSERT(regs.remap_const.component_size_minus_one == 3);
-            accelerate.BufferClear(regs.offset_out, regs.line_length_in, regs.remap_consta_value);
-            std::vector<u32> tmp_buffer(regs.line_length_in, regs.remap_consta_value);
-            memory_manager.WriteBlockUnsafe(regs.offset_out,
-                                            reinterpret_cast<u8*>(tmp_buffer.data()),
-                                            regs.line_length_in * sizeof(u32));
-            return;
-        }
-        UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
-        if (!accelerate.BufferCopy(regs.offset_in, regs.offset_out, regs.line_length_in)) {
-            std::vector<u8> tmp_buffer(regs.line_length_in);
-            memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), regs.line_length_in);
-            memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(), regs.line_length_in);
+    // When `multi_line_enable` bit is enabled we copy a 2D image of dimensions
+    // (line_length_in, line_count).
+    // Otherwise the copy is performed as if we were copying a 1D buffer of length line_length_in.
+    const bool remap_enabled = regs.launch_dma.remap_enable != 0;
+    if (regs.launch_dma.multi_line_enable) {
+        UNIMPLEMENTED_IF(remap_enabled);
+
+        // Perform a line-by-line copy.
+        // We're going to take a subrect of size (line_length_in, line_count) from the source
+        // rectangle. There is no need to manually flush/invalidate the regions because CopyBlock
+        // does that for us.
+        for (u32 line = 0; line < regs.line_count; ++line) {
+            const GPUVAddr source_line = regs.offset_in + static_cast<size_t>(line) * regs.pitch_in;
+            const GPUVAddr dest_line = regs.offset_out + static_cast<size_t>(line) * regs.pitch_out;
+            memory_manager.CopyBlock(dest_line, source_line, regs.line_length_in);
         }
         return;
     }
-
-    UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
-
-    // Perform a line-by-line copy.
-    // We're going to take a subrect of size (line_length_in, line_count) from the source rectangle.
-    // There is no need to manually flush/invalidate the regions because CopyBlock does that for us.
-    for (u32 line = 0; line < regs.line_count; ++line) {
-        const GPUVAddr source_line = regs.offset_in + static_cast<size_t>(line) * regs.pitch_in;
-        const GPUVAddr dest_line = regs.offset_out + static_cast<size_t>(line) * regs.pitch_out;
-        memory_manager.CopyBlock(dest_line, source_line, regs.line_length_in);
+    // TODO: allow multisized components.
+    auto& accelerate = rasterizer->AccessAccelerateDMA();
+    const bool is_const_a_dst = regs.remap_const.dst_x == RemapConst::Swizzle::CONST_A;
+    const bool is_buffer_clear = remap_enabled && is_const_a_dst;
+    if (is_buffer_clear) {
+        ASSERT(regs.remap_const.component_size_minus_one == 3);
+        accelerate.BufferClear(regs.offset_out, regs.line_length_in, regs.remap_consta_value);
+        std::vector<u32> tmp_buffer(regs.line_length_in, regs.remap_consta_value);
+        memory_manager.WriteBlockUnsafe(regs.offset_out, reinterpret_cast<u8*>(tmp_buffer.data()),
+                                        regs.line_length_in * sizeof(u32));
+        return;
+    }
+    UNIMPLEMENTED_IF(remap_enabled);
+    if (!accelerate.BufferCopy(regs.offset_in, regs.offset_out, regs.line_length_in)) {
+        std::vector<u8> tmp_buffer(regs.line_length_in);
+        memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), regs.line_length_in);
+        memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(), regs.line_length_in);
     }
 }
 
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index 9e457ae16..a04514425 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -175,7 +175,7 @@ public:
     static_assert(sizeof(LaunchDMA) == 4);
 
     struct RemapConst {
-        enum Swizzle : u32 {
+        enum class Swizzle : u32 {
             SRC_X = 0,
             SRC_Y = 1,
             SRC_Z = 2,
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index ff024f530..2ae3639b5 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -531,14 +531,6 @@ void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
     interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
 }
 
-void GPU::ShutDown() {
-    // Signal that threads should no longer block on syncpoint fences
-    shutting_down.store(true, std::memory_order_relaxed);
-    sync_cv.notify_all();
-
-    gpu_thread.ShutDown();
-}
-
 void GPU::OnCommandListEnd() {
     if (is_async) {
         // This command only applies to asynchronous GPU mode
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index a8e98e51b..e6a02a71b 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -219,9 +219,6 @@ public:
         return *shader_notify;
     }
 
-    // Stops the GPU execution and waits for the GPU to finish working
-    void ShutDown();
-
     /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame.
     void WaitFence(u32 syncpoint_id, u32 value);
 
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 46f642b19..9547f277a 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -17,9 +17,9 @@
 namespace VideoCommon::GPUThread {
 
 /// Runs the GPU thread
-static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
-                      Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher,
-                      SynchState& state) {
+static void RunThread(std::stop_token stop_token, Core::System& system,
+                      VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context,
+                      Tegra::DmaPusher& dma_pusher, SynchState& state) {
     std::string name = "yuzu:GPU";
     MicroProfileOnThreadCreate(name.c_str());
     SCOPE_EXIT({ MicroProfileOnThreadExit(); });
@@ -28,20 +28,14 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
     Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
     system.RegisterHostThread();
 
-    // Wait for first GPU command before acquiring the window context
-    state.queue.Wait();
-
-    // If emulation was stopped during disk shader loading, abort before trying to acquire context
-    if (!state.is_running) {
-        return;
-    }
-
     auto current_context = context.Acquire();
     VideoCore::RasterizerInterface* const rasterizer = renderer.ReadRasterizer();
 
-    CommandDataContainer next;
-    while (state.is_running) {
-        next = state.queue.PopWait();
+    while (!stop_token.stop_requested()) {
+        CommandDataContainer next = state.queue.PopWait(stop_token);
+        if (stop_token.stop_requested()) {
+            break;
+        }
         if (auto* submit_list = std::get_if<SubmitListCommand>(&next.data)) {
             dma_pusher.Push(std::move(submit_list->entries));
             dma_pusher.DispatchCalls();
@@ -55,8 +49,6 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
             rasterizer->FlushRegion(flush->addr, flush->size);
         } else if (const auto* invalidate = std::get_if<InvalidateRegionCommand>(&next.data)) {
             rasterizer->OnCPUWrite(invalidate->addr, invalidate->size);
-        } else if (std::holds_alternative<EndProcessingCommand>(next.data)) {
-            ASSERT(state.is_running == false);
         } else {
             UNREACHABLE();
         }
@@ -73,16 +65,14 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
 ThreadManager::ThreadManager(Core::System& system_, bool is_async_)
     : system{system_}, is_async{is_async_} {}
 
-ThreadManager::~ThreadManager() {
-    ShutDown();
-}
+ThreadManager::~ThreadManager() = default;
 
 void ThreadManager::StartThread(VideoCore::RendererBase& renderer,
                                 Core::Frontend::GraphicsContext& context,
                                 Tegra::DmaPusher& dma_pusher) {
     rasterizer = renderer.ReadRasterizer();
-    thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context),
-                         std::ref(dma_pusher), std::ref(state));
+    thread = std::jthread(RunThread, std::ref(system), std::ref(renderer), std::ref(context),
+                          std::ref(dma_pusher), std::ref(state));
 }
 
 void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
@@ -117,26 +107,6 @@ void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
     rasterizer->OnCPUWrite(addr, size);
 }
 
-void ThreadManager::ShutDown() {
-    if (!state.is_running) {
-        return;
-    }
-
-    {
-        std::lock_guard lk(state.write_lock);
-        state.is_running = false;
-        state.cv.notify_all();
-    }
-
-    if (!thread.joinable()) {
-        return;
-    }
-
-    // Notify GPU thread that a shutdown is pending
-    PushCommand(EndProcessingCommand());
-    thread.join();
-}
-
 void ThreadManager::OnCommandListEnd() {
     PushCommand(OnCommandListEndCommand());
 }
@@ -152,9 +122,8 @@ u64 ThreadManager::PushCommand(CommandData&& command_data, bool block) {
     state.queue.Push(CommandDataContainer(std::move(command_data), fence, block));
 
     if (block) {
-        state.cv.wait(lk, [this, fence] {
-            return fence <= state.signaled_fence.load(std::memory_order_relaxed) ||
-                   !state.is_running;
+        state.cv.wait(lk, thread.get_stop_token(), [this, fence] {
+            return fence <= state.signaled_fence.load(std::memory_order_relaxed);
         });
     }
 
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 11a648f38..91bada925 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -33,9 +33,6 @@ class RendererBase;
 
 namespace VideoCommon::GPUThread {
 
-/// Command to signal to the GPU thread that processing has ended
-struct EndProcessingCommand final {};
-
 /// Command to signal to the GPU thread that a command list is ready for processing
 struct SubmitListCommand final {
     explicit SubmitListCommand(Tegra::CommandList&& entries_) : entries{std::move(entries_)} {}
@@ -83,7 +80,7 @@ struct OnCommandListEndCommand final {};
 struct GPUTickCommand final {};
 
 using CommandData =
-    std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
+    std::variant<std::monostate, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
                  InvalidateRegionCommand, FlushAndInvalidateRegionCommand, OnCommandListEndCommand,
                  GPUTickCommand>;
 
@@ -100,14 +97,12 @@ struct CommandDataContainer {
 
 /// Struct used to synchronize the GPU thread
 struct SynchState final {
-    std::atomic_bool is_running{true};
-
-    using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
+    using CommandQueue = Common::SPSCQueue<CommandDataContainer, true>;
     std::mutex write_lock;
     CommandQueue queue;
     u64 last_fence{};
     std::atomic<u64> signaled_fence{};
-    std::condition_variable cv;
+    std::condition_variable_any cv;
 };
 
 /// Class used to manage the GPU thread
@@ -149,7 +144,7 @@ private:
     VideoCore::RasterizerInterface* rasterizer = nullptr;
 
     SynchState state;
-    std::thread thread;
+    std::jthread thread;
 };
 
 } // namespace VideoCommon::GPUThread
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
index c9cff7450..20d748c12 100644
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -6,7 +6,6 @@ set(SHADER_FILES
     convert_float_to_depth.frag
     full_screen_triangle.vert
     opengl_copy_bc4.comp
-    opengl_copy_bgra.comp
     opengl_present.frag
     opengl_present.vert
     pitch_unswizzle.comp
diff --git a/src/video_core/host_shaders/opengl_copy_bgra.comp b/src/video_core/host_shaders/opengl_copy_bgra.comp
deleted file mode 100644
index 2571a4abf..000000000
--- a/src/video_core/host_shaders/opengl_copy_bgra.comp
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright 2021 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#version 430 core
-
-layout (local_size_x = 4, local_size_y = 4) in;
-
-layout(binding = 0, rgba8) readonly uniform image2DArray bgr_input;
-layout(binding = 1, rgba8) writeonly uniform image2DArray bgr_output;
-
-void main() {
-    vec4 color = imageLoad(bgr_input, ivec3(gl_GlobalInvocationID));
-    imageStore(bgr_output, ivec3(gl_GlobalInvocationID), color.bgra);
-}
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index c60ed6453..dce00e829 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -2,6 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <algorithm>
+
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 07a995f7d..187a28e4d 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -147,8 +147,7 @@ void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer,
 
 void BufferCacheRuntime::ClearBuffer(Buffer& dest_buffer, u32 offset, size_t size, u32 value) {
     glClearNamedBufferSubData(dest_buffer.Handle(), GL_R32UI, static_cast<GLintptr>(offset),
-                              static_cast<GLsizeiptr>(size / sizeof(u32)), GL_RED, GL_UNSIGNED_INT,
-                              &value);
+                              static_cast<GLsizeiptr>(size), GL_RED, GL_UNSIGNED_INT, &value);
 }
 
 void BufferCacheRuntime::BindIndexBuffer(Buffer& buffer, u32 offset, u32 size) {
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index b0e14182e..02682bd76 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -293,6 +293,8 @@ void ShaderCache::LoadDiskResources(u64 title_id, std::stop_token stop_loading,
     }};
     LoadPipelines(stop_loading, shader_cache_filename, CACHE_VERSION, load_compute, load_graphics);
 
+    LOG_INFO(Render_OpenGL, "Total Pipeline Count: {}", state.total);
+
     std::unique_lock lock{state.mutex};
     callback(VideoCore::LoadCallbackStage::Build, 0, state.total);
     state.has_loaded = true;
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index b0aee6cc1..54dae2c41 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -461,7 +461,7 @@ bool TextureCacheRuntime::CanImageBeCopied(const Image& dst, const Image& src) {
     if (dst.info.type == ImageType::e3D && dst.info.format == PixelFormat::BC4_UNORM) {
         return false;
     }
-    if (IsPixelFormatBGR(dst.info.format) || IsPixelFormatBGR(src.info.format)) {
+    if (IsPixelFormatBGR(dst.info.format) != IsPixelFormatBGR(src.info.format)) {
         return false;
     }
     return true;
@@ -473,7 +473,7 @@ void TextureCacheRuntime::EmulateCopyImage(Image& dst, Image& src,
         ASSERT(src.info.type == ImageType::e3D);
         util_shaders.CopyBC4(dst, src, copies);
     } else if (IsPixelFormatBGR(dst.info.format) || IsPixelFormatBGR(src.info.format)) {
-        util_shaders.CopyBGR(dst, src, copies);
+        bgr_copy_pass.CopyBGR(dst, src, copies);
     } else {
         UNREACHABLE();
     }
@@ -1112,4 +1112,37 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM
     framebuffer.handle = handle;
 }
 
+void BGRCopyPass::CopyBGR(Image& dst_image, Image& src_image,
+                          std::span<const VideoCommon::ImageCopy> copies) {
+    static constexpr VideoCommon::Offset3D zero_offset{0, 0, 0};
+    const u32 requested_pbo_size =
+        std::max(src_image.unswizzled_size_bytes, dst_image.unswizzled_size_bytes);
+
+    if (bgr_pbo_size < requested_pbo_size) {
+        bgr_pbo.Create();
+        bgr_pbo_size = requested_pbo_size;
+        glNamedBufferData(bgr_pbo.handle, bgr_pbo_size, nullptr, GL_STREAM_COPY);
+    }
+    for (const ImageCopy& copy : copies) {
+        ASSERT(copy.src_offset == zero_offset);
+        ASSERT(copy.dst_offset == zero_offset);
+
+        // Copy from source to PBO
+        glPixelStorei(GL_PACK_ALIGNMENT, 1);
+        glPixelStorei(GL_PACK_ROW_LENGTH, copy.extent.width);
+        glBindBuffer(GL_PIXEL_PACK_BUFFER, bgr_pbo.handle);
+        glGetTextureSubImage(src_image.Handle(), 0, 0, 0, 0, copy.extent.width, copy.extent.height,
+                             copy.src_subresource.num_layers, src_image.GlFormat(),
+                             src_image.GlType(), static_cast<GLsizei>(bgr_pbo_size), nullptr);
+
+        // Copy from PBO to destination in desired GL format
+        glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+        glPixelStorei(GL_UNPACK_ROW_LENGTH, copy.extent.width);
+        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, bgr_pbo.handle);
+        glTextureSubImage3D(dst_image.Handle(), 0, 0, 0, 0, copy.extent.width, copy.extent.height,
+                            copy.dst_subresource.num_layers, dst_image.GlFormat(),
+                            dst_image.GlType(), nullptr);
+    }
+}
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 4a4f6301c..c498a8a8f 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -47,6 +47,19 @@ struct FormatProperties {
     bool is_compressed;
 };
 
+class BGRCopyPass {
+public:
+    BGRCopyPass() = default;
+    ~BGRCopyPass() = default;
+
+    void CopyBGR(Image& dst_image, Image& src_image,
+                 std::span<const VideoCommon::ImageCopy> copies);
+
+private:
+    OGLBuffer bgr_pbo;
+    size_t bgr_pbo_size{};
+};
+
 class TextureCacheRuntime {
     friend Framebuffer;
     friend Image;
@@ -118,6 +131,7 @@ private:
     const Device& device;
     StateTracker& state_tracker;
     UtilShaders util_shaders;
+    BGRCopyPass bgr_copy_pass;
 
     std::array<std::unordered_map<GLenum, FormatProperties>, 3> format_properties;
     bool has_broken_texture_view_formats = false;
@@ -162,6 +176,14 @@ public:
         return texture.handle;
     }
 
+    GLuint GlFormat() const noexcept {
+        return gl_format;
+    }
+
+    GLuint GlType() const noexcept {
+        return gl_type;
+    }
+
 private:
     void CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset);
 
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 672f94bfc..39158aa3e 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -52,7 +52,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> FORMAT_TAB
     {GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT},                          // BC6H_UFLOAT
     {GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT},                            // BC6H_SFLOAT
     {GL_COMPRESSED_RGBA_ASTC_4x4_KHR},                                // ASTC_2D_4X4_UNORM
-    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE},                            // B8G8R8A8_UNORM
+    {GL_RGBA8, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV},                 // B8G8R8A8_UNORM
     {GL_RGBA32F, GL_RGBA, GL_FLOAT},                                  // R32G32B32A32_FLOAT
     {GL_RGBA32I, GL_RGBA_INTEGER, GL_INT},                            // R32G32B32A32_SINT
     {GL_RG32F, GL_RG, GL_FLOAT},                                      // R32G32_FLOAT
@@ -81,7 +81,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> FORMAT_TAB
     {GL_COMPRESSED_RGBA_ASTC_8x8_KHR},                                // ASTC_2D_8X8_UNORM
     {GL_COMPRESSED_RGBA_ASTC_8x5_KHR},                                // ASTC_2D_8X5_UNORM
     {GL_COMPRESSED_RGBA_ASTC_5x4_KHR},                                // ASTC_2D_5X4_UNORM
-    {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE},                     // B8G8R8A8_SRGB
+    {GL_SRGB8_ALPHA8, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV},          // B8G8R8A8_SRGB
     {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT},                         // BC1_RGBA_SRGB
     {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT},                         // BC2_SRGB
     {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT},                         // BC3_SRGB
diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp
index 333f35a1c..897c380b3 100644
--- a/src/video_core/renderer_opengl/util_shaders.cpp
+++ b/src/video_core/renderer_opengl/util_shaders.cpp
@@ -14,7 +14,6 @@
 #include "video_core/host_shaders/block_linear_unswizzle_2d_comp.h"
 #include "video_core/host_shaders/block_linear_unswizzle_3d_comp.h"
 #include "video_core/host_shaders/opengl_copy_bc4_comp.h"
-#include "video_core/host_shaders/opengl_copy_bgra_comp.h"
 #include "video_core/host_shaders/pitch_unswizzle_comp.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
 #include "video_core/renderer_opengl/gl_shader_util.h"
@@ -44,11 +43,6 @@ namespace {
 OGLProgram MakeProgram(std::string_view source) {
     return CreateProgram(source, GL_COMPUTE_SHADER);
 }
-
-size_t NumPixelsInCopy(const VideoCommon::ImageCopy& copy) {
-    return static_cast<size_t>(copy.extent.width * copy.extent.height *
-                               copy.src_subresource.num_layers);
-}
 } // Anonymous namespace
 
 UtilShaders::UtilShaders(ProgramManager& program_manager_)
@@ -56,7 +50,6 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_)
       block_linear_unswizzle_2d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_2D_COMP)),
       block_linear_unswizzle_3d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_3D_COMP)),
       pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)),
-      copy_bgra_program(MakeProgram(OPENGL_COPY_BGRA_COMP)),
       copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) {
     const auto swizzle_table = Tegra::Texture::MakeSwizzleTable();
     swizzle_table_buffer.Create();
@@ -255,43 +248,6 @@ void UtilShaders::CopyBC4(Image& dst_image, Image& src_image, std::span<const Im
     program_manager.RestoreGuestCompute();
 }
 
-void UtilShaders::CopyBGR(Image& dst_image, Image& src_image,
-                          std::span<const VideoCommon::ImageCopy> copies) {
-    static constexpr GLuint BINDING_INPUT_IMAGE = 0;
-    static constexpr GLuint BINDING_OUTPUT_IMAGE = 1;
-    static constexpr VideoCommon::Offset3D zero_offset{0, 0, 0};
-    const u32 bytes_per_block = BytesPerBlock(dst_image.info.format);
-    switch (bytes_per_block) {
-    case 2:
-        // BGR565 copy
-        for (const ImageCopy& copy : copies) {
-            ASSERT(copy.src_offset == zero_offset);
-            ASSERT(copy.dst_offset == zero_offset);
-            bgr_copy_pass.Execute(dst_image, src_image, copy);
-        }
-        break;
-    case 4: {
-        // BGRA8 copy
-        program_manager.BindComputeProgram(copy_bgra_program.handle);
-        constexpr GLenum FORMAT = GL_RGBA8;
-        for (const ImageCopy& copy : copies) {
-            ASSERT(copy.src_offset == zero_offset);
-            ASSERT(copy.dst_offset == zero_offset);
-            glBindImageTexture(BINDING_INPUT_IMAGE, src_image.StorageHandle(),
-                               copy.src_subresource.base_level, GL_FALSE, 0, GL_READ_ONLY, FORMAT);
-            glBindImageTexture(BINDING_OUTPUT_IMAGE, dst_image.StorageHandle(),
-                               copy.dst_subresource.base_level, GL_FALSE, 0, GL_WRITE_ONLY, FORMAT);
-            glDispatchCompute(copy.extent.width, copy.extent.height, copy.extent.depth);
-        }
-        program_manager.RestoreGuestCompute();
-        break;
-    }
-    default:
-        UNREACHABLE();
-        break;
-    }
-}
-
 GLenum StoreFormat(u32 bytes_per_block) {
     switch (bytes_per_block) {
     case 1:
@@ -309,36 +265,4 @@ GLenum StoreFormat(u32 bytes_per_block) {
     return GL_R8UI;
 }
 
-void Bgr565CopyPass::Execute(const Image& dst_image, const Image& src_image,
-                             const ImageCopy& copy) {
-    if (CopyBufferCreationNeeded(copy)) {
-        CreateNewCopyBuffer(copy, GL_TEXTURE_2D_ARRAY, GL_RGB565);
-    }
-    // Copy from source to PBO
-    glPixelStorei(GL_PACK_ALIGNMENT, 1);
-    glPixelStorei(GL_PACK_ROW_LENGTH, copy.extent.width);
-    glBindBuffer(GL_PIXEL_PACK_BUFFER, bgr16_pbo.handle);
-    glGetTextureSubImage(src_image.Handle(), 0, 0, 0, 0, copy.extent.width, copy.extent.height,
-                         copy.src_subresource.num_layers, GL_RGB, GL_UNSIGNED_SHORT_5_6_5,
-                         static_cast<GLsizei>(bgr16_pbo_size), nullptr);
-
-    // Copy from PBO to destination in reverse order
-    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
-    glPixelStorei(GL_UNPACK_ROW_LENGTH, copy.extent.width);
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, bgr16_pbo.handle);
-    glTextureSubImage3D(dst_image.Handle(), 0, 0, 0, 0, copy.extent.width, copy.extent.height,
-                        copy.dst_subresource.num_layers, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV,
-                        nullptr);
-}
-
-bool Bgr565CopyPass::CopyBufferCreationNeeded(const ImageCopy& copy) {
-    return bgr16_pbo_size < NumPixelsInCopy(copy) * sizeof(u16);
-}
-
-void Bgr565CopyPass::CreateNewCopyBuffer(const ImageCopy& copy, GLenum target, GLuint format) {
-    bgr16_pbo.Create();
-    bgr16_pbo_size = NumPixelsInCopy(copy) * sizeof(u16);
-    glNamedBufferData(bgr16_pbo.handle, bgr16_pbo_size, nullptr, GL_STREAM_COPY);
-}
-
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h
index ef881e35f..5de95ea7a 100644
--- a/src/video_core/renderer_opengl/util_shaders.h
+++ b/src/video_core/renderer_opengl/util_shaders.h
@@ -19,22 +19,6 @@ class ProgramManager;
 
 struct ImageBufferMap;
 
-class Bgr565CopyPass {
-public:
-    Bgr565CopyPass() = default;
-    ~Bgr565CopyPass() = default;
-
-    void Execute(const Image& dst_image, const Image& src_image,
-                 const VideoCommon::ImageCopy& copy);
-
-private:
-    [[nodiscard]] bool CopyBufferCreationNeeded(const VideoCommon::ImageCopy& copy);
-    void CreateNewCopyBuffer(const VideoCommon::ImageCopy& copy, GLenum target, GLuint format);
-
-    OGLBuffer bgr16_pbo;
-    size_t bgr16_pbo_size{};
-};
-
 class UtilShaders {
 public:
     explicit UtilShaders(ProgramManager& program_manager);
@@ -55,9 +39,6 @@ public:
     void CopyBC4(Image& dst_image, Image& src_image,
                  std::span<const VideoCommon::ImageCopy> copies);
 
-    void CopyBGR(Image& dst_image, Image& src_image,
-                 std::span<const VideoCommon::ImageCopy> copies);
-
 private:
     ProgramManager& program_manager;
 
@@ -67,10 +48,7 @@ private:
     OGLProgram block_linear_unswizzle_2d_program;
     OGLProgram block_linear_unswizzle_3d_program;
     OGLProgram pitch_unswizzle_program;
-    OGLProgram copy_bgra_program;
     OGLProgram copy_bc4_program;
-
-    Bgr565CopyPass bgr_copy_pass;
 };
 
 GLenum StoreFormat(u32 bytes_per_block);
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
index 7c9b0d6db..74822814d 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -97,19 +97,14 @@ RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_,
                                Core::Frontend::EmuWindow& emu_window,
                                Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_,
                                std::unique_ptr<Core::Frontend::GraphicsContext> context_) try
-    : RendererBase(emu_window, std::move(context_)),
-      telemetry_session(telemetry_session_),
-      cpu_memory(cpu_memory_),
-      gpu(gpu_),
-      library(OpenLibrary()),
+    : RendererBase(emu_window, std::move(context_)), telemetry_session(telemetry_session_),
+      cpu_memory(cpu_memory_), gpu(gpu_), library(OpenLibrary()),
       instance(CreateInstance(library, dld, VK_API_VERSION_1_1, render_window.GetWindowInfo().type,
                               true, Settings::values.renderer_debug.GetValue())),
       debug_callback(Settings::values.renderer_debug ? CreateDebugCallback(instance) : nullptr),
       surface(CreateSurface(instance, render_window)),
-      device(CreateDevice(instance, dld, *surface)),
-      memory_allocator(device, false),
-      state_tracker(gpu),
-      scheduler(device, state_tracker),
+      device(CreateDevice(instance, dld, *surface)), memory_allocator(device, false),
+      state_tracker(gpu), scheduler(device, state_tracker),
       swapchain(*surface, device, scheduler, render_window.GetFramebufferLayout().width,
                 render_window.GetFramebufferLayout().height, false),
       blit_screen(cpu_memory, render_window, device, memory_allocator, swapchain, scheduler,
@@ -149,7 +144,7 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
         const Layout::FramebufferLayout layout = render_window.GetFramebufferLayout();
         swapchain.Create(layout.width, layout.height, is_srgb);
     };
-    if (swapchain.IsSubOptimal() || swapchain.HasColorSpaceChanged(is_srgb)) {
+    if (swapchain.NeedsRecreation(is_srgb)) {
         recreate_swapchain();
     }
     bool is_outdated;
@@ -164,7 +159,8 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
         blit_screen.Recreate();
     }
     const VkSemaphore render_semaphore = blit_screen.DrawToSwapchain(*framebuffer, use_accelerated);
-    scheduler.Flush(render_semaphore);
+    const VkSemaphore present_semaphore = swapchain.CurrentPresentSemaphore();
+    scheduler.Flush(render_semaphore, present_semaphore);
     scheduler.WaitWorker();
     swapchain.Present(render_semaphore);
 
diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp
index cb0580182..888bc7392 100644
--- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp
+++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp
@@ -358,7 +358,7 @@ void VKBlitScreen::CreateDescriptorPool() {
 void VKBlitScreen::CreateRenderPass() {
     const VkAttachmentDescription color_attachment{
         .flags = 0,
-        .format = swapchain.GetImageFormat(),
+        .format = swapchain.GetImageViewFormat(),
         .samples = VK_SAMPLE_COUNT_1_BIT,
         .loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR,
         .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
index 8e77e4796..d87da2a34 100644
--- a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <algorithm>
 #include <mutex>
 #include <span>
 #include <vector>
@@ -18,7 +19,6 @@ namespace Vulkan {
 // Prefer small grow rates to avoid saturating the descriptor pool with barely used pipelines
 constexpr size_t SETS_GROW_RATE = 16;
 constexpr s32 SCORE_THRESHOLD = 3;
-constexpr u32 SETS_PER_POOL = 64;
 
 struct DescriptorBank {
     DescriptorBankInfo info;
@@ -58,11 +58,12 @@ static DescriptorBankInfo MakeBankInfo(std::span<const Shader::Info> infos) {
 static void AllocatePool(const Device& device, DescriptorBank& bank) {
     std::array<VkDescriptorPoolSize, 6> pool_sizes;
     size_t pool_cursor{};
+    const u32 sets_per_pool = device.GetSetsPerPool();
     const auto add = [&](VkDescriptorType type, u32 count) {
         if (count > 0) {
             pool_sizes[pool_cursor++] = {
                 .type = type,
-                .descriptorCount = count * SETS_PER_POOL,
+                .descriptorCount = count * sets_per_pool,
             };
         }
     };
@@ -77,7 +78,7 @@ static void AllocatePool(const Device& device, DescriptorBank& bank) {
         .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
         .pNext = nullptr,
         .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
-        .maxSets = SETS_PER_POOL,
+        .maxSets = sets_per_pool,
         .poolSizeCount = static_cast<u32>(pool_cursor),
         .pPoolSizes = std::data(pool_sizes),
     }));
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
index 7c0f91007..11cd41ad7 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@@ -507,8 +507,9 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) {
             vertex_attributes.push_back({
                 .location = static_cast<u32>(index),
                 .binding = 0,
-                .format = type == 1 ? VK_FORMAT_R32_SFLOAT
-                                    : type == 2 ? VK_FORMAT_R32_SINT : VK_FORMAT_R32_UINT,
+                .format = type == 1   ? VK_FORMAT_R32_SFLOAT
+                          : type == 2 ? VK_FORMAT_R32_SINT
+                                      : VK_FORMAT_R32_UINT,
                 .offset = 0,
             });
         }
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index 31bfbcb06..eb8b4e08b 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -447,6 +447,8 @@ void PipelineCache::LoadDiskResources(u64 title_id, std::stop_token stop_loading
     VideoCommon::LoadPipelines(stop_loading, pipeline_cache_filename, CACHE_VERSION, load_compute,
                                load_graphics);
 
+    LOG_INFO(Render_Vulkan, "Total Pipeline Count: {}", state.total);
+
     std::unique_lock lock{state.mutex};
     callback(VideoCore::LoadCallbackStage::Build, 0, state.total);
     state.has_loaded = true;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 3ac18ea54..3bcd6d6cc 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -228,9 +228,7 @@ void RasterizerVulkan::Clear() {
     };
 
     const u32 color_attachment = regs.clear_buffers.RT;
-    const auto attachment_aspect_mask = framebuffer->ImageRanges()[color_attachment].aspectMask;
-    const bool is_color_rt = (attachment_aspect_mask & VK_IMAGE_ASPECT_COLOR_BIT) != 0;
-    if (use_color && is_color_rt) {
+    if (use_color && framebuffer->HasAspectColorBit(color_attachment)) {
         VkClearValue clear_value;
         std::memcpy(clear_value.color.float32, regs.clear_color, sizeof(regs.clear_color));
 
@@ -248,12 +246,15 @@ void RasterizerVulkan::Clear() {
         return;
     }
     VkImageAspectFlags aspect_flags = 0;
-    if (use_depth) {
+    if (use_depth && framebuffer->HasAspectDepthBit()) {
         aspect_flags |= VK_IMAGE_ASPECT_DEPTH_BIT;
     }
-    if (use_stencil) {
+    if (use_stencil && framebuffer->HasAspectStencilBit()) {
         aspect_flags |= VK_IMAGE_ASPECT_STENCIL_BIT;
     }
+    if (aspect_flags == 0) {
+        return;
+    }
     scheduler.Record([clear_depth = regs.clear_depth, clear_stencil = regs.clear_stencil,
                       clear_rect, aspect_flags](vk::CommandBuffer cmdbuf) {
         VkClearAttachment attachment;
@@ -764,12 +765,7 @@ void RasterizerVulkan::UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs) {
     const Maxwell::StencilOp zpass = regs.stencil_front_op_zpass;
     const Maxwell::ComparisonOp compare = regs.stencil_front_func_func;
     if (regs.stencil_two_side_enable) {
-        scheduler.Record([fail, zfail, zpass, compare](vk::CommandBuffer cmdbuf) {
-            cmdbuf.SetStencilOpEXT(VK_STENCIL_FACE_FRONT_AND_BACK, MaxwellToVK::StencilOp(fail),
-                                   MaxwellToVK::StencilOp(zpass), MaxwellToVK::StencilOp(zfail),
-                                   MaxwellToVK::ComparisonOp(compare));
-        });
-    } else {
+        // Separate stencil op per face
         const Maxwell::StencilOp back_fail = regs.stencil_back_op_fail;
         const Maxwell::StencilOp back_zfail = regs.stencil_back_op_zfail;
         const Maxwell::StencilOp back_zpass = regs.stencil_back_op_zpass;
@@ -784,6 +780,13 @@ void RasterizerVulkan::UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs) {
                                    MaxwellToVK::StencilOp(back_zfail),
                                    MaxwellToVK::ComparisonOp(back_compare));
         });
+    } else {
+        // Front face defines the stencil op of both faces
+        scheduler.Record([fail, zfail, zpass, compare](vk::CommandBuffer cmdbuf) {
+            cmdbuf.SetStencilOpEXT(VK_STENCIL_FACE_FRONT_AND_BACK, MaxwellToVK::StencilOp(fail),
+                                   MaxwellToVK::StencilOp(zpass), MaxwellToVK::StencilOp(zfail),
+                                   MaxwellToVK::ComparisonOp(compare));
+        });
     }
 }
 
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index 4840962de..0c11c814f 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -43,26 +43,19 @@ VKScheduler::VKScheduler(const Device& device_, StateTracker& state_tracker_)
       command_pool{std::make_unique<CommandPool>(*master_semaphore, device)} {
     AcquireNewChunk();
     AllocateWorkerCommandBuffer();
-    worker_thread = std::thread(&VKScheduler::WorkerThread, this);
+    worker_thread = std::jthread([this](std::stop_token token) { WorkerThread(token); });
 }
 
-VKScheduler::~VKScheduler() {
-    {
-        std::lock_guard lock{work_mutex};
-        quit = true;
-    }
-    work_cv.notify_all();
-    worker_thread.join();
-}
+VKScheduler::~VKScheduler() = default;
 
-void VKScheduler::Flush(VkSemaphore semaphore) {
-    SubmitExecution(semaphore);
+void VKScheduler::Flush(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) {
+    SubmitExecution(signal_semaphore, wait_semaphore);
     AllocateNewContext();
 }
 
-void VKScheduler::Finish(VkSemaphore semaphore) {
+void VKScheduler::Finish(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) {
     const u64 presubmit_tick = CurrentTick();
-    SubmitExecution(semaphore);
+    SubmitExecution(signal_semaphore, wait_semaphore);
     WaitWorker();
     Wait(presubmit_tick);
     AllocateNewContext();
@@ -135,7 +128,7 @@ bool VKScheduler::UpdateGraphicsPipeline(GraphicsPipeline* pipeline) {
     return true;
 }
 
-void VKScheduler::WorkerThread() {
+void VKScheduler::WorkerThread(std::stop_token stop_token) {
     Common::SetCurrentThreadName("yuzu:VulkanWorker");
     do {
         if (work_queue.empty()) {
@@ -144,8 +137,8 @@ void VKScheduler::WorkerThread() {
         std::unique_ptr<CommandChunk> work;
         {
             std::unique_lock lock{work_mutex};
-            work_cv.wait(lock, [this] { return !work_queue.empty() || quit; });
-            if (quit) {
+            work_cv.wait(lock, stop_token, [this] { return !work_queue.empty(); });
+            if (stop_token.stop_requested()) {
                 continue;
             }
             work = std::move(work_queue.front());
@@ -158,7 +151,7 @@ void VKScheduler::WorkerThread() {
         }
         std::lock_guard reserve_lock{reserve_mutex};
         chunk_reserve.push_back(std::move(work));
-    } while (!quit);
+    } while (!stop_token.stop_requested());
 }
 
 void VKScheduler::AllocateWorkerCommandBuffer() {
@@ -171,37 +164,41 @@ void VKScheduler::AllocateWorkerCommandBuffer() {
     });
 }
 
-void VKScheduler::SubmitExecution(VkSemaphore semaphore) {
+void VKScheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) {
     EndPendingOperations();
     InvalidateState();
 
     const u64 signal_value = master_semaphore->NextTick();
-    Record([semaphore, signal_value, this](vk::CommandBuffer cmdbuf) {
+    Record([signal_semaphore, wait_semaphore, signal_value, this](vk::CommandBuffer cmdbuf) {
         cmdbuf.End();
-
-        const u32 num_signal_semaphores = semaphore ? 2U : 1U;
-
-        const u64 wait_value = signal_value - 1;
-        const VkPipelineStageFlags wait_stage_mask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
-
         const VkSemaphore timeline_semaphore = master_semaphore->Handle();
+
+        const u32 num_signal_semaphores = signal_semaphore ? 2U : 1U;
         const std::array signal_values{signal_value, u64(0)};
-        const std::array signal_semaphores{timeline_semaphore, semaphore};
+        const std::array signal_semaphores{timeline_semaphore, signal_semaphore};
+
+        const u32 num_wait_semaphores = wait_semaphore ? 2U : 1U;
+        const std::array wait_values{signal_value - 1, u64(1)};
+        const std::array wait_semaphores{timeline_semaphore, wait_semaphore};
+        static constexpr std::array<VkPipelineStageFlags, 2> wait_stage_masks{
+            VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+            VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
+        };
 
         const VkTimelineSemaphoreSubmitInfoKHR timeline_si{
             .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR,
             .pNext = nullptr,
-            .waitSemaphoreValueCount = 1,
-            .pWaitSemaphoreValues = &wait_value,
+            .waitSemaphoreValueCount = num_wait_semaphores,
+            .pWaitSemaphoreValues = wait_values.data(),
             .signalSemaphoreValueCount = num_signal_semaphores,
             .pSignalSemaphoreValues = signal_values.data(),
         };
         const VkSubmitInfo submit_info{
             .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
             .pNext = &timeline_si,
-            .waitSemaphoreCount = 1,
-            .pWaitSemaphores = &timeline_semaphore,
-            .pWaitDstStageMask = &wait_stage_mask,
+            .waitSemaphoreCount = num_wait_semaphores,
+            .pWaitSemaphores = wait_semaphores.data(),
+            .pWaitDstStageMask = wait_stage_masks.data(),
             .commandBufferCount = 1,
             .pCommandBuffers = cmdbuf.address(),
             .signalSemaphoreCount = num_signal_semaphores,
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h
index cf39a2363..85fc1712f 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -34,10 +34,10 @@ public:
     ~VKScheduler();
 
     /// Sends the current execution context to the GPU.
-    void Flush(VkSemaphore semaphore = nullptr);
+    void Flush(VkSemaphore signal_semaphore = nullptr, VkSemaphore wait_semaphore = nullptr);
 
     /// Sends the current execution context to the GPU and waits for it to complete.
-    void Finish(VkSemaphore semaphore = nullptr);
+    void Finish(VkSemaphore signal_semaphore = nullptr, VkSemaphore wait_semaphore = nullptr);
 
     /// Waits for the worker thread to finish executing everything. After this function returns it's
     /// safe to touch worker resources.
@@ -187,11 +187,11 @@ private:
         GraphicsPipeline* graphics_pipeline = nullptr;
     };
 
-    void WorkerThread();
+    void WorkerThread(std::stop_token stop_token);
 
     void AllocateWorkerCommandBuffer();
 
-    void SubmitExecution(VkSemaphore semaphore);
+    void SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore);
 
     void AllocateNewContext();
 
@@ -212,7 +212,6 @@ private:
     vk::CommandBuffer current_cmdbuf;
 
     std::unique_ptr<CommandChunk> chunk;
-    std::thread worker_thread;
 
     State state;
 
@@ -224,9 +223,9 @@ private:
     std::vector<std::unique_ptr<CommandChunk>> chunk_reserve;
     std::mutex reserve_mutex;
     std::mutex work_mutex;
-    std::condition_variable work_cv;
+    std::condition_variable_any work_cv;
     std::condition_variable wait_cv;
-    std::atomic_bool quit{};
+    std::jthread worker_thread;
 };
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.h b/src/video_core/renderer_vulkan/vk_state_tracker.h
index 5f78f6950..d90935f52 100644
--- a/src/video_core/renderer_vulkan/vk_state_tracker.h
+++ b/src/video_core/renderer_vulkan/vk_state_tracker.h
@@ -110,10 +110,6 @@ public:
         return Exchange(Dirty::DepthTestEnable, false);
     }
 
-    bool TouchDepthBoundsEnable() {
-        return Exchange(Dirty::DepthBoundsEnable, false);
-    }
-
     bool TouchDepthWriteEnable() {
         return Exchange(Dirty::DepthWriteEnable, false);
     }
diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp
index d990eefba..8972a6921 100644
--- a/src/video_core/renderer_vulkan/vk_swapchain.cpp
+++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp
@@ -9,6 +9,7 @@
 
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/settings.h"
 #include "core/core.h"
 #include "core/frontend/framebuffer_layout.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
@@ -20,16 +21,15 @@ namespace Vulkan {
 
 namespace {
 
-VkSurfaceFormatKHR ChooseSwapSurfaceFormat(vk::Span<VkSurfaceFormatKHR> formats, bool srgb) {
+VkSurfaceFormatKHR ChooseSwapSurfaceFormat(vk::Span<VkSurfaceFormatKHR> formats) {
     if (formats.size() == 1 && formats[0].format == VK_FORMAT_UNDEFINED) {
         VkSurfaceFormatKHR format;
         format.format = VK_FORMAT_B8G8R8A8_UNORM;
         format.colorSpace = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR;
         return format;
     }
-    const auto& found = std::find_if(formats.begin(), formats.end(), [srgb](const auto& format) {
-        const auto request_format = srgb ? VK_FORMAT_B8G8R8A8_SRGB : VK_FORMAT_B8G8R8A8_UNORM;
-        return format.format == request_format &&
+    const auto& found = std::find_if(formats.begin(), formats.end(), [](const auto& format) {
+        return format.format == VK_FORMAT_B8G8R8A8_UNORM &&
                format.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR;
     });
     return found != formats.end() ? *found : formats[0];
@@ -37,8 +37,19 @@ VkSurfaceFormatKHR ChooseSwapSurfaceFormat(vk::Span<VkSurfaceFormatKHR> formats,
 
 VkPresentModeKHR ChooseSwapPresentMode(vk::Span<VkPresentModeKHR> modes) {
     // Mailbox doesn't lock the application like fifo (vsync), prefer it
-    const auto found = std::find(modes.begin(), modes.end(), VK_PRESENT_MODE_MAILBOX_KHR);
-    return found != modes.end() ? *found : VK_PRESENT_MODE_FIFO_KHR;
+    const auto found_mailbox = std::find(modes.begin(), modes.end(), VK_PRESENT_MODE_MAILBOX_KHR);
+    if (found_mailbox != modes.end()) {
+        return VK_PRESENT_MODE_MAILBOX_KHR;
+    }
+    if (Settings::values.disable_fps_limit.GetValue()) {
+        // FIFO present mode locks the framerate to the monitor's refresh rate,
+        // Find an alternative to surpass this limitation if FPS is unlocked.
+        const auto found_imm = std::find(modes.begin(), modes.end(), VK_PRESENT_MODE_IMMEDIATE_KHR);
+        if (found_imm != modes.end()) {
+            return VK_PRESENT_MODE_IMMEDIATE_KHR;
+        }
+    }
+    return VK_PRESENT_MODE_FIFO_KHR;
 }
 
 VkExtent2D ChooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities, u32 width, u32 height) {
@@ -107,14 +118,12 @@ void VKSwapchain::AcquireNextImage() {
 }
 
 void VKSwapchain::Present(VkSemaphore render_semaphore) {
-    const VkSemaphore present_semaphore{*present_semaphores[frame_index]};
-    const std::array<VkSemaphore, 2> semaphores{present_semaphore, render_semaphore};
     const auto present_queue{device.GetPresentQueue()};
     const VkPresentInfoKHR present_info{
         .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
         .pNext = nullptr,
-        .waitSemaphoreCount = render_semaphore ? 2U : 1U,
-        .pWaitSemaphores = semaphores.data(),
+        .waitSemaphoreCount = render_semaphore ? 1U : 0U,
+        .pWaitSemaphores = &render_semaphore,
         .swapchainCount = 1,
         .pSwapchains = swapchain.address(),
         .pImageIndices = &image_index,
@@ -145,8 +154,8 @@ void VKSwapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities,
     const auto formats{physical_device.GetSurfaceFormatsKHR(surface)};
     const auto present_modes{physical_device.GetSurfacePresentModesKHR(surface)};
 
-    const VkSurfaceFormatKHR surface_format{ChooseSwapSurfaceFormat(formats, srgb)};
-    const VkPresentModeKHR present_mode{ChooseSwapPresentMode(present_modes)};
+    const VkSurfaceFormatKHR surface_format{ChooseSwapSurfaceFormat(formats)};
+    present_mode = ChooseSwapPresentMode(present_modes);
 
     u32 requested_image_count{capabilities.minImageCount + 1};
     if (capabilities.maxImageCount > 0 && requested_image_count > capabilities.maxImageCount) {
@@ -180,6 +189,17 @@ void VKSwapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities,
         swapchain_ci.queueFamilyIndexCount = static_cast<u32>(queue_indices.size());
         swapchain_ci.pQueueFamilyIndices = queue_indices.data();
     }
+    static constexpr std::array view_formats{VK_FORMAT_B8G8R8A8_UNORM, VK_FORMAT_B8G8R8A8_SRGB};
+    VkImageFormatListCreateInfo format_list{
+        .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_LIST_CREATE_INFO_KHR,
+        .pNext = nullptr,
+        .viewFormatCount = static_cast<u32>(view_formats.size()),
+        .pViewFormats = view_formats.data(),
+    };
+    if (device.IsKhrSwapchainMutableFormatEnabled()) {
+        format_list.pNext = std::exchange(swapchain_ci.pNext, &format_list);
+        swapchain_ci.flags |= VK_SWAPCHAIN_CREATE_MUTABLE_FORMAT_BIT_KHR;
+    }
     // Request the size again to reduce the possibility of a TOCTOU race condition.
     const auto updated_capabilities = physical_device.GetSurfaceCapabilitiesKHR(surface);
     swapchain_ci.imageExtent = ChooseSwapExtent(updated_capabilities, width, height);
@@ -188,10 +208,11 @@ void VKSwapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities,
 
     extent = swapchain_ci.imageExtent;
     current_srgb = srgb;
+    current_fps_unlocked = Settings::values.disable_fps_limit.GetValue();
 
     images = swapchain.GetImages();
     image_count = static_cast<u32>(images.size());
-    image_format = surface_format.format;
+    image_view_format = srgb ? VK_FORMAT_B8G8R8A8_SRGB : VK_FORMAT_B8G8R8A8_UNORM;
 }
 
 void VKSwapchain::CreateSemaphores() {
@@ -207,7 +228,7 @@ void VKSwapchain::CreateImageViews() {
         .flags = 0,
         .image = {},
         .viewType = VK_IMAGE_VIEW_TYPE_2D,
-        .format = image_format,
+        .format = image_view_format,
         .components =
             {
                 .r = VK_COMPONENT_SWIZZLE_IDENTITY,
@@ -240,4 +261,14 @@ void VKSwapchain::Destroy() {
     swapchain.reset();
 }
 
+bool VKSwapchain::HasFpsUnlockChanged() const {
+    return current_fps_unlocked != Settings::values.disable_fps_limit.GetValue();
+}
+
+bool VKSwapchain::NeedsPresentModeUpdate() const {
+    // Mailbox present mode is the ideal for all scenarios. If it is not available,
+    // A different present mode is needed to support unlocked FPS above the monitor's refresh rate.
+    return present_mode != VK_PRESENT_MODE_MAILBOX_KHR && HasFpsUnlockChanged();
+}
+
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_swapchain.h b/src/video_core/renderer_vulkan/vk_swapchain.h
index 35c2cdc14..61a6d959e 100644
--- a/src/video_core/renderer_vulkan/vk_swapchain.h
+++ b/src/video_core/renderer_vulkan/vk_swapchain.h
@@ -33,6 +33,11 @@ public:
     /// Presents the rendered image to the swapchain.
     void Present(VkSemaphore render_semaphore);
 
+    /// Returns true when the swapchain needs to be recreated.
+    bool NeedsRecreation(bool is_srgb) const {
+        return HasColorSpaceChanged(is_srgb) || IsSubOptimal() || NeedsPresentModeUpdate();
+    }
+
     /// Returns true when the color space has changed.
     bool HasColorSpaceChanged(bool is_srgb) const {
         return current_srgb != is_srgb;
@@ -68,8 +73,12 @@ public:
         return *image_views[index];
     }
 
-    VkFormat GetImageFormat() const {
-        return image_format;
+    VkFormat GetImageViewFormat() const {
+        return image_view_format;
+    }
+
+    VkSemaphore CurrentPresentSemaphore() const {
+        return *present_semaphores[frame_index];
     }
 
 private:
@@ -80,6 +89,10 @@ private:
 
     void Destroy();
 
+    bool HasFpsUnlockChanged() const;
+
+    bool NeedsPresentModeUpdate() const;
+
     const VkSurfaceKHR surface;
     const Device& device;
     VKScheduler& scheduler;
@@ -96,10 +109,12 @@ private:
     u32 image_index{};
     u32 frame_index{};
 
-    VkFormat image_format{};
+    VkFormat image_view_format{};
     VkExtent2D extent{};
+    VkPresentModeKHR present_mode{};
 
     bool current_srgb{};
+    bool current_fps_unlocked{};
     bool is_outdated{};
     bool is_suboptimal{};
 };
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index 8f4df7122..3b87640b5 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -127,7 +127,7 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
     const auto format_info = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, false, format);
     VkImageCreateFlags flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;
     if (info.type == ImageType::e2D && info.resources.layers >= 6 &&
-        info.size.width == info.size.height) {
+        info.size.width == info.size.height && !device.HasBrokenCubeImageCompability()) {
         flags |= VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT;
     }
     if (info.type == ImageType::e3D) {
@@ -1186,9 +1186,12 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM
         renderpass_key.depth_format = depth_buffer->format;
         num_layers = std::max(num_layers, depth_buffer->range.extent.layers);
         images[num_images] = depth_buffer->ImageHandle();
-        image_ranges[num_images] = MakeSubresourceRange(depth_buffer);
+        const VkImageSubresourceRange subresource_range = MakeSubresourceRange(depth_buffer);
+        image_ranges[num_images] = subresource_range;
         samples = depth_buffer->Samples();
         ++num_images;
+        has_depth = (subresource_range.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) != 0;
+        has_stencil = (subresource_range.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) != 0;
     } else {
         renderpass_key.depth_format = PixelFormat::Invalid;
     }
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index 5fe6b7ba3..6d5a68bfe 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -232,6 +232,18 @@ public:
         return image_ranges;
     }
 
+    [[nodiscard]] bool HasAspectColorBit(size_t index) const noexcept {
+        return (image_ranges.at(index).aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) != 0;
+    }
+
+    [[nodiscard]] bool HasAspectDepthBit() const noexcept {
+        return has_depth;
+    }
+
+    [[nodiscard]] bool HasAspectStencilBit() const noexcept {
+        return has_stencil;
+    }
+
 private:
     vk::Framebuffer framebuffer;
     VkRenderPass renderpass{};
@@ -241,6 +253,8 @@ private:
     u32 num_images = 0;
     std::array<VkImage, 9> images{};
     std::array<VkImageSubresourceRange, 9> image_ranges{};
+    bool has_depth{};
+    bool has_stencil{};
 };
 
 struct TextureCacheParams {
diff --git a/src/video_core/shader_environment.cpp b/src/video_core/shader_environment.cpp
index 8a4581c19..81a878bb2 100644
--- a/src/video_core/shader_environment.cpp
+++ b/src/video_core/shader_environment.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <algorithm>
 #include <filesystem>
 #include <fstream>
 #include <memory>
diff --git a/src/video_core/texture_cache/slot_vector.h b/src/video_core/texture_cache/slot_vector.h
index 6180b8c0e..50df06409 100644
--- a/src/video_core/texture_cache/slot_vector.h
+++ b/src/video_core/texture_cache/slot_vector.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <algorithm>
 #include <array>
 #include <bit>
 #include <concepts>
@@ -30,8 +31,8 @@ struct SlotId {
 };
 
 template <class T>
-requires std::is_nothrow_move_assignable_v<T>&&
-    std::is_nothrow_move_constructible_v<T> class SlotVector {
+requires std::is_nothrow_move_assignable_v<T> && std::is_nothrow_move_constructible_v<T>
+class SlotVector {
 public:
     class Iterator {
         friend SlotVector<T>;
diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp
index 3b575db4d..cae543a51 100644
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@@ -37,7 +37,8 @@ std::unique_ptr<VideoCore::RendererBase> CreateRenderer(
 namespace VideoCore {
 
 std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system) {
-    const bool use_nvdec = Settings::values.use_nvdec_emulation.GetValue();
+    const auto nvdec_value = Settings::values.nvdec_emulation.GetValue();
+    const bool use_nvdec = nvdec_value != Settings::NvdecEmulation::Off;
     const bool use_async = Settings::values.use_asynchronous_gpu_emulation.GetValue();
     auto gpu = std::make_unique<Tegra::GPU>(system, use_async, use_nvdec);
     auto context = emu_window.CreateSharedContext();
diff --git a/src/video_core/vulkan_common/vulkan_debug_callback.cpp b/src/video_core/vulkan_common/vulkan_debug_callback.cpp
index 0f60765bb..cf94e1d39 100644
--- a/src/video_core/vulkan_common/vulkan_debug_callback.cpp
+++ b/src/video_core/vulkan_common/vulkan_debug_callback.cpp
@@ -16,6 +16,7 @@ VkBool32 Callback(VkDebugUtilsMessageSeverityFlagBitsEXT severity,
     switch (static_cast<u32>(data->messageIdNumber)) {
     case 0x682a878au: // VUID-vkCmdBindVertexBuffers2EXT-pBuffers-parameter
     case 0x99fb7dfdu: // UNASSIGNED-RequiredParameter (vkCmdBindVertexBuffers2EXT pBuffers[0])
+    case 0xe8616bf2u: // Bound VkDescriptorSet 0x0[] was destroyed. Likely push_descriptor related
         return VK_FALSE;
     default:
         break;
diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp
index 86ca4be54..6388ed2eb 100644
--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -368,8 +368,9 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
     };
     SetNext(next, demote);
 
+    VkPhysicalDeviceFloat16Int8FeaturesKHR float16_int8;
     if (is_int8_supported || is_float16_supported) {
-        VkPhysicalDeviceFloat16Int8FeaturesKHR float16_int8{
+        float16_int8 = {
             .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR,
             .pNext = nullptr,
             .shaderFloat16 = is_float16_supported,
@@ -587,6 +588,31 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
             ext_extended_dynamic_state = false;
         }
     }
+    sets_per_pool = 64;
+
+    const bool is_amd =
+        driver_id == VK_DRIVER_ID_AMD_PROPRIETARY || driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE;
+    if (is_amd) {
+        // AMD drivers need a higher amount of Sets per Pool in certain circunstances like in XC2.
+        sets_per_pool = 96;
+        // Disable VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT on AMD GCN4 and lower as it is broken.
+        if (!is_float16_supported) {
+            LOG_WARNING(
+                Render_Vulkan,
+                "AMD GCN4 and earlier do not properly support VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT");
+            has_broken_cube_compatibility = true;
+        }
+    }
+    const bool is_amd_or_radv = is_amd || driver_id == VK_DRIVER_ID_MESA_RADV;
+    if (ext_sampler_filter_minmax && is_amd_or_radv) {
+        // Disable ext_sampler_filter_minmax on AMD GCN4 and lower as it is broken.
+        if (!is_float16_supported) {
+            LOG_WARNING(Render_Vulkan,
+                        "Blacklisting AMD GCN4 and earlier for VK_EXT_sampler_filter_minmax");
+            ext_sampler_filter_minmax = false;
+        }
+    }
+
     if (ext_vertex_input_dynamic_state && driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS) {
         LOG_WARNING(Render_Vulkan, "Blacklisting Intel for VK_EXT_vertex_input_dynamic_state");
         ext_vertex_input_dynamic_state = false;
@@ -839,6 +865,8 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) {
     bool has_khr_shader_float16_int8{};
     bool has_khr_workgroup_memory_explicit_layout{};
     bool has_khr_pipeline_executable_properties{};
+    bool has_khr_image_format_list{};
+    bool has_khr_swapchain_mutable_format{};
     bool has_ext_subgroup_size_control{};
     bool has_ext_transform_feedback{};
     bool has_ext_custom_border_color{};
@@ -888,6 +916,9 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) {
         test(has_ext_shader_atomic_int64, VK_KHR_SHADER_ATOMIC_INT64_EXTENSION_NAME, false);
         test(has_khr_workgroup_memory_explicit_layout,
              VK_KHR_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_EXTENSION_NAME, false);
+        test(has_khr_image_format_list, VK_KHR_IMAGE_FORMAT_LIST_EXTENSION_NAME, false);
+        test(has_khr_swapchain_mutable_format, VK_KHR_SWAPCHAIN_MUTABLE_FORMAT_EXTENSION_NAME,
+             false);
         test(has_ext_line_rasterization, VK_EXT_LINE_RASTERIZATION_EXTENSION_NAME, false);
         if (Settings::values.enable_nsight_aftermath) {
             test(nv_device_diagnostics_config, VK_NV_DEVICE_DIAGNOSTICS_CONFIG_EXTENSION_NAME,
@@ -1066,6 +1097,11 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) {
             khr_pipeline_executable_properties = true;
         }
     }
+    if (has_khr_image_format_list && has_khr_swapchain_mutable_format) {
+        extensions.push_back(VK_KHR_IMAGE_FORMAT_LIST_EXTENSION_NAME);
+        extensions.push_back(VK_KHR_SWAPCHAIN_MUTABLE_FORMAT_EXTENSION_NAME);
+        khr_swapchain_mutable_format = true;
+    }
     if (khr_push_descriptor) {
         VkPhysicalDevicePushDescriptorPropertiesKHR push_descriptor;
         push_descriptor.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR;
diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h
index 234d74129..d9e74f1aa 100644
--- a/src/video_core/vulkan_common/vulkan_device.h
+++ b/src/video_core/vulkan_common/vulkan_device.h
@@ -224,6 +224,11 @@ public:
         return khr_pipeline_executable_properties;
     }
 
+    /// Returns true if VK_KHR_swapchain_mutable_format is enabled.
+    bool IsKhrSwapchainMutableFormatEnabled() const {
+        return khr_swapchain_mutable_format;
+    }
+
     /// Returns true if the device supports VK_KHR_workgroup_memory_explicit_layout.
     bool IsKhrWorkgroupMemoryExplicitLayoutSupported() const {
         return khr_workgroup_memory_explicit_layout;
@@ -304,6 +309,11 @@ public:
         return has_renderdoc || has_nsight_graphics;
     }
 
+    /// Returns true when the device does not properly support cube compatibility.
+    bool HasBrokenCubeImageCompability() const {
+        return has_broken_cube_compatibility;
+    }
+
     /// Returns the vendor name reported from Vulkan.
     std::string_view GetVendorName() const {
         return vendor_name;
@@ -318,6 +328,10 @@ public:
         return device_access_memory;
     }
 
+    u32 GetSetsPerPool() const {
+        return sets_per_pool;
+    }
+
 private:
     /// Checks if the physical device is suitable.
     void CheckSuitability(bool requires_swapchain) const;
@@ -371,6 +385,7 @@ private:
     VkShaderStageFlags guest_warp_stages{};     ///< Stages where the guest warp size can be forced.
     u64 device_access_memory{};                 ///< Total size of device local memory in bytes.
     u32 max_push_descriptors{};                 ///< Maximum number of push descriptors
+    u32 sets_per_pool{};                        ///< Sets per Description Pool
     bool is_optimal_astc_supported{};           ///< Support for native ASTC.
     bool is_float16_supported{};                ///< Support for float16 arithmetic.
     bool is_int8_supported{};                   ///< Support for int8 arithmetic.
@@ -390,6 +405,7 @@ private:
     bool khr_workgroup_memory_explicit_layout{}; ///< Support for explicit workgroup layouts.
     bool khr_push_descriptor{};                  ///< Support for VK_KHR_push_descritor.
     bool khr_pipeline_executable_properties{};   ///< Support for executable properties.
+    bool khr_swapchain_mutable_format{};         ///< Support for VK_KHR_swapchain_mutable_format.
     bool ext_index_type_uint8{};                 ///< Support for VK_EXT_index_type_uint8.
     bool ext_sampler_filter_minmax{};            ///< Support for VK_EXT_sampler_filter_minmax.
     bool ext_depth_range_unrestricted{};         ///< Support for VK_EXT_depth_range_unrestricted.
@@ -406,6 +422,7 @@ private:
     bool ext_conservative_rasterization{};  ///< Support for VK_EXT_conservative_rasterization.
     bool ext_provoking_vertex{};            ///< Support for VK_EXT_provoking_vertex.
     bool nv_device_diagnostics_config{};    ///< Support for VK_NV_device_diagnostics_config.
+    bool has_broken_cube_compatibility{};   ///< Has broken cube compatiblity bit
     bool has_renderdoc{};                   ///< Has RenderDoc attached
     bool has_nsight_graphics{};             ///< Has Nsight Graphics attached