46 files changed, 2308 insertions, 278 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 47190c464..e31eb30c0 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -237,6 +237,7 @@ add_library(video_core STATIC
     texture_cache/util.cpp
     texture_cache/util.h
     textures/astc.h
+    textures/astc.cpp
     textures/decoders.cpp
     textures/decoders.h
     textures/texture.cpp
@@ -292,6 +293,7 @@ endif()
 if (MSVC)
     target_compile_options(video_core PRIVATE
         /we4267 # 'var' : conversion from 'size_t' to 'type', possible loss of data
+        /we4244 # 'var' : conversion from integer to 'type', possible loss of data
         /we4456 # Declaration of 'identifier' hides previous local declaration
         /we4457 # Declaration of 'identifier' hides function parameter
         /we4458 # Declaration of 'identifier' hides class member
diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h
index a39505903..b121d36a3 100644
--- a/src/video_core/buffer_cache/buffer_base.h
+++ b/src/video_core/buffer_cache/buffer_base.h
@@ -256,6 +256,16 @@ public:
         stream_score += score;
     }
 
+    /// Sets the new frame tick
+    void SetFrameTick(u64 new_frame_tick) noexcept {
+        frame_tick = new_frame_tick;
+    }
+
+    /// Returns the new frame tick
+    [[nodiscard]] u64 FrameTick() const noexcept {
+        return frame_tick;
+    }
+
     /// Returns the likeliness of this being a stream buffer
     [[nodiscard]] int StreamScore() const noexcept {
         return stream_score;
@@ -586,6 +596,7 @@ private:
     RasterizerInterface* rasterizer = nullptr;
     VAddr cpu_addr = 0;
     Words words;
+    u64 frame_tick = 0;
     BufferFlagBits flags{};
     int stream_score = 0;
 };
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index d371b842f..cad7f902d 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -18,6 +18,7 @@
 
 #include "common/common_types.h"
 #include "common/div_ceil.h"
+#include "common/literals.h"
 #include "common/microprofile.h"
 #include "common/scope_exit.h"
 #include "common/settings.h"
@@ -47,8 +48,11 @@ constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8;
 constexpr u32 NUM_STORAGE_BUFFERS = 16;
 constexpr u32 NUM_STAGES = 5;
 
+using namespace Common::Literals;
+
 template <typename P>
 class BufferCache {
+
     // Page size for caching purposes.
     // This is unrelated to the CPU page size and it can be changed as it seems optimal.
     static constexpr u32 PAGE_BITS = 16;
@@ -65,6 +69,9 @@ class BufferCache {
 
     static constexpr BufferId NULL_BUFFER_ID{0};
 
+    static constexpr u64 EXPECTED_MEMORY = 512_MiB;
+    static constexpr u64 CRITICAL_MEMORY = 1_GiB;
+
     using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
     using Runtime = typename P::Runtime;
@@ -92,7 +99,7 @@ class BufferCache {
     };
 
 public:
-    static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = 4096;
+    static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = static_cast<u32>(4_KiB);
 
     explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
                          Tegra::Engines::Maxwell3D& maxwell3d_,
@@ -188,6 +195,8 @@ private:
                ((cpu_addr + size) & ~Core::Memory::PAGE_MASK);
     }
 
+    void RunGarbageCollector();
+
     void BindHostIndexBuffer();
 
     void BindHostVertexBuffers();
@@ -243,6 +252,8 @@ private:
     template <bool insert>
     void ChangeRegister(BufferId buffer_id);
 
+    void TouchBuffer(Buffer& buffer) const noexcept;
+
     bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size);
 
     bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size);
@@ -255,6 +266,10 @@ private:
 
     void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies);
 
+    void DownloadBufferMemory(Buffer& buffer_id);
+
+    void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size);
+
     void DeleteBuffer(BufferId buffer_id);
 
     void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id);
@@ -319,6 +334,10 @@ private:
     size_t immediate_buffer_capacity = 0;
     std::unique_ptr<u8[]> immediate_buffer_alloc;
 
+    typename SlotVector<Buffer>::Iterator deletion_iterator;
+    u64 frame_tick = 0;
+    u64 total_used_memory = 0;
+
     std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table;
 };
 
@@ -332,6 +351,28 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
       gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} {
     // Ensure the first slot is used for the null buffer
     void(slot_buffers.insert(runtime, NullBufferParams{}));
+    deletion_iterator = slot_buffers.end();
+}
+
+template <class P>
+void BufferCache<P>::RunGarbageCollector() {
+    const bool aggressive_gc = total_used_memory >= CRITICAL_MEMORY;
+    const u64 ticks_to_destroy = aggressive_gc ? 60 : 120;
+    int num_iterations = aggressive_gc ? 64 : 32;
+    for (; num_iterations > 0; --num_iterations) {
+        if (deletion_iterator == slot_buffers.end()) {
+            deletion_iterator = slot_buffers.begin();
+        }
+        ++deletion_iterator;
+        if (deletion_iterator == slot_buffers.end()) {
+            break;
+        }
+        const auto [buffer_id, buffer] = *deletion_iterator;
+        if (buffer->FrameTick() + ticks_to_destroy < frame_tick) {
+            DownloadBufferMemory(*buffer);
+            DeleteBuffer(buffer_id);
+        }
+    }
 }
 
 template <class P>
@@ -349,6 +390,10 @@ void BufferCache<P>::TickFrame() {
     const bool skip_preferred = hits * 256 < shots * 251;
     uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0;
 
+    if (Settings::values.use_caches_gc.GetValue() && total_used_memory >= EXPECTED_MEMORY) {
+        RunGarbageCollector();
+    }
+    ++frame_tick;
     delayed_destruction_ring.Tick();
 }
 
@@ -372,48 +417,7 @@ void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) {
 template <class P>
 void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) {
     ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) {
-        boost::container::small_vector<BufferCopy, 1> copies;
-        u64 total_size_bytes = 0;
-        u64 largest_copy = 0;
-        buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
-            copies.push_back(BufferCopy{
-                .src_offset = range_offset,
-                .dst_offset = total_size_bytes,
-                .size = range_size,
-            });
-            total_size_bytes += range_size;
-            largest_copy = std::max(largest_copy, range_size);
-        });
-        if (total_size_bytes == 0) {
-            return;
-        }
-        MICROPROFILE_SCOPE(GPU_DownloadMemory);
-
-        if constexpr (USE_MEMORY_MAPS) {
-            auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
-            const u8* const mapped_memory = download_staging.mapped_span.data();
-            const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size());
-            for (BufferCopy& copy : copies) {
-                // Modify copies to have the staging offset in mind
-                copy.dst_offset += download_staging.offset;
-            }
-            runtime.CopyBuffer(download_staging.buffer, buffer, copies_span);
-            runtime.Finish();
-            for (const BufferCopy& copy : copies) {
-                const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
-                // Undo the modified offset
-                const u64 dst_offset = copy.dst_offset - download_staging.offset;
-                const u8* copy_mapped_memory = mapped_memory + dst_offset;
-                cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size);
-            }
-        } else {
-            const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
-            for (const BufferCopy& copy : copies) {
-                buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
-                const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
-                cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size);
-            }
-        }
+        DownloadBufferMemory(buffer, cpu_addr, size);
     });
 }
 
@@ -640,6 +644,7 @@ bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
 template <class P>
 void BufferCache<P>::BindHostIndexBuffer() {
     Buffer& buffer = slot_buffers[index_buffer.buffer_id];
+    TouchBuffer(buffer);
     const u32 offset = buffer.Offset(index_buffer.cpu_addr);
     const u32 size = index_buffer.size;
     SynchronizeBuffer(buffer, index_buffer.cpu_addr, size);
@@ -658,6 +663,7 @@ void BufferCache<P>::BindHostVertexBuffers() {
     for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
         const Binding& binding = vertex_buffers[index];
         Buffer& buffer = slot_buffers[binding.buffer_id];
+        TouchBuffer(buffer);
         SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);
         if (!flags[Dirty::VertexBuffer0 + index]) {
             continue;
@@ -693,6 +699,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
     const VAddr cpu_addr = binding.cpu_addr;
     const u32 size = binding.size;
     Buffer& buffer = slot_buffers[binding.buffer_id];
+    TouchBuffer(buffer);
     const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID &&
                                  size <= uniform_buffer_skip_cache_size &&
                                  !buffer.IsRegionGpuModified(cpu_addr, size);
@@ -744,6 +751,7 @@ void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) {
     ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
         const Binding& binding = storage_buffers[stage][index];
         Buffer& buffer = slot_buffers[binding.buffer_id];
+        TouchBuffer(buffer);
         const u32 size = binding.size;
         SynchronizeBuffer(buffer, binding.cpu_addr, size);
 
@@ -766,6 +774,7 @@ void BufferCache<P>::BindHostTransformFeedbackBuffers() {
     for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
         const Binding& binding = transform_feedback_buffers[index];
         Buffer& buffer = slot_buffers[binding.buffer_id];
+        TouchBuffer(buffer);
         const u32 size = binding.size;
         SynchronizeBuffer(buffer, binding.cpu_addr, size);
 
@@ -784,6 +793,7 @@ void BufferCache<P>::BindHostComputeUniformBuffers() {
     ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {
         const Binding& binding = compute_uniform_buffers[index];
         Buffer& buffer = slot_buffers[binding.buffer_id];
+        TouchBuffer(buffer);
         const u32 size = binding.size;
         SynchronizeBuffer(buffer, binding.cpu_addr, size);
 
@@ -803,6 +813,7 @@ void BufferCache<P>::BindHostComputeStorageBuffers() {
     ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
         const Binding& binding = compute_storage_buffers[index];
         Buffer& buffer = slot_buffers[binding.buffer_id];
+        TouchBuffer(buffer);
         const u32 size = binding.size;
         SynchronizeBuffer(buffer, binding.cpu_addr, size);
 
@@ -1101,6 +1112,7 @@ BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) {
     const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size);
     const u32 size = static_cast<u32>(overlap.end - overlap.begin);
     const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size);
+    TouchBuffer(slot_buffers[new_buffer_id]);
     for (const BufferId overlap_id : overlap.ids) {
         JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap);
     }
@@ -1122,8 +1134,14 @@ template <class P>
 template <bool insert>
 void BufferCache<P>::ChangeRegister(BufferId buffer_id) {
     const Buffer& buffer = slot_buffers[buffer_id];
+    const auto size = buffer.SizeBytes();
+    if (insert) {
+        total_used_memory += Common::AlignUp(size, 1024);
+    } else {
+        total_used_memory -= Common::AlignUp(size, 1024);
+    }
     const VAddr cpu_addr_begin = buffer.CpuAddr();
-    const VAddr cpu_addr_end = cpu_addr_begin + buffer.SizeBytes();
+    const VAddr cpu_addr_end = cpu_addr_begin + size;
     const u64 page_begin = cpu_addr_begin / PAGE_SIZE;
     const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE);
     for (u64 page = page_begin; page != page_end; ++page) {
@@ -1136,6 +1154,11 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) {
 }
 
 template <class P>
+void BufferCache<P>::TouchBuffer(Buffer& buffer) const noexcept {
+    buffer.SetFrameTick(frame_tick);
+}
+
+template <class P>
 bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) {
     if (buffer.CpuAddr() == 0) {
         return true;
@@ -1212,6 +1235,57 @@ void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
 }
 
 template <class P>
+void BufferCache<P>::DownloadBufferMemory(Buffer& buffer) {
+    DownloadBufferMemory(buffer, buffer.CpuAddr(), buffer.SizeBytes());
+}
+
+template <class P>
+void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 size) {
+    boost::container::small_vector<BufferCopy, 1> copies;
+    u64 total_size_bytes = 0;
+    u64 largest_copy = 0;
+    buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
+        copies.push_back(BufferCopy{
+            .src_offset = range_offset,
+            .dst_offset = total_size_bytes,
+            .size = range_size,
+        });
+        total_size_bytes += range_size;
+        largest_copy = std::max(largest_copy, range_size);
+    });
+    if (total_size_bytes == 0) {
+        return;
+    }
+    MICROPROFILE_SCOPE(GPU_DownloadMemory);
+
+    if constexpr (USE_MEMORY_MAPS) {
+        auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
+        const u8* const mapped_memory = download_staging.mapped_span.data();
+        const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size());
+        for (BufferCopy& copy : copies) {
+            // Modify copies to have the staging offset in mind
+            copy.dst_offset += download_staging.offset;
+        }
+        runtime.CopyBuffer(download_staging.buffer, buffer, copies_span);
+        runtime.Finish();
+        for (const BufferCopy& copy : copies) {
+            const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
+            // Undo the modified offset
+            const u64 dst_offset = copy.dst_offset - download_staging.offset;
+            const u8* copy_mapped_memory = mapped_memory + dst_offset;
+            cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size);
+        }
+    } else {
+        const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
+        for (const BufferCopy& copy : copies) {
+            buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
+            const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
+            cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size);
+        }
+    }
+}
+
+template <class P>
 void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {
     const auto scalar_replace = [buffer_id](Binding& binding) {
         if (binding.buffer_id == buffer_id) {
@@ -1236,6 +1310,7 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {
 
     Unregister(buffer_id);
     delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id]));
+    slot_buffers.erase(buffer_id);
 
     NotifyBufferDeletion();
 }
diff --git a/src/video_core/command_classes/codecs/codec.h b/src/video_core/command_classes/codecs/codec.h
index 8a2a6c360..3e135a2a6 100644
--- a/src/video_core/command_classes/codecs/codec.h
+++ b/src/video_core/command_classes/codecs/codec.h
@@ -14,10 +14,18 @@ extern "C" {
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wconversion"
 #endif
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4242) // conversion from 'type' to 'type', possible loss of data
+#pragma warning(disable : 4244) // conversion from 'type' to 'type', possible loss of data
+#endif
 #include <libavcodec/avcodec.h>
 #if defined(__GNUC__) || defined(__clang__)
 #pragma GCC diagnostic pop
 #endif
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
 }
 
 namespace Tegra {
diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp
index 0a8b82f2b..5faf8c0f1 100644
--- a/src/video_core/command_classes/vic.cpp
+++ b/src/video_core/command_classes/vic.cpp
@@ -3,7 +3,28 @@
 // Refer to the license.txt file included.
 
 #include <array>
+
+extern "C" {
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+#ifdef _MSC_VER
+#pragma warning(disable : 4244) // conversion from 'type' to 'type', possible loss of data
+#pragma warning(push)
+#endif
+#include <libswscale/swscale.h>
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+}
+
 #include "common/assert.h"
+#include "common/logging/log.h"
+
 #include "video_core/command_classes/nvdec.h"
 #include "video_core/command_classes/vic.h"
 #include "video_core/engines/maxwell_3d.h"
@@ -11,10 +32,6 @@
 #include "video_core/memory_manager.h"
 #include "video_core/textures/decoders.h"
 
-extern "C" {
-#include <libswscale/swscale.h>
-}
-
 namespace Tegra {
 
 Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_)
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index ffed42a29..335383955 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -242,6 +242,7 @@ public:
                     return 4;
                 default:
                     UNREACHABLE();
+                    return 1;
                 }
             }
 
diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp
index 703e34587..c37f15bfd 100644
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -11,12 +11,8 @@
 #define UNIFORM(n)
 #define BINDING_INPUT_BUFFER 0
 #define BINDING_ENC_BUFFER 1
-#define BINDING_6_TO_8_BUFFER 2
-#define BINDING_7_TO_8_BUFFER 3
-#define BINDING_8_TO_8_BUFFER 4
-#define BINDING_BYTE_TO_16_BUFFER 5
-#define BINDING_SWIZZLE_BUFFER 6
-#define BINDING_OUTPUT_IMAGE 7
+#define BINDING_SWIZZLE_BUFFER 2
+#define BINDING_OUTPUT_IMAGE 3
 
 #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
 
@@ -26,10 +22,6 @@
 #define BINDING_SWIZZLE_BUFFER 0
 #define BINDING_INPUT_BUFFER 1
 #define BINDING_ENC_BUFFER 2
-#define BINDING_6_TO_8_BUFFER 3
-#define BINDING_7_TO_8_BUFFER 4
-#define BINDING_8_TO_8_BUFFER 5
-#define BINDING_BYTE_TO_16_BUFFER 6
 #define BINDING_OUTPUT_IMAGE 0
 
 #endif
@@ -76,19 +68,6 @@ layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 {
 layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues {
     EncodingData encoding_values[];
 };
-// ASTC Precompiled tables
-layout(binding = BINDING_6_TO_8_BUFFER, std430) readonly buffer REPLICATE_6_BIT_TO_8 {
-    uint REPLICATE_6_BIT_TO_8_TABLE[];
-};
-layout(binding = BINDING_7_TO_8_BUFFER, std430) readonly buffer REPLICATE_7_BIT_TO_8 {
-    uint REPLICATE_7_BIT_TO_8_TABLE[];
-};
-layout(binding = BINDING_8_TO_8_BUFFER, std430) readonly buffer REPLICATE_8_BIT_TO_8 {
-    uint REPLICATE_8_BIT_TO_8_TABLE[];
-};
-layout(binding = BINDING_BYTE_TO_16_BUFFER, std430) readonly buffer REPLICATE_BYTE_TO_16 {
-    uint REPLICATE_BYTE_TO_16_TABLE[];
-};
 
 layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image;
 
@@ -139,6 +118,19 @@ const uint REPLICATE_4_BIT_TO_6_TABLE[16] =
 const uint REPLICATE_5_BIT_TO_6_TABLE[32] =
     uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45,
            47, 49, 51, 53, 55, 57, 59, 61, 63);
+const uint REPLICATE_6_BIT_TO_8_TABLE[64] =
+    uint[](0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 65, 69, 73, 77, 81, 85, 89,
+           93, 97, 101, 105, 109, 113, 117, 121, 125, 130, 134, 138, 142, 146, 150, 154, 158, 162,
+           166, 170, 174, 178, 182, 186, 190, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235,
+           239, 243, 247, 251, 255);
+const uint REPLICATE_7_BIT_TO_8_TABLE[128] =
+    uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44,
+           46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88,
+           90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
+           129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 159, 161, 163,
+           165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191, 193, 195, 197, 199,
+           201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235,
+           237, 239, 241, 243, 245, 247, 249, 251, 253, 255);
 
 // Input ASTC texture globals
 uint current_index = 0;
@@ -207,8 +199,7 @@ uint Replicate(uint val, uint num_bits, uint to_bit) {
 }
 
 uvec4 ReplicateByteTo16(uvec4 value) {
-    return uvec4(REPLICATE_BYTE_TO_16_TABLE[value.x], REPLICATE_BYTE_TO_16_TABLE[value.y],
-                 REPLICATE_BYTE_TO_16_TABLE[value.z], REPLICATE_BYTE_TO_16_TABLE[value.w]);
+    return value * 0x101;
 }
 
 uint ReplicateBitTo7(uint value) {
@@ -236,7 +227,7 @@ uint FastReplicateTo8(uint value, uint num_bits) {
     case 7:
         return REPLICATE_7_BIT_TO_8_TABLE[value];
     case 8:
-        return REPLICATE_8_BIT_TO_8_TABLE[value];
+        return value;
     }
     return Replicate(value, num_bits, 8);
 }
@@ -763,7 +754,7 @@ void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) {
     case 1: {
         READ_UINT_VALUES(2)
         uint L0 = (v[0] >> 2) | (v[1] & 0xC0);
-        uint L1 = max(L0 + (v[1] & 0x3F), 0xFFU);
+        uint L1 = min(L0 + (v[1] & 0x3F), 0xFFU);
         ep1 = uvec4(0xFF, L0, L0, L0);
         ep2 = uvec4(0xFF, L1, L1, L1);
         break;
@@ -1327,6 +1318,9 @@ void main() {
     offset += swizzle;
 
     const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1));
+    if (any(greaterThanEqual(coord, imageSize(dest_image)))) {
+        return;
+    }
     uint block_index =
         pos.z * gl_WorkGroupSize.x * gl_WorkGroupSize.y + pos.y * gl_WorkGroupSize.x + pos.x;
 
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index f968b5b16..07939432f 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -4,10 +4,10 @@
 
 #pragma once
 
-#include <atomic>
 #include <functional>
 #include <optional>
 #include <span>
+#include <stop_token>
 #include "common/common_types.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/gpu.h"
@@ -123,7 +123,7 @@ public:
     virtual void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {}
 
     /// Initialize disk cached resources for the game being emulated
-    virtual void LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading,
+    virtual void LoadDiskResources(u64 title_id, std::stop_token stop_loading,
                                    const DiskResourceLoadCallback& callback) {}
 
     /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver.
diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h
index 320ee8d30..63d8ad42a 100644
--- a/src/video_core/renderer_base.h
+++ b/src/video_core/renderer_base.h
@@ -42,6 +42,8 @@ public:
 
     [[nodiscard]] virtual RasterizerInterface* ReadRasterizer() = 0;
 
+    [[nodiscard]] virtual std::string GetDeviceVendor() const = 0;
+
     // Getter/setter functions:
     // ------------------------
 
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
index 3e4d88c30..e8d8d2aa5 100644
--- a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
@@ -454,7 +454,7 @@ private:
 
     template <typename... Args>
     void AddExpression(std::string_view text, Args&&... args) {
-        shader_source += fmt::format(text, std::forward<Args>(args)...);
+        shader_source += fmt::format(fmt::runtime(text), std::forward<Args>(args)...);
     }
 
     template <typename... Args>
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 3f4532ca7..3b00614e7 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -202,13 +202,13 @@ Device::Device() {
         LOG_ERROR(Render_OpenGL, "OpenGL 4.6 is not available");
         throw std::runtime_error{"Insufficient version"};
     }
-    const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
+    vendor_name = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
     const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
     const std::vector extensions = GetExtensions();
 
-    const bool is_nvidia = vendor == "NVIDIA Corporation";
-    const bool is_amd = vendor == "ATI Technologies Inc.";
-    const bool is_intel = vendor == "Intel";
+    const bool is_nvidia = vendor_name == "NVIDIA Corporation";
+    const bool is_amd = vendor_name == "ATI Technologies Inc.";
+    const bool is_intel = vendor_name == "Intel";
 
 #ifdef __unix__
     const bool is_linux = true;
@@ -275,6 +275,56 @@ Device::Device() {
     }
 }
 
+std::string Device::GetVendorName() const {
+    if (vendor_name == "NVIDIA Corporation") {
+        return "NVIDIA";
+    }
+    if (vendor_name == "ATI Technologies Inc.") {
+        return "AMD";
+    }
+    if (vendor_name == "Intel") {
+        // For Mesa, `Intel` is an overloaded vendor string that could mean crocus or iris.
+        // Simply return `INTEL` for those as well as the Windows driver.
+        return "INTEL";
+    }
+    if (vendor_name == "Intel Open Source Technology Center") {
+        return "I965";
+    }
+    if (vendor_name == "Mesa Project") {
+        return "I915";
+    }
+    if (vendor_name == "Mesa/X.org") {
+        // This vendor string is overloaded between llvmpipe, softpipe, and virgl, so just return
+        // MESA instead of one of those driver names.
+        return "MESA";
+    }
+    if (vendor_name == "AMD") {
+        return "RADEONSI";
+    }
+    if (vendor_name == "nouveau") {
+        return "NOUVEAU";
+    }
+    if (vendor_name == "X.Org") {
+        return "R600";
+    }
+    if (vendor_name == "Collabora Ltd") {
+        return "ZINK";
+    }
+    if (vendor_name == "Intel Corporation") {
+        return "OPENSWR";
+    }
+    if (vendor_name == "Microsoft Corporation") {
+        return "D3D12";
+    }
+    if (vendor_name == "NVIDIA") {
+        // Mesa's tegra driver reports `NVIDIA`. Only present in this list because the default
+        // strategy would have returned `NVIDIA` here for this driver, the same result as the
+        // proprietary driver.
+        return "TEGRA";
+    }
+    return vendor_name;
+}
+
 Device::Device(std::nullptr_t) {
     max_uniform_buffers.fill(std::numeric_limits<u32>::max());
     uniform_buffer_alignment = 4;
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index f24bd0c7b..2c2b13767 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -22,6 +22,8 @@ public:
     explicit Device();
     explicit Device(std::nullptr_t);
 
+    [[nodiscard]] std::string GetVendorName() const;
+
     u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept {
         return max_uniform_buffers[static_cast<std::size_t>(shader_type)];
     }
@@ -130,6 +132,7 @@ private:
     static bool TestVariableAoffi();
     static bool TestPreciseBug();
 
+    std::string vendor_name;
     std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{};
     std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{};
     size_t uniform_buffer_alignment{};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index f87bb269b..eb8bdaa85 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -351,7 +351,7 @@ void RasterizerOpenGL::SetupShaders(bool is_indexed) {
     }
 }
 
-void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading,
+void RasterizerOpenGL::LoadDiskResources(u64 title_id, std::stop_token stop_loading,
                                          const VideoCore::DiskResourceLoadCallback& callback) {
     shader_cache.LoadDiskCache(title_id, stop_loading, callback);
 }
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 76298517f..9995a563b 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -94,7 +94,7 @@ public:
                                const Tegra::Engines::Fermi2D::Config& copy_config) override;
     bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
                            u32 pixel_stride) override;
-    void LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading,
+    void LoadDiskResources(u64 title_id, std::stop_token stop_loading,
                            const VideoCore::DiskResourceLoadCallback& callback) override;
 
     /// Returns true when there are commands queued to the OpenGL server.
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 5cf7cd151..5a01c59ec 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -331,7 +331,7 @@ ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer_,
 
 ShaderCacheOpenGL::~ShaderCacheOpenGL() = default;
 
-void ShaderCacheOpenGL::LoadDiskCache(u64 title_id, const std::atomic_bool& stop_loading,
+void ShaderCacheOpenGL::LoadDiskCache(u64 title_id, std::stop_token stop_loading,
                                       const VideoCore::DiskResourceLoadCallback& callback) {
     disk_cache.BindTitleID(title_id);
     const std::optional transferable = disk_cache.LoadTransferable();
@@ -372,7 +372,7 @@ void ShaderCacheOpenGL::LoadDiskCache(u64 title_id, const std::atomic_bool& stop
         const auto scope = context->Acquire();
 
         for (std::size_t i = begin; i < end; ++i) {
-            if (stop_loading) {
+            if (stop_loading.stop_requested()) {
                 return;
             }
             const auto& entry = (*transferable)[i];
@@ -435,7 +435,7 @@ void ShaderCacheOpenGL::LoadDiskCache(u64 title_id, const std::atomic_bool& stop
         precompiled_cache_altered = true;
         return;
     }
-    if (stop_loading) {
+    if (stop_loading.stop_requested()) {
         return;
     }
 
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 2aed0697e..b30308b6f 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -127,7 +127,7 @@ public:
     ~ShaderCacheOpenGL() override;
 
     /// Loads disk cache for the current game
-    void LoadDiskCache(u64 title_id, const std::atomic_bool& stop_loading,
+    void LoadDiskCache(u64 title_id, std::stop_token stop_loading,
                        const VideoCore::DiskResourceLoadCallback& callback);
 
     /// Gets the current specified shader stage program
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index ac78d344c..9c28498e8 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -96,7 +96,7 @@ public:
     // etc).
     template <typename... Args>
     void AddLine(std::string_view text, Args&&... args) {
-        AddExpression(fmt::format(text, std::forward<Args>(args)...));
+        AddExpression(fmt::format(fmt::runtime(text), std::forward<Args>(args)...));
         AddNewLine();
     }
 
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index 6dbb6bfba..2e67922a6 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -12,12 +12,15 @@
 #include <glad/glad.h>
 
 #include "common/common_types.h"
+#include "common/literals.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 
 namespace OpenGL {
 
+using namespace Common::Literals;
+
 class StreamBuffer {
-    static constexpr size_t STREAM_BUFFER_SIZE = 64 * 1024 * 1024;
+    static constexpr size_t STREAM_BUFFER_SIZE = 64_MiB;
     static constexpr size_t NUM_SYNCS = 16;
     static constexpr size_t REGION_SIZE = STREAM_BUFFER_SIZE / NUM_SYNCS;
     static constexpr size_t MAX_ALIGNMENT = 256;
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index ffe9edc1b..23948feed 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -9,6 +9,8 @@
 
 #include <glad/glad.h>
 
+#include "common/settings.h"
+
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
 #include "video_core/renderer_opengl/gl_state_tracker.h"
@@ -307,7 +309,9 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4
 
 [[nodiscard]] bool CanBeAccelerated(const TextureCacheRuntime& runtime,
                                     const VideoCommon::ImageInfo& info) {
-    return !runtime.HasNativeASTC() && IsPixelFormatASTC(info.format);
+    if (IsPixelFormatASTC(info.format)) {
+        return !runtime.HasNativeASTC() && Settings::values.accelerate_astc.GetValue();
+    }
     // Disable other accelerated uploads for now as they don't implement swizzled uploads
     return false;
     switch (info.type) {
@@ -733,6 +737,8 @@ Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_,
     }
 }
 
+Image::~Image() = default;
+
 void Image::UploadMemory(const ImageBufferMap& map,
                          std::span<const VideoCommon::BufferImageCopy> copies) {
     glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer);
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index df8be12ff..25fe61566 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -143,6 +143,14 @@ public:
     explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr,
                    VAddr cpu_addr);
 
+    ~Image();
+
+    Image(const Image&) = delete;
+    Image& operator=(const Image&) = delete;
+
+    Image(Image&&) = default;
+    Image& operator=(Image&&) = default;
+
     void UploadMemory(const ImageBufferMap& map,
                       std::span<const VideoCommon::BufferImageCopy> copies);
 
@@ -235,6 +243,7 @@ struct TextureCacheParams {
     static constexpr bool ENABLE_VALIDATION = true;
     static constexpr bool FRAMEBUFFER_BLITS = true;
     static constexpr bool HAS_EMULATED_COPIES = true;
+    static constexpr bool HAS_DEVICE_MEMORY_INFO = false;
 
     using Runtime = OpenGL::TextureCacheRuntime;
     using Image = OpenGL::Image;
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index cc19a110f..0b66f8332 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -70,6 +70,10 @@ public:
         return &rasterizer;
     }
 
+    [[nodiscard]] std::string GetDeviceVendor() const override {
+        return device.GetVendorName();
+    }
+
 private:
     /// Initializes the OpenGL state and creates persistent objects.
     void InitOpenGLObjects();
diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp
index 47fddcb6e..abaf1ee6a 100644
--- a/src/video_core/renderer_opengl/util_shaders.cpp
+++ b/src/video_core/renderer_opengl/util_shaders.cpp
@@ -69,7 +69,8 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_)
     swizzle_table_buffer.Create();
     astc_buffer.Create();
     glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0);
-    glNamedBufferStorage(astc_buffer.handle, sizeof(ASTC_BUFFER_DATA), &ASTC_BUFFER_DATA, 0);
+    glNamedBufferStorage(astc_buffer.handle, sizeof(ASTC_ENCODINGS_VALUES), &ASTC_ENCODINGS_VALUES,
+                         0);
 }
 
 UtilShaders::~UtilShaders() = default;
@@ -79,12 +80,6 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map,
     static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0;
     static constexpr GLuint BINDING_INPUT_BUFFER = 1;
     static constexpr GLuint BINDING_ENC_BUFFER = 2;
-
-    static constexpr GLuint BINDING_6_TO_8_BUFFER = 3;
-    static constexpr GLuint BINDING_7_TO_8_BUFFER = 4;
-    static constexpr GLuint BINDING_8_TO_8_BUFFER = 5;
-    static constexpr GLuint BINDING_BYTE_TO_16_BUFFER = 6;
-
     static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;
 
     const Extent2D tile_size{
@@ -93,21 +88,7 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map,
     };
     program_manager.BindHostCompute(astc_decoder_program.handle);
     glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);
-    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_buffer.handle,
-                      offsetof(AstcBufferData, encoding_values),
-                      sizeof(AstcBufferData::encoding_values));
-    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_6_TO_8_BUFFER, astc_buffer.handle,
-                      offsetof(AstcBufferData, replicate_6_to_8),
-                      sizeof(AstcBufferData::replicate_6_to_8));
-    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_7_TO_8_BUFFER, astc_buffer.handle,
-                      offsetof(AstcBufferData, replicate_7_to_8),
-                      sizeof(AstcBufferData::replicate_7_to_8));
-    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_8_TO_8_BUFFER, astc_buffer.handle,
-                      offsetof(AstcBufferData, replicate_8_to_8),
-                      sizeof(AstcBufferData::replicate_8_to_8));
-    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_BYTE_TO_16_BUFFER, astc_buffer.handle,
-                      offsetof(AstcBufferData, replicate_byte_to_16),
-                      sizeof(AstcBufferData::replicate_byte_to_16));
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_buffer.handle);
 
     glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes);
     glUniform2ui(1, tile_size.width, tile_size.height);
@@ -137,6 +118,12 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map,
 
         glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers);
     }
+    // Precautionary barrier to ensure the compute shader is done decoding prior to texture access.
+    // GL_TEXTURE_FETCH_BARRIER_BIT and GL_SHADER_IMAGE_ACCESS_BARRIER_BIT are used in a separate
+    // glMemoryBarrier call by the texture cache runtime
+    glMemoryBarrier(GL_UNIFORM_BARRIER_BIT | GL_COMMAND_BARRIER_BIT | GL_PIXEL_BUFFER_BARRIER_BIT |
+                    GL_TEXTURE_UPDATE_BARRIER_BIT | GL_BUFFER_UPDATE_BARRIER_BIT |
+                    GL_SHADER_STORAGE_BARRIER_BIT | GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
     program_manager.RestoreGuestCompute();
 }
 
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h
index 72071316c..d7d17e110 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.h
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.h
@@ -47,6 +47,10 @@ public:
         return &rasterizer;
     }
 
+    [[nodiscard]] std::string GetDeviceVendor() const override {
+        return device.GetDriverName();
+    }
+
 private:
     void Report() const;
 
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 8cb65e588..0df4e1a1c 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -55,8 +55,9 @@ size_t BytesPerIndex(VkIndexType index_type) {
 template <typename T>
 std::array<T, 6> MakeQuadIndices(u32 quad, u32 first) {
     std::array<T, 6> indices{0, 1, 2, 0, 2, 3};
-    std::ranges::transform(indices, indices.begin(),
-                           [quad, first](u32 index) { return first + index + quad * 4; });
+    for (T& index : indices) {
+        index = static_cast<T>(first + index + quad * 4);
+    }
     return indices;
 }
 } // Anonymous namespace
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
index e11406e58..205cd3b05 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -30,19 +30,16 @@
 namespace Vulkan {
 
 using Tegra::Texture::SWIZZLE_TABLE;
-using Tegra::Texture::ASTC::EncodingsValues;
+using Tegra::Texture::ASTC::ASTC_ENCODINGS_VALUES;
 using namespace Tegra::Texture::ASTC;
 
 namespace {
 
 constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0;
 constexpr u32 ASTC_BINDING_ENC_BUFFER = 1;
-constexpr u32 ASTC_BINDING_6_TO_8_BUFFER = 2;
-constexpr u32 ASTC_BINDING_7_TO_8_BUFFER = 3;
-constexpr u32 ASTC_BINDING_8_TO_8_BUFFER = 4;
-constexpr u32 ASTC_BINDING_BYTE_TO_16_BUFFER = 5;
-constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 6;
-constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 7;
+constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 2;
+constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 3;
+constexpr size_t ASTC_NUM_BINDINGS = 4;
 
 VkPushConstantRange BuildComputePushConstantRange(std::size_t size) {
     return {
@@ -71,7 +68,7 @@ std::array<VkDescriptorSetLayoutBinding, 2> BuildInputOutputDescriptorSetBinding
     }};
 }
 
-std::array<VkDescriptorSetLayoutBinding, 8> BuildASTCDescriptorSetBindings() {
+std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> BuildASTCDescriptorSetBindings() {
     return {{
         {
             .binding = ASTC_BINDING_INPUT_BUFFER,
@@ -88,34 +85,6 @@ std::array<VkDescriptorSetLayoutBinding, 8> BuildASTCDescriptorSetBindings() {
             .pImmutableSamplers = nullptr,
         },
         {
-            .binding = ASTC_BINDING_6_TO_8_BUFFER,
-            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .descriptorCount = 1,
-            .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
-            .pImmutableSamplers = nullptr,
-        },
-        {
-            .binding = ASTC_BINDING_7_TO_8_BUFFER,
-            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .descriptorCount = 1,
-            .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
-            .pImmutableSamplers = nullptr,
-        },
-        {
-            .binding = ASTC_BINDING_8_TO_8_BUFFER,
-            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .descriptorCount = 1,
-            .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
-            .pImmutableSamplers = nullptr,
-        },
-        {
-            .binding = ASTC_BINDING_BYTE_TO_16_BUFFER,
-            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .descriptorCount = 1,
-            .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
-            .pImmutableSamplers = nullptr,
-        },
-        {
             .binding = ASTC_BINDING_SWIZZLE_BUFFER,
             .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
             .descriptorCount = 1,
@@ -143,7 +112,8 @@ VkDescriptorUpdateTemplateEntryKHR BuildInputOutputDescriptorUpdateTemplate() {
     };
 }
 
-std::array<VkDescriptorUpdateTemplateEntryKHR, 8> BuildASTCPassDescriptorUpdateTemplateEntry() {
+std::array<VkDescriptorUpdateTemplateEntryKHR, ASTC_NUM_BINDINGS>
+BuildASTCPassDescriptorUpdateTemplateEntry() {
     return {{
         {
             .dstBinding = ASTC_BINDING_INPUT_BUFFER,
@@ -162,38 +132,6 @@ std::array<VkDescriptorUpdateTemplateEntryKHR, 8> BuildASTCPassDescriptorUpdateT
             .stride = sizeof(DescriptorUpdateEntry),
         },
         {
-            .dstBinding = ASTC_BINDING_6_TO_8_BUFFER,
-            .dstArrayElement = 0,
-            .descriptorCount = 1,
-            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .offset = ASTC_BINDING_6_TO_8_BUFFER * sizeof(DescriptorUpdateEntry),
-            .stride = sizeof(DescriptorUpdateEntry),
-        },
-        {
-            .dstBinding = ASTC_BINDING_7_TO_8_BUFFER,
-            .dstArrayElement = 0,
-            .descriptorCount = 1,
-            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .offset = ASTC_BINDING_7_TO_8_BUFFER * sizeof(DescriptorUpdateEntry),
-            .stride = sizeof(DescriptorUpdateEntry),
-        },
-        {
-            .dstBinding = ASTC_BINDING_8_TO_8_BUFFER,
-            .dstArrayElement = 0,
-            .descriptorCount = 1,
-            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .offset = ASTC_BINDING_8_TO_8_BUFFER * sizeof(DescriptorUpdateEntry),
-            .stride = sizeof(DescriptorUpdateEntry),
-        },
-        {
-            .dstBinding = ASTC_BINDING_BYTE_TO_16_BUFFER,
-            .dstArrayElement = 0,
-            .descriptorCount = 1,
-            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .offset = ASTC_BINDING_BYTE_TO_16_BUFFER * sizeof(DescriptorUpdateEntry),
-            .stride = sizeof(DescriptorUpdateEntry),
-        },
-        {
             .dstBinding = ASTC_BINDING_SWIZZLE_BUFFER,
             .dstArrayElement = 0,
             .descriptorCount = 1,
@@ -222,15 +160,6 @@ struct AstcPushConstants {
     u32 block_height_mask;
 };
 
-struct AstcBufferData {
-    decltype(SWIZZLE_TABLE) swizzle_table_buffer = SWIZZLE_TABLE;
-    decltype(EncodingsValues) encoding_values = EncodingsValues;
-    decltype(REPLICATE_6_BIT_TO_8_TABLE) replicate_6_to_8 = REPLICATE_6_BIT_TO_8_TABLE;
-    decltype(REPLICATE_7_BIT_TO_8_TABLE) replicate_7_to_8 = REPLICATE_7_BIT_TO_8_TABLE;
-    decltype(REPLICATE_8_BIT_TO_8_TABLE) replicate_8_to_8 = REPLICATE_8_BIT_TO_8_TABLE;
-    decltype(REPLICATE_BYTE_TO_16_TABLE) replicate_byte_to_16 = REPLICATE_BYTE_TO_16_TABLE;
-} constexpr ASTC_BUFFER_DATA;
-
 } // Anonymous namespace
 
 VKComputePass::VKComputePass(const Device& device, VKDescriptorPool& descriptor_pool,
@@ -423,7 +352,7 @@ ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_,
 ASTCDecoderPass::~ASTCDecoderPass() = default;
 
 void ASTCDecoderPass::MakeDataBuffer() {
-    constexpr size_t TOTAL_BUFFER_SIZE = sizeof(ASTC_BUFFER_DATA) + sizeof(SWIZZLE_TABLE);
+    constexpr size_t TOTAL_BUFFER_SIZE = sizeof(ASTC_ENCODINGS_VALUES) + sizeof(SWIZZLE_TABLE);
     data_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{
         .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
         .pNext = nullptr,
@@ -437,9 +366,10 @@ void ASTCDecoderPass::MakeDataBuffer() {
     data_buffer_commit = memory_allocator.Commit(data_buffer, MemoryUsage::Upload);
 
     const auto staging_ref = staging_buffer_pool.Request(TOTAL_BUFFER_SIZE, MemoryUsage::Upload);
-    std::memcpy(staging_ref.mapped_span.data(), &ASTC_BUFFER_DATA, sizeof(ASTC_BUFFER_DATA));
+    std::memcpy(staging_ref.mapped_span.data(), &ASTC_ENCODINGS_VALUES,
+                sizeof(ASTC_ENCODINGS_VALUES));
     // Tack on the swizzle table at the end of the buffer
-    std::memcpy(staging_ref.mapped_span.data() + sizeof(ASTC_BUFFER_DATA), &SWIZZLE_TABLE,
+    std::memcpy(staging_ref.mapped_span.data() + sizeof(ASTC_ENCODINGS_VALUES), &SWIZZLE_TABLE,
                 sizeof(SWIZZLE_TABLE));
 
     scheduler.Record([src = staging_ref.buffer, offset = staging_ref.offset, dst = *data_buffer,
@@ -509,18 +439,8 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
         update_descriptor_queue.Acquire();
         update_descriptor_queue.AddBuffer(map.buffer, input_offset,
                                           image.guest_size_bytes - swizzle.buffer_offset);
-        update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, encoding_values),
-                                          sizeof(AstcBufferData::encoding_values));
-        update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, replicate_6_to_8),
-                                          sizeof(AstcBufferData::replicate_6_to_8));
-        update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, replicate_7_to_8),
-                                          sizeof(AstcBufferData::replicate_7_to_8));
-        update_descriptor_queue.AddBuffer(*data_buffer, offsetof(AstcBufferData, replicate_8_to_8),
-                                          sizeof(AstcBufferData::replicate_8_to_8));
-        update_descriptor_queue.AddBuffer(*data_buffer,
-                                          offsetof(AstcBufferData, replicate_byte_to_16),
-                                          sizeof(AstcBufferData::replicate_byte_to_16));
-        update_descriptor_queue.AddBuffer(*data_buffer, sizeof(AstcBufferData),
+        update_descriptor_queue.AddBuffer(*data_buffer, 0, sizeof(ASTC_ENCODINGS_VALUES));
+        update_descriptor_queue.AddBuffer(*data_buffer, sizeof(ASTC_ENCODINGS_VALUES),
                                           sizeof(SWIZZLE_TABLE));
         update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level));
 
@@ -569,6 +489,7 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                                VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, image_barrier);
     });
+    scheduler.Finish();
 }
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp
index db78ce3d9..6852c11b0 100644
--- a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp
+++ b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp
@@ -2,8 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <atomic>
-#include <chrono>
+#include <thread>
 
 #include "common/settings.h"
 #include "video_core/renderer_vulkan/vk_master_semaphore.h"
@@ -12,8 +11,6 @@
 
 namespace Vulkan {
 
-using namespace std::chrono_literals;
-
 MasterSemaphore::MasterSemaphore(const Device& device) {
     static constexpr VkSemaphoreTypeCreateInfoKHR semaphore_type_ci{
         .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR,
@@ -34,9 +31,9 @@ MasterSemaphore::MasterSemaphore(const Device& device) {
     // Validation layers have a bug where they fail to track resource usage when using timeline
     // semaphores and synchronizing with GetSemaphoreCounterValueKHR. To workaround this issue, have
     // a separate thread waiting for each timeline semaphore value.
-    debug_thread = std::thread([this] {
+    debug_thread = std::jthread([this](std::stop_token stop_token) {
         u64 counter = 0;
-        while (!shutdown) {
+        while (!stop_token.stop_requested()) {
             if (semaphore.Wait(counter, 10'000'000)) {
                 ++counter;
             }
@@ -44,13 +41,6 @@ MasterSemaphore::MasterSemaphore(const Device& device) {
     });
 }
 
-MasterSemaphore::~MasterSemaphore() {
-    shutdown = true;
-
-    // This thread might not be started
-    if (debug_thread.joinable()) {
-        debug_thread.join();
-    }
-}
+MasterSemaphore::~MasterSemaphore() = default;
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.h b/src/video_core/renderer_vulkan/vk_master_semaphore.h
index 4b6d64daa..ee3cd35d0 100644
--- a/src/video_core/renderer_vulkan/vk_master_semaphore.h
+++ b/src/video_core/renderer_vulkan/vk_master_semaphore.h
@@ -65,11 +65,10 @@ public:
     }
 
 private:
-    vk::Semaphore semaphore;           ///< Timeline semaphore.
-    std::atomic<u64> gpu_tick{0};      ///< Current known GPU tick.
-    std::atomic<u64> current_tick{1};  ///< Current logical tick.
-    std::atomic<bool> shutdown{false}; ///< True when the object is being destroyed.
-    std::thread debug_thread;          ///< Debug thread to workaround validation layer bugs.
+    vk::Semaphore semaphore;          ///< Timeline semaphore.
+    std::atomic<u64> gpu_tick{0};     ///< Current known GPU tick.
+    std::atomic<u64> current_tick{1}; ///< Current logical tick.
+    std::jthread debug_thread;        ///< Debug thread to workaround validation layer bugs.
 };
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
index 7a1232497..0412b5234 100644
--- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
@@ -12,6 +12,7 @@
 #include "common/assert.h"
 #include "common/bit_util.h"
 #include "common/common_types.h"
+#include "common/literals.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
 #include "video_core/vulkan_common/vulkan_device.h"
@@ -19,12 +20,15 @@
 
 namespace Vulkan {
 namespace {
+
+using namespace Common::Literals;
+
 // Maximum potential alignment of a Vulkan buffer
 constexpr VkDeviceSize MAX_ALIGNMENT = 256;
 // Maximum size to put elements in the stream buffer
-constexpr VkDeviceSize MAX_STREAM_BUFFER_REQUEST_SIZE = 8 * 1024 * 1024;
+constexpr VkDeviceSize MAX_STREAM_BUFFER_REQUEST_SIZE = 8_MiB;
 // Stream buffer size in bytes
-constexpr VkDeviceSize STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
+constexpr VkDeviceSize STREAM_BUFFER_SIZE = 128_MiB;
 constexpr VkDeviceSize REGION_SIZE = STREAM_BUFFER_SIZE / StagingBufferPool::NUM_SYNCS;
 
 constexpr VkMemoryPropertyFlags HOST_FLAGS =
diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
index a09fe084e..7b4875d0e 100644
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
@@ -10,6 +10,7 @@
 
 #include "common/alignment.h"
 #include "common/assert.h"
+#include "common/literals.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_stream_buffer.h"
 #include "video_core/vulkan_common/vulkan_device.h"
@@ -19,6 +20,8 @@ namespace Vulkan {
 
 namespace {
 
+using namespace Common::Literals;
+
 constexpr VkBufferUsageFlags BUFFER_USAGE =
     VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
     VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
@@ -26,7 +29,7 @@ constexpr VkBufferUsageFlags BUFFER_USAGE =
 constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000;
 constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000;
 
-constexpr u64 PREFERRED_STREAM_BUFFER_SIZE = 256 * 1024 * 1024;
+constexpr u64 PREFERRED_STREAM_BUFFER_SIZE = 256_MiB;
 
 /// Find a memory type with the passed requirements
 std::optional<u32> FindMemoryType(const VkPhysicalDeviceMemoryProperties& properties,
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index bdd0ce8bc..a2ab4d1ee 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -8,6 +8,7 @@
 #include <vector>
 
 #include "common/bit_cast.h"
+#include "common/settings.h"
 
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/renderer_vulkan/blit_image.h"
@@ -817,6 +818,10 @@ void TextureCacheRuntime::CopyImage(Image& dst, Image& src,
     });
 }
 
+u64 TextureCacheRuntime::GetDeviceLocalMemory() const {
+    return device.GetDeviceLocalMemory();
+}
+
 Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_addr_,
              VAddr cpu_addr_)
     : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), scheduler{&runtime.scheduler},
@@ -828,7 +833,11 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_
         commit = runtime.memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal);
     }
     if (IsPixelFormatASTC(info.format) && !runtime.device.IsOptimalAstcSupported()) {
-        flags |= VideoCommon::ImageFlagBits::AcceleratedUpload;
+        if (Settings::values.accelerate_astc.GetValue()) {
+            flags |= VideoCommon::ImageFlagBits::AcceleratedUpload;
+        } else {
+            flags |= VideoCommon::ImageFlagBits::Converted;
+        }
     }
     if (runtime.device.HasDebuggingToolAttached()) {
         if (image) {
@@ -871,6 +880,8 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_
     }
 }
 
+Image::~Image() = default;
+
 void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
     // TODO: Move this to another API
     scheduler->RequestOutsideRenderPassOperationContext();
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index 4a57d378b..172bcdf98 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -97,6 +97,8 @@ struct TextureCacheRuntime {
         // All known Vulkan drivers can natively handle BGR textures
         return true;
     }
+
+    u64 GetDeviceLocalMemory() const;
 };
 
 class Image : public VideoCommon::ImageBase {
@@ -104,6 +106,14 @@ public:
     explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr,
                    VAddr cpu_addr);
 
+    ~Image();
+
+    Image(const Image&) = delete;
+    Image& operator=(const Image&) = delete;
+
+    Image(Image&&) = default;
+    Image& operator=(Image&&) = default;
+
     void UploadMemory(const StagingBufferRef& map,
                       std::span<const VideoCommon::BufferImageCopy> copies);
 
@@ -257,6 +267,7 @@ struct TextureCacheParams {
     static constexpr bool ENABLE_VALIDATION = true;
     static constexpr bool FRAMEBUFFER_BLITS = false;
     static constexpr bool HAS_EMULATED_COPIES = false;
+    static constexpr bool HAS_DEVICE_MEMORY_INFO = true;
 
     using Runtime = Vulkan::TextureCacheRuntime;
     using Image = Vulkan::Image;
diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp
index 6308aef94..eb1746265 100644
--- a/src/video_core/surface.cpp
+++ b/src/video_core/surface.cpp
@@ -283,4 +283,11 @@ std::pair<u32, u32> GetASTCBlockSize(PixelFormat format) {
     return {DefaultBlockWidth(format), DefaultBlockHeight(format)};
 }
 
+u64 EstimatedDecompressedSize(u64 base_size, PixelFormat format) {
+    constexpr u64 RGBA8_PIXEL_SIZE = 4;
+    const u64 base_block_size = static_cast<u64>(DefaultBlockWidth(format)) *
+                                static_cast<u64>(DefaultBlockHeight(format)) * RGBA8_PIXEL_SIZE;
+    return (base_size * base_block_size) / BytesPerBlock(format);
+}
+
 } // namespace VideoCore::Surface
diff --git a/src/video_core/surface.h b/src/video_core/surface.h
index c40ab89d0..1503db81f 100644
--- a/src/video_core/surface.h
+++ b/src/video_core/surface.h
@@ -462,4 +462,6 @@ bool IsPixelFormatSRGB(PixelFormat format);
 
 std::pair<u32, u32> GetASTCBlockSize(PixelFormat format);
 
+u64 EstimatedDecompressedSize(u64 base_size, PixelFormat format);
+
 } // namespace VideoCore::Surface
diff --git a/src/video_core/texture_cache/image_base.cpp b/src/video_core/texture_cache/image_base.cpp
index 9914926b3..ad69d32d1 100644
--- a/src/video_core/texture_cache/image_base.cpp
+++ b/src/video_core/texture_cache/image_base.cpp
@@ -113,6 +113,43 @@ void ImageBase::InsertView(const ImageViewInfo& view_info, ImageViewId image_vie
     image_view_ids.push_back(image_view_id);
 }
 
+bool ImageBase::IsSafeDownload() const noexcept {
+    // Skip images that were not modified from the GPU
+    if (False(flags & ImageFlagBits::GpuModified)) {
+        return false;
+    }
+    // Skip images that .are. modified from the CPU
+    // We don't want to write sensitive data from the guest
+    if (True(flags & ImageFlagBits::CpuModified)) {
+        return false;
+    }
+    if (info.num_samples > 1) {
+        LOG_WARNING(HW_GPU, "MSAA image downloads are not implemented");
+        return false;
+    }
+    return true;
+}
+
+void ImageBase::CheckBadOverlapState() {
+    if (False(flags & ImageFlagBits::BadOverlap)) {
+        return;
+    }
+    if (!overlapping_images.empty()) {
+        return;
+    }
+    flags &= ~ImageFlagBits::BadOverlap;
+}
+
+void ImageBase::CheckAliasState() {
+    if (False(flags & ImageFlagBits::Alias)) {
+        return;
+    }
+    if (!aliased_images.empty()) {
+        return;
+    }
+    flags &= ~ImageFlagBits::Alias;
+}
+
 void AddImageAlias(ImageBase& lhs, ImageBase& rhs, ImageId lhs_id, ImageId rhs_id) {
     static constexpr auto OPTIONS = RelaxedOptions::Size | RelaxedOptions::Format;
     ASSERT(lhs.info.type == rhs.info.type);
diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h
index b7f3b7e43..e326cab71 100644
--- a/src/video_core/texture_cache/image_base.h
+++ b/src/video_core/texture_cache/image_base.h
@@ -25,6 +25,12 @@ enum class ImageFlagBits : u32 {
     Strong = 1 << 5,      ///< Exists in the image table, the dimensions are can be trusted
     Registered = 1 << 6,  ///< True when the image is registered
     Picked = 1 << 7,      ///< Temporary flag to mark the image as picked
+
+    // Garbage Collection Flags
+    BadOverlap = 1 << 8, ///< This image overlaps other but doesn't fit, has higher
+                         ///< garbage collection priority
+    Alias = 1 << 9,      ///< This image has aliases and has priority on garbage
+                         ///< collection
 };
 DECLARE_ENUM_FLAG_OPERATORS(ImageFlagBits)
 
@@ -44,11 +50,16 @@ struct ImageBase {
 
     void InsertView(const ImageViewInfo& view_info, ImageViewId image_view_id);
 
+    [[nodiscard]] bool IsSafeDownload() const noexcept;
+
     [[nodiscard]] bool Overlaps(VAddr overlap_cpu_addr, size_t overlap_size) const noexcept {
         const VAddr overlap_end = overlap_cpu_addr + overlap_size;
         return cpu_addr < overlap_end && overlap_cpu_addr < cpu_addr_end;
     }
 
+    void CheckBadOverlapState();
+    void CheckAliasState();
+
     ImageInfo info;
 
     u32 guest_size_bytes = 0;
@@ -72,6 +83,7 @@ struct ImageBase {
     std::vector<SubresourceBase> slice_subresources;
 
     std::vector<AliasedImage> aliased_images;
+    std::vector<ImageId> overlapping_images;
 };
 
 struct ImageAllocBase {
diff --git a/src/video_core/texture_cache/slot_vector.h b/src/video_core/texture_cache/slot_vector.h
index eae3be6ea..6180b8c0e 100644
--- a/src/video_core/texture_cache/slot_vector.h
+++ b/src/video_core/texture_cache/slot_vector.h
@@ -5,6 +5,7 @@
 #pragma once
 
 #include <array>
+#include <bit>
 #include <concepts>
 #include <numeric>
 #include <type_traits>
@@ -32,6 +33,60 @@ template <class T>
 requires std::is_nothrow_move_assignable_v<T>&&
     std::is_nothrow_move_constructible_v<T> class SlotVector {
 public:
+    class Iterator {
+        friend SlotVector<T>;
+
+    public:
+        constexpr Iterator() = default;
+
+        Iterator& operator++() noexcept {
+            const u64* const bitset = slot_vector->stored_bitset.data();
+            const u32 size = static_cast<u32>(slot_vector->stored_bitset.size()) * 64;
+            if (id.index < size) {
+                do {
+                    ++id.index;
+                } while (id.index < size && !IsValid(bitset));
+                if (id.index == size) {
+                    id.index = SlotId::INVALID_INDEX;
+                }
+            }
+            return *this;
+        }
+
+        Iterator operator++(int) noexcept {
+            const Iterator copy{*this};
+            ++*this;
+            return copy;
+        }
+
+        bool operator==(const Iterator& other) const noexcept {
+            return id.index == other.id.index;
+        }
+
+        bool operator!=(const Iterator& other) const noexcept {
+            return id.index != other.id.index;
+        }
+
+        std::pair<SlotId, T*> operator*() const noexcept {
+            return {id, std::addressof((*slot_vector)[id])};
+        }
+
+        T* operator->() const noexcept {
+            return std::addressof((*slot_vector)[id]);
+        }
+
+    private:
+        Iterator(SlotVector<T>* slot_vector_, SlotId id_) noexcept
+            : slot_vector{slot_vector_}, id{id_} {}
+
+        bool IsValid(const u64* bitset) const noexcept {
+            return ((bitset[id.index / 64] >> (id.index % 64)) & 1) != 0;
+        }
+
+        SlotVector<T>* slot_vector;
+        SlotId id;
+    };
+
     ~SlotVector() noexcept {
         size_t index = 0;
         for (u64 bits : stored_bitset) {
@@ -70,6 +125,20 @@ public:
         ResetStorageBit(id.index);
     }
 
+    [[nodiscard]] Iterator begin() noexcept {
+        const auto it = std::ranges::find_if(stored_bitset, [](u64 value) { return value != 0; });
+        if (it == stored_bitset.end()) {
+            return end();
+        }
+        const u32 word_index = static_cast<u32>(std::distance(it, stored_bitset.begin()));
+        const SlotId first_id{word_index * 64 + static_cast<u32>(std::countr_zero(*it))};
+        return Iterator(this, first_id);
+    }
+
+    [[nodiscard]] Iterator end() noexcept {
+        return Iterator(this, SlotId{SlotId::INVALID_INDEX});
+    }
+
 private:
     struct NonTrivialDummy {
         NonTrivialDummy() noexcept {}
@@ -140,7 +209,6 @@ private:
 
     Entry* values = nullptr;
     size_t values_capacity = 0;
-    size_t values_size = 0;
 
     std::vector<u64> stored_bitset;
     std::vector<u32> free_list;
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 59b7c678b..c7cfd02b6 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -19,9 +19,10 @@
 #include <boost/container/small_vector.hpp>
 
 #include "common/alignment.h"
-#include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "common/literals.h"
 #include "common/logging/log.h"
+#include "common/settings.h"
 #include "video_core/compatible_formats.h"
 #include "video_core/delayed_destruction_ring.h"
 #include "video_core/dirty_flags.h"
@@ -57,6 +58,7 @@ using VideoCore::Surface::PixelFormat;
 using VideoCore::Surface::PixelFormatFromDepthFormat;
 using VideoCore::Surface::PixelFormatFromRenderTargetFormat;
 using VideoCore::Surface::SurfaceType;
+using namespace Common::Literals;
 
 template <class P>
 class TextureCache {
@@ -69,12 +71,17 @@ class TextureCache {
     static constexpr bool FRAMEBUFFER_BLITS = P::FRAMEBUFFER_BLITS;
     /// True when some copies have to be emulated
     static constexpr bool HAS_EMULATED_COPIES = P::HAS_EMULATED_COPIES;
+    /// True when the API can provide info about the memory of the device.
+    static constexpr bool HAS_DEVICE_MEMORY_INFO = P::HAS_DEVICE_MEMORY_INFO;
 
     /// Image view ID for null descriptors
     static constexpr ImageViewId NULL_IMAGE_VIEW_ID{0};
     /// Sampler ID for bugged sampler ids
     static constexpr SamplerId NULL_SAMPLER_ID{0};
 
+    static constexpr u64 DEFAULT_EXPECTED_MEMORY = 1_GiB;
+    static constexpr u64 DEFAULT_CRITICAL_MEMORY = 2_GiB;
+
     using Runtime = typename P::Runtime;
     using Image = typename P::Image;
     using ImageAlloc = typename P::ImageAlloc;
@@ -197,6 +204,9 @@ private:
         }
     }
 
+    /// Runs the Garbage Collector.
+    void RunGarbageCollector();
+
     /// Fills image_view_ids in the image views in indices
     void FillImageViews(DescriptorTable<TICEntry>& table,
                         std::span<ImageViewId> cached_image_view_ids, std::span<const u32> indices,
@@ -333,6 +343,10 @@ private:
     std::unordered_map<u64, std::vector<ImageId>, IdentityHash<u64>> page_table;
 
     bool has_deleted_images = false;
+    u64 total_used_memory = 0;
+    u64 minimum_memory;
+    u64 expected_memory;
+    u64 critical_memory;
 
     SlotVector<Image> slot_images;
     SlotVector<ImageView> slot_image_views;
@@ -353,6 +367,7 @@ private:
 
     u64 modification_tick = 0;
     u64 frame_tick = 0;
+    typename SlotVector<Image>::Iterator deletion_iterator;
 };
 
 template <class P>
@@ -373,11 +388,94 @@ TextureCache<P>::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface&
     // This way the null resource becomes a compile time constant
     void(slot_image_views.insert(runtime, NullImageParams{}));
     void(slot_samplers.insert(runtime, sampler_descriptor));
+
+    deletion_iterator = slot_images.begin();
+
+    if constexpr (HAS_DEVICE_MEMORY_INFO) {
+        const auto device_memory = runtime.GetDeviceLocalMemory();
+        const u64 possible_expected_memory = (device_memory * 3) / 10;
+        const u64 possible_critical_memory = (device_memory * 6) / 10;
+        expected_memory = std::max(possible_expected_memory, DEFAULT_EXPECTED_MEMORY);
+        critical_memory = std::max(possible_critical_memory, DEFAULT_CRITICAL_MEMORY);
+        minimum_memory = 0;
+    } else {
+        // on OGL we can be more conservatives as the driver takes care.
+        expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB;
+        critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB;
+        minimum_memory = expected_memory;
+    }
+}
+
+template <class P>
+void TextureCache<P>::RunGarbageCollector() {
+    const bool high_priority_mode = total_used_memory >= expected_memory;
+    const bool aggressive_mode = total_used_memory >= critical_memory;
+    const u64 ticks_to_destroy = high_priority_mode ? 60 : 100;
+    int num_iterations = aggressive_mode ? 256 : (high_priority_mode ? 128 : 64);
+    for (; num_iterations > 0; --num_iterations) {
+        if (deletion_iterator == slot_images.end()) {
+            deletion_iterator = slot_images.begin();
+            if (deletion_iterator == slot_images.end()) {
+                break;
+            }
+        }
+        auto [image_id, image_tmp] = *deletion_iterator;
+        Image* image = image_tmp; // fix clang error.
+        const bool is_alias = True(image->flags & ImageFlagBits::Alias);
+        const bool is_bad_overlap = True(image->flags & ImageFlagBits::BadOverlap);
+        const bool must_download = image->IsSafeDownload();
+        bool should_care = is_bad_overlap || is_alias || (high_priority_mode && !must_download);
+        const u64 ticks_needed =
+            is_bad_overlap
+                ? ticks_to_destroy >> 4
+                : ((should_care && aggressive_mode) ? ticks_to_destroy >> 1 : ticks_to_destroy);
+        should_care |= aggressive_mode;
+        if (should_care && image->frame_tick + ticks_needed < frame_tick) {
+            if (is_bad_overlap) {
+                const bool overlap_check = std::ranges::all_of(
+                    image->overlapping_images, [&, image](const ImageId& overlap_id) {
+                        auto& overlap = slot_images[overlap_id];
+                        return overlap.frame_tick >= image->frame_tick;
+                    });
+                if (!overlap_check) {
+                    ++deletion_iterator;
+                    continue;
+                }
+            }
+            if (!is_bad_overlap && must_download) {
+                const bool alias_check = std::ranges::none_of(
+                    image->aliased_images, [&, image](const AliasedImage& alias) {
+                        auto& alias_image = slot_images[alias.id];
+                        return (alias_image.frame_tick < image->frame_tick) ||
+                               (alias_image.modification_tick < image->modification_tick);
+                    });
+
+                if (alias_check) {
+                    auto map = runtime.DownloadStagingBuffer(image->unswizzled_size_bytes);
+                    const auto copies = FullDownloadCopies(image->info);
+                    image->DownloadMemory(map, copies);
+                    runtime.Finish();
+                    SwizzleImage(gpu_memory, image->gpu_addr, image->info, copies, map.mapped_span);
+                }
+            }
+            if (True(image->flags & ImageFlagBits::Tracked)) {
+                UntrackImage(*image);
+            }
+            UnregisterImage(image_id);
+            DeleteImage(image_id);
+            if (is_bad_overlap) {
+                ++num_iterations;
+            }
+        }
+        ++deletion_iterator;
+    }
 }
 
 template <class P>
 void TextureCache<P>::TickFrame() {
-    // Tick sentenced resources in this order to ensure they are destroyed in the right order
+    if (Settings::values.use_caches_gc.GetValue() && total_used_memory > minimum_memory) {
+        RunGarbageCollector();
+    }
     sentenced_images.Tick();
     sentenced_framebuffers.Tick();
     sentenced_image_view.Tick();
@@ -568,17 +666,7 @@ template <class P>
 void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) {
     std::vector<ImageId> images;
     ForEachImageInRegion(cpu_addr, size, [this, &images](ImageId image_id, ImageBase& image) {
-        // Skip images that were not modified from the GPU
-        if (False(image.flags & ImageFlagBits::GpuModified)) {
-            return;
-        }
-        // Skip images that .are. modified from the CPU
-        // We don't want to write sensitive data from the guest
-        if (True(image.flags & ImageFlagBits::CpuModified)) {
-            return;
-        }
-        if (image.info.num_samples > 1) {
-            LOG_WARNING(HW_GPU, "MSAA image downloads are not implemented");
+        if (!image.IsSafeDownload()) {
             return;
         }
         image.flags &= ~ImageFlagBits::GpuModified;
@@ -967,6 +1055,7 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
     std::vector<ImageId> overlap_ids;
     std::vector<ImageId> left_aliased_ids;
     std::vector<ImageId> right_aliased_ids;
+    std::vector<ImageId> bad_overlap_ids;
     ForEachImageInRegion(cpu_addr, size_bytes, [&](ImageId overlap_id, ImageBase& overlap) {
         if (info.type != overlap.info.type) {
             return;
@@ -992,9 +1081,14 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
         const ImageBase new_image_base(new_info, gpu_addr, cpu_addr);
         if (IsSubresource(new_info, overlap, gpu_addr, options, broken_views, native_bgr)) {
             left_aliased_ids.push_back(overlap_id);
+            overlap.flags |= ImageFlagBits::Alias;
         } else if (IsSubresource(overlap.info, new_image_base, overlap.gpu_addr, options,
                                  broken_views, native_bgr)) {
             right_aliased_ids.push_back(overlap_id);
+            overlap.flags |= ImageFlagBits::Alias;
+        } else {
+            bad_overlap_ids.push_back(overlap_id);
+            overlap.flags |= ImageFlagBits::BadOverlap;
         }
     });
     const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr);
@@ -1022,10 +1116,18 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
     for (const ImageId aliased_id : right_aliased_ids) {
         ImageBase& aliased = slot_images[aliased_id];
         AddImageAlias(new_image_base, aliased, new_image_id, aliased_id);
+        new_image.flags |= ImageFlagBits::Alias;
     }
     for (const ImageId aliased_id : left_aliased_ids) {
         ImageBase& aliased = slot_images[aliased_id];
         AddImageAlias(aliased, new_image_base, aliased_id, new_image_id);
+        new_image.flags |= ImageFlagBits::Alias;
+    }
+    for (const ImageId aliased_id : bad_overlap_ids) {
+        ImageBase& aliased = slot_images[aliased_id];
+        aliased.overlapping_images.push_back(new_image_id);
+        new_image.overlapping_images.push_back(aliased_id);
+        new_image.flags |= ImageFlagBits::BadOverlap;
     }
     RegisterImage(new_image_id);
     return new_image_id;
@@ -1195,6 +1297,13 @@ void TextureCache<P>::RegisterImage(ImageId image_id) {
     image.flags |= ImageFlagBits::Registered;
     ForEachPage(image.cpu_addr, image.guest_size_bytes,
                 [this, image_id](u64 page) { page_table[page].push_back(image_id); });
+    u64 tentative_size = std::max(image.guest_size_bytes, image.unswizzled_size_bytes);
+    if ((IsPixelFormatASTC(image.info.format) &&
+         True(image.flags & ImageFlagBits::AcceleratedUpload)) ||
+        True(image.flags & ImageFlagBits::Converted)) {
+        tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format);
+    }
+    total_used_memory += Common::AlignUp(tentative_size, 1024);
 }
 
 template <class P>
@@ -1203,6 +1312,14 @@ void TextureCache<P>::UnregisterImage(ImageId image_id) {
     ASSERT_MSG(True(image.flags & ImageFlagBits::Registered),
                "Trying to unregister an already registered image");
     image.flags &= ~ImageFlagBits::Registered;
+    image.flags &= ~ImageFlagBits::BadOverlap;
+    u64 tentative_size = std::max(image.guest_size_bytes, image.unswizzled_size_bytes);
+    if ((IsPixelFormatASTC(image.info.format) &&
+         True(image.flags & ImageFlagBits::AcceleratedUpload)) ||
+        True(image.flags & ImageFlagBits::Converted)) {
+        tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format);
+    }
+    total_used_memory -= Common::AlignUp(tentative_size, 1024);
     ForEachPage(image.cpu_addr, image.guest_size_bytes, [this, image_id](u64 page) {
         const auto page_it = page_table.find(page);
         if (page_it == page_table.end()) {
@@ -1276,9 +1393,19 @@ void TextureCache<P>::DeleteImage(ImageId image_id) {
             std::erase_if(other_image.aliased_images, [image_id](const AliasedImage& other_alias) {
                 return other_alias.id == image_id;
             });
+        other_image.CheckAliasState();
         ASSERT_MSG(num_removed_aliases == 1, "Invalid number of removed aliases: {}",
                    num_removed_aliases);
     }
+    for (const ImageId overlap_id : image.overlapping_images) {
+        ImageBase& other_image = slot_images[overlap_id];
+        [[maybe_unused]] const size_t num_removed_overlaps = std::erase_if(
+            other_image.overlapping_images,
+            [image_id](const ImageId other_overlap_id) { return other_overlap_id == image_id; });
+        other_image.CheckBadOverlapState();
+        ASSERT_MSG(num_removed_overlaps == 1, "Invalid number of removed overlapps: {}",
+                   num_removed_overlaps);
+    }
     for (const ImageViewId image_view_id : image_view_ids) {
         sentenced_image_view.Push(std::move(slot_image_views[image_view_id]));
         slot_image_views.erase(image_view_id);
diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp
index 906604a39..4efe042b6 100644
--- a/src/video_core/texture_cache/util.cpp
+++ b/src/video_core/texture_cache/util.cpp
@@ -47,6 +47,7 @@
 #include "video_core/texture_cache/formatter.h"
 #include "video_core/texture_cache/samples_helper.h"
 #include "video_core/texture_cache/util.h"
+#include "video_core/textures/astc.h"
 #include "video_core/textures/decoders.h"
 
 namespace VideoCommon {
@@ -580,6 +581,8 @@ void SwizzleBlockLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr
 
     for (s32 layer = 0; layer < info.resources.layers; ++layer) {
         const std::span<const u8> src = input.subspan(host_offset);
+        gpu_memory.ReadBlockUnsafe(gpu_addr + guest_offset, dst.data(), dst.size_bytes());
+
         SwizzleTexture(dst, src, bytes_per_block, num_tiles.width, num_tiles.height,
                        num_tiles.depth, block.height, block.depth);
 
@@ -884,8 +887,16 @@ void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8
         ASSERT(copy.image_extent == mip_size);
         ASSERT(copy.buffer_row_length == Common::AlignUp(mip_size.width, tile_size.width));
         ASSERT(copy.buffer_image_height == Common::AlignUp(mip_size.height, tile_size.height));
-        DecompressBC4(input.subspan(copy.buffer_offset), copy.image_extent,
-                      output.subspan(output_offset));
+        if (IsPixelFormatASTC(info.format)) {
+            ASSERT(copy.image_extent.depth == 1);
+            Tegra::Texture::ASTC::Decompress(input.subspan(copy.buffer_offset),
+                                             copy.image_extent.width, copy.image_extent.height,
+                                             copy.image_subresource.num_layers, tile_size.width,
+                                             tile_size.height, output.subspan(output_offset));
+        } else {
+            DecompressBC4(input.subspan(copy.buffer_offset), copy.image_extent,
+                          output.subspan(output_offset));
+        }
         copy.buffer_offset = output_offset;
         copy.buffer_row_length = mip_size.width;
         copy.buffer_image_height = mip_size.height;
@@ -1087,7 +1098,15 @@ std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const
         return std::nullopt;
     }
     const ImageInfo& existing = image.info;
-    if (False(options & RelaxedOptions::Format)) {
+    if (True(options & RelaxedOptions::Format)) {
+        // Format checking is relaxed, but we still have to check for matching bytes per block.
+        // This avoids creating a view for blits on UE4 titles where formats with different bytes
+        // per block are aliased.
+        if (BytesPerBlock(existing.format) != BytesPerBlock(candidate.format)) {
+            return std::nullopt;
+        }
+    } else {
+        // Format comaptibility is not relaxed, ensure we are creating a view on a compatible format
         if (!IsViewCompatible(existing.format, candidate.format, broken_views, native_bgr)) {
             return std::nullopt;
         }
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
new file mode 100644
index 000000000..7b756ba41
--- /dev/null
+++ b/src/video_core/textures/astc.cpp
@@ -0,0 +1,1579 @@
+// Copyright 2016 The University of North Carolina at Chapel Hill
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Please send all BUG REPORTS to <pavel@cs.unc.edu>.
+// <http://gamma.cs.unc.edu/FasTC/>
+
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <span>
+#include <vector>
+
+#include <boost/container/static_vector.hpp>
+
+#include "common/common_types.h"
+#include "video_core/textures/astc.h"
+
+class InputBitStream {
+public:
+    constexpr explicit InputBitStream(std::span<const u8> data, size_t start_offset = 0)
+        : cur_byte{data.data()}, total_bits{data.size()}, next_bit{start_offset % 8} {}
+
+    constexpr size_t GetBitsRead() const {
+        return bits_read;
+    }
+
+    constexpr bool ReadBit() {
+        if (bits_read >= total_bits * 8) {
+            return 0;
+        }
+        const bool bit = ((*cur_byte >> next_bit) & 1) != 0;
+        ++next_bit;
+        while (next_bit >= 8) {
+            next_bit -= 8;
+            ++cur_byte;
+        }
+        ++bits_read;
+        return bit;
+    }
+
+    constexpr u32 ReadBits(std::size_t nBits) {
+        u32 ret = 0;
+        for (std::size_t i = 0; i < nBits; ++i) {
+            ret |= (ReadBit() & 1) << i;
+        }
+        return ret;
+    }
+
+    template <std::size_t nBits>
+    constexpr u32 ReadBits() {
+        u32 ret = 0;
+        for (std::size_t i = 0; i < nBits; ++i) {
+            ret |= (ReadBit() & 1) << i;
+        }
+        return ret;
+    }
+
+private:
+    const u8* cur_byte;
+    size_t total_bits = 0;
+    size_t next_bit = 0;
+    size_t bits_read = 0;
+};
+
+class OutputBitStream {
+public:
+    constexpr explicit OutputBitStream(u8* ptr, std::size_t bits = 0, std::size_t start_offset = 0)
+        : cur_byte{ptr}, num_bits{bits}, next_bit{start_offset % 8} {}
+
+    constexpr std::size_t GetBitsWritten() const {
+        return bits_written;
+    }
+
+    constexpr void WriteBitsR(u32 val, u32 nBits) {
+        for (u32 i = 0; i < nBits; i++) {
+            WriteBit((val >> (nBits - i - 1)) & 1);
+        }
+    }
+
+    constexpr void WriteBits(u32 val, u32 nBits) {
+        for (u32 i = 0; i < nBits; i++) {
+            WriteBit((val >> i) & 1);
+        }
+    }
+
+private:
+    constexpr void WriteBit(bool b) {
+        if (bits_written >= num_bits) {
+            return;
+        }
+
+        const u32 mask = 1 << next_bit++;
+
+        // clear the bit
+        *cur_byte &= static_cast<u8>(~mask);
+
+        // Write the bit, if necessary
+        if (b)
+            *cur_byte |= static_cast<u8>(mask);
+
+        // Next byte?
+        if (next_bit >= 8) {
+            cur_byte += 1;
+            next_bit = 0;
+        }
+    }
+
+    u8* cur_byte;
+    std::size_t num_bits;
+    std::size_t bits_written = 0;
+    std::size_t next_bit = 0;
+};
+
+template <typename IntType>
+class Bits {
+public:
+    explicit Bits(const IntType& v) : m_Bits(v) {}
+
+    Bits(const Bits&) = delete;
+    Bits& operator=(const Bits&) = delete;
+
+    u8 operator[](u32 bitPos) const {
+        return static_cast<u8>((m_Bits >> bitPos) & 1);
+    }
+
+    IntType operator()(u32 start, u32 end) const {
+        if (start == end) {
+            return (*this)[start];
+        } else if (start > end) {
+            u32 t = start;
+            start = end;
+            end = t;
+        }
+
+        u64 mask = (1 << (end - start + 1)) - 1;
+        return (m_Bits >> start) & static_cast<IntType>(mask);
+    }
+
+private:
+    const IntType& m_Bits;
+};
+
+namespace Tegra::Texture::ASTC {
+using IntegerEncodedVector = boost::container::static_vector<
+    IntegerEncodedValue, 256,
+    boost::container::static_vector_options<
+        boost::container::inplace_alignment<alignof(IntegerEncodedValue)>,
+        boost::container::throw_on_overflow<false>>::type>;
+
+static void DecodeTritBlock(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) {
+    // Implement the algorithm in section C.2.12
+    std::array<u32, 5> m;
+    std::array<u32, 5> t;
+    u32 T;
+
+    // Read the trit encoded block according to
+    // table C.2.14
+    m[0] = bits.ReadBits(nBitsPerValue);
+    T = bits.ReadBits<2>();
+    m[1] = bits.ReadBits(nBitsPerValue);
+    T |= bits.ReadBits<2>() << 2;
+    m[2] = bits.ReadBits(nBitsPerValue);
+    T |= bits.ReadBit() << 4;
+    m[3] = bits.ReadBits(nBitsPerValue);
+    T |= bits.ReadBits<2>() << 5;
+    m[4] = bits.ReadBits(nBitsPerValue);
+    T |= bits.ReadBit() << 7;
+
+    u32 C = 0;
+
+    Bits<u32> Tb(T);
+    if (Tb(2, 4) == 7) {
+        C = (Tb(5, 7) << 2) | Tb(0, 1);
+        t[4] = t[3] = 2;
+    } else {
+        C = Tb(0, 4);
+        if (Tb(5, 6) == 3) {
+            t[4] = 2;
+            t[3] = Tb[7];
+        } else {
+            t[4] = Tb[7];
+            t[3] = Tb(5, 6);
+        }
+    }
+
+    Bits<u32> Cb(C);
+    if (Cb(0, 1) == 3) {
+        t[2] = 2;
+        t[1] = Cb[4];
+        t[0] = (Cb[3] << 1) | (Cb[2] & ~Cb[3]);
+    } else if (Cb(2, 3) == 3) {
+        t[2] = 2;
+        t[1] = 2;
+        t[0] = Cb(0, 1);
+    } else {
+        t[2] = Cb[4];
+        t[1] = Cb(2, 3);
+        t[0] = (Cb[1] << 1) | (Cb[0] & ~Cb[1]);
+    }
+
+    for (std::size_t i = 0; i < 5; ++i) {
+        IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Trit, nBitsPerValue);
+        val.bit_value = m[i];
+        val.trit_value = t[i];
+    }
+}
+
+static void DecodeQuintBlock(InputBitStream& bits, IntegerEncodedVector& result,
+                             u32 nBitsPerValue) {
+    // Implement the algorithm in section C.2.12
+    u32 m[3];
+    u32 q[3];
+    u32 Q;
+
+    // Read the trit encoded block according to
+    // table C.2.15
+    m[0] = bits.ReadBits(nBitsPerValue);
+    Q = bits.ReadBits<3>();
+    m[1] = bits.ReadBits(nBitsPerValue);
+    Q |= bits.ReadBits<2>() << 3;
+    m[2] = bits.ReadBits(nBitsPerValue);
+    Q |= bits.ReadBits<2>() << 5;
+
+    Bits<u32> Qb(Q);
+    if (Qb(1, 2) == 3 && Qb(5, 6) == 0) {
+        q[0] = q[1] = 4;
+        q[2] = (Qb[0] << 2) | ((Qb[4] & ~Qb[0]) << 1) | (Qb[3] & ~Qb[0]);
+    } else {
+        u32 C = 0;
+        if (Qb(1, 2) == 3) {
+            q[2] = 4;
+            C = (Qb(3, 4) << 3) | ((~Qb(5, 6) & 3) << 1) | Qb[0];
+        } else {
+            q[2] = Qb(5, 6);
+            C = Qb(0, 4);
+        }
+
+        Bits<u32> Cb(C);
+        if (Cb(0, 2) == 5) {
+            q[1] = 4;
+            q[0] = Cb(3, 4);
+        } else {
+            q[1] = Cb(3, 4);
+            q[0] = Cb(0, 2);
+        }
+    }
+
+    for (std::size_t i = 0; i < 3; ++i) {
+        IntegerEncodedValue& val = result.emplace_back(IntegerEncoding::Quint, nBitsPerValue);
+        val.bit_value = m[i];
+        val.quint_value = q[i];
+    }
+}
+
+// Fills result with the values that are encoded in the given
+// bitstream. We must know beforehand what the maximum possible
+// value is, and how many values we're decoding.
+static void DecodeIntegerSequence(IntegerEncodedVector& result, InputBitStream& bits, u32 maxRange,
+                                  u32 nValues) {
+    // Determine encoding parameters
+    IntegerEncodedValue val = ASTC_ENCODINGS_VALUES[maxRange];
+
+    // Start decoding
+    u32 nValsDecoded = 0;
+    while (nValsDecoded < nValues) {
+        switch (val.encoding) {
+        case IntegerEncoding::Quint:
+            DecodeQuintBlock(bits, result, val.num_bits);
+            nValsDecoded += 3;
+            break;
+
+        case IntegerEncoding::Trit:
+            DecodeTritBlock(bits, result, val.num_bits);
+            nValsDecoded += 5;
+            break;
+
+        case IntegerEncoding::JustBits:
+            val.bit_value = bits.ReadBits(val.num_bits);
+            result.push_back(val);
+            nValsDecoded++;
+            break;
+        }
+    }
+}
+
+struct TexelWeightParams {
+    u32 m_Width = 0;
+    u32 m_Height = 0;
+    bool m_bDualPlane = false;
+    u32 m_MaxWeight = 0;
+    bool m_bError = false;
+    bool m_bVoidExtentLDR = false;
+    bool m_bVoidExtentHDR = false;
+
+    u32 GetPackedBitSize() const {
+        // How many indices do we have?
+        u32 nIdxs = m_Height * m_Width;
+        if (m_bDualPlane) {
+            nIdxs *= 2;
+        }
+
+        return ASTC_ENCODINGS_VALUES[m_MaxWeight].GetBitLength(nIdxs);
+    }
+
+    u32 GetNumWeightValues() const {
+        u32 ret = m_Width * m_Height;
+        if (m_bDualPlane) {
+            ret *= 2;
+        }
+        return ret;
+    }
+};
+
+static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) {
+    TexelWeightParams params;
+
+    // Read the entire block mode all at once
+    u16 modeBits = static_cast<u16>(strm.ReadBits<11>());
+
+    // Does this match the void extent block mode?
+    if ((modeBits & 0x01FF) == 0x1FC) {
+        if (modeBits & 0x200) {
+            params.m_bVoidExtentHDR = true;
+        } else {
+            params.m_bVoidExtentLDR = true;
+        }
+
+        // Next two bits must be one.
+        if (!(modeBits & 0x400) || !strm.ReadBit()) {
+            params.m_bError = true;
+        }
+
+        return params;
+    }
+
+    // First check if the last four bits are zero
+    if ((modeBits & 0xF) == 0) {
+        params.m_bError = true;
+        return params;
+    }
+
+    // If the last two bits are zero, then if bits
+    // [6-8] are all ones, this is also reserved.
+    if ((modeBits & 0x3) == 0 && (modeBits & 0x1C0) == 0x1C0) {
+        params.m_bError = true;
+        return params;
+    }
+
+    // Otherwise, there is no error... Figure out the layout
+    // of the block mode. Layout is determined by a number
+    // between 0 and 9 corresponding to table C.2.8 of the
+    // ASTC spec.
+    u32 layout = 0;
+
+    if ((modeBits & 0x1) || (modeBits & 0x2)) {
+        // layout is in [0-4]
+        if (modeBits & 0x8) {
+            // layout is in [2-4]
+            if (modeBits & 0x4) {
+                // layout is in [3-4]
+                if (modeBits & 0x100) {
+                    layout = 4;
+                } else {
+                    layout = 3;
+                }
+            } else {
+                layout = 2;
+            }
+        } else {
+            // layout is in [0-1]
+            if (modeBits & 0x4) {
+                layout = 1;
+            } else {
+                layout = 0;
+            }
+        }
+    } else {
+        // layout is in [5-9]
+        if (modeBits & 0x100) {
+            // layout is in [7-9]
+            if (modeBits & 0x80) {
+                // layout is in [7-8]
+                assert((modeBits & 0x40) == 0U);
+                if (modeBits & 0x20) {
+                    layout = 8;
+                } else {
+                    layout = 7;
+                }
+            } else {
+                layout = 9;
+            }
+        } else {
+            // layout is in [5-6]
+            if (modeBits & 0x80) {
+                layout = 6;
+            } else {
+                layout = 5;
+            }
+        }
+    }
+
+    assert(layout < 10);
+
+    // Determine R
+    u32 R = !!(modeBits & 0x10);
+    if (layout < 5) {
+        R |= (modeBits & 0x3) << 1;
+    } else {
+        R |= (modeBits & 0xC) >> 1;
+    }
+    assert(2 <= R && R <= 7);
+
+    // Determine width & height
+    switch (layout) {
+    case 0: {
+        u32 A = (modeBits >> 5) & 0x3;
+        u32 B = (modeBits >> 7) & 0x3;
+        params.m_Width = B + 4;
+        params.m_Height = A + 2;
+        break;
+    }
+
+    case 1: {
+        u32 A = (modeBits >> 5) & 0x3;
+        u32 B = (modeBits >> 7) & 0x3;
+        params.m_Width = B + 8;
+        params.m_Height = A + 2;
+        break;
+    }
+
+    case 2: {
+        u32 A = (modeBits >> 5) & 0x3;
+        u32 B = (modeBits >> 7) & 0x3;
+        params.m_Width = A + 2;
+        params.m_Height = B + 8;
+        break;
+    }
+
+    case 3: {
+        u32 A = (modeBits >> 5) & 0x3;
+        u32 B = (modeBits >> 7) & 0x1;
+        params.m_Width = A + 2;
+        params.m_Height = B + 6;
+        break;
+    }
+
+    case 4: {
+        u32 A = (modeBits >> 5) & 0x3;
+        u32 B = (modeBits >> 7) & 0x1;
+        params.m_Width = B + 2;
+        params.m_Height = A + 2;
+        break;
+    }
+
+    case 5: {
+        u32 A = (modeBits >> 5) & 0x3;
+        params.m_Width = 12;
+        params.m_Height = A + 2;
+        break;
+    }
+
+    case 6: {
+        u32 A = (modeBits >> 5) & 0x3;
+        params.m_Width = A + 2;
+        params.m_Height = 12;
+        break;
+    }
+
+    case 7: {
+        params.m_Width = 6;
+        params.m_Height = 10;
+        break;
+    }
+
+    case 8: {
+        params.m_Width = 10;
+        params.m_Height = 6;
+        break;
+    }
+
+    case 9: {
+        u32 A = (modeBits >> 5) & 0x3;
+        u32 B = (modeBits >> 9) & 0x3;
+        params.m_Width = A + 6;
+        params.m_Height = B + 6;
+        break;
+    }
+
+    default:
+        assert(false && "Don't know this layout...");
+        params.m_bError = true;
+        break;
+    }
+
+    // Determine whether or not we're using dual planes
+    // and/or high precision layouts.
+    bool D = (layout != 9) && (modeBits & 0x400);
+    bool H = (layout != 9) && (modeBits & 0x200);
+
+    if (H) {
+        const u32 maxWeights[6] = {9, 11, 15, 19, 23, 31};
+        params.m_MaxWeight = maxWeights[R - 2];
+    } else {
+        const u32 maxWeights[6] = {1, 2, 3, 4, 5, 7};
+        params.m_MaxWeight = maxWeights[R - 2];
+    }
+
+    params.m_bDualPlane = D;
+
+    return params;
+}
+
+static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth,
+                              u32 blockHeight) {
+    // Don't actually care about the void extent, just read the bits...
+    for (s32 i = 0; i < 4; ++i) {
+        strm.ReadBits<13>();
+    }
+
+    // Decode the RGBA components and renormalize them to the range [0, 255]
+    u16 r = static_cast<u16>(strm.ReadBits<16>());
+    u16 g = static_cast<u16>(strm.ReadBits<16>());
+    u16 b = static_cast<u16>(strm.ReadBits<16>());
+    u16 a = static_cast<u16>(strm.ReadBits<16>());
+
+    u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 |
+               (static_cast<u32>(a) & 0xFF00) << 16;
+
+    for (u32 j = 0; j < blockHeight; j++) {
+        for (u32 i = 0; i < blockWidth; i++) {
+            outBuf[j * blockWidth + i] = rgba;
+        }
+    }
+}
+
+static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) {
+    for (u32 j = 0; j < blockHeight; j++) {
+        for (u32 i = 0; i < blockWidth; i++) {
+            outBuf[j * blockWidth + i] = 0xFFFF00FF;
+        }
+    }
+}
+
+static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
+static constexpr u32 ReplicateByteTo16(std::size_t value) {
+    return REPLICATE_BYTE_TO_16_TABLE[value];
+}
+
+static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>();
+static constexpr u32 ReplicateBitTo7(std::size_t value) {
+    return REPLICATE_BIT_TO_7_TABLE[value];
+}
+
+static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>();
+static constexpr u32 ReplicateBitTo9(std::size_t value) {
+    return REPLICATE_BIT_TO_9_TABLE[value];
+}
+
+static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>();
+static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>();
+static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
+static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
+static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
+/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
+/// to the runtime implementation
+static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_8_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_8_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_8_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_8_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_8_TABLE[value];
+    case 6:
+        return REPLICATE_6_BIT_TO_8_TABLE[value];
+    case 7:
+        return REPLICATE_7_BIT_TO_8_TABLE[value];
+    case 8:
+        return REPLICATE_8_BIT_TO_8_TABLE[value];
+    default:
+        return Replicate(value, num_bits, 8);
+    }
+}
+
+static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>();
+static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>();
+static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>();
+static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>();
+static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>();
+static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_6_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_6_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_6_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_6_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_6_TABLE[value];
+    default:
+        return Replicate(value, num_bits, 6);
+    }
+}
+
+class Pixel {
+protected:
+    using ChannelType = s16;
+    u8 m_BitDepth[4] = {8, 8, 8, 8};
+    s16 color[4] = {};
+
+public:
+    Pixel() = default;
+    Pixel(u32 a, u32 r, u32 g, u32 b, u32 bitDepth = 8)
+        : m_BitDepth{u8(bitDepth), u8(bitDepth), u8(bitDepth), u8(bitDepth)},
+          color{static_cast<ChannelType>(a), static_cast<ChannelType>(r),
+                static_cast<ChannelType>(g), static_cast<ChannelType>(b)} {}
+
+    // Changes the depth of each pixel. This scales the values to
+    // the appropriate bit depth by either truncating the least
+    // significant bits when going from larger to smaller bit depth
+    // or by repeating the most significant bits when going from
+    // smaller to larger bit depths.
+    void ChangeBitDepth() {
+        for (u32 i = 0; i < 4; i++) {
+            Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i]);
+            m_BitDepth[i] = 8;
+        }
+    }
+
+    template <typename IntType>
+    static float ConvertChannelToFloat(IntType channel, u8 bitDepth) {
+        float denominator = static_cast<float>((1 << bitDepth) - 1);
+        return static_cast<float>(channel) / denominator;
+    }
+
+    // Changes the bit depth of a single component. See the comment
+    // above for how we do this.
+    static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth) {
+        assert(oldDepth <= 8);
+
+        if (oldDepth == 8) {
+            // Do nothing
+            return val;
+        } else if (oldDepth == 0) {
+            return static_cast<ChannelType>((1 << 8) - 1);
+        } else if (8 > oldDepth) {
+            return static_cast<ChannelType>(FastReplicateTo8(static_cast<u32>(val), oldDepth));
+        } else {
+            // oldDepth > newDepth
+            const u8 bitsWasted = static_cast<u8>(oldDepth - 8);
+            u16 v = static_cast<u16>(val);
+            v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted);
+            v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << 8) - 1));
+            return static_cast<u8>(v);
+        }
+
+        assert(false && "We shouldn't get here.");
+        return 0;
+    }
+
+    const ChannelType& A() const {
+        return color[0];
+    }
+    ChannelType& A() {
+        return color[0];
+    }
+    const ChannelType& R() const {
+        return color[1];
+    }
+    ChannelType& R() {
+        return color[1];
+    }
+    const ChannelType& G() const {
+        return color[2];
+    }
+    ChannelType& G() {
+        return color[2];
+    }
+    const ChannelType& B() const {
+        return color[3];
+    }
+    ChannelType& B() {
+        return color[3];
+    }
+    const ChannelType& Component(u32 idx) const {
+        return color[idx];
+    }
+    ChannelType& Component(u32 idx) {
+        return color[idx];
+    }
+
+    void GetBitDepth(u8 (&outDepth)[4]) const {
+        for (s32 i = 0; i < 4; i++) {
+            outDepth[i] = m_BitDepth[i];
+        }
+    }
+
+    // Take all of the components, transform them to their 8-bit variants,
+    // and then pack each channel into an R8G8B8A8 32-bit integer. We assume
+    // that the architecture is little-endian, so the alpha channel will end
+    // up in the most-significant byte.
+    u32 Pack() const {
+        Pixel eightBit(*this);
+        eightBit.ChangeBitDepth();
+
+        u32 r = 0;
+        r |= eightBit.A();
+        r <<= 8;
+        r |= eightBit.B();
+        r <<= 8;
+        r |= eightBit.G();
+        r <<= 8;
+        r |= eightBit.R();
+        return r;
+    }
+
+    // Clamps the pixel to the range [0,255]
+    void ClampByte() {
+        for (u32 i = 0; i < 4; i++) {
+            color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]);
+        }
+    }
+
+    void MakeOpaque() {
+        A() = 255;
+    }
+};
+
+static void DecodeColorValues(u32* out, std::span<u8> data, const u32* modes, const u32 nPartitions,
+                              const u32 nBitsForColorData) {
+    // First figure out how many color values we have
+    u32 nValues = 0;
+    for (u32 i = 0; i < nPartitions; i++) {
+        nValues += ((modes[i] >> 2) + 1) << 1;
+    }
+
+    // Then based on the number of values and the remaining number of bits,
+    // figure out the max value for each of them...
+    u32 range = 256;
+    while (--range > 0) {
+        IntegerEncodedValue val = ASTC_ENCODINGS_VALUES[range];
+        u32 bitLength = val.GetBitLength(nValues);
+        if (bitLength <= nBitsForColorData) {
+            // Find the smallest possible range that matches the given encoding
+            while (--range > 0) {
+                IntegerEncodedValue newval = ASTC_ENCODINGS_VALUES[range];
+                if (!newval.MatchesEncoding(val)) {
+                    break;
+                }
+            }
+
+            // Return to last matching range.
+            range++;
+            break;
+        }
+    }
+
+    // We now have enough to decode our integer sequence.
+    IntegerEncodedVector decodedColorValues;
+
+    InputBitStream colorStream(data, 0);
+    DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues);
+
+    // Once we have the decoded values, we need to dequantize them to the 0-255 range
+    // This procedure is outlined in ASTC spec C.2.13
+    u32 outIdx = 0;
+    for (auto itr = decodedColorValues.begin(); itr != decodedColorValues.end(); ++itr) {
+        // Have we already decoded all that we need?
+        if (outIdx >= nValues) {
+            break;
+        }
+
+        const IntegerEncodedValue& val = *itr;
+        u32 bitlen = val.num_bits;
+        u32 bitval = val.bit_value;
+
+        assert(bitlen >= 1);
+
+        u32 A = 0, B = 0, C = 0, D = 0;
+        // A is just the lsb replicated 9 times.
+        A = ReplicateBitTo9(bitval & 1);
+
+        switch (val.encoding) {
+        // Replicate bits
+        case IntegerEncoding::JustBits:
+            out[outIdx++] = FastReplicateTo8(bitval, bitlen);
+            break;
+
+        // Use algorithm in C.2.13
+        case IntegerEncoding::Trit: {
+
+            D = val.trit_value;
+
+            switch (bitlen) {
+            case 1: {
+                C = 204;
+            } break;
+
+            case 2: {
+                C = 93;
+                // B = b000b0bb0
+                u32 b = (bitval >> 1) & 1;
+                B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
+            } break;
+
+            case 3: {
+                C = 44;
+                // B = cb000cbcb
+                u32 cb = (bitval >> 1) & 3;
+                B = (cb << 7) | (cb << 2) | cb;
+            } break;
+
+            case 4: {
+                C = 22;
+                // B = dcb000dcb
+                u32 dcb = (bitval >> 1) & 7;
+                B = (dcb << 6) | dcb;
+            } break;
+
+            case 5: {
+                C = 11;
+                // B = edcb000ed
+                u32 edcb = (bitval >> 1) & 0xF;
+                B = (edcb << 5) | (edcb >> 2);
+            } break;
+
+            case 6: {
+                C = 5;
+                // B = fedcb000f
+                u32 fedcb = (bitval >> 1) & 0x1F;
+                B = (fedcb << 4) | (fedcb >> 4);
+            } break;
+
+            default:
+                assert(false && "Unsupported trit encoding for color values!");
+                break;
+            } // switch(bitlen)
+        }     // case IntegerEncoding::Trit
+        break;
+
+        case IntegerEncoding::Quint: {
+
+            D = val.quint_value;
+
+            switch (bitlen) {
+            case 1: {
+                C = 113;
+            } break;
+
+            case 2: {
+                C = 54;
+                // B = b0000bb00
+                u32 b = (bitval >> 1) & 1;
+                B = (b << 8) | (b << 3) | (b << 2);
+            } break;
+
+            case 3: {
+                C = 26;
+                // B = cb0000cbc
+                u32 cb = (bitval >> 1) & 3;
+                B = (cb << 7) | (cb << 1) | (cb >> 1);
+            } break;
+
+            case 4: {
+                C = 13;
+                // B = dcb0000dc
+                u32 dcb = (bitval >> 1) & 7;
+                B = (dcb << 6) | (dcb >> 1);
+            } break;
+
+            case 5: {
+                C = 6;
+                // B = edcb0000e
+                u32 edcb = (bitval >> 1) & 0xF;
+                B = (edcb << 5) | (edcb >> 3);
+            } break;
+
+            default:
+                assert(false && "Unsupported quint encoding for color values!");
+                break;
+            } // switch(bitlen)
+        }     // case IntegerEncoding::Quint
+        break;
+        } // switch(val.encoding)
+
+        if (val.encoding != IntegerEncoding::JustBits) {
+            u32 T = D * C + B;
+            T ^= A;
+            T = (A & 0x80) | (T >> 2);
+            out[outIdx++] = T;
+        }
+    }
+
+    // Make sure that each of our values is in the proper range...
+    for (u32 i = 0; i < nValues; i++) {
+        assert(out[i] <= 255);
+    }
+}
+
+static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
+    u32 bitval = val.bit_value;
+    u32 bitlen = val.num_bits;
+
+    u32 A = ReplicateBitTo7(bitval & 1);
+    u32 B = 0, C = 0, D = 0;
+
+    u32 result = 0;
+    switch (val.encoding) {
+    case IntegerEncoding::JustBits:
+        result = FastReplicateTo6(bitval, bitlen);
+        break;
+
+    case IntegerEncoding::Trit: {
+        D = val.trit_value;
+        assert(D < 3);
+
+        switch (bitlen) {
+        case 0: {
+            u32 results[3] = {0, 32, 63};
+            result = results[D];
+        } break;
+
+        case 1: {
+            C = 50;
+        } break;
+
+        case 2: {
+            C = 23;
+            u32 b = (bitval >> 1) & 1;
+            B = (b << 6) | (b << 2) | b;
+        } break;
+
+        case 3: {
+            C = 11;
+            u32 cb = (bitval >> 1) & 3;
+            B = (cb << 5) | cb;
+        } break;
+
+        default:
+            assert(false && "Invalid trit encoding for texel weight");
+            break;
+        }
+    } break;
+
+    case IntegerEncoding::Quint: {
+        D = val.quint_value;
+        assert(D < 5);
+
+        switch (bitlen) {
+        case 0: {
+            u32 results[5] = {0, 16, 32, 47, 63};
+            result = results[D];
+        } break;
+
+        case 1: {
+            C = 28;
+        } break;
+
+        case 2: {
+            C = 13;
+            u32 b = (bitval >> 1) & 1;
+            B = (b << 6) | (b << 1);
+        } break;
+
+        default:
+            assert(false && "Invalid quint encoding for texel weight");
+            break;
+        }
+    } break;
+    }
+
+    if (val.encoding != IntegerEncoding::JustBits && bitlen > 0) {
+        // Decode the value...
+        result = D * C + B;
+        result ^= A;
+        result = (A & 0x20) | (result >> 2);
+    }
+
+    assert(result < 64);
+
+    // Change from [0,63] to [0,64]
+    if (result > 32) {
+        result += 1;
+    }
+
+    return result;
+}
+
+static void UnquantizeTexelWeights(u32 out[2][144], const IntegerEncodedVector& weights,
+                                   const TexelWeightParams& params, const u32 blockWidth,
+                                   const u32 blockHeight) {
+    u32 weightIdx = 0;
+    u32 unquantized[2][144];
+
+    for (auto itr = weights.begin(); itr != weights.end(); ++itr) {
+        unquantized[0][weightIdx] = UnquantizeTexelWeight(*itr);
+
+        if (params.m_bDualPlane) {
+            ++itr;
+            unquantized[1][weightIdx] = UnquantizeTexelWeight(*itr);
+            if (itr == weights.end()) {
+                break;
+            }
+        }
+
+        if (++weightIdx >= (params.m_Width * params.m_Height))
+            break;
+    }
+
+    // Do infill if necessary (Section C.2.18) ...
+    u32 Ds = (1024 + (blockWidth / 2)) / (blockWidth - 1);
+    u32 Dt = (1024 + (blockHeight / 2)) / (blockHeight - 1);
+
+    const u32 kPlaneScale = params.m_bDualPlane ? 2U : 1U;
+    for (u32 plane = 0; plane < kPlaneScale; plane++)
+        for (u32 t = 0; t < blockHeight; t++)
+            for (u32 s = 0; s < blockWidth; s++) {
+                u32 cs = Ds * s;
+                u32 ct = Dt * t;
+
+                u32 gs = (cs * (params.m_Width - 1) + 32) >> 6;
+                u32 gt = (ct * (params.m_Height - 1) + 32) >> 6;
+
+                u32 js = gs >> 4;
+                u32 fs = gs & 0xF;
+
+                u32 jt = gt >> 4;
+                u32 ft = gt & 0x0F;
+
+                u32 w11 = (fs * ft + 8) >> 4;
+                u32 w10 = ft - w11;
+                u32 w01 = fs - w11;
+                u32 w00 = 16 - fs - ft + w11;
+
+                u32 v0 = js + jt * params.m_Width;
+
+#define FIND_TEXEL(tidx, bidx)                                                                     \
+    u32 p##bidx = 0;                                                                               \
+    do {                                                                                           \
+        if ((tidx) < (params.m_Width * params.m_Height)) {                                         \
+            p##bidx = unquantized[plane][(tidx)];                                                  \
+        }                                                                                          \
+    } while (0)
+
+                FIND_TEXEL(v0, 00);
+                FIND_TEXEL(v0 + 1, 01);
+                FIND_TEXEL(v0 + params.m_Width, 10);
+                FIND_TEXEL(v0 + params.m_Width + 1, 11);
+
+#undef FIND_TEXEL
+
+                out[plane][t * blockWidth + s] =
+                    (p00 * w00 + p01 * w01 + p10 * w10 + p11 * w11 + 8) >> 4;
+            }
+}
+
+// Transfers a bit as described in C.2.14
+static inline void BitTransferSigned(int& a, int& b) {
+    b >>= 1;
+    b |= a & 0x80;
+    a >>= 1;
+    a &= 0x3F;
+    if (a & 0x20)
+        a -= 0x40;
+}
+
+// Adds more precision to the blue channel as described
+// in C.2.14
+static inline Pixel BlueContract(s32 a, s32 r, s32 g, s32 b) {
+    return Pixel(static_cast<s16>(a), static_cast<s16>((r + b) >> 1),
+                 static_cast<s16>((g + b) >> 1), static_cast<s16>(b));
+}
+
+// Partition selection functions as specified in
+// C.2.21
+static inline u32 hash52(u32 p) {
+    p ^= p >> 15;
+    p -= p << 17;
+    p += p << 7;
+    p += p << 4;
+    p ^= p >> 5;
+    p += p << 16;
+    p ^= p >> 7;
+    p ^= p >> 3;
+    p ^= p << 6;
+    p ^= p >> 17;
+    return p;
+}
+
+static u32 SelectPartition(s32 seed, s32 x, s32 y, s32 z, s32 partitionCount, s32 smallBlock) {
+    if (1 == partitionCount)
+        return 0;
+
+    if (smallBlock) {
+        x <<= 1;
+        y <<= 1;
+        z <<= 1;
+    }
+
+    seed += (partitionCount - 1) * 1024;
+
+    u32 rnum = hash52(static_cast<u32>(seed));
+    u8 seed1 = static_cast<u8>(rnum & 0xF);
+    u8 seed2 = static_cast<u8>((rnum >> 4) & 0xF);
+    u8 seed3 = static_cast<u8>((rnum >> 8) & 0xF);
+    u8 seed4 = static_cast<u8>((rnum >> 12) & 0xF);
+    u8 seed5 = static_cast<u8>((rnum >> 16) & 0xF);
+    u8 seed6 = static_cast<u8>((rnum >> 20) & 0xF);
+    u8 seed7 = static_cast<u8>((rnum >> 24) & 0xF);
+    u8 seed8 = static_cast<u8>((rnum >> 28) & 0xF);
+    u8 seed9 = static_cast<u8>((rnum >> 18) & 0xF);
+    u8 seed10 = static_cast<u8>((rnum >> 22) & 0xF);
+    u8 seed11 = static_cast<u8>((rnum >> 26) & 0xF);
+    u8 seed12 = static_cast<u8>(((rnum >> 30) | (rnum << 2)) & 0xF);
+
+    seed1 = static_cast<u8>(seed1 * seed1);
+    seed2 = static_cast<u8>(seed2 * seed2);
+    seed3 = static_cast<u8>(seed3 * seed3);
+    seed4 = static_cast<u8>(seed4 * seed4);
+    seed5 = static_cast<u8>(seed5 * seed5);
+    seed6 = static_cast<u8>(seed6 * seed6);
+    seed7 = static_cast<u8>(seed7 * seed7);
+    seed8 = static_cast<u8>(seed8 * seed8);
+    seed9 = static_cast<u8>(seed9 * seed9);
+    seed10 = static_cast<u8>(seed10 * seed10);
+    seed11 = static_cast<u8>(seed11 * seed11);
+    seed12 = static_cast<u8>(seed12 * seed12);
+
+    s32 sh1, sh2, sh3;
+    if (seed & 1) {
+        sh1 = (seed & 2) ? 4 : 5;
+        sh2 = (partitionCount == 3) ? 6 : 5;
+    } else {
+        sh1 = (partitionCount == 3) ? 6 : 5;
+        sh2 = (seed & 2) ? 4 : 5;
+    }
+    sh3 = (seed & 0x10) ? sh1 : sh2;
+
+    seed1 = static_cast<u8>(seed1 >> sh1);
+    seed2 = static_cast<u8>(seed2 >> sh2);
+    seed3 = static_cast<u8>(seed3 >> sh1);
+    seed4 = static_cast<u8>(seed4 >> sh2);
+    seed5 = static_cast<u8>(seed5 >> sh1);
+    seed6 = static_cast<u8>(seed6 >> sh2);
+    seed7 = static_cast<u8>(seed7 >> sh1);
+    seed8 = static_cast<u8>(seed8 >> sh2);
+    seed9 = static_cast<u8>(seed9 >> sh3);
+    seed10 = static_cast<u8>(seed10 >> sh3);
+    seed11 = static_cast<u8>(seed11 >> sh3);
+    seed12 = static_cast<u8>(seed12 >> sh3);
+
+    s32 a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
+    s32 b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
+    s32 c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
+    s32 d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
+
+    a &= 0x3F;
+    b &= 0x3F;
+    c &= 0x3F;
+    d &= 0x3F;
+
+    if (partitionCount < 4)
+        d = 0;
+    if (partitionCount < 3)
+        c = 0;
+
+    if (a >= b && a >= c && a >= d)
+        return 0;
+    else if (b >= c && b >= d)
+        return 1;
+    else if (c >= d)
+        return 2;
+    return 3;
+}
+
+static inline u32 Select2DPartition(s32 seed, s32 x, s32 y, s32 partitionCount, s32 smallBlock) {
+    return SelectPartition(seed, x, y, 0, partitionCount, smallBlock);
+}
+
+// Section C.2.14
+static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const u32*& colorValues,
+                             u32 colorEndpointMode) {
+#define READ_UINT_VALUES(N)                                                                        \
+    u32 v[N];                                                                                      \
+    for (u32 i = 0; i < N; i++) {                                                                  \
+        v[i] = *(colorValues++);                                                                   \
+    }
+
+#define READ_INT_VALUES(N)                                                                         \
+    s32 v[N];                                                                                      \
+    for (u32 i = 0; i < N; i++) {                                                                  \
+        v[i] = static_cast<int>(*(colorValues++));                                                 \
+    }
+
+    switch (colorEndpointMode) {
+    case 0: {
+        READ_UINT_VALUES(2)
+        ep1 = Pixel(0xFF, v[0], v[0], v[0]);
+        ep2 = Pixel(0xFF, v[1], v[1], v[1]);
+    } break;
+
+    case 1: {
+        READ_UINT_VALUES(2)
+        u32 L0 = (v[0] >> 2) | (v[1] & 0xC0);
+        u32 L1 = std::min(L0 + (v[1] & 0x3F), 0xFFU);
+        ep1 = Pixel(0xFF, L0, L0, L0);
+        ep2 = Pixel(0xFF, L1, L1, L1);
+    } break;
+
+    case 4: {
+        READ_UINT_VALUES(4)
+        ep1 = Pixel(v[2], v[0], v[0], v[0]);
+        ep2 = Pixel(v[3], v[1], v[1], v[1]);
+    } break;
+
+    case 5: {
+        READ_INT_VALUES(4)
+        BitTransferSigned(v[1], v[0]);
+        BitTransferSigned(v[3], v[2]);
+        ep1 = Pixel(v[2], v[0], v[0], v[0]);
+        ep2 = Pixel(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1]);
+        ep1.ClampByte();
+        ep2.ClampByte();
+    } break;
+
+    case 6: {
+        READ_UINT_VALUES(4)
+        ep1 = Pixel(0xFF, v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
+        ep2 = Pixel(0xFF, v[0], v[1], v[2]);
+    } break;
+
+    case 8: {
+        READ_UINT_VALUES(6)
+        if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
+            ep1 = Pixel(0xFF, v[0], v[2], v[4]);
+            ep2 = Pixel(0xFF, v[1], v[3], v[5]);
+        } else {
+            ep1 = BlueContract(0xFF, v[1], v[3], v[5]);
+            ep2 = BlueContract(0xFF, v[0], v[2], v[4]);
+        }
+    } break;
+
+    case 9: {
+        READ_INT_VALUES(6)
+        BitTransferSigned(v[1], v[0]);
+        BitTransferSigned(v[3], v[2]);
+        BitTransferSigned(v[5], v[4]);
+        if (v[1] + v[3] + v[5] >= 0) {
+            ep1 = Pixel(0xFF, v[0], v[2], v[4]);
+            ep2 = Pixel(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]);
+        } else {
+            ep1 = BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]);
+            ep2 = BlueContract(0xFF, v[0], v[2], v[4]);
+        }
+        ep1.ClampByte();
+        ep2.ClampByte();
+    } break;
+
+    case 10: {
+        READ_UINT_VALUES(6)
+        ep1 = Pixel(v[4], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
+        ep2 = Pixel(v[5], v[0], v[1], v[2]);
+    } break;
+
+    case 12: {
+        READ_UINT_VALUES(8)
+        if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
+            ep1 = Pixel(v[6], v[0], v[2], v[4]);
+            ep2 = Pixel(v[7], v[1], v[3], v[5]);
+        } else {
+            ep1 = BlueContract(v[7], v[1], v[3], v[5]);
+            ep2 = BlueContract(v[6], v[0], v[2], v[4]);
+        }
+    } break;
+
+    case 13: {
+        READ_INT_VALUES(8)
+        BitTransferSigned(v[1], v[0]);
+        BitTransferSigned(v[3], v[2]);
+        BitTransferSigned(v[5], v[4]);
+        BitTransferSigned(v[7], v[6]);
+        if (v[1] + v[3] + v[5] >= 0) {
+            ep1 = Pixel(v[6], v[0], v[2], v[4]);
+            ep2 = Pixel(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5]);
+        } else {
+            ep1 = BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5]);
+            ep2 = BlueContract(v[6], v[0], v[2], v[4]);
+        }
+        ep1.ClampByte();
+        ep2.ClampByte();
+    } break;
+
+    default:
+        assert(false && "Unsupported color endpoint mode (is it HDR?)");
+        break;
+    }
+
+#undef READ_UINT_VALUES
+#undef READ_INT_VALUES
+}
+
+static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth,
+                            const u32 blockHeight, std::span<u32, 12 * 12> outBuf) {
+    InputBitStream strm(inBuf);
+    TexelWeightParams weightParams = DecodeBlockInfo(strm);
+
+    // Was there an error?
+    if (weightParams.m_bError) {
+        assert(false && "Invalid block mode");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+
+    if (weightParams.m_bVoidExtentLDR) {
+        FillVoidExtentLDR(strm, outBuf, blockWidth, blockHeight);
+        return;
+    }
+
+    if (weightParams.m_bVoidExtentHDR) {
+        assert(false && "HDR void extent blocks are unsupported!");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+
+    if (weightParams.m_Width > blockWidth) {
+        assert(false && "Texel weight grid width should be smaller than block width");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+
+    if (weightParams.m_Height > blockHeight) {
+        assert(false && "Texel weight grid height should be smaller than block height");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+
+    // Read num partitions
+    u32 nPartitions = strm.ReadBits<2>() + 1;
+    assert(nPartitions <= 4);
+
+    if (nPartitions == 4 && weightParams.m_bDualPlane) {
+        assert(false && "Dual plane mode is incompatible with four partition blocks");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+
+    // Based on the number of partitions, read the color endpoint mode for
+    // each partition.
+
+    // Determine partitions, partition index, and color endpoint modes
+    s32 planeIdx = -1;
+    u32 partitionIndex;
+    u32 colorEndpointMode[4] = {0, 0, 0, 0};
+
+    // Define color data.
+    u8 colorEndpointData[16];
+    memset(colorEndpointData, 0, sizeof(colorEndpointData));
+    OutputBitStream colorEndpointStream(colorEndpointData, 16 * 8, 0);
+
+    // Read extra config data...
+    u32 baseCEM = 0;
+    if (nPartitions == 1) {
+        colorEndpointMode[0] = strm.ReadBits<4>();
+        partitionIndex = 0;
+    } else {
+        partitionIndex = strm.ReadBits<10>();
+        baseCEM = strm.ReadBits<6>();
+    }
+    u32 baseMode = (baseCEM & 3);
+
+    // Remaining bits are color endpoint data...
+    u32 nWeightBits = weightParams.GetPackedBitSize();
+    s32 remainingBits = 128 - nWeightBits - static_cast<int>(strm.GetBitsRead());
+
+    // Consider extra bits prior to texel data...
+    u32 extraCEMbits = 0;
+    if (baseMode) {
+        switch (nPartitions) {
+        case 2:
+            extraCEMbits += 2;
+            break;
+        case 3:
+            extraCEMbits += 5;
+            break;
+        case 4:
+            extraCEMbits += 8;
+            break;
+        default:
+            assert(false);
+            break;
+        }
+    }
+    remainingBits -= extraCEMbits;
+
+    // Do we have a dual plane situation?
+    u32 planeSelectorBits = 0;
+    if (weightParams.m_bDualPlane) {
+        planeSelectorBits = 2;
+    }
+    remainingBits -= planeSelectorBits;
+
+    // Read color data...
+    u32 colorDataBits = remainingBits;
+    while (remainingBits > 0) {
+        u32 nb = std::min(remainingBits, 8);
+        u32 b = strm.ReadBits(nb);
+        colorEndpointStream.WriteBits(b, nb);
+        remainingBits -= 8;
+    }
+
+    // Read the plane selection bits
+    planeIdx = strm.ReadBits(planeSelectorBits);
+
+    // Read the rest of the CEM
+    if (baseMode) {
+        u32 extraCEM = strm.ReadBits(extraCEMbits);
+        u32 CEM = (extraCEM << 6) | baseCEM;
+        CEM >>= 2;
+
+        bool C[4] = {0};
+        for (u32 i = 0; i < nPartitions; i++) {
+            C[i] = CEM & 1;
+            CEM >>= 1;
+        }
+
+        u8 M[4] = {0};
+        for (u32 i = 0; i < nPartitions; i++) {
+            M[i] = CEM & 3;
+            CEM >>= 2;
+            assert(M[i] <= 3);
+        }
+
+        for (u32 i = 0; i < nPartitions; i++) {
+            colorEndpointMode[i] = baseMode;
+            if (!(C[i]))
+                colorEndpointMode[i] -= 1;
+            colorEndpointMode[i] <<= 2;
+            colorEndpointMode[i] |= M[i];
+        }
+    } else if (nPartitions > 1) {
+        u32 CEM = baseCEM >> 2;
+        for (u32 i = 0; i < nPartitions; i++) {
+            colorEndpointMode[i] = CEM;
+        }
+    }
+
+    // Make sure everything up till here is sane.
+    for (u32 i = 0; i < nPartitions; i++) {
+        assert(colorEndpointMode[i] < 16);
+    }
+    assert(strm.GetBitsRead() + weightParams.GetPackedBitSize() == 128);
+
+    // Decode both color data and texel weight data
+    u32 colorValues[32]; // Four values, two endpoints, four maximum paritions
+    DecodeColorValues(colorValues, colorEndpointData, colorEndpointMode, nPartitions,
+                      colorDataBits);
+
+    Pixel endpoints[4][2];
+    const u32* colorValuesPtr = colorValues;
+    for (u32 i = 0; i < nPartitions; i++) {
+        ComputeEndpoints(endpoints[i][0], endpoints[i][1], colorValuesPtr, colorEndpointMode[i]);
+    }
+
+    // Read the texel weight data..
+    std::array<u8, 16> texelWeightData;
+    std::ranges::copy(inBuf, texelWeightData.begin());
+
+    // Reverse everything
+    for (u32 i = 0; i < 8; i++) {
+// Taken from http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
+#define REVERSE_BYTE(b) (((b)*0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32
+        u8 a = static_cast<u8>(REVERSE_BYTE(texelWeightData[i]));
+        u8 b = static_cast<u8>(REVERSE_BYTE(texelWeightData[15 - i]));
+#undef REVERSE_BYTE
+
+        texelWeightData[i] = b;
+        texelWeightData[15 - i] = a;
+    }
+
+    // Make sure that higher non-texel bits are set to zero
+    const u32 clearByteStart = (weightParams.GetPackedBitSize() >> 3) + 1;
+    if (clearByteStart > 0 && clearByteStart <= texelWeightData.size()) {
+        texelWeightData[clearByteStart - 1] &=
+            static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1);
+        std::memset(texelWeightData.data() + clearByteStart, 0,
+                    std::min(16U - clearByteStart, 16U));
+    }
+
+    IntegerEncodedVector texelWeightValues;
+
+    InputBitStream weightStream(texelWeightData);
+
+    DecodeIntegerSequence(texelWeightValues, weightStream, weightParams.m_MaxWeight,
+                          weightParams.GetNumWeightValues());
+
+    // Blocks can be at most 12x12, so we can have as many as 144 weights
+    u32 weights[2][144];
+    UnquantizeTexelWeights(weights, texelWeightValues, weightParams, blockWidth, blockHeight);
+
+    // Now that we have endpoints and weights, we can interpolate and generate
+    // the proper decoding...
+    for (u32 j = 0; j < blockHeight; j++)
+        for (u32 i = 0; i < blockWidth; i++) {
+            u32 partition = Select2DPartition(partitionIndex, i, j, nPartitions,
+                                              (blockHeight * blockWidth) < 32);
+            assert(partition < nPartitions);
+
+            Pixel p;
+            for (u32 c = 0; c < 4; c++) {
+                u32 C0 = endpoints[partition][0].Component(c);
+                C0 = ReplicateByteTo16(C0);
+                u32 C1 = endpoints[partition][1].Component(c);
+                C1 = ReplicateByteTo16(C1);
+
+                u32 plane = 0;
+                if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) {
+                    plane = 1;
+                }
+
+                u32 weight = weights[plane][j * blockWidth + i];
+                u32 C = (C0 * (64 - weight) + C1 * weight + 32) / 64;
+                if (C == 65535) {
+                    p.Component(c) = 255;
+                } else {
+                    double Cf = static_cast<double>(C);
+                    p.Component(c) = static_cast<u16>(255.0 * (Cf / 65536.0) + 0.5);
+                }
+            }
+
+            outBuf[j * blockWidth + i] = p.Pack();
+        }
+}
+
+void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
+                uint32_t block_width, uint32_t block_height, std::span<uint8_t> output) {
+    u32 block_index = 0;
+    std::size_t depth_offset = 0;
+    for (u32 z = 0; z < depth; z++) {
+        for (u32 y = 0; y < height; y += block_height) {
+            for (u32 x = 0; x < width; x += block_width) {
+                const std::span<const u8, 16> blockPtr{data.subspan(block_index * 16, 16)};
+
+                // Blocks can be at most 12x12
+                std::array<u32, 12 * 12> uncompData;
+                DecompressBlock(blockPtr, block_width, block_height, uncompData);
+
+                u32 decompWidth = std::min(block_width, width - x);
+                u32 decompHeight = std::min(block_height, height - y);
+
+                const std::span<u8> outRow = output.subspan(depth_offset + (y * width + x) * 4);
+                for (u32 jj = 0; jj < decompHeight; jj++) {
+                    std::memcpy(outRow.data() + jj * width * 4,
+                                uncompData.data() + jj * block_width, decompWidth * 4);
+                }
+                ++block_index;
+            }
+        }
+        depth_offset += height * width * 4;
+    }
+}
+
+} // namespace Tegra::Texture::ASTC
diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h
index c1c73fda5..0229ae122 100644
--- a/src/video_core/textures/astc.h
+++ b/src/video_core/textures/astc.h
@@ -77,7 +77,7 @@ constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() {
     return encodings;
 }
 
-constexpr std::array<IntegerEncodedValue, 256> EncodingsValues = MakeEncodedValues();
+constexpr std::array<IntegerEncodedValue, 256> ASTC_ENCODINGS_VALUES = MakeEncodedValues();
 
 // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
 // is the same as [(num_bits - 1):0] and repeats all the way down.
@@ -116,17 +116,11 @@ constexpr auto MakeReplicateTable() {
     return table;
 }
 
-constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
 constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
 constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
 constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
 
-struct AstcBufferData {
-    decltype(EncodingsValues) encoding_values = EncodingsValues;
-    decltype(REPLICATE_6_BIT_TO_8_TABLE) replicate_6_to_8 = REPLICATE_6_BIT_TO_8_TABLE;
-    decltype(REPLICATE_7_BIT_TO_8_TABLE) replicate_7_to_8 = REPLICATE_7_BIT_TO_8_TABLE;
-    decltype(REPLICATE_8_BIT_TO_8_TABLE) replicate_8_to_8 = REPLICATE_8_BIT_TO_8_TABLE;
-    decltype(REPLICATE_BYTE_TO_16_TABLE) replicate_byte_to_16 = REPLICATE_BYTE_TO_16_TABLE;
-} constexpr ASTC_BUFFER_DATA;
+void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
+                uint32_t block_width, uint32_t block_height, std::span<uint8_t> output);
 
 } // namespace Tegra::Texture::ASTC
diff --git a/src/video_core/vulkan_common/nsight_aftermath_tracker.cpp b/src/video_core/vulkan_common/nsight_aftermath_tracker.cpp
index f0ee76519..758c038ba 100644
--- a/src/video_core/vulkan_common/nsight_aftermath_tracker.cpp
+++ b/src/video_core/vulkan_common/nsight_aftermath_tracker.cpp
@@ -50,7 +50,7 @@ NsightAftermathTracker::NsightAftermathTracker() {
     }
     dump_dir = Common::FS::GetYuzuPath(Common::FS::YuzuPath::LogDir) / "gpucrash";
 
-    void(Common::FS::RemoveDirRecursively(dump_dir));
+    Common::FS::RemoveDirRecursively(dump_dir);
     if (!Common::FS::CreateDir(dump_dir)) {
         LOG_ERROR(Render_Vulkan, "Failed to create Nsight Aftermath dump directory");
         return;
diff --git a/src/video_core/vulkan_common/vulkan_debug_callback.cpp b/src/video_core/vulkan_common/vulkan_debug_callback.cpp
index 5c64c9bf7..0f60765bb 100644
--- a/src/video_core/vulkan_common/vulkan_debug_callback.cpp
+++ b/src/video_core/vulkan_common/vulkan_debug_callback.cpp
@@ -12,6 +12,14 @@ VkBool32 Callback(VkDebugUtilsMessageSeverityFlagBitsEXT severity,
                   VkDebugUtilsMessageTypeFlagsEXT type,
                   const VkDebugUtilsMessengerCallbackDataEXT* data,
                   [[maybe_unused]] void* user_data) {
+    // Skip logging known false-positive validation errors
+    switch (static_cast<u32>(data->messageIdNumber)) {
+    case 0x682a878au: // VUID-vkCmdBindVertexBuffers2EXT-pBuffers-parameter
+    case 0x99fb7dfdu: // UNASSIGNED-RequiredParameter (vkCmdBindVertexBuffers2EXT pBuffers[0])
+        return VK_FALSE;
+    default:
+        break;
+    }
     const std::string_view message{data->pMessage};
     if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) {
         LOG_CRITICAL(Render_Vulkan, "{}", message);
diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp
index 64206b3d2..f214510da 100644
--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -408,6 +408,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
     }
     logical = vk::Device::Create(physical, queue_cis, extensions, first_next, dld);
 
+    CollectPhysicalMemoryInfo();
     CollectTelemetryParameters();
     CollectToolingInfo();
 
@@ -531,6 +532,27 @@ bool Device::IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags want
     return (supported_usage & wanted_usage) == wanted_usage;
 }
 
+std::string Device::GetDriverName() const {
+    switch (driver_id) {
+    case VK_DRIVER_ID_AMD_PROPRIETARY:
+        return "AMD";
+    case VK_DRIVER_ID_AMD_OPEN_SOURCE:
+        return "AMDVLK";
+    case VK_DRIVER_ID_MESA_RADV:
+        return "RADV";
+    case VK_DRIVER_ID_NVIDIA_PROPRIETARY:
+        return "NVIDIA";
+    case VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS:
+        return "INTEL";
+    case VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA:
+        return "ANV";
+    case VK_DRIVER_ID_MESA_LLVMPIPE:
+        return "LAVAPIPE";
+    default:
+        return vendor_name;
+    }
+}
+
 void Device::CheckSuitability(bool requires_swapchain) const {
     std::bitset<REQUIRED_EXTENSIONS.size()> available_extensions;
     bool has_swapchain = false;
@@ -818,6 +840,17 @@ void Device::CollectTelemetryParameters() {
     }
 }
 
+void Device::CollectPhysicalMemoryInfo() {
+    const auto mem_properties = physical.GetMemoryProperties();
+    const size_t num_properties = mem_properties.memoryHeapCount;
+    device_access_memory = 0;
+    for (size_t element = 0; element < num_properties; ++element) {
+        if ((mem_properties.memoryHeaps[element].flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) != 0) {
+            device_access_memory += mem_properties.memoryHeaps[element].size;
+        }
+    }
+}
+
 void Device::CollectToolingInfo() {
     if (!ext_tooling_info) {
         return;
diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h
index 67d70cd22..96c0f8c60 100644
--- a/src/video_core/vulkan_common/vulkan_device.h
+++ b/src/video_core/vulkan_common/vulkan_device.h
@@ -45,6 +45,9 @@ public:
     /// Reports a shader to Nsight Aftermath.
     void SaveShader(const std::vector<u32>& spirv) const;
 
+    /// Returns the name of the VkDriverId reported from Vulkan.
+    std::string GetDriverName() const;
+
     /// Returns the dispatch loader with direct function pointers of the device.
     const vk::DeviceDispatch& GetDispatchLoader() const {
         return dld;
@@ -225,6 +228,10 @@ public:
         return use_asynchronous_shaders;
     }
 
+    u64 GetDeviceLocalMemory() const {
+        return device_access_memory;
+    }
+
 private:
     /// Checks if the physical device is suitable.
     void CheckSuitability(bool requires_swapchain) const;
@@ -244,6 +251,9 @@ private:
     /// Collects information about attached tools.
     void CollectToolingInfo();
 
+    /// Collects information about the device's local memory.
+    void CollectPhysicalMemoryInfo();
+
     /// Returns a list of queue initialization descriptors.
     std::vector<VkDeviceQueueCreateInfo> GetDeviceQueueCreateInfos() const;
 
@@ -257,21 +267,22 @@ private:
     bool IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage,
                            FormatType format_type) const;
 
-    VkInstance instance;                    ///< Vulkan instance.
-    vk::DeviceDispatch dld;                 ///< Device function pointers.
-    vk::PhysicalDevice physical;            ///< Physical device.
-    VkPhysicalDeviceProperties properties;  ///< Device properties.
-    vk::Device logical;                     ///< Logical device.
-    vk::Queue graphics_queue;               ///< Main graphics queue.
-    vk::Queue present_queue;                ///< Main present queue.
-    u32 instance_version{};                 ///< Vulkan onstance version.
-    u32 graphics_family{};                  ///< Main graphics queue family index.
-    u32 present_family{};                   ///< Main present queue family index.
-    VkDriverIdKHR driver_id{};              ///< Driver ID.
-    VkShaderStageFlags guest_warp_stages{}; ///< Stages where the guest warp size can be forced.ed
-    bool is_optimal_astc_supported{};       ///< Support for native ASTC.
-    bool is_float16_supported{};            ///< Support for float16 arithmetics.
-    bool is_warp_potentially_bigger{};      ///< Host warp size can be bigger than guest.
+    VkInstance instance;                        ///< Vulkan instance.
+    vk::DeviceDispatch dld;                     ///< Device function pointers.
+    vk::PhysicalDevice physical;                ///< Physical device.
+    VkPhysicalDeviceProperties properties;      ///< Device properties.
+    vk::Device logical;                         ///< Logical device.
+    vk::Queue graphics_queue;                   ///< Main graphics queue.
+    vk::Queue present_queue;                    ///< Main present queue.
+    u32 instance_version{};                     ///< Vulkan onstance version.
+    u32 graphics_family{};                      ///< Main graphics queue family index.
+    u32 present_family{};                       ///< Main present queue family index.
+    VkDriverIdKHR driver_id{};                  ///< Driver ID.
+    VkShaderStageFlags guest_warp_stages{};     ///< Stages where the guest warp size can be forced.
+    u64 device_access_memory{};                 ///< Total size of device local memory in bytes.
+    bool is_optimal_astc_supported{};           ///< Support for native ASTC.
+    bool is_float16_supported{};                ///< Support for float16 arithmetics.
+    bool is_warp_potentially_bigger{};          ///< Host warp size can be bigger than guest.
     bool is_formatless_image_load_supported{};  ///< Support for shader image read without format.
     bool is_shader_storage_image_multisample{}; ///< Support for image operations on MSAA images.
     bool is_blit_depth_stencil_supported{};     ///< Support for blitting from and to depth stencil.
diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp
index 5edd06ebc..aa173d19e 100644
--- a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp
+++ b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp
@@ -69,10 +69,10 @@ constexpr VkExportMemoryAllocateInfo EXPORT_ALLOCATE_INFO{
 
 class MemoryAllocation {
 public:
-    explicit MemoryAllocation(vk::DeviceMemory memory_, VkMemoryPropertyFlags properties,
-                              u64 allocation_size_, u32 type)
-        : memory{std::move(memory_)}, allocation_size{allocation_size_}, property_flags{properties},
-          shifted_memory_type{1U << type} {}
+    explicit MemoryAllocation(MemoryAllocator* const allocator_, vk::DeviceMemory memory_,
+                              VkMemoryPropertyFlags properties, u64 allocation_size_, u32 type)
+        : allocator{allocator_}, memory{std::move(memory_)}, allocation_size{allocation_size_},
+          property_flags{properties}, shifted_memory_type{1U << type} {}
 
 #if defined(_WIN32) || defined(__unix__)
     ~MemoryAllocation() {
@@ -106,6 +106,10 @@ public:
         const auto it = std::ranges::find(commits, begin, &Range::begin);
         ASSERT_MSG(it != commits.end(), "Invalid commit");
         commits.erase(it);
+        if (commits.empty()) {
+            // Do not call any code involving 'this' after this call, the object will be destroyed
+            allocator->ReleaseMemory(this);
+        }
     }
 
     [[nodiscard]] std::span<u8> Map() {
@@ -171,6 +175,7 @@ private:
         return candidate;
     }
 
+    MemoryAllocator* const allocator;           ///< Parent memory allocation.
     const vk::DeviceMemory memory;              ///< Vulkan memory allocation handler.
     const u64 allocation_size;                  ///< Size of this allocation.
     const VkMemoryPropertyFlags property_flags; ///< Vulkan memory property flags.
@@ -275,10 +280,17 @@ bool MemoryAllocator::TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask,
             return false;
         }
     }
-    allocations.push_back(std::make_unique<MemoryAllocation>(std::move(memory), flags, size, type));
+    allocations.push_back(
+        std::make_unique<MemoryAllocation>(this, std::move(memory), flags, size, type));
     return true;
 }
 
+void MemoryAllocator::ReleaseMemory(MemoryAllocation* alloc) {
+    const auto it = std::ranges::find(allocations, alloc, &std::unique_ptr<MemoryAllocation>::get);
+    ASSERT(it != allocations.end());
+    allocations.erase(it);
+}
+
 std::optional<MemoryCommit> MemoryAllocator::TryCommit(const VkMemoryRequirements& requirements,
                                                        VkMemoryPropertyFlags flags) {
     for (auto& allocation : allocations) {
diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.h b/src/video_core/vulkan_common/vulkan_memory_allocator.h
index db12d02f4..b61e931e0 100644
--- a/src/video_core/vulkan_common/vulkan_memory_allocator.h
+++ b/src/video_core/vulkan_common/vulkan_memory_allocator.h
@@ -69,6 +69,8 @@ private:
 /// Memory allocator container.
 /// Allocates and releases memory allocations on demand.
 class MemoryAllocator {
+    friend MemoryAllocation;
+
 public:
     /**
      * Construct memory allocator
@@ -104,6 +106,9 @@ private:
     /// Tries to allocate a chunk of memory.
     bool TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size);
 
+    /// Releases a chunk of memory.
+    void ReleaseMemory(MemoryAllocation* alloc);
+
     /// Tries to allocate a memory commit.
     std::optional<MemoryCommit> TryCommit(const VkMemoryRequirements& requirements,
                                           VkMemoryPropertyFlags flags);