From 2985e5e94c82febcf215feb0023f4184b38bb24a Mon Sep 17 00:00:00 2001
From: ameerj <52414509+ameerj@users.noreply.github.com>
Date: Sat, 13 Feb 2021 15:50:12 -0500
Subject: renderer_opengl: Accelerate ASTC texture decoding with a compute
 shader

ASTC texture decoding is currently handled by a CPU decoder for GPU's without native ASTC decoding support (most desktop GPUs). This is the cause for noticeable performance degradation in titles which use the format extensively.

This commit adds support to accelerate ASTC decoding using a compute shader on OpenGL for GPUs without native support.
---
 src/video_core/host_shaders/astc_decoder.comp | 1288 +++++++++++++++++++++++++
 1 file changed, 1288 insertions(+)
 create mode 100644 src/video_core/host_shaders/astc_decoder.comp

(limited to 'src/video_core/host_shaders/astc_decoder.comp')

diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp
new file mode 100644
index 000000000..070190a5c
--- /dev/null
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -0,0 +1,1288 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 450
+
+#ifdef VULKAN
+
+#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+#define END_PUSH_CONSTANTS };
+#define UNIFORM(n)
+#define BINDING_SWIZZLE_BUFFER 0
+#define BINDING_INPUT_BUFFER 1
+#define BINDING_ENC_BUFFER 2
+#define BINDING_6_TO_8_BUFFER 3
+#define BINDING_7_TO_8_BUFFER 4
+#define BINDING_8_TO_8_BUFFER 5
+#define BINDING_BYTE_TO_16_BUFFER 6
+#define BINDING_OUTPUT_IMAGE 3
+
+#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
+
+#define BEGIN_PUSH_CONSTANTS
+#define END_PUSH_CONSTANTS
+#define UNIFORM(n) layout(location = n) uniform
+#define BINDING_SWIZZLE_BUFFER 0
+#define BINDING_INPUT_BUFFER 1
+#define BINDING_ENC_BUFFER 2
+#define BINDING_6_TO_8_BUFFER 3
+#define BINDING_7_TO_8_BUFFER 4
+#define BINDING_8_TO_8_BUFFER 5
+#define BINDING_BYTE_TO_16_BUFFER 6
+#define BINDING_OUTPUT_IMAGE 0
+
+#endif
+
+layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
+
+BEGIN_PUSH_CONSTANTS
+UNIFORM(0) uvec2 num_image_blocks;
+UNIFORM(1) uvec2 block_dims;
+UNIFORM(2) uint layer;
+
+UNIFORM(3) uvec3 origin;
+UNIFORM(4) ivec3 destination;
+UNIFORM(5) uint bytes_per_block_log2;
+UNIFORM(6) uint layer_stride;
+UNIFORM(7) uint block_size;
+UNIFORM(8) uint x_shift;
+UNIFORM(9) uint block_height;
+UNIFORM(10) uint block_height_mask;
+
+END_PUSH_CONSTANTS
+
+uint current_index = 0;
+int bitsread = 0;
+uint total_bitsread = 0;
+uint local_buff[16];
+
+const int JustBits = 0;
+const int Quint = 1;
+const int Trit = 2;
+
+struct EncodingData {
+    uint encoding;
+    uint num_bits;
+    uint bit_value;
+    uint quint_trit_value;
+};
+
+struct TexelWeightParams {
+    uvec2 size;
+    bool dual_plane;
+    uint max_weight;
+    bool Error;
+    bool VoidExtentLDR;
+    bool VoidExtentHDR;
+};
+
+// Swizzle data
+layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
+    uint swizzle_table[];
+};
+
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 {
+    uint astc_data[];
+};
+layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly image2D dest_image;
+
+const uint GOB_SIZE_X = 64;
+const uint GOB_SIZE_Y = 8;
+const uint GOB_SIZE_Z = 1;
+const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
+
+const uint GOB_SIZE_X_SHIFT = 6;
+const uint GOB_SIZE_Y_SHIFT = 3;
+const uint GOB_SIZE_Z_SHIFT = 0;
+const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
+
+const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
+
+uint SwizzleOffset(uvec2 pos) {
+    pos = pos & SWIZZLE_MASK;
+    return swizzle_table[pos.y * 64 + pos.x];
+}
+
+uint ReadTexel(uint offset) {
+    // extract the 8-bit value from the 32-bit packed data.
+    return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8);
+}
+
+// ASTC Encodings data
+layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues {
+    EncodingData encoding_values[256];
+};
+// ASTC Precompiled tables
+layout(binding = BINDING_6_TO_8_BUFFER, std430) readonly buffer REPLICATE_6_BIT_TO_8 {
+    uint REPLICATE_6_BIT_TO_8_TABLE[];
+};
+layout(binding = BINDING_7_TO_8_BUFFER, std430) readonly buffer REPLICATE_7_BIT_TO_8 {
+    uint REPLICATE_7_BIT_TO_8_TABLE[];
+};
+layout(binding = BINDING_8_TO_8_BUFFER, std430) readonly buffer REPLICATE_8_BIT_TO_8 {
+    uint REPLICATE_8_BIT_TO_8_TABLE[];
+};
+layout(binding = BINDING_BYTE_TO_16_BUFFER, std430) readonly buffer REPLICATE_BYTE_TO_16 {
+    uint REPLICATE_BYTE_TO_16_TABLE[];
+};
+
+const int BLOCK_SIZE_IN_BYTES = 16;
+
+const int BLOCK_INFO_ERROR = 0;
+const int BLOCK_INFO_VOID_EXTENT_HDR = 1;
+const int BLOCK_INFO_VOID_EXTENT_LDR = 2;
+const int BLOCK_INFO_NORMAL = 3;
+
+// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
+// is the same as [(numBits - 1):0] and repeats all the way down.
+uint Replicate(uint val, uint num_bits, uint to_bit) {
+    if (num_bits == 0) {
+        return 0;
+    }
+    if (to_bit == 0) {
+        return 0;
+    }
+    const uint v = val & uint((1 << num_bits) - 1);
+    uint res = v;
+    uint reslen = num_bits;
+    while (reslen < to_bit) {
+        uint comp = 0;
+        if (num_bits > to_bit - reslen) {
+            uint newshift = to_bit - reslen;
+            comp = num_bits - newshift;
+            num_bits = newshift;
+        }
+        res = uint(res << num_bits);
+        res = uint(res | (v >> comp));
+        reslen += num_bits;
+    }
+    return res;
+}
+
+uvec4 ReplicateByteTo16(uvec4 value) {
+    return uvec4(REPLICATE_BYTE_TO_16_TABLE[value.x], REPLICATE_BYTE_TO_16_TABLE[value.y],
+                 REPLICATE_BYTE_TO_16_TABLE[value.z], REPLICATE_BYTE_TO_16_TABLE[value.w]);
+}
+
+const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127);
+uint ReplicateBitTo7(uint value) {
+    return REPLICATE_BIT_TO_7_TABLE[value];
+    ;
+}
+
+const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511);
+uint ReplicateBitTo9(uint value) {
+    return REPLICATE_1_BIT_TO_9_TABLE[value];
+}
+
+const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255);
+const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255);
+const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255);
+const uint REPLICATE_4_BIT_TO_8_TABLE[16] =
+    uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255);
+const uint REPLICATE_5_BIT_TO_8_TABLE[32] =
+    uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165,
+           173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255);
+
+uint FastReplicateTo8(uint value, uint num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_8_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_8_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_8_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_8_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_8_TABLE[value];
+    case 6:
+        return REPLICATE_6_BIT_TO_8_TABLE[value];
+    case 7:
+        return REPLICATE_7_BIT_TO_8_TABLE[value];
+    case 8:
+        return REPLICATE_8_BIT_TO_8_TABLE[value];
+    }
+    return Replicate(value, num_bits, 8);
+}
+
+const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63);
+const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63);
+const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63);
+const uint REPLICATE_4_BIT_TO_6_TABLE[16] =
+    uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63);
+const uint REPLICATE_5_BIT_TO_6_TABLE[32] =
+    uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45,
+           47, 49, 51, 53, 55, 57, 59, 61, 63);
+
+uint FastReplicateTo6(uint value, uint num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_6_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_6_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_6_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_6_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_6_TABLE[value];
+    }
+    return Replicate(value, num_bits, 6);
+}
+
+uint hash52(uint p) {
+    p ^= p >> 15;
+    p -= p << 17;
+    p += p << 7;
+    p += p << 4;
+    p ^= p >> 5;
+    p += p << 16;
+    p ^= p >> 7;
+    p ^= p >> 3;
+    p ^= p << 6;
+    p ^= p >> 17;
+    return p;
+}
+
+uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bool small_block) {
+    if (1 == partition_count)
+        return 0;
+
+    if (small_block) {
+        x <<= 1;
+        y <<= 1;
+        z <<= 1;
+    }
+
+    seed += (partition_count - 1) * 1024;
+
+    uint rnum = hash52(uint(seed));
+    uint seed1 = uint(rnum & 0xF);
+    uint seed2 = uint((rnum >> 4) & 0xF);
+    uint seed3 = uint((rnum >> 8) & 0xF);
+    uint seed4 = uint((rnum >> 12) & 0xF);
+    uint seed5 = uint((rnum >> 16) & 0xF);
+    uint seed6 = uint((rnum >> 20) & 0xF);
+    uint seed7 = uint((rnum >> 24) & 0xF);
+    uint seed8 = uint((rnum >> 28) & 0xF);
+    uint seed9 = uint((rnum >> 18) & 0xF);
+    uint seed10 = uint((rnum >> 22) & 0xF);
+    uint seed11 = uint((rnum >> 26) & 0xF);
+    uint seed12 = uint(((rnum >> 30) | (rnum << 2)) & 0xF);
+
+    seed1 = (seed1 * seed1);
+    seed2 = (seed2 * seed2);
+    seed3 = (seed3 * seed3);
+    seed4 = (seed4 * seed4);
+    seed5 = (seed5 * seed5);
+    seed6 = (seed6 * seed6);
+    seed7 = (seed7 * seed7);
+    seed8 = (seed8 * seed8);
+    seed9 = (seed9 * seed9);
+    seed10 = (seed10 * seed10);
+    seed11 = (seed11 * seed11);
+    seed12 = (seed12 * seed12);
+
+    int sh1, sh2, sh3;
+    if ((seed & 1) > 0) {
+        sh1 = (seed & 2) > 0 ? 4 : 5;
+        sh2 = (partition_count == 3) ? 6 : 5;
+    } else {
+        sh1 = (partition_count == 3) ? 6 : 5;
+        sh2 = (seed & 2) > 0 ? 4 : 5;
+    }
+    sh3 = (seed & 0x10) > 0 ? sh1 : sh2;
+
+    seed1 = (seed1 >> sh1);
+    seed2 = (seed2 >> sh2);
+    seed3 = (seed3 >> sh1);
+    seed4 = (seed4 >> sh2);
+    seed5 = (seed5 >> sh1);
+    seed6 = (seed6 >> sh2);
+    seed7 = (seed7 >> sh1);
+    seed8 = (seed8 >> sh2);
+    seed9 = (seed9 >> sh3);
+    seed10 = (seed10 >> sh3);
+    seed11 = (seed11 >> sh3);
+    seed12 = (seed12 >> sh3);
+
+    uint a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
+    uint b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
+    uint c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
+    uint d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
+
+    a &= 0x3F;
+    b &= 0x3F;
+    c &= 0x3F;
+    d &= 0x3F;
+
+    if (partition_count < 4)
+        d = 0;
+    if (partition_count < 3)
+        c = 0;
+
+    if (a >= b && a >= c && a >= d)
+        return 0;
+    else if (b >= c && b >= d)
+        return 1;
+    else if (c >= d)
+        return 2;
+    return 3;
+}
+
+uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) {
+    return SelectPartition(seed, x, y, 0, partition_count, small_block);
+}
+
+uint ReadBit() {
+    uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1);
+    bitsread++;
+    total_bitsread++;
+    if (bitsread == 8) {
+        current_index++;
+        bitsread = 0;
+    }
+    return bit;
+}
+
+uint StreamBits(uint num_bits) {
+    uint ret = 0;
+    for (uint i = 0; i < num_bits; i++) {
+        ret |= ((ReadBit() & 1) << i);
+    }
+    return ret;
+}
+
+// Define color data.
+uint color_endpoint_data[16];
+int color_bitsread = 0;
+uint total_color_bitsread = 0;
+int color_index = 0;
+
+// Define color data.
+uint texel_weight_data[16];
+int texel_bitsread = 0;
+uint total_texel_bitsread = 0;
+int texel_index = 0;
+
+bool texel_flag = false;
+
+uint ReadColorBit() {
+    uint bit = 0;
+    if (texel_flag) {
+        bit = bitfieldExtract(texel_weight_data[texel_index], texel_bitsread, 1);
+        texel_bitsread++;
+        total_texel_bitsread++;
+        if (texel_bitsread == 8) {
+            texel_index++;
+            texel_bitsread = 0;
+        }
+    } else {
+        bit = bitfieldExtract(color_endpoint_data[color_index], color_bitsread, 1);
+        color_bitsread++;
+        total_color_bitsread++;
+        if (color_bitsread == 8) {
+            color_index++;
+            color_bitsread = 0;
+        }
+    }
+    return bit;
+}
+
+uint StreamColorBits(uint num_bits) {
+    uint ret = 0;
+    for (uint i = 0; i < num_bits; i++) {
+        ret |= ((ReadColorBit() & 1) << i);
+    }
+    return ret;
+}
+
+EncodingData result_vector[100];
+int result_index = 0;
+
+EncodingData texel_vector[100];
+int texel_vector_index = 0;
+
+void ResultEmplaceBack(EncodingData val) {
+    if (texel_flag) {
+        texel_vector[texel_vector_index] = val;
+        texel_vector_index++;
+    } else {
+        result_vector[result_index] = val;
+        result_index++;
+    }
+}
+
+// Returns the number of bits required to encode n_vals values.
+uint GetBitLength(uint n_vals, uint encoding_index) {
+    uint totalBits = encoding_values[encoding_index].num_bits * n_vals;
+    if (encoding_values[encoding_index].encoding == Trit) {
+        totalBits += (n_vals * 8 + 4) / 5;
+    } else if (encoding_values[encoding_index].encoding == Quint) {
+        totalBits += (n_vals * 7 + 2) / 3;
+    }
+    return totalBits;
+}
+
+uint GetNumWeightValues(uvec2 size, bool dual_plane) {
+    uint n_vals = size.x * size.y;
+    if (dual_plane) {
+        n_vals *= 2;
+    }
+    return n_vals;
+}
+
+uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) {
+    uint n_vals = GetNumWeightValues(size, dual_plane);
+    return GetBitLength(n_vals, max_weight);
+}
+
+uint BitsBracket(uint bits, uint pos) {
+    return ((bits >> pos) & 1);
+}
+
+uint BitsOp(uint bits, uint start, uint end) {
+    if (start == end) {
+        return BitsBracket(bits, start);
+    } else if (start > end) {
+        uint t = start;
+        start = end;
+        end = t;
+    }
+
+    uint mask = (1 << (end - start + 1)) - 1;
+    return ((bits >> start) & mask);
+}
+
+void DecodeQuintBlock(uint num_bits) { // Value number of bits
+    uint m[3];
+    uint q[3];
+    uint Q;
+    m[0] = StreamColorBits(num_bits);
+    Q = StreamColorBits(3);
+    m[1] = StreamColorBits(num_bits);
+    Q |= StreamColorBits(2) << 3;
+    m[2] = StreamColorBits(num_bits);
+    Q |= StreamColorBits(2) << 5;
+    if (BitsOp(Q, 1, 2) == 3 && BitsOp(Q, 5, 6) == 0) {
+        q[0] = 4;
+        q[1] = 4;
+        q[2] = (BitsBracket(Q, 0) << 2) | ((BitsBracket(Q, 4) & ~BitsBracket(Q, 0)) << 1) |
+               (BitsBracket(Q, 3) & ~BitsBracket(Q, 0));
+    } else {
+        uint C = 0;
+        if (BitsOp(Q, 1, 2) == 3) {
+            q[2] = 4;
+            C = (BitsOp(Q, 3, 4) << 3) | ((~BitsOp(Q, 5, 6) & 3) << 1) | BitsBracket(Q, 0);
+        } else {
+            q[2] = BitsOp(Q, 5, 6);
+            C = BitsOp(Q, 0, 4);
+        }
+
+        if (BitsOp(C, 0, 2) == 5) {
+            q[1] = 4;
+            q[0] = BitsOp(C, 3, 4);
+        } else {
+            q[1] = BitsOp(C, 3, 4);
+            q[0] = BitsOp(C, 0, 2);
+        }
+    }
+
+    for (uint i = 0; i < 3; i++) {
+        EncodingData val;
+        val.encoding = Quint;
+        val.num_bits = num_bits;
+        val.bit_value = m[i];
+        val.quint_trit_value = q[i];
+        ResultEmplaceBack(val);
+    }
+}
+
+void DecodeTritBlock(uint num_bits) {
+    uint m[5];
+    uint t[5];
+    uint T;
+    m[0] = StreamColorBits(num_bits);
+    T = StreamColorBits(2);
+    m[1] = StreamColorBits(num_bits);
+    T |= StreamColorBits(2) << 2;
+    m[2] = StreamColorBits(num_bits);
+    T |= StreamColorBits(1) << 4;
+    m[3] = StreamColorBits(num_bits);
+    T |= StreamColorBits(2) << 5;
+    m[4] = StreamColorBits(num_bits);
+    T |= StreamColorBits(1) << 7;
+    uint C = 0;
+    if (BitsOp(T, 2, 4) == 7) {
+        C = (BitsOp(T, 5, 7) << 2) | BitsOp(T, 0, 1);
+        t[4] = 2;
+        t[3] = 2;
+    } else {
+        C = BitsOp(T, 0, 4);
+        if (BitsOp(T, 5, 6) == 3) {
+            t[4] = 2;
+            t[3] = BitsBracket(T, 7);
+        } else {
+            t[4] = BitsBracket(T, 7);
+            t[3] = BitsOp(T, 5, 6);
+        }
+    }
+    if (BitsOp(C, 0, 1) == 3) {
+        t[2] = 2;
+        t[1] = BitsBracket(C, 4);
+        t[0] = (BitsBracket(C, 3) << 1) | (BitsBracket(C, 2) & ~BitsBracket(C, 3));
+    } else if (BitsOp(C, 2, 3) == 3) {
+        t[2] = 2;
+        t[1] = 2;
+        t[0] = BitsOp(C, 0, 1);
+    } else {
+        t[2] = BitsBracket(C, 4);
+        t[1] = BitsOp(C, 2, 3);
+        t[0] = (BitsBracket(C, 1) << 1) | (BitsBracket(C, 0) & ~BitsBracket(C, 1));
+    }
+    for (uint i = 0; i < 5; i++) {
+        EncodingData val;
+        val.encoding = Trit;
+        val.num_bits = num_bits;
+        val.bit_value = m[i];
+        val.quint_trit_value = t[i];
+        ResultEmplaceBack(val);
+    }
+}
+void DecodeIntegerSequence(uint max_range, uint num_values) {
+    EncodingData val = encoding_values[max_range];
+    uint vals_decoded = 0;
+    while (vals_decoded < num_values) {
+        switch (val.encoding) {
+        case Quint:
+            DecodeQuintBlock(val.num_bits);
+            vals_decoded += 3;
+            break;
+
+        case Trit:
+            DecodeTritBlock(val.num_bits);
+            vals_decoded += 5;
+            break;
+
+        case JustBits:
+            val.bit_value = StreamColorBits(val.num_bits);
+            ResultEmplaceBack(val);
+            vals_decoded++;
+            break;
+        }
+    }
+}
+
+void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitions,
+                       uint color_data_bits) {
+    uint num_values = 0;
+    for (uint i = 0; i < num_partitions; i++) {
+        num_values += ((modes[i] >> 2) + 1) << 1;
+    }
+    int range = 256;
+    while (--range > 0) {
+        EncodingData val = encoding_values[range];
+        uint bitLength = GetBitLength(num_values, range);
+        if (bitLength <= color_data_bits) {
+            while (--range > 0) {
+                EncodingData newval = encoding_values[range];
+                if (newval.encoding != val.encoding && newval.num_bits != val.num_bits) {
+                    break;
+                }
+            }
+            range++;
+            break;
+        }
+    }
+    DecodeIntegerSequence(range, num_values);
+    uint out_index = 0;
+    for (int itr = 0; itr < result_index; itr++) {
+        if (out_index >= num_values) {
+            break;
+        }
+        EncodingData val = result_vector[itr];
+        uint bitlen = val.num_bits;
+        uint bitval = val.bit_value;
+        uint A = 0, B = 0, C = 0, D = 0;
+        A = ReplicateBitTo9((bitval & 1));
+        switch (val.encoding) {
+        case JustBits:
+            color_values[out_index++] = FastReplicateTo8(bitval, bitlen);
+            break;
+        case Trit: {
+            D = val.quint_trit_value;
+            switch (bitlen) {
+            case 1: {
+                C = 204;
+            } break;
+            case 2: {
+                C = 93;
+                uint b = (bitval >> 1) & 1;
+                B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
+            } break;
+
+            case 3: {
+                C = 44;
+                uint cb = (bitval >> 1) & 3;
+                B = (cb << 7) | (cb << 2) | cb;
+            } break;
+
+            case 4: {
+                C = 22;
+                uint dcb = (bitval >> 1) & 7;
+                B = (dcb << 6) | dcb;
+            } break;
+
+            case 5: {
+                C = 11;
+                uint edcb = (bitval >> 1) & 0xF;
+                B = (edcb << 5) | (edcb >> 2);
+            } break;
+
+            case 6: {
+                C = 5;
+                uint fedcb = (bitval >> 1) & 0x1F;
+                B = (fedcb << 4) | (fedcb >> 4);
+            } break;
+            }
+        } break;
+        case Quint: {
+            D = val.quint_trit_value;
+            switch (bitlen) {
+            case 1: {
+                C = 113;
+            } break;
+            case 2: {
+                C = 54;
+                uint b = (bitval >> 1) & 1;
+                B = (b << 8) | (b << 3) | (b << 2);
+            } break;
+            case 3: {
+                C = 26;
+                uint cb = (bitval >> 1) & 3;
+                B = (cb << 7) | (cb << 1) | (cb >> 1);
+            } break;
+            case 4: {
+                C = 13;
+                uint dcb = (bitval >> 1) & 7;
+                B = (dcb << 6) | (dcb >> 1);
+            } break;
+            case 5: {
+                C = 6;
+                uint edcb = (bitval >> 1) & 0xF;
+                B = (edcb << 5) | (edcb >> 3);
+            } break;
+            }
+        } break;
+        }
+
+        if (val.encoding != JustBits) {
+            uint T = (D * C) + B;
+            T ^= A;
+            T = (A & 0x80) | (T >> 2);
+            color_values[out_index++] = T;
+        }
+    }
+}
+ivec2 BitTransferSigned(int a, int b) {
+    ivec2 transferred;
+    transferred[1] = b >> 1;
+    transferred[1] |= a & 0x80;
+    transferred[0] = a >> 1;
+    transferred[0] &= 0x3F;
+    if ((transferred[0] & 0x20) > 0) {
+        transferred[0] -= 0x40;
+    }
+    return transferred;
+}
+
+uvec4 ClampByte(ivec4 color) {
+    for (uint i = 0; i < 4; i++) {
+        color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]);
+    }
+    return uvec4(color);
+}
+ivec4 BlueContract(int a, int r, int g, int b) {
+    return ivec4(a, (r + b) >> 1, (g + b) >> 1, b);
+}
+int colvals_index = 0;
+void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_values[32],
+                      uint color_endpoint_mode) {
+#define READ_UINT_VALUES(N)                                                                        \
+    uint v[N];                                                                                     \
+    for (uint i = 0; i < N; i++) {                                                                 \
+        v[i] = color_values[colvals_index++];                                                      \
+    }
+
+#define READ_INT_VALUES(N)                                                                         \
+    int v[N];                                                                                      \
+    for (uint i = 0; i < N; i++) {                                                                 \
+        v[i] = int(color_values[colvals_index++]);                                                 \
+    }
+
+    switch (color_endpoint_mode) {
+    case 0: {
+        READ_UINT_VALUES(2)
+        ep1 = uvec4(0xFF, v[0], v[0], v[0]);
+        ep2 = uvec4(0xFF, v[1], v[1], v[1]);
+    } break;
+
+    case 1: {
+        READ_UINT_VALUES(2)
+        uint L0 = (v[0] >> 2) | (v[1] & 0xC0);
+        uint L1 = max(L0 + (v[1] & 0x3F), 0xFFU);
+        ep1 = uvec4(0xFF, L0, L0, L0);
+        ep2 = uvec4(0xFF, L1, L1, L1);
+    } break;
+
+    case 4: {
+        READ_UINT_VALUES(4)
+        ep1 = uvec4(v[2], v[0], v[0], v[0]);
+        ep2 = uvec4(v[3], v[1], v[1], v[1]);
+    } break;
+
+    case 5: {
+        READ_INT_VALUES(4)
+        ivec2 transferred = BitTransferSigned(v[1], v[0]);
+        v[1] = transferred[0];
+        v[0] = transferred[1];
+        transferred = BitTransferSigned(v[3], v[2]);
+        v[3] = transferred[0];
+        v[2] = transferred[1];
+        ep1 = ClampByte(ivec4(v[2], v[0], v[0], v[0]));
+        ep2 = ClampByte(ivec4((v[2] + v[3]), v[0] + v[1], v[0] + v[1], v[0] + v[1]));
+    } break;
+
+    case 6: {
+        READ_UINT_VALUES(4)
+        ep1 = uvec4(0xFF, v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
+        ep2 = uvec4(0xFF, v[0], v[1], v[2]);
+    } break;
+
+    case 8: {
+        READ_UINT_VALUES(6)
+        if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
+            ep1 = uvec4(0xFF, v[0], v[2], v[4]);
+            ep2 = uvec4(0xFF, v[1], v[3], v[5]);
+        } else {
+            ep1 = uvec4(BlueContract(0xFF, int(v[1]), int(v[3]), int(v[5])));
+            ep2 = uvec4(BlueContract(0xFF, int(v[0]), int(v[2]), int(v[4])));
+        }
+    } break;
+
+    case 9: {
+        READ_INT_VALUES(6)
+        ivec2 transferred = BitTransferSigned(v[1], v[0]);
+        v[1] = transferred[0];
+        v[0] = transferred[1];
+        transferred = BitTransferSigned(v[3], v[2]);
+        v[3] = transferred[0];
+        v[2] = transferred[1];
+        transferred = BitTransferSigned(v[5], v[4]);
+        v[5] = transferred[0];
+        v[4] = transferred[1];
+        if (v[1] + v[3] + v[5] >= 0) {
+            ep1 = ClampByte(ivec4(0xFF, v[0], v[2], v[4]));
+            ep2 = ClampByte(ivec4(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]));
+        } else {
+            ep1 = ClampByte(BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]));
+            ep2 = ClampByte(BlueContract(0xFF, v[0], v[2], v[4]));
+        }
+    } break;
+
+    case 10: {
+        READ_UINT_VALUES(6)
+        ep1 = uvec4(v[4], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
+        ep2 = uvec4(v[5], v[0], v[1], v[2]);
+    } break;
+
+    case 12: {
+        READ_UINT_VALUES(8)
+        if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
+            ep1 = uvec4(v[6], v[0], v[2], v[4]);
+            ep2 = uvec4(v[7], v[1], v[3], v[5]);
+        } else {
+            ep1 = uvec4(BlueContract(int(v[7]), int(v[1]), int(v[3]), int(v[5])));
+            ep2 = uvec4(BlueContract(int(v[6]), int(v[0]), int(v[2]), int(v[4])));
+        }
+    } break;
+
+    case 13: {
+        READ_INT_VALUES(8)
+        ivec2 transferred = BitTransferSigned(v[1], v[0]);
+        v[1] = transferred[0];
+        v[0] = transferred[1];
+        transferred = BitTransferSigned(v[3], v[2]);
+        v[3] = transferred[0];
+        v[2] = transferred[1];
+
+        transferred = BitTransferSigned(v[5], v[4]);
+        v[5] = transferred[0];
+        v[4] = transferred[1];
+
+        transferred = BitTransferSigned(v[7], v[6]);
+        v[7] = transferred[0];
+        v[6] = transferred[1];
+
+        if (v[1] + v[3] + v[5] >= 0) {
+            ep1 = ClampByte(ivec4(v[6], v[0], v[2], v[4]));
+            ep2 = ClampByte(ivec4(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5]));
+        } else {
+            ep1 = ClampByte(BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5]));
+            ep2 = ClampByte(BlueContract(v[6], v[0], v[2], v[4]));
+        }
+    } break;
+    }
+#undef READ_UINT_VALUES
+#undef READ_INT_VALUES
+}
+
+uint UnquantizeTexelWeight(EncodingData val) {
+    uint bitval = val.bit_value;
+    uint bitlen = val.num_bits;
+    uint A = ReplicateBitTo7((bitval & 1));
+    uint B = 0, C = 0, D = 0;
+    uint result = 0;
+    switch (val.encoding) {
+    case JustBits:
+        result = FastReplicateTo6(bitval, bitlen);
+        break;
+    case Trit: {
+        D = val.quint_trit_value;
+        switch (bitlen) {
+        case 0: {
+            uint results[3] = {0, 32, 63};
+            result = results[D];
+        } break;
+        case 1: {
+            C = 50;
+        } break;
+        case 2: {
+            C = 23;
+            uint b = (bitval >> 1) & 1;
+            B = (b << 6) | (b << 2) | b;
+        } break;
+        case 3: {
+            C = 11;
+            uint cb = (bitval >> 1) & 3;
+            B = (cb << 5) | cb;
+        } break;
+        default:
+            break;
+        }
+    } break;
+    case Quint: {
+        D = val.quint_trit_value;
+        switch (bitlen) {
+        case 0: {
+            uint results[5] = {0, 16, 32, 47, 63};
+            result = results[D];
+        } break;
+        case 1: {
+            C = 28;
+        } break;
+        case 2: {
+            C = 13;
+            uint b = (bitval >> 1) & 1;
+            B = (b << 6) | (b << 1);
+        } break;
+        }
+    } break;
+    }
+    if (val.encoding != JustBits && bitlen > 0) {
+        result = D * C + B;
+        result ^= A;
+        result = (A & 0x20) | (result >> 2);
+    }
+    if (result > 32) {
+        result += 1;
+    }
+    return result;
+}
+
+void UnquantizeTexelWeights(out uint outbuffer[2][144], bool dual_plane, uvec2 size) {
+    uint weight_idx = 0;
+    uint unquantized[2][144];
+    uint area = size.x * size.y;
+    for (uint itr = 0; itr < texel_vector_index; itr++) {
+        unquantized[0][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]);
+        if (dual_plane) {
+            ++itr;
+            unquantized[1][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]);
+            if (itr == texel_vector_index) {
+                break;
+            }
+        }
+        if (++weight_idx >= (area))
+            break;
+    }
+    uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1));
+    uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1));
+    uint kPlaneScale = dual_plane ? 2 : 1;
+    for (uint plane = 0; plane < kPlaneScale; plane++)
+        for (uint t = 0; t < block_dims.y; t++)
+            for (uint s = 0; s < block_dims.x; s++) {
+                uint cs = Ds * s;
+                uint ct = Dt * t;
+                uint gs = (cs * (size.x - 1) + 32) >> 6;
+                uint gt = (ct * (size.y - 1) + 32) >> 6;
+                uint js = gs >> 4;
+                uint fs = gs & 0xF;
+                uint jt = gt >> 4;
+                uint ft = gt & 0x0F;
+                uint w11 = (fs * ft + 8) >> 4;
+                uint w10 = ft - w11;
+                uint w01 = fs - w11;
+                uint w00 = 16 - fs - ft + w11;
+                uvec4 w = uvec4(w00, w01, w10, w11);
+                uint v0 = jt * size.x + js;
+
+                uvec4 p = uvec4(0);
+                if (v0 < area) {
+                    p.x = unquantized[plane][v0];
+                }
+                if ((v0 + 1) < (area)) {
+                    p.y = unquantized[plane][v0 + 1];
+                }
+                if ((v0 + size.x) < (area)) {
+                    p.z = unquantized[plane][(v0 + size.x)];
+                }
+                if ((v0 + size.x + 1) < (area)) {
+                    p.w = unquantized[plane][(v0 + size.x + 1)];
+                }
+                outbuffer[plane][t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4;
+            }
+}
+
+int FindLayout(uint mode) {
+    if ((mode & 3) != 0) {
+        if ((mode & 8) != 0) {
+            if ((mode & 4) != 0) {
+                if ((mode & 0x100) != 0) {
+                    return 4;
+                }
+                return 3;
+            }
+            return 2;
+        }
+        if ((mode & 4) != 0) {
+            return 1;
+        }
+        return 0;
+    }
+    if ((mode & 0x100) != 0) {
+        if ((mode & 0x80) != 0) {
+            if ((mode & 0x20) != 0) {
+                return 8;
+            }
+            return 7;
+        }
+        return 9;
+    }
+    if ((mode & 0x80) != 0) {
+        return 6;
+    }
+    return 5;
+}
+
+TexelWeightParams DecodeBlockInfo(uint block_index) {
+    TexelWeightParams params = TexelWeightParams(uvec2(0), false, 0, false, false, false);
+    uint mode = StreamBits(11);
+    if ((mode & 0x1ff) == 0x1fc) {
+        if ((mode & 0x200) != 0) {
+            params.VoidExtentHDR = true;
+        } else {
+            params.VoidExtentLDR = true;
+        }
+        if ((mode & 0x400) == 0 || StreamBits(1) == 0) {
+            params.Error = true;
+        }
+        return params;
+    }
+    if ((mode & 0xf) == 0) {
+        params.Error = true;
+        return params;
+    }
+    if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) {
+        params.Error = true;
+        return params;
+    }
+    uint A, B;
+    uint mode_layout = FindLayout(mode);
+    switch (mode_layout) {
+    case 0:
+        A = (mode >> 5) & 0x3;
+        B = (mode >> 7) & 0x3;
+        params.size = uvec2(B + 4, A + 2);
+        break;
+    case 1:
+        A = (mode >> 5) & 0x3;
+        B = (mode >> 7) & 0x3;
+        params.size = uvec2(B + 8, A + 2);
+        break;
+    case 2:
+        A = (mode >> 5) & 0x3;
+        B = (mode >> 7) & 0x3;
+        params.size = uvec2(A + 2, B + 8);
+        break;
+    case 3:
+        A = (mode >> 5) & 0x3;
+        B = (mode >> 7) & 0x1;
+        params.size = uvec2(A + 2, B + 6);
+        break;
+    case 4:
+        A = (mode >> 5) & 0x3;
+        B = (mode >> 7) & 0x1;
+        params.size = uvec2(B + 2, A + 2);
+        break;
+    case 5:
+        A = (mode >> 5) & 0x3;
+        params.size = uvec2(12, A + 2);
+        break;
+    case 6:
+        A = (mode >> 5) & 0x3;
+        params.size = uvec2(A + 2, 12);
+        break;
+    case 7:
+        params.size = uvec2(6, 10);
+        break;
+    case 8:
+        params.size = uvec2(10, 6);
+        break;
+    case 9:
+        A = (mode >> 5) & 0x3;
+        B = (mode >> 9) & 0x3;
+        params.size = uvec2(A + 6, B + 6);
+        break;
+    default:
+        params.Error = true;
+        break;
+    }
+    params.dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0);
+    uint weight_index = (mode & 0x10) != 0 ? 1 : 0;
+    if (mode_layout < 5) {
+        weight_index |= (mode & 0x3) << 1;
+    } else {
+        weight_index |= (mode & 0xc) >> 1;
+    }
+    weight_index -= 2;
+    if ((mode_layout != 9) && ((mode & 0x200) != 0)) {
+        const int max_weights[6] = int[6](9, 11, 15, 19, 23, 31);
+        params.max_weight = max_weights[weight_index];
+    } else {
+        const int max_weights[6] = int[6](1, 2, 3, 4, 5, 7);
+        params.max_weight = max_weights[weight_index];
+    }
+    return params;
+}
+
+void FillError(ivec3 coord) {
+    for (uint j = 0; j < block_dims.y; j++) {
+        for (uint i = 0; i < block_dims.x; i++) {
+            imageStore(dest_image, coord.xy + ivec2(i, j), vec4(1.0, 1.0, 0.0, 1.0));
+        }
+    }
+    return;
+}
+
+void FillVoidExtentLDR(ivec3 coord, uint block_index) {
+    for (int i = 0; i < 4; i++) {
+        StreamBits(13);
+    }
+
+    uint r_u = StreamBits(16);
+    uint g_u = StreamBits(16);
+    uint b_u = StreamBits(16);
+    uint a_u = StreamBits(16);
+    float a = float(a_u) / 65535.0f;
+    float r = float(r_u) / 65535.0f;
+    float g = float(g_u) / 65535.0f;
+    float b = float(b_u) / 65535.0f;
+    for (uint j = 0; j < block_dims.y; j++) {
+        for (uint i = 0; i < block_dims.x; i++) {
+            imageStore(dest_image, coord.xy + ivec2(i, j), vec4(r, g, b, a));
+        }
+    }
+}
+
+void DecompressBlock(ivec3 coord, uint block_index) {
+    TexelWeightParams params;
+    params = DecodeBlockInfo(block_index);
+    if (params.Error) {
+        FillError(coord);
+        return;
+    }
+    if (params.VoidExtentHDR) {
+        FillError(coord);
+        return;
+    }
+    if (params.VoidExtentLDR) {
+        FillVoidExtentLDR(coord, block_index);
+        return;
+    }
+    if (params.size.x > block_dims.x || params.size.y > block_dims.y) {
+        FillError(coord);
+        return;
+    }
+    uint num_partitions = StreamBits(2) + 1;
+    if (num_partitions > 4 || (num_partitions == 4 && params.dual_plane)) {
+        FillError(coord);
+        return;
+    }
+    int plane_index = -1;
+    uint partition_index = 1;
+    uvec4 color_endpoint_mode = uvec4(0);
+    uint ced_pointer = 0;
+    uint base_cem = 0;
+    if (num_partitions == 1) {
+        color_endpoint_mode[0] = StreamBits(4);
+        partition_index = 0;
+    } else {
+        partition_index = StreamBits(10);
+        base_cem = StreamBits(6);
+    }
+    uint base_mode = base_cem & 3;
+    uint weight_bits = GetPackedBitSize(params.size, params.dual_plane, params.max_weight);
+    uint remaining_bits = 128 - weight_bits - total_bitsread;
+    uint extra_cem_bits = 0;
+    if (base_mode > 0) {
+        switch (num_partitions) {
+        case 2:
+            extra_cem_bits += 2;
+            break;
+        case 3:
+            extra_cem_bits += 5;
+            break;
+        case 4:
+            extra_cem_bits += 8;
+            break;
+        default:
+            return;
+        }
+    }
+    remaining_bits -= extra_cem_bits;
+    uint plane_selector_bits = 0;
+    if (params.dual_plane) {
+        plane_selector_bits = 2;
+    }
+    remaining_bits -= plane_selector_bits;
+    // Read color data...
+    uint color_data_bits = remaining_bits;
+    while (remaining_bits > 0) {
+        uint nb = min(remaining_bits, 8);
+        uint b = StreamBits(nb);
+        color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, 8));
+        ced_pointer++;
+        remaining_bits -= nb;
+    }
+    plane_index = int(StreamBits(plane_selector_bits));
+    if (base_mode > 0) {
+        uint extra_cem = StreamBits(extra_cem_bits);
+        uint cem = (extra_cem << 6) | base_cem;
+        cem >>= 2;
+        uint C[4] = {0, 0, 0, 0};
+        for (uint i = 0; i < num_partitions; i++) {
+            C[i] = cem & 1;
+            cem >>= 1;
+        }
+        uint M[4] = {0, 0, 0, 0};
+        for (uint i = 0; i < num_partitions; i++) {
+            M[i] = cem & 3;
+            cem >>= 2;
+        }
+        for (uint i = 0; i < num_partitions; i++) {
+            color_endpoint_mode[i] = base_mode;
+            if ((C[i]) == 0) {
+                color_endpoint_mode[i] -= 1;
+            }
+            color_endpoint_mode[i] <<= 2;
+            color_endpoint_mode[i] |= M[i];
+        }
+    } else if (num_partitions > 1) {
+        uint cem = base_cem >> 2;
+        for (uint i = 0; i < num_partitions; i++) {
+            color_endpoint_mode[i] = cem;
+        }
+    }
+
+    uint color_values[32]; // Four values, two endpoints, four maximum paritions
+    DecodeColorValues(color_values, color_endpoint_mode, num_partitions, color_data_bits);
+    uvec4 endpoints[4][2];
+    for (uint i = 0; i < num_partitions; i++) {
+        ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_values, color_endpoint_mode[i]);
+    }
+    for (uint i = 0; i < 16; i++) {
+        texel_weight_data[i] = local_buff[i];
+    }
+    for (uint i = 0; i < 8; i++) {
+#define REVERSE_BYTE(b) ((b * 0x0802U & 0x22110U) | (b * 0x8020U & 0x88440U)) * 0x10101U >> 16
+        uint a = REVERSE_BYTE(texel_weight_data[i]);
+        uint b = REVERSE_BYTE(texel_weight_data[15 - i]);
+#undef REVERSE_BYTE
+        texel_weight_data[i] = uint(bitfieldExtract(b, 0, 8));
+        texel_weight_data[15 - i] = uint(bitfieldExtract(a, 0, 8));
+    }
+    uint clear_byte_start =
+        (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) >> 3) + 1;
+    texel_weight_data[clear_byte_start - 1] =
+        texel_weight_data[clear_byte_start - 1] &
+        uint(
+            ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1));
+    for (uint i = 0; i < 16 - clear_byte_start; i++) {
+        texel_weight_data[clear_byte_start + i] = uint(0U);
+    }
+    texel_flag = true; // use texel "vector" and bit stream in integer decoding
+    DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane));
+    uint weights[2][144];
+    UnquantizeTexelWeights(weights, params.dual_plane, params.size);
+    for (uint j = 0; j < block_dims.y; j++) {
+        for (uint i = 0; i < block_dims.x; i++) {
+            uint local_partition = Select2DPartition(partition_index, i, j, num_partitions,
+                                                     (block_dims.y * block_dims.x) < 32);
+            vec4 p;
+            uvec4 C0 = ReplicateByteTo16(endpoints[local_partition][0]);
+            uvec4 C1 = ReplicateByteTo16(endpoints[local_partition][1]);
+            uvec4 plane_vec = uvec4(0);
+            uvec4 weight_vec = uvec4(0);
+            for (uint c = 0; c < 4; c++) {
+                if (params.dual_plane && (((plane_index + 1) & 3) == c)) {
+                    plane_vec[c] = 1;
+                }
+                weight_vec[c] = weights[plane_vec[c]][j * block_dims.x + i];
+            }
+            vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) >> 6);
+            p = (Cf / 65535.0);
+            imageStore(dest_image, coord.xy + ivec2(i, j), p.gbar);
+        }
+    }
+}
+
+void main() {
+    uvec3 pos = gl_GlobalInvocationID + origin;
+    pos.x <<= bytes_per_block_log2;
+
+    // Read as soon as possible due to its latency
+    const uint swizzle = SwizzleOffset(pos.xy);
+
+    const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
+
+    uint offset = 0;
+    offset += layer * layer_stride;
+    offset += (block_y >> block_height) * block_size;
+    offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT;
+    offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
+    offset += swizzle;
+
+    const ivec3 invocation_destination = ivec3(gl_GlobalInvocationID + destination);
+    const ivec3 coord = ivec3(invocation_destination * uvec3(block_dims, 1.0));
+    uint block_index =
+        layer * num_image_blocks.x * num_image_blocks.y + pos.y * num_image_blocks.x + pos.x;
+    current_index = 0;
+    bitsread = 0;
+    for (int i = 0; i < 16; i++) {
+        local_buff[i] = ReadTexel(offset + i);
+    }
+    DecompressBlock(coord, block_index);
+}
-- 
cgit v1.2.3


From 20eb368e147e1c27f05d6923c51596f8dfe24e89 Mon Sep 17 00:00:00 2001
From: ameerj <52414509+ameerj@users.noreply.github.com>
Date: Sat, 13 Feb 2021 16:49:24 -0500
Subject: renderer_vulkan: Accelerate ASTC decoding

Co-Authored-By: Rodrigo Locatti <reinuseslisp@airmail.cc>
---
 src/video_core/host_shaders/astc_decoder.comp | 43 ++++++++++++++-------------
 1 file changed, 22 insertions(+), 21 deletions(-)

(limited to 'src/video_core/host_shaders/astc_decoder.comp')

diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp
index 070190a5c..2ddac2e1d 100644
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -16,7 +16,7 @@
 #define BINDING_7_TO_8_BUFFER 4
 #define BINDING_8_TO_8_BUFFER 5
 #define BINDING_BYTE_TO_16_BUFFER 6
-#define BINDING_OUTPUT_IMAGE 3
+#define BINDING_OUTPUT_IMAGE 7
 
 #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
 
@@ -85,7 +85,26 @@ layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
 layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 {
     uint astc_data[];
 };
-layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly image2D dest_image;
+
+// ASTC Encodings data
+layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues {
+    EncodingData encoding_values[];
+};
+// ASTC Precompiled tables
+layout(binding = BINDING_6_TO_8_BUFFER, std430) readonly buffer REPLICATE_6_BIT_TO_8 {
+    uint REPLICATE_6_BIT_TO_8_TABLE[];
+};
+layout(binding = BINDING_7_TO_8_BUFFER, std430) readonly buffer REPLICATE_7_BIT_TO_8 {
+    uint REPLICATE_7_BIT_TO_8_TABLE[];
+};
+layout(binding = BINDING_8_TO_8_BUFFER, std430) readonly buffer REPLICATE_8_BIT_TO_8 {
+    uint REPLICATE_8_BIT_TO_8_TABLE[];
+};
+layout(binding = BINDING_BYTE_TO_16_BUFFER, std430) readonly buffer REPLICATE_BYTE_TO_16 {
+    uint REPLICATE_BYTE_TO_16_TABLE[];
+};
+
+layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2D dest_image;
 
 const uint GOB_SIZE_X = 64;
 const uint GOB_SIZE_Y = 8;
@@ -109,23 +128,6 @@ uint ReadTexel(uint offset) {
     return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8);
 }
 
-// ASTC Encodings data
-layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues {
-    EncodingData encoding_values[256];
-};
-// ASTC Precompiled tables
-layout(binding = BINDING_6_TO_8_BUFFER, std430) readonly buffer REPLICATE_6_BIT_TO_8 {
-    uint REPLICATE_6_BIT_TO_8_TABLE[];
-};
-layout(binding = BINDING_7_TO_8_BUFFER, std430) readonly buffer REPLICATE_7_BIT_TO_8 {
-    uint REPLICATE_7_BIT_TO_8_TABLE[];
-};
-layout(binding = BINDING_8_TO_8_BUFFER, std430) readonly buffer REPLICATE_8_BIT_TO_8 {
-    uint REPLICATE_8_BIT_TO_8_TABLE[];
-};
-layout(binding = BINDING_BYTE_TO_16_BUFFER, std430) readonly buffer REPLICATE_BYTE_TO_16 {
-    uint REPLICATE_BYTE_TO_16_TABLE[];
-};
 
 const int BLOCK_SIZE_IN_BYTES = 16;
 
@@ -1275,8 +1277,7 @@ void main() {
     offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
     offset += swizzle;
 
-    const ivec3 invocation_destination = ivec3(gl_GlobalInvocationID + destination);
-    const ivec3 coord = ivec3(invocation_destination * uvec3(block_dims, 1.0));
+    const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1.0));
     uint block_index =
         layer * num_image_blocks.x * num_image_blocks.y + pos.y * num_image_blocks.x + pos.x;
     current_index = 0;
-- 
cgit v1.2.3


From c7553abe894ddac84fe8417a12ec51d5ab60dc58 Mon Sep 17 00:00:00 2001
From: ameerj <52414509+ameerj@users.noreply.github.com>
Date: Tue, 19 Jan 2021 19:54:28 -0500
Subject: astc_decoder: Fix out of bounds memory access

resolves a crash with some anamolous textures found in Astral Chain.
---
 src/video_core/host_shaders/astc_decoder.comp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'src/video_core/host_shaders/astc_decoder.comp')

diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp
index 2ddac2e1d..5be716309 100644
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -339,6 +339,9 @@ uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool sma
 }
 
 uint ReadBit() {
+    if (current_index >= local_buff.length()) {
+        return 0;
+    }
     uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1);
     bitsread++;
     total_bitsread++;
@@ -1170,12 +1173,17 @@ void DecompressBlock(ivec3 coord, uint block_index) {
         plane_selector_bits = 2;
     }
     remaining_bits -= plane_selector_bits;
+    if (remaining_bits > 128) {
+        // Bad data, more remaining bits than 4 bytes
+        // return early
+        return;
+    }
     // Read color data...
     uint color_data_bits = remaining_bits;
     while (remaining_bits > 0) {
-        uint nb = min(remaining_bits, 8);
+        int nb = int(min(remaining_bits, 8U));
         uint b = StreamBits(nb);
-        color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, 8));
+        color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb));
         ced_pointer++;
         remaining_bits -= nb;
     }
-- 
cgit v1.2.3


From 2f30c105849c214345e2201f4bd6f9b4b76ab4a1 Mon Sep 17 00:00:00 2001
From: Rodrigo Locatti <reinuseslisp@airmail.cc>
Date: Sat, 13 Feb 2021 16:08:50 -0500
Subject: astc_decoder: Reimplement Layers

Reimplements the approach to decoding layers in the compute shader. Fixes multilayer astc decoding when using Vulkan.
---
 src/video_core/host_shaders/astc_decoder.comp | 33 ++++++++++++---------------
 1 file changed, 15 insertions(+), 18 deletions(-)

(limited to 'src/video_core/host_shaders/astc_decoder.comp')

diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp
index 5be716309..b903a2d37 100644
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -39,17 +39,15 @@ layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
 BEGIN_PUSH_CONSTANTS
 UNIFORM(0) uvec2 num_image_blocks;
 UNIFORM(1) uvec2 block_dims;
-UNIFORM(2) uint layer;
-
-UNIFORM(3) uvec3 origin;
-UNIFORM(4) ivec3 destination;
-UNIFORM(5) uint bytes_per_block_log2;
-UNIFORM(6) uint layer_stride;
-UNIFORM(7) uint block_size;
-UNIFORM(8) uint x_shift;
-UNIFORM(9) uint block_height;
-UNIFORM(10) uint block_height_mask;
 
+UNIFORM(2) uvec3 origin;
+UNIFORM(3) ivec3 destination;
+UNIFORM(4) uint bytes_per_block_log2;
+UNIFORM(5) uint layer_stride;
+UNIFORM(6) uint block_size;
+UNIFORM(7) uint x_shift;
+UNIFORM(8) uint block_height;
+UNIFORM(9) uint block_height_mask;
 END_PUSH_CONSTANTS
 
 uint current_index = 0;
@@ -82,7 +80,7 @@ layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
     uint swizzle_table[];
 };
 
-layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 {
+layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 {
     uint astc_data[];
 };
 
@@ -104,7 +102,7 @@ layout(binding = BINDING_BYTE_TO_16_BUFFER, std430) readonly buffer REPLICATE_BY
     uint REPLICATE_BYTE_TO_16_TABLE[];
 };
 
-layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2D dest_image;
+layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image;
 
 const uint GOB_SIZE_X = 64;
 const uint GOB_SIZE_Y = 8;
@@ -1086,10 +1084,9 @@ TexelWeightParams DecodeBlockInfo(uint block_index) {
 void FillError(ivec3 coord) {
     for (uint j = 0; j < block_dims.y; j++) {
         for (uint i = 0; i < block_dims.x; i++) {
-            imageStore(dest_image, coord.xy + ivec2(i, j), vec4(1.0, 1.0, 0.0, 1.0));
+            imageStore(dest_image, coord + ivec3(i, j, 0), vec4(1.0, 1.0, 0.0, 1.0));
         }
     }
-    return;
 }
 
 void FillVoidExtentLDR(ivec3 coord, uint block_index) {
@@ -1107,7 +1104,7 @@ void FillVoidExtentLDR(ivec3 coord, uint block_index) {
     float b = float(b_u) / 65535.0f;
     for (uint j = 0; j < block_dims.y; j++) {
         for (uint i = 0; i < block_dims.x; i++) {
-            imageStore(dest_image, coord.xy + ivec2(i, j), vec4(r, g, b, a));
+            imageStore(dest_image, coord + ivec3(i, j, 0), vec4(r, g, b, a));
         }
     }
 }
@@ -1264,7 +1261,7 @@ void DecompressBlock(ivec3 coord, uint block_index) {
             }
             vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) >> 6);
             p = (Cf / 65535.0);
-            imageStore(dest_image, coord.xy + ivec2(i, j), p.gbar);
+            imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar);
         }
     }
 }
@@ -1279,7 +1276,7 @@ void main() {
     const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
 
     uint offset = 0;
-    offset += layer * layer_stride;
+    offset += pos.z * layer_stride;
     offset += (block_y >> block_height) * block_size;
     offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT;
     offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
@@ -1287,7 +1284,7 @@ void main() {
 
     const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1.0));
     uint block_index =
-        layer * num_image_blocks.x * num_image_blocks.y + pos.y * num_image_blocks.x + pos.x;
+        pos.z * num_image_blocks.x * num_image_blocks.y + pos.y * num_image_blocks.x + pos.x;
     current_index = 0;
     bitsread = 0;
     for (int i = 0; i < 16; i++) {
-- 
cgit v1.2.3


From 2f83d9a61bca42d9ef24074beb2b11b19bd4cecd Mon Sep 17 00:00:00 2001
From: ameerj <52414509+ameerj@users.noreply.github.com>
Date: Thu, 25 Mar 2021 16:53:51 -0400
Subject: astc_decoder: Refactor for style and more efficient memory use

---
 src/video_core/host_shaders/astc_decoder.comp | 569 ++++++++++++++------------
 1 file changed, 307 insertions(+), 262 deletions(-)

(limited to 'src/video_core/host_shaders/astc_decoder.comp')

diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp
index b903a2d37..703e34587 100644
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -9,13 +9,13 @@
 #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
 #define END_PUSH_CONSTANTS };
 #define UNIFORM(n)
-#define BINDING_SWIZZLE_BUFFER 0
-#define BINDING_INPUT_BUFFER 1
-#define BINDING_ENC_BUFFER 2
-#define BINDING_6_TO_8_BUFFER 3
-#define BINDING_7_TO_8_BUFFER 4
-#define BINDING_8_TO_8_BUFFER 5
-#define BINDING_BYTE_TO_16_BUFFER 6
+#define BINDING_INPUT_BUFFER 0
+#define BINDING_ENC_BUFFER 1
+#define BINDING_6_TO_8_BUFFER 2
+#define BINDING_7_TO_8_BUFFER 3
+#define BINDING_8_TO_8_BUFFER 4
+#define BINDING_BYTE_TO_16_BUFFER 5
+#define BINDING_SWIZZLE_BUFFER 6
 #define BINDING_OUTPUT_IMAGE 7
 
 #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
@@ -37,28 +37,16 @@
 layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
 
 BEGIN_PUSH_CONSTANTS
-UNIFORM(0) uvec2 num_image_blocks;
 UNIFORM(1) uvec2 block_dims;
 
-UNIFORM(2) uvec3 origin;
-UNIFORM(3) ivec3 destination;
-UNIFORM(4) uint bytes_per_block_log2;
-UNIFORM(5) uint layer_stride;
-UNIFORM(6) uint block_size;
-UNIFORM(7) uint x_shift;
-UNIFORM(8) uint block_height;
-UNIFORM(9) uint block_height_mask;
+UNIFORM(2) uint bytes_per_block_log2;
+UNIFORM(3) uint layer_stride;
+UNIFORM(4) uint block_size;
+UNIFORM(5) uint x_shift;
+UNIFORM(6) uint block_height;
+UNIFORM(7) uint block_height_mask;
 END_PUSH_CONSTANTS
 
-uint current_index = 0;
-int bitsread = 0;
-uint total_bitsread = 0;
-uint local_buff[16];
-
-const int JustBits = 0;
-const int Quint = 1;
-const int Trit = 2;
-
 struct EncodingData {
     uint encoding;
     uint num_bits;
@@ -68,11 +56,11 @@ struct EncodingData {
 
 struct TexelWeightParams {
     uvec2 size;
-    bool dual_plane;
     uint max_weight;
-    bool Error;
-    bool VoidExtentLDR;
-    bool VoidExtentHDR;
+    bool dual_plane;
+    bool error_state;
+    bool void_extent_ldr;
+    bool void_extent_hdr;
 };
 
 // Swizzle data
@@ -116,6 +104,75 @@ const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHI
 
 const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
 
+const int BLOCK_SIZE_IN_BYTES = 16;
+
+const int BLOCK_INFO_ERROR = 0;
+const int BLOCK_INFO_VOID_EXTENT_HDR = 1;
+const int BLOCK_INFO_VOID_EXTENT_LDR = 2;
+const int BLOCK_INFO_NORMAL = 3;
+
+const int JUST_BITS = 0;
+const int QUINT = 1;
+const int TRIT = 2;
+
+// The following constants are expanded variants of the Replicate()
+// function calls corresponding to the following arguments:
+// value: index into the generated table
+// num_bits: the after "REPLICATE" in the table name. i.e. 4 is num_bits in REPLICATE_4.
+// to_bit: the integer after "TO_"
+const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127);
+const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511);
+
+const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255);
+const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255);
+const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255);
+const uint REPLICATE_4_BIT_TO_8_TABLE[16] =
+    uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255);
+const uint REPLICATE_5_BIT_TO_8_TABLE[32] =
+    uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165,
+           173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255);
+const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63);
+const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63);
+const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63);
+const uint REPLICATE_4_BIT_TO_6_TABLE[16] =
+    uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63);
+const uint REPLICATE_5_BIT_TO_6_TABLE[32] =
+    uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45,
+           47, 49, 51, 53, 55, 57, 59, 61, 63);
+
+// Input ASTC texture globals
+uint current_index = 0;
+int bitsread = 0;
+uint total_bitsread = 0;
+uint local_buff[16];
+
+// Color data globals
+uint color_endpoint_data[16];
+int color_bitsread = 0;
+uint total_color_bitsread = 0;
+int color_index = 0;
+
+// Four values, two endpoints, four maximum paritions
+uint color_values[32];
+int colvals_index = 0;
+
+// Weight data globals
+uint texel_weight_data[16];
+int texel_bitsread = 0;
+uint total_texel_bitsread = 0;
+int texel_index = 0;
+
+bool texel_flag = false;
+
+// Global "vectors" to be pushed into when decoding
+EncodingData result_vector[100];
+int result_index = 0;
+
+EncodingData texel_vector[100];
+int texel_vector_index = 0;
+
+uint unquantized_texel_weights[2][144];
+
 uint SwizzleOffset(uvec2 pos) {
     pos = pos & SWIZZLE_MASK;
     return swizzle_table[pos.y * 64 + pos.x];
@@ -126,21 +183,10 @@ uint ReadTexel(uint offset) {
     return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8);
 }
 
-
-const int BLOCK_SIZE_IN_BYTES = 16;
-
-const int BLOCK_INFO_ERROR = 0;
-const int BLOCK_INFO_VOID_EXTENT_HDR = 1;
-const int BLOCK_INFO_VOID_EXTENT_LDR = 2;
-const int BLOCK_INFO_NORMAL = 3;
-
-// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
-// is the same as [(numBits - 1):0] and repeats all the way down.
+// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
+// is the same as [(num_bits - 1):0] and repeats all the way down.
 uint Replicate(uint val, uint num_bits, uint to_bit) {
-    if (num_bits == 0) {
-        return 0;
-    }
-    if (to_bit == 0) {
+    if (num_bits == 0 || to_bit == 0) {
         return 0;
     }
     const uint v = val & uint((1 << num_bits) - 1);
@@ -165,26 +211,14 @@ uvec4 ReplicateByteTo16(uvec4 value) {
                  REPLICATE_BYTE_TO_16_TABLE[value.z], REPLICATE_BYTE_TO_16_TABLE[value.w]);
 }
 
-const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127);
 uint ReplicateBitTo7(uint value) {
     return REPLICATE_BIT_TO_7_TABLE[value];
-    ;
 }
 
-const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511);
 uint ReplicateBitTo9(uint value) {
     return REPLICATE_1_BIT_TO_9_TABLE[value];
 }
 
-const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255);
-const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255);
-const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255);
-const uint REPLICATE_4_BIT_TO_8_TABLE[16] =
-    uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255);
-const uint REPLICATE_5_BIT_TO_8_TABLE[32] =
-    uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165,
-           173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255);
-
 uint FastReplicateTo8(uint value, uint num_bits) {
     switch (num_bits) {
     case 1:
@@ -207,15 +241,6 @@ uint FastReplicateTo8(uint value, uint num_bits) {
     return Replicate(value, num_bits, 8);
 }
 
-const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63);
-const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63);
-const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63);
-const uint REPLICATE_4_BIT_TO_6_TABLE[16] =
-    uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63);
-const uint REPLICATE_5_BIT_TO_6_TABLE[32] =
-    uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45,
-           47, 49, 51, 53, 55, 57, 59, 61, 63);
-
 uint FastReplicateTo6(uint value, uint num_bits) {
     switch (num_bits) {
     case 1:
@@ -232,7 +257,23 @@ uint FastReplicateTo6(uint value, uint num_bits) {
     return Replicate(value, num_bits, 6);
 }
 
-uint hash52(uint p) {
+uint Div3Floor(uint v) {
+    return (v * 0x5556) >> 16;
+}
+
+uint Div3Ceil(uint v) {
+    return Div3Floor(v + 2);
+}
+
+uint Div5Floor(uint v) {
+    return (v * 0x3334) >> 16;
+}
+
+uint Div5Ceil(uint v) {
+    return Div5Floor(v + 4);
+}
+
+uint Hash52(uint p) {
     p ^= p >> 15;
     p -= p << 17;
     p += p << 7;
@@ -247,9 +288,9 @@ uint hash52(uint p) {
 }
 
 uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bool small_block) {
-    if (1 == partition_count)
+    if (partition_count == 1) {
         return 0;
-
+    }
     if (small_block) {
         x <<= 1;
         y <<= 1;
@@ -258,7 +299,7 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo
 
     seed += (partition_count - 1) * 1024;
 
-    uint rnum = hash52(uint(seed));
+    uint rnum = Hash52(uint(seed));
     uint seed1 = uint(rnum & 0xF);
     uint seed2 = uint((rnum >> 4) & 0xF);
     uint seed3 = uint((rnum >> 8) & 0xF);
@@ -318,18 +359,22 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo
     c &= 0x3F;
     d &= 0x3F;
 
-    if (partition_count < 4)
+    if (partition_count < 4) {
         d = 0;
-    if (partition_count < 3)
+    }
+    if (partition_count < 3) {
         c = 0;
+    }
 
-    if (a >= b && a >= c && a >= d)
+    if (a >= b && a >= c && a >= d) {
         return 0;
-    else if (b >= c && b >= d)
+    } else if (b >= c && b >= d) {
         return 1;
-    else if (c >= d)
+    } else if (c >= d) {
         return 2;
-    return 3;
+    } else {
+        return 3;
+    }
 }
 
 uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) {
@@ -341,10 +386,10 @@ uint ReadBit() {
         return 0;
     }
     uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1);
-    bitsread++;
-    total_bitsread++;
+    ++bitsread;
+    ++total_bitsread;
     if (bitsread == 8) {
-        current_index++;
+        ++current_index;
         bitsread = 0;
     }
     return bit;
@@ -358,36 +403,22 @@ uint StreamBits(uint num_bits) {
     return ret;
 }
 
-// Define color data.
-uint color_endpoint_data[16];
-int color_bitsread = 0;
-uint total_color_bitsread = 0;
-int color_index = 0;
-
-// Define color data.
-uint texel_weight_data[16];
-int texel_bitsread = 0;
-uint total_texel_bitsread = 0;
-int texel_index = 0;
-
-bool texel_flag = false;
-
 uint ReadColorBit() {
     uint bit = 0;
     if (texel_flag) {
         bit = bitfieldExtract(texel_weight_data[texel_index], texel_bitsread, 1);
-        texel_bitsread++;
-        total_texel_bitsread++;
+        ++texel_bitsread;
+        ++total_texel_bitsread;
         if (texel_bitsread == 8) {
-            texel_index++;
+            ++texel_index;
             texel_bitsread = 0;
         }
     } else {
         bit = bitfieldExtract(color_endpoint_data[color_index], color_bitsread, 1);
-        color_bitsread++;
-        total_color_bitsread++;
+        ++color_bitsread;
+        ++total_color_bitsread;
         if (color_bitsread == 8) {
-            color_index++;
+            ++color_index;
             color_bitsread = 0;
         }
     }
@@ -402,31 +433,25 @@ uint StreamColorBits(uint num_bits) {
     return ret;
 }
 
-EncodingData result_vector[100];
-int result_index = 0;
-
-EncodingData texel_vector[100];
-int texel_vector_index = 0;
-
 void ResultEmplaceBack(EncodingData val) {
     if (texel_flag) {
         texel_vector[texel_vector_index] = val;
-        texel_vector_index++;
+        ++texel_vector_index;
     } else {
         result_vector[result_index] = val;
-        result_index++;
+        ++result_index;
     }
 }
 
 // Returns the number of bits required to encode n_vals values.
 uint GetBitLength(uint n_vals, uint encoding_index) {
-    uint totalBits = encoding_values[encoding_index].num_bits * n_vals;
-    if (encoding_values[encoding_index].encoding == Trit) {
-        totalBits += (n_vals * 8 + 4) / 5;
-    } else if (encoding_values[encoding_index].encoding == Quint) {
-        totalBits += (n_vals * 7 + 2) / 3;
+    uint total_bits = encoding_values[encoding_index].num_bits * n_vals;
+    if (encoding_values[encoding_index].encoding == TRIT) {
+        total_bits += Div5Ceil(n_vals * 8);
+    } else if (encoding_values[encoding_index].encoding == QUINT) {
+        total_bits += Div3Ceil(n_vals * 7);
     }
-    return totalBits;
+    return total_bits;
 }
 
 uint GetNumWeightValues(uvec2 size, bool dual_plane) {
@@ -459,7 +484,7 @@ uint BitsOp(uint bits, uint start, uint end) {
     return ((bits >> start) & mask);
 }
 
-void DecodeQuintBlock(uint num_bits) { // Value number of bits
+void DecodeQuintBlock(uint num_bits) {
     uint m[3];
     uint q[3];
     uint Q;
@@ -483,7 +508,6 @@ void DecodeQuintBlock(uint num_bits) { // Value number of bits
             q[2] = BitsOp(Q, 5, 6);
             C = BitsOp(Q, 0, 4);
         }
-
         if (BitsOp(C, 0, 2) == 5) {
             q[1] = 4;
             q[0] = BitsOp(C, 3, 4);
@@ -492,10 +516,9 @@ void DecodeQuintBlock(uint num_bits) { // Value number of bits
             q[0] = BitsOp(C, 0, 2);
         }
     }
-
     for (uint i = 0; i < 3; i++) {
         EncodingData val;
-        val.encoding = Quint;
+        val.encoding = QUINT;
         val.num_bits = num_bits;
         val.bit_value = m[i];
         val.quint_trit_value = q[i];
@@ -547,29 +570,28 @@ void DecodeTritBlock(uint num_bits) {
     }
     for (uint i = 0; i < 5; i++) {
         EncodingData val;
-        val.encoding = Trit;
+        val.encoding = TRIT;
         val.num_bits = num_bits;
         val.bit_value = m[i];
         val.quint_trit_value = t[i];
         ResultEmplaceBack(val);
     }
 }
+
 void DecodeIntegerSequence(uint max_range, uint num_values) {
     EncodingData val = encoding_values[max_range];
     uint vals_decoded = 0;
     while (vals_decoded < num_values) {
         switch (val.encoding) {
-        case Quint:
+        case QUINT:
             DecodeQuintBlock(val.num_bits);
             vals_decoded += 3;
             break;
-
-        case Trit:
+        case TRIT:
             DecodeTritBlock(val.num_bits);
             vals_decoded += 5;
             break;
-
-        case JustBits:
+        case JUST_BITS:
             val.bit_value = StreamColorBits(val.num_bits);
             ResultEmplaceBack(val);
             vals_decoded++;
@@ -578,8 +600,7 @@ void DecodeIntegerSequence(uint max_range, uint num_values) {
     }
 }
 
-void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitions,
-                       uint color_data_bits) {
+void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) {
     uint num_values = 0;
     for (uint i = 0; i < num_partitions; i++) {
         num_values += ((modes[i] >> 2) + 1) << 1;
@@ -587,21 +608,21 @@ void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitio
     int range = 256;
     while (--range > 0) {
         EncodingData val = encoding_values[range];
-        uint bitLength = GetBitLength(num_values, range);
-        if (bitLength <= color_data_bits) {
+        uint bit_length = GetBitLength(num_values, range);
+        if (bit_length <= color_data_bits) {
             while (--range > 0) {
                 EncodingData newval = encoding_values[range];
                 if (newval.encoding != val.encoding && newval.num_bits != val.num_bits) {
                     break;
                 }
             }
-            range++;
+            ++range;
             break;
         }
     }
     DecodeIntegerSequence(range, num_values);
     uint out_index = 0;
-    for (int itr = 0; itr < result_index; itr++) {
+    for (int itr = 0; itr < result_index; ++itr) {
         if (out_index >= num_values) {
             break;
         }
@@ -611,77 +632,83 @@ void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitio
         uint A = 0, B = 0, C = 0, D = 0;
         A = ReplicateBitTo9((bitval & 1));
         switch (val.encoding) {
-        case JustBits:
+        case JUST_BITS:
             color_values[out_index++] = FastReplicateTo8(bitval, bitlen);
             break;
-        case Trit: {
+        case TRIT: {
             D = val.quint_trit_value;
             switch (bitlen) {
-            case 1: {
+            case 1:
                 C = 204;
-            } break;
+                break;
             case 2: {
                 C = 93;
                 uint b = (bitval >> 1) & 1;
                 B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
-            } break;
-
+                break;
+            }
             case 3: {
                 C = 44;
                 uint cb = (bitval >> 1) & 3;
                 B = (cb << 7) | (cb << 2) | cb;
-            } break;
-
+                break;
+            }
             case 4: {
                 C = 22;
                 uint dcb = (bitval >> 1) & 7;
                 B = (dcb << 6) | dcb;
-            } break;
-
+                break;
+            }
             case 5: {
                 C = 11;
                 uint edcb = (bitval >> 1) & 0xF;
                 B = (edcb << 5) | (edcb >> 2);
-            } break;
-
+                break;
+            }
             case 6: {
                 C = 5;
                 uint fedcb = (bitval >> 1) & 0x1F;
                 B = (fedcb << 4) | (fedcb >> 4);
-            } break;
+                break;
             }
-        } break;
-        case Quint: {
+            }
+            break;
+        }
+        case QUINT: {
             D = val.quint_trit_value;
             switch (bitlen) {
-            case 1: {
+            case 1:
                 C = 113;
-            } break;
+                break;
             case 2: {
                 C = 54;
                 uint b = (bitval >> 1) & 1;
                 B = (b << 8) | (b << 3) | (b << 2);
-            } break;
+                break;
+            }
             case 3: {
                 C = 26;
                 uint cb = (bitval >> 1) & 3;
                 B = (cb << 7) | (cb << 1) | (cb >> 1);
-            } break;
+                break;
+            }
             case 4: {
                 C = 13;
                 uint dcb = (bitval >> 1) & 7;
                 B = (dcb << 6) | (dcb >> 1);
-            } break;
+                break;
+            }
             case 5: {
                 C = 6;
                 uint edcb = (bitval >> 1) & 0xF;
                 B = (edcb << 5) | (edcb >> 3);
-            } break;
+                break;
             }
-        } break;
+            }
+            break;
         }
-
-        if (val.encoding != JustBits) {
+        }
+        if (val.encoding != JUST_BITS) {
             uint T = (D * C) + B;
             T ^= A;
             T = (A & 0x80) | (T >> 2);
@@ -689,30 +716,31 @@ void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitio
         }
     }
 }
+
 ivec2 BitTransferSigned(int a, int b) {
     ivec2 transferred;
-    transferred[1] = b >> 1;
-    transferred[1] |= a & 0x80;
-    transferred[0] = a >> 1;
-    transferred[0] &= 0x3F;
-    if ((transferred[0] & 0x20) > 0) {
-        transferred[0] -= 0x40;
+    transferred.y = b >> 1;
+    transferred.y |= a & 0x80;
+    transferred.x = a >> 1;
+    transferred.x &= 0x3F;
+    if ((transferred.x & 0x20) > 0) {
+        transferred.x -= 0x40;
     }
     return transferred;
 }
 
 uvec4 ClampByte(ivec4 color) {
-    for (uint i = 0; i < 4; i++) {
+    for (uint i = 0; i < 4; ++i) {
         color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]);
     }
     return uvec4(color);
 }
+
 ivec4 BlueContract(int a, int r, int g, int b) {
     return ivec4(a, (r + b) >> 1, (g + b) >> 1, b);
 }
-int colvals_index = 0;
-void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_values[32],
-                      uint color_endpoint_mode) {
+
+void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) {
 #define READ_UINT_VALUES(N)                                                                        \
     uint v[N];                                                                                     \
     for (uint i = 0; i < N; i++) {                                                                 \
@@ -730,113 +758,120 @@ void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_values[32],
         READ_UINT_VALUES(2)
         ep1 = uvec4(0xFF, v[0], v[0], v[0]);
         ep2 = uvec4(0xFF, v[1], v[1], v[1]);
-    } break;
-
+        break;
+    }
     case 1: {
         READ_UINT_VALUES(2)
         uint L0 = (v[0] >> 2) | (v[1] & 0xC0);
         uint L1 = max(L0 + (v[1] & 0x3F), 0xFFU);
         ep1 = uvec4(0xFF, L0, L0, L0);
         ep2 = uvec4(0xFF, L1, L1, L1);
-    } break;
-
+        break;
+    }
     case 4: {
         READ_UINT_VALUES(4)
         ep1 = uvec4(v[2], v[0], v[0], v[0]);
         ep2 = uvec4(v[3], v[1], v[1], v[1]);
-    } break;
-
+        break;
+    }
     case 5: {
         READ_INT_VALUES(4)
         ivec2 transferred = BitTransferSigned(v[1], v[0]);
-        v[1] = transferred[0];
-        v[0] = transferred[1];
+        v[1] = transferred.x;
+        v[0] = transferred.y;
         transferred = BitTransferSigned(v[3], v[2]);
-        v[3] = transferred[0];
-        v[2] = transferred[1];
+        v[3] = transferred.x;
+        v[2] = transferred.y;
         ep1 = ClampByte(ivec4(v[2], v[0], v[0], v[0]));
-        ep2 = ClampByte(ivec4((v[2] + v[3]), v[0] + v[1], v[0] + v[1], v[0] + v[1]));
-    } break;
-
+        ep2 = ClampByte(ivec4(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1]));
+        break;
+    }
     case 6: {
         READ_UINT_VALUES(4)
-        ep1 = uvec4(0xFF, v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
+        ep1 = uvec4(0xFF, (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8);
         ep2 = uvec4(0xFF, v[0], v[1], v[2]);
-    } break;
-
+        break;
+    }
     case 8: {
         READ_UINT_VALUES(6)
-        if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
+        if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) {
             ep1 = uvec4(0xFF, v[0], v[2], v[4]);
             ep2 = uvec4(0xFF, v[1], v[3], v[5]);
         } else {
             ep1 = uvec4(BlueContract(0xFF, int(v[1]), int(v[3]), int(v[5])));
             ep2 = uvec4(BlueContract(0xFF, int(v[0]), int(v[2]), int(v[4])));
         }
-    } break;
-
+        break;
+    }
     case 9: {
         READ_INT_VALUES(6)
         ivec2 transferred = BitTransferSigned(v[1], v[0]);
-        v[1] = transferred[0];
-        v[0] = transferred[1];
+        v[1] = transferred.x;
+        v[0] = transferred.y;
         transferred = BitTransferSigned(v[3], v[2]);
-        v[3] = transferred[0];
-        v[2] = transferred[1];
+        v[3] = transferred.x;
+        v[2] = transferred.y;
         transferred = BitTransferSigned(v[5], v[4]);
-        v[5] = transferred[0];
-        v[4] = transferred[1];
-        if (v[1] + v[3] + v[5] >= 0) {
+        v[5] = transferred.x;
+        v[4] = transferred.y;
+        if ((v[1] + v[3] + v[5]) >= 0) {
             ep1 = ClampByte(ivec4(0xFF, v[0], v[2], v[4]));
             ep2 = ClampByte(ivec4(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]));
         } else {
             ep1 = ClampByte(BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]));
             ep2 = ClampByte(BlueContract(0xFF, v[0], v[2], v[4]));
         }
-    } break;
-
+        break;
+    }
     case 10: {
         READ_UINT_VALUES(6)
-        ep1 = uvec4(v[4], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
+        ep1 = uvec4(v[4], (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8);
         ep2 = uvec4(v[5], v[0], v[1], v[2]);
-    } break;
-
+        break;
+    }
     case 12: {
         READ_UINT_VALUES(8)
-        if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
+        if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) {
             ep1 = uvec4(v[6], v[0], v[2], v[4]);
             ep2 = uvec4(v[7], v[1], v[3], v[5]);
         } else {
             ep1 = uvec4(BlueContract(int(v[7]), int(v[1]), int(v[3]), int(v[5])));
             ep2 = uvec4(BlueContract(int(v[6]), int(v[0]), int(v[2]), int(v[4])));
         }
-    } break;
-
+        break;
+    }
     case 13: {
         READ_INT_VALUES(8)
         ivec2 transferred = BitTransferSigned(v[1], v[0]);
-        v[1] = transferred[0];
-        v[0] = transferred[1];
+        v[1] = transferred.x;
+        v[0] = transferred.y;
         transferred = BitTransferSigned(v[3], v[2]);
-        v[3] = transferred[0];
-        v[2] = transferred[1];
+        v[3] = transferred.x;
+        v[2] = transferred.y;
 
         transferred = BitTransferSigned(v[5], v[4]);
-        v[5] = transferred[0];
-        v[4] = transferred[1];
+        v[5] = transferred.x;
+        v[4] = transferred.y;
 
         transferred = BitTransferSigned(v[7], v[6]);
-        v[7] = transferred[0];
-        v[6] = transferred[1];
+        v[7] = transferred.x;
+        v[6] = transferred.y;
 
-        if (v[1] + v[3] + v[5] >= 0) {
+        if ((v[1] + v[3] + v[5]) >= 0) {
             ep1 = ClampByte(ivec4(v[6], v[0], v[2], v[4]));
             ep2 = ClampByte(ivec4(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5]));
         } else {
             ep1 = ClampByte(BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5]));
             ep2 = ClampByte(BlueContract(v[6], v[0], v[2], v[4]));
         }
-    } break;
+        break;
+    }
+    default: {
+        // HDR mode, or more likely a bug computing the color_endpoint_mode
+        ep1 = uvec4(0xFF, 0xFF, 0, 0);
+        ep2 = uvec4(0xFF, 0xFF, 0, 0);
+        break;
+    }
     }
 #undef READ_UINT_VALUES
 #undef READ_INT_VALUES
@@ -849,52 +884,61 @@ uint UnquantizeTexelWeight(EncodingData val) {
     uint B = 0, C = 0, D = 0;
     uint result = 0;
     switch (val.encoding) {
-    case JustBits:
+    case JUST_BITS:
         result = FastReplicateTo6(bitval, bitlen);
         break;
-    case Trit: {
+    case TRIT: {
         D = val.quint_trit_value;
         switch (bitlen) {
         case 0: {
             uint results[3] = {0, 32, 63};
             result = results[D];
-        } break;
+            break;
+        }
         case 1: {
             C = 50;
-        } break;
+            break;
+        }
         case 2: {
             C = 23;
             uint b = (bitval >> 1) & 1;
             B = (b << 6) | (b << 2) | b;
-        } break;
+            break;
+        }
         case 3: {
             C = 11;
             uint cb = (bitval >> 1) & 3;
             B = (cb << 5) | cb;
-        } break;
+            break;
+        }
         default:
             break;
         }
-    } break;
-    case Quint: {
+        break;
+    }
+    case QUINT: {
         D = val.quint_trit_value;
         switch (bitlen) {
         case 0: {
             uint results[5] = {0, 16, 32, 47, 63};
             result = results[D];
-        } break;
+            break;
+        }
         case 1: {
             C = 28;
-        } break;
+            break;
+        }
         case 2: {
             C = 13;
             uint b = (bitval >> 1) & 1;
             B = (b << 6) | (b << 1);
-        } break;
+            break;
         }
-    } break;
+        }
+        break;
     }
-    if (val.encoding != JustBits && bitlen > 0) {
+    }
+    if (val.encoding != JUST_BITS && bitlen > 0) {
         result = D * C + B;
         result ^= A;
         result = (A & 0x20) | (result >> 2);
@@ -905,7 +949,7 @@ uint UnquantizeTexelWeight(EncodingData val) {
     return result;
 }
 
-void UnquantizeTexelWeights(out uint outbuffer[2][144], bool dual_plane, uvec2 size) {
+void UnquantizeTexelWeights(bool dual_plane, uvec2 size) {
     uint weight_idx = 0;
     uint unquantized[2][144];
     uint area = size.x * size.y;
@@ -921,11 +965,12 @@ void UnquantizeTexelWeights(out uint outbuffer[2][144], bool dual_plane, uvec2 s
         if (++weight_idx >= (area))
             break;
     }
-    uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1));
-    uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1));
-    uint kPlaneScale = dual_plane ? 2 : 1;
-    for (uint plane = 0; plane < kPlaneScale; plane++)
-        for (uint t = 0; t < block_dims.y; t++)
+
+    const uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1));
+    const uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1));
+    const uint k_plane_scale = dual_plane ? 2 : 1;
+    for (uint plane = 0; plane < k_plane_scale; plane++) {
+        for (uint t = 0; t < block_dims.y; t++) {
             for (uint s = 0; s < block_dims.x; s++) {
                 uint cs = Ds * s;
                 uint ct = Dt * t;
@@ -955,8 +1000,10 @@ void UnquantizeTexelWeights(out uint outbuffer[2][144], bool dual_plane, uvec2 s
                 if ((v0 + size.x + 1) < (area)) {
                     p.w = unquantized[plane][(v0 + size.x + 1)];
                 }
-                outbuffer[plane][t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4;
+                unquantized_texel_weights[plane][t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4;
             }
+        }
+    }
 }
 
 int FindLayout(uint mode) {
@@ -991,25 +1038,25 @@ int FindLayout(uint mode) {
 }
 
 TexelWeightParams DecodeBlockInfo(uint block_index) {
-    TexelWeightParams params = TexelWeightParams(uvec2(0), false, 0, false, false, false);
+    TexelWeightParams params = TexelWeightParams(uvec2(0), 0, false, false, false, false);
     uint mode = StreamBits(11);
     if ((mode & 0x1ff) == 0x1fc) {
         if ((mode & 0x200) != 0) {
-            params.VoidExtentHDR = true;
+            params.void_extent_hdr = true;
         } else {
-            params.VoidExtentLDR = true;
+            params.void_extent_ldr = true;
         }
         if ((mode & 0x400) == 0 || StreamBits(1) == 0) {
-            params.Error = true;
+            params.error_state = true;
         }
         return params;
     }
     if ((mode & 0xf) == 0) {
-        params.Error = true;
+        params.error_state = true;
         return params;
     }
     if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) {
-        params.Error = true;
+        params.error_state = true;
         return params;
     }
     uint A, B;
@@ -1060,7 +1107,7 @@ TexelWeightParams DecodeBlockInfo(uint block_index) {
         params.size = uvec2(A + 6, B + 6);
         break;
     default:
-        params.Error = true;
+        params.error_state = true;
         break;
     }
     params.dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0);
@@ -1089,11 +1136,8 @@ void FillError(ivec3 coord) {
     }
 }
 
-void FillVoidExtentLDR(ivec3 coord, uint block_index) {
-    for (int i = 0; i < 4; i++) {
-        StreamBits(13);
-    }
-
+void FillVoidExtentLDR(ivec3 coord) {
+    StreamBits(52);
     uint r_u = StreamBits(16);
     uint g_u = StreamBits(16);
     uint b_u = StreamBits(16);
@@ -1110,21 +1154,20 @@ void FillVoidExtentLDR(ivec3 coord, uint block_index) {
 }
 
 void DecompressBlock(ivec3 coord, uint block_index) {
-    TexelWeightParams params;
-    params = DecodeBlockInfo(block_index);
-    if (params.Error) {
+    TexelWeightParams params = DecodeBlockInfo(block_index);
+    if (params.error_state) {
         FillError(coord);
         return;
     }
-    if (params.VoidExtentHDR) {
+    if (params.void_extent_hdr) {
         FillError(coord);
         return;
     }
-    if (params.VoidExtentLDR) {
-        FillVoidExtentLDR(coord, block_index);
+    if (params.void_extent_ldr) {
+        FillVoidExtentLDR(coord);
         return;
     }
-    if (params.size.x > block_dims.x || params.size.y > block_dims.y) {
+    if ((params.size.x > block_dims.x) || (params.size.y > block_dims.y)) {
         FillError(coord);
         return;
     }
@@ -1139,7 +1182,7 @@ void DecompressBlock(ivec3 coord, uint block_index) {
     uint ced_pointer = 0;
     uint base_cem = 0;
     if (num_partitions == 1) {
-        color_endpoint_mode[0] = StreamBits(4);
+        color_endpoint_mode.x = StreamBits(4);
         partition_index = 0;
     } else {
         partition_index = StreamBits(10);
@@ -1181,7 +1224,7 @@ void DecompressBlock(ivec3 coord, uint block_index) {
         int nb = int(min(remaining_bits, 8U));
         uint b = StreamBits(nb);
         color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb));
-        ced_pointer++;
+        ++ced_pointer;
         remaining_bits -= nb;
     }
     plane_index = int(StreamBits(plane_selector_bits));
@@ -1189,20 +1232,20 @@ void DecompressBlock(ivec3 coord, uint block_index) {
         uint extra_cem = StreamBits(extra_cem_bits);
         uint cem = (extra_cem << 6) | base_cem;
         cem >>= 2;
-        uint C[4] = {0, 0, 0, 0};
+        uvec4 C = uvec4(0);
         for (uint i = 0; i < num_partitions; i++) {
-            C[i] = cem & 1;
+            C[i] = (cem & 1);
             cem >>= 1;
         }
-        uint M[4] = {0, 0, 0, 0};
+        uvec4 M = uvec4(0);
         for (uint i = 0; i < num_partitions; i++) {
             M[i] = cem & 3;
             cem >>= 2;
         }
         for (uint i = 0; i < num_partitions; i++) {
             color_endpoint_mode[i] = base_mode;
-            if ((C[i]) == 0) {
-                color_endpoint_mode[i] -= 1;
+            if (C[i] == 0) {
+                --color_endpoint_mode[i];
             }
             color_endpoint_mode[i] <<= 2;
             color_endpoint_mode[i] |= M[i];
@@ -1213,13 +1256,13 @@ void DecompressBlock(ivec3 coord, uint block_index) {
             color_endpoint_mode[i] = cem;
         }
     }
+    DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits);
 
-    uint color_values[32]; // Four values, two endpoints, four maximum paritions
-    DecodeColorValues(color_values, color_endpoint_mode, num_partitions, color_data_bits);
     uvec4 endpoints[4][2];
     for (uint i = 0; i < num_partitions; i++) {
-        ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_values, color_endpoint_mode[i]);
+        ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_endpoint_mode[i]);
     }
+
     for (uint i = 0; i < 16; i++) {
         texel_weight_data[i] = local_buff[i];
     }
@@ -1238,12 +1281,13 @@ void DecompressBlock(ivec3 coord, uint block_index) {
         uint(
             ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1));
     for (uint i = 0; i < 16 - clear_byte_start; i++) {
-        texel_weight_data[clear_byte_start + i] = uint(0U);
+        texel_weight_data[clear_byte_start + i] = 0U;
     }
     texel_flag = true; // use texel "vector" and bit stream in integer decoding
     DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane));
-    uint weights[2][144];
-    UnquantizeTexelWeights(weights, params.dual_plane, params.size);
+
+    UnquantizeTexelWeights(params.dual_plane, params.size);
+
     for (uint j = 0; j < block_dims.y; j++) {
         for (uint i = 0; i < block_dims.x; i++) {
             uint local_partition = Select2DPartition(partition_index, i, j, num_partitions,
@@ -1257,9 +1301,9 @@ void DecompressBlock(ivec3 coord, uint block_index) {
                 if (params.dual_plane && (((plane_index + 1) & 3) == c)) {
                     plane_vec[c] = 1;
                 }
-                weight_vec[c] = weights[plane_vec[c]][j * block_dims.x + i];
+                weight_vec[c] = unquantized_texel_weights[plane_vec[c]][j * block_dims.x + i];
             }
-            vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) >> 6);
+            vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64);
             p = (Cf / 65535.0);
             imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar);
         }
@@ -1267,7 +1311,7 @@ void DecompressBlock(ivec3 coord, uint block_index) {
 }
 
 void main() {
-    uvec3 pos = gl_GlobalInvocationID + origin;
+    uvec3 pos = gl_GlobalInvocationID;
     pos.x <<= bytes_per_block_log2;
 
     // Read as soon as possible due to its latency
@@ -1282,9 +1326,10 @@ void main() {
     offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
     offset += swizzle;
 
-    const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1.0));
+    const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1));
     uint block_index =
-        pos.z * num_image_blocks.x * num_image_blocks.y + pos.y * num_image_blocks.x + pos.x;
+        pos.z * gl_WorkGroupSize.x * gl_WorkGroupSize.y + pos.y * gl_WorkGroupSize.x + pos.x;
+
     current_index = 0;
     bitsread = 0;
     for (int i = 0; i < 16; i++) {
-- 
cgit v1.2.3