aboutsummaryrefslogtreecommitdiff
path: root/src/Ryujinx.Graphics.Texture
diff options
context:
space:
mode:
Diffstat (limited to 'src/Ryujinx.Graphics.Texture')
-rw-r--r--src/Ryujinx.Graphics.Texture/Astc/AstcDecoder.cs1621
-rw-r--r--src/Ryujinx.Graphics.Texture/Astc/AstcDecoderException.cs9
-rw-r--r--src/Ryujinx.Graphics.Texture/Astc/AstcPixel.cs68
-rw-r--r--src/Ryujinx.Graphics.Texture/Astc/BitStream128.cs72
-rw-r--r--src/Ryujinx.Graphics.Texture/Astc/Bits.cs66
-rw-r--r--src/Ryujinx.Graphics.Texture/Astc/EndPointSet.cs23
-rw-r--r--src/Ryujinx.Graphics.Texture/Astc/IntegerEncoded.cs345
-rw-r--r--src/Ryujinx.Graphics.Texture/Astc/IntegerSequence.cs31
-rw-r--r--src/Ryujinx.Graphics.Texture/BC6Decoder.cs819
-rw-r--r--src/Ryujinx.Graphics.Texture/BC7Decoder.cs220
-rw-r--r--src/Ryujinx.Graphics.Texture/BCnDecoder.cs894
-rw-r--r--src/Ryujinx.Graphics.Texture/BCnEncoder.cs60
-rw-r--r--src/Ryujinx.Graphics.Texture/BlockLinearConstants.cs10
-rw-r--r--src/Ryujinx.Graphics.Texture/BlockLinearLayout.cs195
-rw-r--r--src/Ryujinx.Graphics.Texture/Bpp12Pixel.cs11
-rw-r--r--src/Ryujinx.Graphics.Texture/ETC2Decoder.cs682
-rw-r--r--src/Ryujinx.Graphics.Texture/Encoders/BC7Encoder.cs1005
-rw-r--r--src/Ryujinx.Graphics.Texture/Encoders/EncodeMode.cs10
-rw-r--r--src/Ryujinx.Graphics.Texture/LayoutConverter.cs591
-rw-r--r--src/Ryujinx.Graphics.Texture/OffsetCalculator.cs141
-rw-r--r--src/Ryujinx.Graphics.Texture/PixelConverter.cs216
-rw-r--r--src/Ryujinx.Graphics.Texture/Region.cs14
-rw-r--r--src/Ryujinx.Graphics.Texture/Ryujinx.Graphics.Texture.csproj11
-rw-r--r--src/Ryujinx.Graphics.Texture/Size.cs16
-rw-r--r--src/Ryujinx.Graphics.Texture/SizeCalculator.cs287
-rw-r--r--src/Ryujinx.Graphics.Texture/SizeInfo.cs119
-rw-r--r--src/Ryujinx.Graphics.Texture/Utils/BC67Tables.cs297
-rw-r--r--src/Ryujinx.Graphics.Texture/Utils/BC67Utils.cs1327
-rw-r--r--src/Ryujinx.Graphics.Texture/Utils/BC7ModeInfo.cs37
-rw-r--r--src/Ryujinx.Graphics.Texture/Utils/Block.cs55
-rw-r--r--src/Ryujinx.Graphics.Texture/Utils/RgbaColor32.cs229
-rw-r--r--src/Ryujinx.Graphics.Texture/Utils/RgbaColor8.cs84
32 files changed, 9565 insertions, 0 deletions
diff --git a/src/Ryujinx.Graphics.Texture/Astc/AstcDecoder.cs b/src/Ryujinx.Graphics.Texture/Astc/AstcDecoder.cs
new file mode 100644
index 00000000..08738583
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Astc/AstcDecoder.cs
@@ -0,0 +1,1621 @@
+using Ryujinx.Common.Utilities;
+using System;
+using System.Diagnostics;
+using System.Linq;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Graphics.Texture.Astc
+{
+ // https://github.com/GammaUNC/FasTC/blob/master/ASTCEncoder/src/Decompressor.cpp
+ public class AstcDecoder
+ {
+ private ReadOnlyMemory<byte> InputBuffer { get; }
+ private Memory<byte> OutputBuffer { get; }
+
+ private int BlockSizeX { get; }
+ private int BlockSizeY { get; }
+
+ private AstcLevel[] Levels { get; }
+
+ private bool Success { get; set; }
+
+ public int TotalBlockCount { get; }
+
+ public AstcDecoder(
+ ReadOnlyMemory<byte> inputBuffer,
+ Memory<byte> outputBuffer,
+ int blockWidth,
+ int blockHeight,
+ int width,
+ int height,
+ int depth,
+ int levels,
+ int layers)
+ {
+ if ((uint)blockWidth > 12)
+ {
+ throw new ArgumentOutOfRangeException(nameof(blockWidth));
+ }
+
+ if ((uint)blockHeight > 12)
+ {
+ throw new ArgumentOutOfRangeException(nameof(blockHeight));
+ }
+
+ InputBuffer = inputBuffer;
+ OutputBuffer = outputBuffer;
+
+ BlockSizeX = blockWidth;
+ BlockSizeY = blockHeight;
+
+ Levels = new AstcLevel[levels * layers];
+
+ Success = true;
+
+ TotalBlockCount = 0;
+
+ int currentInputBlock = 0;
+ int currentOutputOffset = 0;
+
+ for (int i = 0; i < levels; i++)
+ {
+ for (int j = 0; j < layers; j++)
+ {
+ ref AstcLevel level = ref Levels[i * layers + j];
+
+ level.ImageSizeX = Math.Max(1, width >> i);
+ level.ImageSizeY = Math.Max(1, height >> i);
+ level.ImageSizeZ = Math.Max(1, depth >> i);
+
+ level.BlockCountX = (level.ImageSizeX + blockWidth - 1) / blockWidth;
+ level.BlockCountY = (level.ImageSizeY + blockHeight - 1) / blockHeight;
+
+ level.StartBlock = currentInputBlock;
+ level.OutputByteOffset = currentOutputOffset;
+
+ currentInputBlock += level.TotalBlockCount;
+ currentOutputOffset += level.PixelCount * 4;
+ }
+ }
+
+ TotalBlockCount = currentInputBlock;
+ }
+
+ private struct AstcLevel
+ {
+ public int ImageSizeX { get; set; }
+ public int ImageSizeY { get; set; }
+ public int ImageSizeZ { get; set; }
+
+ public int BlockCountX { get; set; }
+ public int BlockCountY { get; set; }
+
+ public int StartBlock { get; set; }
+ public int OutputByteOffset { get; set; }
+
+ public int TotalBlockCount => BlockCountX * BlockCountY * ImageSizeZ;
+ public int PixelCount => ImageSizeX * ImageSizeY * ImageSizeZ;
+ }
+
+ public static int QueryDecompressedSize(int sizeX, int sizeY, int sizeZ, int levelCount, int layerCount)
+ {
+ int size = 0;
+
+ for (int i = 0; i < levelCount; i++)
+ {
+ int levelSizeX = Math.Max(1, sizeX >> i);
+ int levelSizeY = Math.Max(1, sizeY >> i);
+ int levelSizeZ = Math.Max(1, sizeZ >> i);
+
+ size += levelSizeX * levelSizeY * levelSizeZ * layerCount;
+ }
+
+ return size * 4;
+ }
+
+ public void ProcessBlock(int index)
+ {
+ Buffer16 inputBlock = MemoryMarshal.Cast<byte, Buffer16>(InputBuffer.Span)[index];
+
+ Span<int> decompressedData = stackalloc int[144];
+
+ try
+ {
+ DecompressBlock(inputBlock, decompressedData, BlockSizeX, BlockSizeY);
+ }
+ catch (Exception)
+ {
+ Success = false;
+ }
+
+ Span<byte> decompressedBytes = MemoryMarshal.Cast<int, byte>(decompressedData);
+
+ AstcLevel levelInfo = GetLevelInfo(index);
+
+ WriteDecompressedBlock(decompressedBytes, OutputBuffer.Span.Slice(levelInfo.OutputByteOffset),
+ index - levelInfo.StartBlock, levelInfo);
+ }
+
+ private AstcLevel GetLevelInfo(int blockIndex)
+ {
+ foreach (AstcLevel levelInfo in Levels)
+ {
+ if (blockIndex < levelInfo.StartBlock + levelInfo.TotalBlockCount)
+ {
+ return levelInfo;
+ }
+ }
+
+ throw new AstcDecoderException("Invalid block index.");
+ }
+
+ private void WriteDecompressedBlock(ReadOnlySpan<byte> block, Span<byte> outputBuffer, int blockIndex, AstcLevel level)
+ {
+ int stride = level.ImageSizeX * 4;
+
+ int blockCordX = blockIndex % level.BlockCountX;
+ int blockCordY = blockIndex / level.BlockCountX;
+
+ int pixelCordX = blockCordX * BlockSizeX;
+ int pixelCordY = blockCordY * BlockSizeY;
+
+ int outputPixelsX = Math.Min(pixelCordX + BlockSizeX, level.ImageSizeX) - pixelCordX;
+ int outputPixelsY = Math.Min(pixelCordY + BlockSizeY, level.ImageSizeY * level.ImageSizeZ) - pixelCordY;
+
+ int outputStart = pixelCordX * 4 + pixelCordY * stride;
+ int outputOffset = outputStart;
+
+ int inputOffset = 0;
+
+ for (int i = 0; i < outputPixelsY; i++)
+ {
+ ReadOnlySpan<byte> blockRow = block.Slice(inputOffset, outputPixelsX * 4);
+ Span<byte> outputRow = outputBuffer.Slice(outputOffset);
+ blockRow.CopyTo(outputRow);
+
+ inputOffset += BlockSizeX * 4;
+ outputOffset += stride;
+ }
+ }
+
+ struct TexelWeightParams
+ {
+ public int Width;
+ public int Height;
+ public int MaxWeight;
+ public bool DualPlane;
+ public bool Error;
+ public bool VoidExtentLdr;
+ public bool VoidExtentHdr;
+
+ public int GetPackedBitSize()
+ {
+ // How many indices do we have?
+ int indices = Height * Width;
+
+ if (DualPlane)
+ {
+ indices *= 2;
+ }
+
+ IntegerEncoded intEncoded = IntegerEncoded.CreateEncoding(MaxWeight);
+
+ return intEncoded.GetBitLength(indices);
+ }
+
+ public int GetNumWeightValues()
+ {
+ int ret = Width * Height;
+
+ if (DualPlane)
+ {
+ ret *= 2;
+ }
+
+ return ret;
+ }
+ }
+
+ public static bool TryDecodeToRgba8(
+ ReadOnlyMemory<byte> data,
+ int blockWidth,
+ int blockHeight,
+ int width,
+ int height,
+ int depth,
+ int levels,
+ int layers,
+ out Span<byte> decoded)
+ {
+ byte[] output = new byte[QueryDecompressedSize(width, height, depth, levels, layers)];
+
+ AstcDecoder decoder = new AstcDecoder(data, output, blockWidth, blockHeight, width, height, depth, levels, layers);
+
+ for (int i = 0; i < decoder.TotalBlockCount; i++)
+ {
+ decoder.ProcessBlock(i);
+ }
+
+ decoded = output;
+
+ return decoder.Success;
+ }
+
+ public static bool TryDecodeToRgba8(
+ ReadOnlyMemory<byte> data,
+ Memory<byte> outputBuffer,
+ int blockWidth,
+ int blockHeight,
+ int width,
+ int height,
+ int depth,
+ int levels,
+ int layers)
+ {
+ AstcDecoder decoder = new AstcDecoder(data, outputBuffer, blockWidth, blockHeight, width, height, depth, levels, layers);
+
+ for (int i = 0; i < decoder.TotalBlockCount; i++)
+ {
+ decoder.ProcessBlock(i);
+ }
+
+ return decoder.Success;
+ }
+
+ public static bool TryDecodeToRgba8P(
+ ReadOnlyMemory<byte> data,
+ Memory<byte> outputBuffer,
+ int blockWidth,
+ int blockHeight,
+ int width,
+ int height,
+ int depth,
+ int levels,
+ int layers)
+ {
+ AstcDecoder decoder = new AstcDecoder(data, outputBuffer, blockWidth, blockHeight, width, height, depth, levels, layers);
+
+ // Lazy parallelism
+ Enumerable.Range(0, decoder.TotalBlockCount).AsParallel().ForAll(x => decoder.ProcessBlock(x));
+
+ return decoder.Success;
+ }
+
+ public static bool TryDecodeToRgba8P(
+ ReadOnlyMemory<byte> data,
+ int blockWidth,
+ int blockHeight,
+ int width,
+ int height,
+ int depth,
+ int levels,
+ int layers,
+ out byte[] decoded)
+ {
+ byte[] output = new byte[QueryDecompressedSize(width, height, depth, levels, layers)];
+
+ AstcDecoder decoder = new AstcDecoder(data, output, blockWidth, blockHeight, width, height, depth, levels, layers);
+
+ Enumerable.Range(0, decoder.TotalBlockCount).AsParallel().ForAll(x => decoder.ProcessBlock(x));
+
+ decoded = output;
+
+ return decoder.Success;
+ }
+
+ public static bool DecompressBlock(
+ Buffer16 inputBlock,
+ Span<int> outputBuffer,
+ int blockWidth,
+ int blockHeight)
+ {
+ BitStream128 bitStream = new BitStream128(inputBlock);
+
+ DecodeBlockInfo(ref bitStream, out TexelWeightParams texelParams);
+
+ if (texelParams.Error)
+ {
+ throw new AstcDecoderException("Invalid block mode");
+ }
+
+ if (texelParams.VoidExtentLdr)
+ {
+ FillVoidExtentLdr(ref bitStream, outputBuffer, blockWidth, blockHeight);
+
+ return true;
+ }
+
+ if (texelParams.VoidExtentHdr)
+ {
+ throw new AstcDecoderException("HDR void extent blocks are not supported.");
+ }
+
+ if (texelParams.Width > blockWidth)
+ {
+ throw new AstcDecoderException("Texel weight grid width should be smaller than block width.");
+ }
+
+ if (texelParams.Height > blockHeight)
+ {
+ throw new AstcDecoderException("Texel weight grid height should be smaller than block height.");
+ }
+
+ // Read num partitions
+ int numberPartitions = bitStream.ReadBits(2) + 1;
+ Debug.Assert(numberPartitions <= 4);
+
+ if (numberPartitions == 4 && texelParams.DualPlane)
+ {
+ throw new AstcDecoderException("Dual plane mode is incompatible with four partition blocks.");
+ }
+
+ // Based on the number of partitions, read the color endpoint mode for
+ // each partition.
+
+ // Determine partitions, partition index, and color endpoint modes
+ int planeIndices;
+ int partitionIndex;
+
+ Span<uint> colorEndpointMode = stackalloc uint[4];
+
+ BitStream128 colorEndpointStream = new BitStream128();
+
+ // Read extra config data...
+ uint baseColorEndpointMode = 0;
+
+ if (numberPartitions == 1)
+ {
+ colorEndpointMode[0] = (uint)bitStream.ReadBits(4);
+ partitionIndex = 0;
+ }
+ else
+ {
+ partitionIndex = bitStream.ReadBits(10);
+ baseColorEndpointMode = (uint)bitStream.ReadBits(6);
+ }
+
+ uint baseMode = (baseColorEndpointMode & 3);
+
+ // Remaining bits are color endpoint data...
+ int numberWeightBits = texelParams.GetPackedBitSize();
+ int remainingBits = bitStream.BitsLeft - numberWeightBits;
+
+ // Consider extra bits prior to texel data...
+ uint extraColorEndpointModeBits = 0;
+
+ if (baseMode != 0)
+ {
+ switch (numberPartitions)
+ {
+ case 2: extraColorEndpointModeBits += 2; break;
+ case 3: extraColorEndpointModeBits += 5; break;
+ case 4: extraColorEndpointModeBits += 8; break;
+ default: Debug.Assert(false); break;
+ }
+ }
+
+ remainingBits -= (int)extraColorEndpointModeBits;
+
+ // Do we have a dual plane situation?
+ int planeSelectorBits = 0;
+
+ if (texelParams.DualPlane)
+ {
+ planeSelectorBits = 2;
+ }
+
+ remainingBits -= planeSelectorBits;
+
+ // Read color data...
+ int colorDataBits = remainingBits;
+
+ while (remainingBits > 0)
+ {
+ int numberBits = Math.Min(remainingBits, 8);
+ int bits = bitStream.ReadBits(numberBits);
+ colorEndpointStream.WriteBits(bits, numberBits);
+ remainingBits -= 8;
+ }
+
+ // Read the plane selection bits
+ planeIndices = bitStream.ReadBits(planeSelectorBits);
+
+ // Read the rest of the CEM
+ if (baseMode != 0)
+ {
+ uint extraColorEndpointMode = (uint)bitStream.ReadBits((int)extraColorEndpointModeBits);
+ uint tempColorEndpointMode = (extraColorEndpointMode << 6) | baseColorEndpointMode;
+ tempColorEndpointMode >>= 2;
+
+ Span<bool> c = stackalloc bool[4];
+
+ for (int i = 0; i < numberPartitions; i++)
+ {
+ c[i] = (tempColorEndpointMode & 1) != 0;
+ tempColorEndpointMode >>= 1;
+ }
+
+ Span<byte> m = stackalloc byte[4];
+
+ for (int i = 0; i < numberPartitions; i++)
+ {
+ m[i] = (byte)(tempColorEndpointMode & 3);
+ tempColorEndpointMode >>= 2;
+ Debug.Assert(m[i] <= 3);
+ }
+
+ for (int i = 0; i < numberPartitions; i++)
+ {
+ colorEndpointMode[i] = baseMode;
+ if (!(c[i])) colorEndpointMode[i] -= 1;
+ colorEndpointMode[i] <<= 2;
+ colorEndpointMode[i] |= m[i];
+ }
+ }
+ else if (numberPartitions > 1)
+ {
+ uint tempColorEndpointMode = baseColorEndpointMode >> 2;
+
+ for (int i = 0; i < numberPartitions; i++)
+ {
+ colorEndpointMode[i] = tempColorEndpointMode;
+ }
+ }
+
+ // Make sure everything up till here is sane.
+ for (int i = 0; i < numberPartitions; i++)
+ {
+ Debug.Assert(colorEndpointMode[i] < 16);
+ }
+ Debug.Assert(bitStream.BitsLeft == texelParams.GetPackedBitSize());
+
+ // Decode both color data and texel weight data
+ Span<int> colorValues = stackalloc int[32]; // Four values * two endpoints * four maximum partitions
+ DecodeColorValues(colorValues, ref colorEndpointStream, colorEndpointMode, numberPartitions, colorDataBits);
+
+ EndPointSet endPoints;
+ unsafe { _ = &endPoints; } // Skip struct initialization
+
+ int colorValuesPosition = 0;
+
+ for (int i = 0; i < numberPartitions; i++)
+ {
+ ComputeEndpoints(endPoints.Get(i), colorValues, colorEndpointMode[i], ref colorValuesPosition);
+ }
+
+ // Read the texel weight data.
+ Buffer16 texelWeightData = inputBlock;
+
+ // Reverse everything
+ for (int i = 0; i < 8; i++)
+ {
+ byte a = ReverseByte(texelWeightData[i]);
+ byte b = ReverseByte(texelWeightData[15 - i]);
+
+ texelWeightData[i] = b;
+ texelWeightData[15 - i] = a;
+ }
+
+ // Make sure that higher non-texel bits are set to zero
+ int clearByteStart = (texelParams.GetPackedBitSize() >> 3) + 1;
+ texelWeightData[clearByteStart - 1] &= (byte)((1 << (texelParams.GetPackedBitSize() % 8)) - 1);
+
+ int cLen = 16 - clearByteStart;
+ for (int i = clearByteStart; i < clearByteStart + cLen; i++) texelWeightData[i] = 0;
+
+ IntegerSequence texelWeightValues;
+ unsafe { _ = &texelWeightValues; } // Skip struct initialization
+ texelWeightValues.Reset();
+
+ BitStream128 weightBitStream = new BitStream128(texelWeightData);
+
+ IntegerEncoded.DecodeIntegerSequence(ref texelWeightValues, ref weightBitStream, texelParams.MaxWeight, texelParams.GetNumWeightValues());
+
+ // Blocks can be at most 12x12, so we can have as many as 144 weights
+ Weights weights;
+ unsafe { _ = &weights; } // Skip struct initialization
+
+ UnquantizeTexelWeights(ref weights, ref texelWeightValues, ref texelParams, blockWidth, blockHeight);
+
+ ushort[] table = Bits.Replicate8_16Table;
+
+ // Now that we have endpoints and weights, we can interpolate and generate
+ // the proper decoding...
+ for (int j = 0; j < blockHeight; j++)
+ {
+ for (int i = 0; i < blockWidth; i++)
+ {
+ int partition = Select2dPartition(partitionIndex, i, j, numberPartitions, ((blockHeight * blockWidth) < 32));
+ Debug.Assert(partition < numberPartitions);
+
+ AstcPixel pixel = new AstcPixel();
+ for (int component = 0; component < 4; component++)
+ {
+ int component0 = endPoints.Get(partition)[0].GetComponent(component);
+ component0 = table[component0];
+ int component1 = endPoints.Get(partition)[1].GetComponent(component);
+ component1 = table[component1];
+
+ int plane = 0;
+
+ if (texelParams.DualPlane && (((planeIndices + 1) & 3) == component))
+ {
+ plane = 1;
+ }
+
+ int weight = weights.Get(plane)[j * blockWidth + i];
+ int finalComponent = (component0 * (64 - weight) + component1 * weight + 32) / 64;
+
+ if (finalComponent == 65535)
+ {
+ pixel.SetComponent(component, 255);
+ }
+ else
+ {
+ double finalComponentFloat = finalComponent;
+ pixel.SetComponent(component, (int)(255.0 * (finalComponentFloat / 65536.0) + 0.5));
+ }
+ }
+
+ outputBuffer[j * blockWidth + i] = pixel.Pack();
+ }
+ }
+
+ return true;
+ }
+
+ // Blocks can be at most 12x12, so we can have as many as 144 weights
+ [StructLayout(LayoutKind.Sequential, Size = 144 * sizeof(int) * Count)]
+ private struct Weights
+ {
+ private int _start;
+
+ public const int Count = 2;
+
+ public Span<int> this[int index]
+ {
+ get
+ {
+ if ((uint)index >= Count)
+ {
+ throw new ArgumentOutOfRangeException();
+ }
+
+ ref int start = ref Unsafe.Add(ref _start, index * 144);
+
+ return MemoryMarshal.CreateSpan(ref start, 144);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public Span<int> Get(int index)
+ {
+ ref int start = ref Unsafe.Add(ref _start, index * 144);
+
+ return MemoryMarshal.CreateSpan(ref start, 144);
+ }
+ }
+
+ private static int Select2dPartition(int seed, int x, int y, int partitionCount, bool isSmallBlock)
+ {
+ return SelectPartition(seed, x, y, 0, partitionCount, isSmallBlock);
+ }
+
+ private static int SelectPartition(int seed, int x, int y, int z, int partitionCount, bool isSmallBlock)
+ {
+ if (partitionCount == 1)
+ {
+ return 0;
+ }
+
+ if (isSmallBlock)
+ {
+ x <<= 1;
+ y <<= 1;
+ z <<= 1;
+ }
+
+ seed += (partitionCount - 1) * 1024;
+
+ int rightNum = Hash52((uint)seed);
+ byte seed01 = (byte)(rightNum & 0xF);
+ byte seed02 = (byte)((rightNum >> 4) & 0xF);
+ byte seed03 = (byte)((rightNum >> 8) & 0xF);
+ byte seed04 = (byte)((rightNum >> 12) & 0xF);
+ byte seed05 = (byte)((rightNum >> 16) & 0xF);
+ byte seed06 = (byte)((rightNum >> 20) & 0xF);
+ byte seed07 = (byte)((rightNum >> 24) & 0xF);
+ byte seed08 = (byte)((rightNum >> 28) & 0xF);
+ byte seed09 = (byte)((rightNum >> 18) & 0xF);
+ byte seed10 = (byte)((rightNum >> 22) & 0xF);
+ byte seed11 = (byte)((rightNum >> 26) & 0xF);
+ byte seed12 = (byte)(((rightNum >> 30) | (rightNum << 2)) & 0xF);
+
+ seed01 *= seed01; seed02 *= seed02;
+ seed03 *= seed03; seed04 *= seed04;
+ seed05 *= seed05; seed06 *= seed06;
+ seed07 *= seed07; seed08 *= seed08;
+ seed09 *= seed09; seed10 *= seed10;
+ seed11 *= seed11; seed12 *= seed12;
+
+ int seedHash1, seedHash2, seedHash3;
+
+ if ((seed & 1) != 0)
+ {
+ seedHash1 = (seed & 2) != 0 ? 4 : 5;
+ seedHash2 = (partitionCount == 3) ? 6 : 5;
+ }
+ else
+ {
+ seedHash1 = (partitionCount == 3) ? 6 : 5;
+ seedHash2 = (seed & 2) != 0 ? 4 : 5;
+ }
+
+ seedHash3 = (seed & 0x10) != 0 ? seedHash1 : seedHash2;
+
+ seed01 >>= seedHash1; seed02 >>= seedHash2; seed03 >>= seedHash1; seed04 >>= seedHash2;
+ seed05 >>= seedHash1; seed06 >>= seedHash2; seed07 >>= seedHash1; seed08 >>= seedHash2;
+ seed09 >>= seedHash3; seed10 >>= seedHash3; seed11 >>= seedHash3; seed12 >>= seedHash3;
+
+ int a = seed01 * x + seed02 * y + seed11 * z + (rightNum >> 14);
+ int b = seed03 * x + seed04 * y + seed12 * z + (rightNum >> 10);
+ int c = seed05 * x + seed06 * y + seed09 * z + (rightNum >> 6);
+ int d = seed07 * x + seed08 * y + seed10 * z + (rightNum >> 2);
+
+ a &= 0x3F; b &= 0x3F; c &= 0x3F; d &= 0x3F;
+
+ if (partitionCount < 4) d = 0;
+ if (partitionCount < 3) c = 0;
+
+ if (a >= b && a >= c && a >= d) return 0;
+ else if (b >= c && b >= d) return 1;
+ else if (c >= d) return 2;
+ return 3;
+ }
+
+ static int Hash52(uint val)
+ {
+ val ^= val >> 15; val -= val << 17; val += val << 7; val += val << 4;
+ val ^= val >> 5; val += val << 16; val ^= val >> 7; val ^= val >> 3;
+ val ^= val << 6; val ^= val >> 17;
+
+ return (int)val;
+ }
+
+ static void UnquantizeTexelWeights(
+ ref Weights outputBuffer,
+ ref IntegerSequence weights,
+ ref TexelWeightParams texelParams,
+ int blockWidth,
+ int blockHeight)
+ {
+ int weightIndices = 0;
+ Weights unquantized;
+ unsafe { _ = &unquantized; } // Skip struct initialization
+
+ Span<IntegerEncoded> weightsList = weights.List;
+ Span<int> unquantized0 = unquantized[0];
+ Span<int> unquantized1 = unquantized[1];
+
+ for (int i = 0; i < weightsList.Length; i++)
+ {
+ unquantized0[weightIndices] = UnquantizeTexelWeight(weightsList[i]);
+
+ if (texelParams.DualPlane)
+ {
+ i++;
+ unquantized1[weightIndices] = UnquantizeTexelWeight(weightsList[i]);
+
+ if (i == weightsList.Length)
+ {
+ break;
+ }
+ }
+
+ if (++weightIndices >= texelParams.Width * texelParams.Height) break;
+ }
+
+ // Do infill if necessary (Section C.2.18) ...
+ int ds = (1024 + blockWidth / 2) / (blockWidth - 1);
+ int dt = (1024 + blockHeight / 2) / (blockHeight - 1);
+
+ int planeScale = texelParams.DualPlane ? 2 : 1;
+
+ for (int plane = 0; plane < planeScale; plane++)
+ {
+ Span<int> unquantizedSpan = unquantized.Get(plane);
+ Span<int> outputSpan = outputBuffer.Get(plane);
+
+ for (int t = 0; t < blockHeight; t++)
+ {
+ for (int s = 0; s < blockWidth; s++)
+ {
+ int cs = ds * s;
+ int ct = dt * t;
+
+ int gs = (cs * (texelParams.Width - 1) + 32) >> 6;
+ int gt = (ct * (texelParams.Height - 1) + 32) >> 6;
+
+ int js = gs >> 4;
+ int fs = gs & 0xF;
+
+ int jt = gt >> 4;
+ int ft = gt & 0x0F;
+
+ int w11 = (fs * ft + 8) >> 4;
+
+ int v0 = js + jt * texelParams.Width;
+
+ int weight = 8;
+
+ int wxh = texelParams.Width * texelParams.Height;
+
+ if (v0 < wxh)
+ {
+ weight += unquantizedSpan[v0] * (16 - fs - ft + w11);
+
+ if (v0 + 1 < wxh)
+ {
+ weight += unquantizedSpan[v0 + 1] * (fs - w11);
+ }
+ }
+
+ if (v0 + texelParams.Width < wxh)
+ {
+ weight += unquantizedSpan[v0 + texelParams.Width] * (ft - w11);
+
+ if (v0 + texelParams.Width + 1 < wxh)
+ {
+ weight += unquantizedSpan[v0 + texelParams.Width + 1] * w11;
+ }
+ }
+
+ outputSpan[t * blockWidth + s] = weight >> 4;
+ }
+ }
+ }
+ }
+
+ static int UnquantizeTexelWeight(IntegerEncoded intEncoded)
+ {
+ int bitValue = intEncoded.BitValue;
+ int bitLength = intEncoded.NumberBits;
+
+ int a = Bits.Replicate1_7(bitValue & 1);
+ int b = 0, c = 0, d = 0;
+
+ int result = 0;
+
+ switch (intEncoded.GetEncoding())
+ {
+ case IntegerEncoded.EIntegerEncoding.JustBits:
+ result = Bits.Replicate(bitValue, bitLength, 6);
+ break;
+
+ case IntegerEncoded.EIntegerEncoding.Trit:
+ {
+ d = intEncoded.TritValue;
+ Debug.Assert(d < 3);
+
+ switch (bitLength)
+ {
+ case 0:
+ {
+ result = d switch
+ {
+ 0 => 0,
+ 1 => 32,
+ 2 => 63,
+ _ => 0
+ };
+
+ break;
+ }
+
+ case 1:
+ {
+ c = 50;
+ break;
+ }
+
+ case 2:
+ {
+ c = 23;
+ int b2 = (bitValue >> 1) & 1;
+ b = (b2 << 6) | (b2 << 2) | b2;
+
+ break;
+ }
+
+ case 3:
+ {
+ c = 11;
+ int cb = (bitValue >> 1) & 3;
+ b = (cb << 5) | cb;
+
+ break;
+ }
+
+ default:
+ throw new AstcDecoderException("Invalid trit encoding for texel weight.");
+ }
+
+ break;
+ }
+
+ case IntegerEncoded.EIntegerEncoding.Quint:
+ {
+ d = intEncoded.QuintValue;
+ Debug.Assert(d < 5);
+
+ switch (bitLength)
+ {
+ case 0:
+ {
+ result = d switch
+ {
+ 0 => 0,
+ 1 => 16,
+ 2 => 32,
+ 3 => 47,
+ 4 => 63,
+ _ => 0
+ };
+
+ break;
+ }
+
+ case 1:
+ {
+ c = 28;
+
+ break;
+ }
+
+ case 2:
+ {
+ c = 13;
+ int b2 = (bitValue >> 1) & 1;
+ b = (b2 << 6) | (b2 << 1);
+
+ break;
+ }
+
+ default:
+ throw new AstcDecoderException("Invalid quint encoding for texel weight.");
+ }
+
+ break;
+ }
+ }
+
+ if (intEncoded.GetEncoding() != IntegerEncoded.EIntegerEncoding.JustBits && bitLength > 0)
+ {
+ // Decode the value...
+ result = d * c + b;
+ result ^= a;
+ result = (a & 0x20) | (result >> 2);
+ }
+
+ Debug.Assert(result < 64);
+
+ // Change from [0,63] to [0,64]
+ if (result > 32)
+ {
+ result += 1;
+ }
+
+ return result;
+ }
+
+ static byte ReverseByte(byte b)
+ {
+ // Taken from http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
+ return (byte)((((b) * 0x80200802L) & 0x0884422110L) * 0x0101010101L >> 32);
+ }
+
+ static Span<uint> ReadUintColorValues(int number, Span<int> colorValues, ref int colorValuesPosition)
+ {
+ Span<int> ret = colorValues.Slice(colorValuesPosition, number);
+
+ colorValuesPosition += number;
+
+ return MemoryMarshal.Cast<int, uint>(ret);
+ }
+
+ static Span<int> ReadIntColorValues(int number, Span<int> colorValues, ref int colorValuesPosition)
+ {
+ Span<int> ret = colorValues.Slice(colorValuesPosition, number);
+
+ colorValuesPosition += number;
+
+ return ret;
+ }
+
+ static void ComputeEndpoints(
+ Span<AstcPixel> endPoints,
+ Span<int> colorValues,
+ uint colorEndpointMode,
+ ref int colorValuesPosition)
+ {
+ switch (colorEndpointMode)
+ {
+ case 0:
+ {
+ Span<uint> val = ReadUintColorValues(2, colorValues, ref colorValuesPosition);
+
+ endPoints[0] = new AstcPixel(0xFF, (short)val[0], (short)val[0], (short)val[0]);
+ endPoints[1] = new AstcPixel(0xFF, (short)val[1], (short)val[1], (short)val[1]);
+
+ break;
+ }
+
+
+ case 1:
+ {
+ Span<uint> val = ReadUintColorValues(2, colorValues, ref colorValuesPosition);
+ int l0 = (int)((val[0] >> 2) | (val[1] & 0xC0));
+ int l1 = (int)Math.Max(l0 + (val[1] & 0x3F), 0xFFU);
+
+ endPoints[0] = new AstcPixel(0xFF, (short)l0, (short)l0, (short)l0);
+ endPoints[1] = new AstcPixel(0xFF, (short)l1, (short)l1, (short)l1);
+
+ break;
+ }
+
+ case 4:
+ {
+ Span<uint> val = ReadUintColorValues(4, colorValues, ref colorValuesPosition);
+
+ endPoints[0] = new AstcPixel((short)val[2], (short)val[0], (short)val[0], (short)val[0]);
+ endPoints[1] = new AstcPixel((short)val[3], (short)val[1], (short)val[1], (short)val[1]);
+
+ break;
+ }
+
+ case 5:
+ {
+ Span<int> val = ReadIntColorValues(4, colorValues, ref colorValuesPosition);
+
+ Bits.BitTransferSigned(ref val[1], ref val[0]);
+ Bits.BitTransferSigned(ref val[3], ref val[2]);
+
+ endPoints[0] = new AstcPixel((short)val[2], (short)val[0], (short)val[0], (short)val[0]);
+ endPoints[1] = new AstcPixel((short)(val[2] + val[3]), (short)(val[0] + val[1]), (short)(val[0] + val[1]), (short)(val[0] + val[1]));
+
+ endPoints[0].ClampByte();
+ endPoints[1].ClampByte();
+
+ break;
+ }
+
+ case 6:
+ {
+ Span<uint> val = ReadUintColorValues(4, colorValues, ref colorValuesPosition);
+
+ endPoints[0] = new AstcPixel(0xFF, (short)(val[0] * val[3] >> 8), (short)(val[1] * val[3] >> 8), (short)(val[2] * val[3] >> 8));
+ endPoints[1] = new AstcPixel(0xFF, (short)val[0], (short)val[1], (short)val[2]);
+
+ break;
+ }
+
+ case 8:
+ {
+ Span<uint> val = ReadUintColorValues(6, colorValues, ref colorValuesPosition);
+
+ if (val[1] + val[3] + val[5] >= val[0] + val[2] + val[4])
+ {
+ endPoints[0] = new AstcPixel(0xFF, (short)val[0], (short)val[2], (short)val[4]);
+ endPoints[1] = new AstcPixel(0xFF, (short)val[1], (short)val[3], (short)val[5]);
+ }
+ else
+ {
+ endPoints[0] = AstcPixel.BlueContract(0xFF, (short)val[1], (short)val[3], (short)val[5]);
+ endPoints[1] = AstcPixel.BlueContract(0xFF, (short)val[0], (short)val[2], (short)val[4]);
+ }
+
+ break;
+ }
+
+ case 9:
+ {
+ Span<int> val = ReadIntColorValues(6, colorValues, ref colorValuesPosition);
+
+ Bits.BitTransferSigned(ref val[1], ref val[0]);
+ Bits.BitTransferSigned(ref val[3], ref val[2]);
+ Bits.BitTransferSigned(ref val[5], ref val[4]);
+
+ if (val[1] + val[3] + val[5] >= 0)
+ {
+ endPoints[0] = new AstcPixel(0xFF, (short)val[0], (short)val[2], (short)val[4]);
+ endPoints[1] = new AstcPixel(0xFF, (short)(val[0] + val[1]), (short)(val[2] + val[3]), (short)(val[4] + val[5]));
+ }
+ else
+ {
+ endPoints[0] = AstcPixel.BlueContract(0xFF, val[0] + val[1], val[2] + val[3], val[4] + val[5]);
+ endPoints[1] = AstcPixel.BlueContract(0xFF, val[0], val[2], val[4]);
+ }
+
+ endPoints[0].ClampByte();
+ endPoints[1].ClampByte();
+
+ break;
+ }
+
+ case 10:
+ {
+ Span<uint> val = ReadUintColorValues(6, colorValues, ref colorValuesPosition);
+
+ endPoints[0] = new AstcPixel((short)val[4], (short)(val[0] * val[3] >> 8), (short)(val[1] * val[3] >> 8), (short)(val[2] * val[3] >> 8));
+ endPoints[1] = new AstcPixel((short)val[5], (short)val[0], (short)val[1], (short)val[2]);
+
+ break;
+ }
+
+ case 12:
+ {
+ Span<uint> val = ReadUintColorValues(8, colorValues, ref colorValuesPosition);
+
+ if (val[1] + val[3] + val[5] >= val[0] + val[2] + val[4])
+ {
+ endPoints[0] = new AstcPixel((short)val[6], (short)val[0], (short)val[2], (short)val[4]);
+ endPoints[1] = new AstcPixel((short)val[7], (short)val[1], (short)val[3], (short)val[5]);
+ }
+ else
+ {
+ endPoints[0] = AstcPixel.BlueContract((short)val[7], (short)val[1], (short)val[3], (short)val[5]);
+ endPoints[1] = AstcPixel.BlueContract((short)val[6], (short)val[0], (short)val[2], (short)val[4]);
+ }
+
+ break;
+ }
+
+ case 13:
+ {
+ Span<int> val = ReadIntColorValues(8, colorValues, ref colorValuesPosition);
+
+ Bits.BitTransferSigned(ref val[1], ref val[0]);
+ Bits.BitTransferSigned(ref val[3], ref val[2]);
+ Bits.BitTransferSigned(ref val[5], ref val[4]);
+ Bits.BitTransferSigned(ref val[7], ref val[6]);
+
+ if (val[1] + val[3] + val[5] >= 0)
+ {
+ endPoints[0] = new AstcPixel((short)val[6], (short)val[0], (short)val[2], (short)val[4]);
+ endPoints[1] = new AstcPixel((short)(val[7] + val[6]), (short)(val[0] + val[1]), (short)(val[2] + val[3]), (short)(val[4] + val[5]));
+ }
+ else
+ {
+ endPoints[0] = AstcPixel.BlueContract(val[6] + val[7], val[0] + val[1], val[2] + val[3], val[4] + val[5]);
+ endPoints[1] = AstcPixel.BlueContract(val[6], val[0], val[2], val[4]);
+ }
+
+ endPoints[0].ClampByte();
+ endPoints[1].ClampByte();
+
+ break;
+ }
+
+ default:
+ throw new AstcDecoderException("Unsupported color endpoint mode (is it HDR?)");
+ }
+ }
+
+ static void DecodeColorValues(
+ Span<int> outputValues,
+ ref BitStream128 colorBitStream,
+ Span<uint> modes,
+ int numberPartitions,
+ int numberBitsForColorData)
+ {
+ // First figure out how many color values we have
+ int numberValues = 0;
+
+ for (int i = 0; i < numberPartitions; i++)
+ {
+ numberValues += (int)((modes[i] >> 2) + 1) << 1;
+ }
+
+ // Then based on the number of values and the remaining number of bits,
+ // figure out the max value for each of them...
+ int range = 256;
+
+ while (--range > 0)
+ {
+ IntegerEncoded intEncoded = IntegerEncoded.CreateEncoding(range);
+ int bitLength = intEncoded.GetBitLength(numberValues);
+
+ if (bitLength <= numberBitsForColorData)
+ {
+ // Find the smallest possible range that matches the given encoding
+ while (--range > 0)
+ {
+ IntegerEncoded newIntEncoded = IntegerEncoded.CreateEncoding(range);
+ if (!newIntEncoded.MatchesEncoding(intEncoded))
+ {
+ break;
+ }
+ }
+
+ // Return to last matching range.
+ range++;
+ break;
+ }
+ }
+
+ // We now have enough to decode our integer sequence.
+ IntegerSequence integerEncodedSequence;
+ unsafe { _ = &integerEncodedSequence; } // Skip struct initialization
+ integerEncodedSequence.Reset();
+
+ IntegerEncoded.DecodeIntegerSequence(ref integerEncodedSequence, ref colorBitStream, range, numberValues);
+
+ // Once we have the decoded values, we need to dequantize them to the 0-255 range
+ // This procedure is outlined in ASTC spec C.2.13
+ int outputIndices = 0;
+
+ foreach (ref IntegerEncoded intEncoded in integerEncodedSequence.List)
+ {
+ int bitLength = intEncoded.NumberBits;
+ int bitValue = intEncoded.BitValue;
+
+ Debug.Assert(bitLength >= 1);
+
+ int a = 0, b = 0, c = 0, d = 0;
+ // A is just the lsb replicated 9 times.
+ a = Bits.Replicate(bitValue & 1, 1, 9);
+
+ switch (intEncoded.GetEncoding())
+ {
+ case IntegerEncoded.EIntegerEncoding.JustBits:
+ {
+ outputValues[outputIndices++] = Bits.Replicate(bitValue, bitLength, 8);
+
+ break;
+ }
+
+ case IntegerEncoded.EIntegerEncoding.Trit:
+ {
+ d = intEncoded.TritValue;
+
+ switch (bitLength)
+ {
+ case 1:
+ {
+ c = 204;
+
+ break;
+ }
+
+ case 2:
+ {
+ c = 93;
+ // B = b000b0bb0
+ int b2 = (bitValue >> 1) & 1;
+ b = (b2 << 8) | (b2 << 4) | (b2 << 2) | (b2 << 1);
+
+ break;
+ }
+
+ case 3:
+ {
+ c = 44;
+ // B = cb000cbcb
+ int cb = (bitValue >> 1) & 3;
+ b = (cb << 7) | (cb << 2) | cb;
+
+ break;
+ }
+
+
+ case 4:
+ {
+ c = 22;
+ // B = dcb000dcb
+ int dcb = (bitValue >> 1) & 7;
+ b = (dcb << 6) | dcb;
+
+ break;
+ }
+
+ case 5:
+ {
+ c = 11;
+ // B = edcb000ed
+ int edcb = (bitValue >> 1) & 0xF;
+ b = (edcb << 5) | (edcb >> 2);
+
+ break;
+ }
+
+ case 6:
+ {
+ c = 5;
+ // B = fedcb000f
+ int fedcb = (bitValue >> 1) & 0x1F;
+ b = (fedcb << 4) | (fedcb >> 4);
+
+ break;
+ }
+
+ default:
+ throw new AstcDecoderException("Unsupported trit encoding for color values.");
+ }
+
+ break;
+ }
+
+ case IntegerEncoded.EIntegerEncoding.Quint:
+ {
+ d = intEncoded.QuintValue;
+
+ switch (bitLength)
+ {
+ case 1:
+ {
+ c = 113;
+
+ break;
+ }
+
+ case 2:
+ {
+ c = 54;
+ // B = b0000bb00
+ int b2 = (bitValue >> 1) & 1;
+ b = (b2 << 8) | (b2 << 3) | (b2 << 2);
+
+ break;
+ }
+
+ case 3:
+ {
+ c = 26;
+ // B = cb0000cbc
+ int cb = (bitValue >> 1) & 3;
+ b = (cb << 7) | (cb << 1) | (cb >> 1);
+
+ break;
+ }
+
+ case 4:
+ {
+ c = 13;
+ // B = dcb0000dc
+ int dcb = (bitValue >> 1) & 7;
+ b = (dcb << 6) | (dcb >> 1);
+
+ break;
+ }
+
+ case 5:
+ {
+ c = 6;
+ // B = edcb0000e
+ int edcb = (bitValue >> 1) & 0xF;
+ b = (edcb << 5) | (edcb >> 3);
+
+ break;
+ }
+
+ default:
+ throw new AstcDecoderException("Unsupported quint encoding for color values.");
+ }
+ break;
+ }
+ }
+
+ if (intEncoded.GetEncoding() != IntegerEncoded.EIntegerEncoding.JustBits)
+ {
+ int T = d * c + b;
+ T ^= a;
+ T = (a & 0x80) | (T >> 2);
+
+ outputValues[outputIndices++] = T;
+ }
+ }
+
+ // Make sure that each of our values is in the proper range...
+ for (int i = 0; i < numberValues; i++)
+ {
+ Debug.Assert(outputValues[i] <= 255);
+ }
+ }
+
+ static void FillVoidExtentLdr(ref BitStream128 bitStream, Span<int> outputBuffer, int blockWidth, int blockHeight)
+ {
+ // Don't actually care about the void extent, just read the bits...
+ for (int i = 0; i < 4; ++i)
+ {
+ bitStream.ReadBits(13);
+ }
+
+ // Decode the RGBA components and renormalize them to the range [0, 255]
+ ushort r = (ushort)bitStream.ReadBits(16);
+ ushort g = (ushort)bitStream.ReadBits(16);
+ ushort b = (ushort)bitStream.ReadBits(16);
+ ushort a = (ushort)bitStream.ReadBits(16);
+
+ int rgba = (r >> 8) | (g & 0xFF00) | ((b) & 0xFF00) << 8 | ((a) & 0xFF00) << 16;
+
+ for (int j = 0; j < blockHeight; j++)
+ {
+ for (int i = 0; i < blockWidth; i++)
+ {
+ outputBuffer[j * blockWidth + i] = rgba;
+ }
+ }
+ }
+
+ static void DecodeBlockInfo(ref BitStream128 bitStream, out TexelWeightParams texelParams)
+ {
+ texelParams = new TexelWeightParams();
+
+ // Read the entire block mode all at once
+ ushort modeBits = (ushort)bitStream.ReadBits(11);
+
+ // Does this match the void extent block mode?
+ if ((modeBits & 0x01FF) == 0x1FC)
+ {
+ if ((modeBits & 0x200) != 0)
+ {
+ texelParams.VoidExtentHdr = true;
+ }
+ else
+ {
+ texelParams.VoidExtentLdr = true;
+ }
+
+ // Next two bits must be one.
+ if ((modeBits & 0x400) == 0 || bitStream.ReadBits(1) == 0)
+ {
+ texelParams.Error = true;
+ }
+
+ return;
+ }
+
+ // First check if the last four bits are zero
+ if ((modeBits & 0xF) == 0)
+ {
+ texelParams.Error = true;
+
+ return;
+ }
+
+ // If the last two bits are zero, then if bits
+ // [6-8] are all ones, this is also reserved.
+ if ((modeBits & 0x3) == 0 && (modeBits & 0x1C0) == 0x1C0)
+ {
+ texelParams.Error = true;
+
+ return;
+ }
+
+ // Otherwise, there is no error... Figure out the layout
+ // of the block mode. Layout is determined by a number
+ // between 0 and 9 corresponding to table C.2.8 of the
+ // ASTC spec.
+ int layout;
+
+ if ((modeBits & 0x1) != 0 || (modeBits & 0x2) != 0)
+ {
+ // layout is in [0-4]
+ if ((modeBits & 0x8) != 0)
+ {
+ // layout is in [2-4]
+ if ((modeBits & 0x4) != 0)
+ {
+ // layout is in [3-4]
+ if ((modeBits & 0x100) != 0)
+ {
+ layout = 4;
+ }
+ else
+ {
+ layout = 3;
+ }
+ }
+ else
+ {
+ layout = 2;
+ }
+ }
+ else
+ {
+ // layout is in [0-1]
+ if ((modeBits & 0x4) != 0)
+ {
+ layout = 1;
+ }
+ else
+ {
+ layout = 0;
+ }
+ }
+ }
+ else
+ {
+ // layout is in [5-9]
+ if ((modeBits & 0x100) != 0)
+ {
+ // layout is in [7-9]
+ if ((modeBits & 0x80) != 0)
+ {
+ // layout is in [7-8]
+ Debug.Assert((modeBits & 0x40) == 0);
+
+ if ((modeBits & 0x20) != 0)
+ {
+ layout = 8;
+ }
+ else
+ {
+ layout = 7;
+ }
+ }
+ else
+ {
+ layout = 9;
+ }
+ }
+ else
+ {
+ // layout is in [5-6]
+ if ((modeBits & 0x80) != 0)
+ {
+ layout = 6;
+ }
+ else
+ {
+ layout = 5;
+ }
+ }
+ }
+
+ Debug.Assert(layout < 10);
+
+ // Determine R
+ int r = (modeBits >> 4) & 1;
+ if (layout < 5)
+ {
+ r |= (modeBits & 0x3) << 1;
+ }
+ else
+ {
+ r |= (modeBits & 0xC) >> 1;
+ }
+
+ Debug.Assert(2 <= r && r <= 7);
+
+ // Determine width & height
+ switch (layout)
+ {
+ case 0:
+ {
+ int a = (modeBits >> 5) & 0x3;
+ int b = (modeBits >> 7) & 0x3;
+
+ texelParams.Width = b + 4;
+ texelParams.Height = a + 2;
+
+ break;
+ }
+
+ case 1:
+ {
+ int a = (modeBits >> 5) & 0x3;
+ int b = (modeBits >> 7) & 0x3;
+
+ texelParams.Width = b + 8;
+ texelParams.Height = a + 2;
+
+ break;
+ }
+
+ case 2:
+ {
+ int a = (modeBits >> 5) & 0x3;
+ int b = (modeBits >> 7) & 0x3;
+
+ texelParams.Width = a + 2;
+ texelParams.Height = b + 8;
+
+ break;
+ }
+
+ case 3:
+ {
+ int a = (modeBits >> 5) & 0x3;
+ int b = (modeBits >> 7) & 0x1;
+
+ texelParams.Width = a + 2;
+ texelParams.Height = b + 6;
+
+ break;
+ }
+
+ case 4:
+ {
+ int a = (modeBits >> 5) & 0x3;
+ int b = (modeBits >> 7) & 0x1;
+
+ texelParams.Width = b + 2;
+ texelParams.Height = a + 2;
+
+ break;
+ }
+
+ case 5:
+ {
+ int a = (modeBits >> 5) & 0x3;
+
+ texelParams.Width = 12;
+ texelParams.Height = a + 2;
+
+ break;
+ }
+
+ case 6:
+ {
+ int a = (modeBits >> 5) & 0x3;
+
+ texelParams.Width = a + 2;
+ texelParams.Height = 12;
+
+ break;
+ }
+
+ case 7:
+ {
+ texelParams.Width = 6;
+ texelParams.Height = 10;
+
+ break;
+ }
+
+ case 8:
+ {
+ texelParams.Width = 10;
+ texelParams.Height = 6;
+ break;
+ }
+
+ case 9:
+ {
+ int a = (modeBits >> 5) & 0x3;
+ int b = (modeBits >> 9) & 0x3;
+
+ texelParams.Width = a + 6;
+ texelParams.Height = b + 6;
+
+ break;
+ }
+
+ default:
+ // Don't know this layout...
+ texelParams.Error = true;
+ break;
+ }
+
+ // Determine whether or not we're using dual planes
+ // and/or high precision layouts.
+ bool d = ((layout != 9) && ((modeBits & 0x400) != 0));
+ bool h = (layout != 9) && ((modeBits & 0x200) != 0);
+
+ if (h)
+ {
+ ReadOnlySpan<byte> maxWeights = new byte[] { 9, 11, 15, 19, 23, 31 };
+ texelParams.MaxWeight = maxWeights[r - 2];
+ }
+ else
+ {
+ ReadOnlySpan<byte> maxWeights = new byte[] { 1, 2, 3, 4, 5, 7 };
+ texelParams.MaxWeight = maxWeights[r - 2];
+ }
+
+ texelParams.DualPlane = d;
+ }
+ }
+}
diff --git a/src/Ryujinx.Graphics.Texture/Astc/AstcDecoderException.cs b/src/Ryujinx.Graphics.Texture/Astc/AstcDecoderException.cs
new file mode 100644
index 00000000..fdc48267
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Astc/AstcDecoderException.cs
@@ -0,0 +1,9 @@
+using System;
+
+namespace Ryujinx.Graphics.Texture.Astc
+{
+ public class AstcDecoderException : Exception
+ {
+ public AstcDecoderException(string exMsg) : base(exMsg) { }
+ }
+} \ No newline at end of file
diff --git a/src/Ryujinx.Graphics.Texture/Astc/AstcPixel.cs b/src/Ryujinx.Graphics.Texture/Astc/AstcPixel.cs
new file mode 100644
index 00000000..13197714
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Astc/AstcPixel.cs
@@ -0,0 +1,68 @@
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Graphics.Texture.Astc
+{
+ [StructLayout(LayoutKind.Sequential)]
+ struct AstcPixel
+ {
+ internal const int StructSize = 12;
+
+ public short A;
+ public short R;
+ public short G;
+ public short B;
+
+ private uint _bitDepthInt;
+
+ private Span<byte> BitDepth => MemoryMarshal.CreateSpan(ref Unsafe.As<uint, byte>(ref _bitDepthInt), 4);
+ private Span<short> Components => MemoryMarshal.CreateSpan(ref A, 4);
+
+ public AstcPixel(short a, short r, short g, short b)
+ {
+ A = a;
+ R = r;
+ G = g;
+ B = b;
+
+ _bitDepthInt = 0x08080808;
+ }
+
+ public void ClampByte()
+ {
+ R = Math.Min(Math.Max(R, (short)0), (short)255);
+ G = Math.Min(Math.Max(G, (short)0), (short)255);
+ B = Math.Min(Math.Max(B, (short)0), (short)255);
+ A = Math.Min(Math.Max(A, (short)0), (short)255);
+ }
+
+ public short GetComponent(int index)
+ {
+ return Components[index];
+ }
+
+ public void SetComponent(int index, int value)
+ {
+ Components[index] = (short)value;
+ }
+
+ public int Pack()
+ {
+ return A << 24 |
+ B << 16 |
+ G << 8 |
+ R << 0;
+ }
+
+ // Adds more precision to the blue channel as described
+ // in C.2.14
+ public static AstcPixel BlueContract(int a, int r, int g, int b)
+ {
+ return new AstcPixel((short)(a),
+ (short)((r + b) >> 1),
+ (short)((g + b) >> 1),
+ (short)(b));
+ }
+ }
+}
diff --git a/src/Ryujinx.Graphics.Texture/Astc/BitStream128.cs b/src/Ryujinx.Graphics.Texture/Astc/BitStream128.cs
new file mode 100644
index 00000000..3bf9769f
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Astc/BitStream128.cs
@@ -0,0 +1,72 @@
+using Ryujinx.Common.Utilities;
+using System;
+using System.Diagnostics;
+
+namespace Ryujinx.Graphics.Texture.Astc
+{
+ public struct BitStream128
+ {
+ private Buffer16 _data;
+ public int BitsLeft { get; set; }
+
+ public BitStream128(Buffer16 data)
+ {
+ _data = data;
+ BitsLeft = 128;
+ }
+
+ public int ReadBits(int bitCount)
+ {
+ Debug.Assert(bitCount < 32);
+
+ if (bitCount == 0)
+ {
+ return 0;
+ }
+
+ int mask = (1 << bitCount) - 1;
+ int value = _data.As<int>() & mask;
+
+ Span<ulong> span = _data.AsSpan<ulong>();
+
+ ulong carry = span[1] << (64 - bitCount);
+ span[0] = (span[0] >> bitCount) | carry;
+ span[1] >>= bitCount;
+
+ BitsLeft -= bitCount;
+
+ return value;
+ }
+
+ public void WriteBits(int value, int bitCount)
+ {
+ Debug.Assert(bitCount < 32);
+
+ if (bitCount == 0) return;
+
+ ulong maskedValue = (uint)(value & ((1 << bitCount) - 1));
+
+ Span<ulong> span = _data.AsSpan<ulong>();
+
+ if (BitsLeft < 64)
+ {
+ ulong lowMask = maskedValue << BitsLeft;
+ span[0] |= lowMask;
+ }
+
+ if (BitsLeft + bitCount > 64)
+ {
+ if (BitsLeft > 64)
+ {
+ span[1] |= maskedValue << (BitsLeft - 64);
+ }
+ else
+ {
+ span[1] |= maskedValue >> (64 - BitsLeft);
+ }
+ }
+
+ BitsLeft += bitCount;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/Ryujinx.Graphics.Texture/Astc/Bits.cs b/src/Ryujinx.Graphics.Texture/Astc/Bits.cs
new file mode 100644
index 00000000..b140a20a
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Astc/Bits.cs
@@ -0,0 +1,66 @@
+namespace Ryujinx.Graphics.Texture.Astc
+{
+ internal static class Bits
+ {
+ public static readonly ushort[] Replicate8_16Table;
+ public static readonly byte[] Replicate1_7Table;
+
+ static Bits()
+ {
+ Replicate8_16Table = new ushort[0x200];
+ Replicate1_7Table = new byte[0x200];
+
+ for (int i = 0; i < 0x200; i++)
+ {
+ Replicate8_16Table[i] = (ushort)Replicate(i, 8, 16);
+ Replicate1_7Table[i] = (byte)Replicate(i, 1, 7);
+ }
+ }
+
+ public static int Replicate8_16(int value)
+ {
+ return Replicate8_16Table[value];
+ }
+
+ public static int Replicate1_7(int value)
+ {
+ return Replicate1_7Table[value];
+ }
+
+ public static int Replicate(int value, int numberBits, int toBit)
+ {
+ if (numberBits == 0) return 0;
+ if (toBit == 0) return 0;
+
+ int tempValue = value & ((1 << numberBits) - 1);
+ int retValue = tempValue;
+ int resLength = numberBits;
+
+ while (resLength < toBit)
+ {
+ int comp = 0;
+ if (numberBits > toBit - resLength)
+ {
+ int newShift = toBit - resLength;
+ comp = numberBits - newShift;
+ numberBits = newShift;
+ }
+ retValue <<= numberBits;
+ retValue |= tempValue >> comp;
+ resLength += numberBits;
+ }
+
+ return retValue;
+ }
+
+ // Transfers a bit as described in C.2.14
+ public static void BitTransferSigned(ref int a, ref int b)
+ {
+ b >>= 1;
+ b |= a & 0x80;
+ a >>= 1;
+ a &= 0x3F;
+ if ((a & 0x20) != 0) a -= 0x40;
+ }
+ }
+}
diff --git a/src/Ryujinx.Graphics.Texture/Astc/EndPointSet.cs b/src/Ryujinx.Graphics.Texture/Astc/EndPointSet.cs
new file mode 100644
index 00000000..45e61ca2
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Astc/EndPointSet.cs
@@ -0,0 +1,23 @@
+using System;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Graphics.Texture.Astc
+{
+ [StructLayout(LayoutKind.Sequential, Size = AstcPixel.StructSize * 8)]
+ internal struct EndPointSet
+ {
+ private AstcPixel _start;
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public Span<AstcPixel> Get(int index)
+ {
+ Debug.Assert(index < 4);
+
+ ref AstcPixel start = ref Unsafe.Add(ref _start, index * 2);
+
+ return MemoryMarshal.CreateSpan(ref start, 2);
+ }
+ }
+}
diff --git a/src/Ryujinx.Graphics.Texture/Astc/IntegerEncoded.cs b/src/Ryujinx.Graphics.Texture/Astc/IntegerEncoded.cs
new file mode 100644
index 00000000..065de46b
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Astc/IntegerEncoded.cs
@@ -0,0 +1,345 @@
+using System;
+using System.Numerics;
+
+namespace Ryujinx.Graphics.Texture.Astc
+{
+ internal struct IntegerEncoded
+ {
+ internal const int StructSize = 8;
+ private static readonly IntegerEncoded[] Encodings;
+
+ public enum EIntegerEncoding : byte
+ {
+ JustBits,
+ Quint,
+ Trit
+ }
+
+ EIntegerEncoding _encoding;
+ public byte NumberBits { get; private set; }
+ public byte TritValue { get; private set; }
+ public byte QuintValue { get; private set; }
+ public int BitValue { get; private set; }
+
+ static IntegerEncoded()
+ {
+ Encodings = new IntegerEncoded[0x100];
+
+ for (int i = 0; i < Encodings.Length; i++)
+ {
+ Encodings[i] = CreateEncodingCalc(i);
+ }
+ }
+
+ public IntegerEncoded(EIntegerEncoding encoding, int numBits)
+ {
+ _encoding = encoding;
+ NumberBits = (byte)numBits;
+ BitValue = 0;
+ TritValue = 0;
+ QuintValue = 0;
+ }
+
+ public bool MatchesEncoding(IntegerEncoded other)
+ {
+ return _encoding == other._encoding && NumberBits == other.NumberBits;
+ }
+
+ public EIntegerEncoding GetEncoding()
+ {
+ return _encoding;
+ }
+
+ public int GetBitLength(int numberVals)
+ {
+ int totalBits = NumberBits * numberVals;
+ if (_encoding == EIntegerEncoding.Trit)
+ {
+ totalBits += (numberVals * 8 + 4) / 5;
+ }
+ else if (_encoding == EIntegerEncoding.Quint)
+ {
+ totalBits += (numberVals * 7 + 2) / 3;
+ }
+ return totalBits;
+ }
+
+ public static IntegerEncoded CreateEncoding(int maxVal)
+ {
+ return Encodings[maxVal];
+ }
+
+ private static IntegerEncoded CreateEncodingCalc(int maxVal)
+ {
+ while (maxVal > 0)
+ {
+ int check = maxVal + 1;
+
+ // Is maxVal a power of two?
+ if ((check & (check - 1)) == 0)
+ {
+ return new IntegerEncoded(EIntegerEncoding.JustBits, BitOperations.PopCount((uint)maxVal));
+ }
+
+ // Is maxVal of the type 3*2^n - 1?
+ if ((check % 3 == 0) && ((check / 3) & ((check / 3) - 1)) == 0)
+ {
+ return new IntegerEncoded(EIntegerEncoding.Trit, BitOperations.PopCount((uint)(check / 3 - 1)));
+ }
+
+ // Is maxVal of the type 5*2^n - 1?
+ if ((check % 5 == 0) && ((check / 5) & ((check / 5) - 1)) == 0)
+ {
+ return new IntegerEncoded(EIntegerEncoding.Quint, BitOperations.PopCount((uint)(check / 5 - 1)));
+ }
+
+ // Apparently it can't be represented with a bounded integer sequence...
+ // just iterate.
+ maxVal--;
+ }
+
+ return new IntegerEncoded(EIntegerEncoding.JustBits, 0);
+ }
+
+ public static void DecodeTritBlock(
+ ref BitStream128 bitStream,
+ ref IntegerSequence listIntegerEncoded,
+ int numberBitsPerValue)
+ {
+ // Implement the algorithm in section C.2.12
+ Span<int> m = stackalloc int[5];
+
+ m[0] = bitStream.ReadBits(numberBitsPerValue);
+ int encoded = bitStream.ReadBits(2);
+ m[1] = bitStream.ReadBits(numberBitsPerValue);
+ encoded |= bitStream.ReadBits(2) << 2;
+ m[2] = bitStream.ReadBits(numberBitsPerValue);
+ encoded |= bitStream.ReadBits(1) << 4;
+ m[3] = bitStream.ReadBits(numberBitsPerValue);
+ encoded |= bitStream.ReadBits(2) << 5;
+ m[4] = bitStream.ReadBits(numberBitsPerValue);
+ encoded |= bitStream.ReadBits(1) << 7;
+
+ ReadOnlySpan<byte> encodings = GetTritEncoding(encoded);
+
+ IntegerEncoded intEncoded = new IntegerEncoded(EIntegerEncoding.Trit, numberBitsPerValue);
+
+ for (int i = 0; i < 5; i++)
+ {
+ intEncoded.BitValue = m[i];
+ intEncoded.TritValue = encodings[i];
+
+ listIntegerEncoded.Add(ref intEncoded);
+ }
+ }
+
+ public static void DecodeQuintBlock(
+ ref BitStream128 bitStream,
+ ref IntegerSequence listIntegerEncoded,
+ int numberBitsPerValue)
+ {
+ ReadOnlySpan<byte> interleavedBits = new byte[] { 3, 2, 2 };
+
+ // Implement the algorithm in section C.2.12
+ Span<int> m = stackalloc int[3];
+ ulong encoded = 0;
+ int encodedBitsRead = 0;
+
+ for (int i = 0; i < m.Length; i++)
+ {
+ m[i] = bitStream.ReadBits(numberBitsPerValue);
+
+ uint encodedBits = (uint)bitStream.ReadBits(interleavedBits[i]);
+
+ encoded |= encodedBits << encodedBitsRead;
+ encodedBitsRead += interleavedBits[i];
+ }
+
+ ReadOnlySpan<byte> encodings = GetQuintEncoding((int)encoded);
+
+ for (int i = 0; i < 3; i++)
+ {
+ IntegerEncoded intEncoded = new IntegerEncoded(EIntegerEncoding.Quint, numberBitsPerValue)
+ {
+ BitValue = m[i],
+ QuintValue = encodings[i]
+ };
+
+ listIntegerEncoded.Add(ref intEncoded);
+ }
+ }
+
+ public static void DecodeIntegerSequence(
+ ref IntegerSequence decodeIntegerSequence,
+ ref BitStream128 bitStream,
+ int maxRange,
+ int numberValues)
+ {
+ // Determine encoding parameters
+ IntegerEncoded intEncoded = CreateEncoding(maxRange);
+
+ // Start decoding
+ int numberValuesDecoded = 0;
+ while (numberValuesDecoded < numberValues)
+ {
+ switch (intEncoded.GetEncoding())
+ {
+ case EIntegerEncoding.Quint:
+ {
+ DecodeQuintBlock(ref bitStream, ref decodeIntegerSequence, intEncoded.NumberBits);
+ numberValuesDecoded += 3;
+
+ break;
+ }
+
+ case EIntegerEncoding.Trit:
+ {
+ DecodeTritBlock(ref bitStream, ref decodeIntegerSequence, intEncoded.NumberBits);
+ numberValuesDecoded += 5;
+
+ break;
+ }
+
+ case EIntegerEncoding.JustBits:
+ {
+ intEncoded.BitValue = bitStream.ReadBits(intEncoded.NumberBits);
+ decodeIntegerSequence.Add(ref intEncoded);
+ numberValuesDecoded++;
+
+ break;
+ }
+ }
+ }
+ }
+
+ private static ReadOnlySpan<byte> GetTritEncoding(int index)
+ {
+ return TritEncodings.Slice(index * 5, 5);
+ }
+
+ private static ReadOnlySpan<byte> GetQuintEncoding(int index)
+ {
+ return QuintEncodings.Slice(index * 3, 3);
+ }
+
+ private static ReadOnlySpan<byte> TritEncodings => new byte[]
+ {
+ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0,
+ 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
+ 2, 1, 0, 0, 0, 1, 0, 2, 0, 0, 0, 2, 0, 0, 0,
+ 1, 2, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0, 2, 0, 0,
+ 0, 2, 2, 0, 0, 1, 2, 2, 0, 0, 2, 2, 2, 0, 0,
+ 2, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
+ 2, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0, 1, 1, 0, 0,
+ 1, 1, 1, 0, 0, 2, 1, 1, 0, 0, 1, 1, 2, 0, 0,
+ 0, 2, 1, 0, 0, 1, 2, 1, 0, 0, 2, 2, 1, 0, 0,
+ 2, 1, 2, 0, 0, 0, 0, 0, 2, 2, 1, 0, 0, 2, 2,
+ 2, 0, 0, 2, 2, 0, 0, 2, 2, 2, 0, 0, 0, 1, 0,
+ 1, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 0, 2, 1, 0,
+ 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 2, 1, 0, 1, 0,
+ 1, 0, 2, 1, 0, 0, 2, 0, 1, 0, 1, 2, 0, 1, 0,
+ 2, 2, 0, 1, 0, 2, 0, 2, 1, 0, 0, 2, 2, 1, 0,
+ 1, 2, 2, 1, 0, 2, 2, 2, 1, 0, 2, 0, 2, 1, 0,
+ 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 2, 0, 1, 1, 0,
+ 0, 1, 2, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
+ 2, 1, 1, 1, 0, 1, 1, 2, 1, 0, 0, 2, 1, 1, 0,
+ 1, 2, 1, 1, 0, 2, 2, 1, 1, 0, 2, 1, 2, 1, 0,
+ 0, 1, 0, 2, 2, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2,
+ 1, 0, 2, 2, 2, 0, 0, 0, 2, 0, 1, 0, 0, 2, 0,
+ 2, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 1, 0, 2, 0,
+ 1, 1, 0, 2, 0, 2, 1, 0, 2, 0, 1, 0, 2, 2, 0,
+ 0, 2, 0, 2, 0, 1, 2, 0, 2, 0, 2, 2, 0, 2, 0,
+ 2, 0, 2, 2, 0, 0, 2, 2, 2, 0, 1, 2, 2, 2, 0,
+ 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 0, 0, 1, 2, 0,
+ 1, 0, 1, 2, 0, 2, 0, 1, 2, 0, 0, 1, 2, 2, 0,
+ 0, 1, 1, 2, 0, 1, 1, 1, 2, 0, 2, 1, 1, 2, 0,
+ 1, 1, 2, 2, 0, 0, 2, 1, 2, 0, 1, 2, 1, 2, 0,
+ 2, 2, 1, 2, 0, 2, 1, 2, 2, 0, 0, 2, 0, 2, 2,
+ 1, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2,
+ 0, 0, 0, 0, 2, 1, 0, 0, 0, 2, 2, 0, 0, 0, 2,
+ 0, 0, 2, 0, 2, 0, 1, 0, 0, 2, 1, 1, 0, 0, 2,
+ 2, 1, 0, 0, 2, 1, 0, 2, 0, 2, 0, 2, 0, 0, 2,
+ 1, 2, 0, 0, 2, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2,
+ 0, 2, 2, 0, 2, 1, 2, 2, 0, 2, 2, 2, 2, 0, 2,
+ 2, 0, 2, 0, 2, 0, 0, 1, 0, 2, 1, 0, 1, 0, 2,
+ 2, 0, 1, 0, 2, 0, 1, 2, 0, 2, 0, 1, 1, 0, 2,
+ 1, 1, 1, 0, 2, 2, 1, 1, 0, 2, 1, 1, 2, 0, 2,
+ 0, 2, 1, 0, 2, 1, 2, 1, 0, 2, 2, 2, 1, 0, 2,
+ 2, 1, 2, 0, 2, 0, 2, 2, 2, 2, 1, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 0, 0, 0, 1,
+ 1, 0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 2, 0, 1,
+ 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 2, 1, 0, 0, 1,
+ 1, 0, 2, 0, 1, 0, 2, 0, 0, 1, 1, 2, 0, 0, 1,
+ 2, 2, 0, 0, 1, 2, 0, 2, 0, 1, 0, 2, 2, 0, 1,
+ 1, 2, 2, 0, 1, 2, 2, 2, 0, 1, 2, 0, 2, 0, 1,
+ 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 2, 0, 1, 0, 1,
+ 0, 1, 2, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
+ 2, 1, 1, 0, 1, 1, 1, 2, 0, 1, 0, 2, 1, 0, 1,
+ 1, 2, 1, 0, 1, 2, 2, 1, 0, 1, 2, 1, 2, 0, 1,
+ 0, 0, 1, 2, 2, 1, 0, 1, 2, 2, 2, 0, 1, 2, 2,
+ 0, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1,
+ 2, 0, 0, 1, 1, 0, 0, 2, 1, 1, 0, 1, 0, 1, 1,
+ 1, 1, 0, 1, 1, 2, 1, 0, 1, 1, 1, 0, 2, 1, 1,
+ 0, 2, 0, 1, 1, 1, 2, 0, 1, 1, 2, 2, 0, 1, 1,
+ 2, 0, 2, 1, 1, 0, 2, 2, 1, 1, 1, 2, 2, 1, 1,
+ 2, 2, 2, 1, 1, 2, 0, 2, 1, 1, 0, 0, 1, 1, 1,
+ 1, 0, 1, 1, 1, 2, 0, 1, 1, 1, 0, 1, 2, 1, 1,
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
+ 1, 1, 2, 1, 1, 0, 2, 1, 1, 1, 1, 2, 1, 1, 1,
+ 2, 2, 1, 1, 1, 2, 1, 2, 1, 1, 0, 1, 1, 2, 2,
+ 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2,
+ 0, 0, 0, 2, 1, 1, 0, 0, 2, 1, 2, 0, 0, 2, 1,
+ 0, 0, 2, 2, 1, 0, 1, 0, 2, 1, 1, 1, 0, 2, 1,
+ 2, 1, 0, 2, 1, 1, 0, 2, 2, 1, 0, 2, 0, 2, 1,
+ 1, 2, 0, 2, 1, 2, 2, 0, 2, 1, 2, 0, 2, 2, 1,
+ 0, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1,
+ 2, 0, 2, 2, 1, 0, 0, 1, 2, 1, 1, 0, 1, 2, 1,
+ 2, 0, 1, 2, 1, 0, 1, 2, 2, 1, 0, 1, 1, 2, 1,
+ 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1,
+ 0, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 2, 1, 2, 1,
+ 2, 1, 2, 2, 1, 0, 2, 1, 2, 2, 1, 2, 1, 2, 2,
+ 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 0, 0, 0, 1, 2,
+ 1, 0, 0, 1, 2, 2, 0, 0, 1, 2, 0, 0, 2, 1, 2,
+ 0, 1, 0, 1, 2, 1, 1, 0, 1, 2, 2, 1, 0, 1, 2,
+ 1, 0, 2, 1, 2, 0, 2, 0, 1, 2, 1, 2, 0, 1, 2,
+ 2, 2, 0, 1, 2, 2, 0, 2, 1, 2, 0, 2, 2, 1, 2,
+ 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 0, 2, 1, 2,
+ 0, 0, 1, 1, 2, 1, 0, 1, 1, 2, 2, 0, 1, 1, 2,
+ 0, 1, 2, 1, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 2,
+ 2, 1, 1, 1, 2, 1, 1, 2, 1, 2, 0, 2, 1, 1, 2,
+ 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2,
+ 0, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 1, 2, 2, 2
+ };
+
+ private static ReadOnlySpan<byte> QuintEncodings => new byte[]
+ {
+ 0, 0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0,
+ 0, 4, 0, 4, 4, 0, 4, 4, 4, 0, 1, 0, 1, 1, 0,
+ 2, 1, 0, 3, 1, 0, 4, 1, 0, 1, 4, 0, 4, 4, 1,
+ 4, 4, 4, 0, 2, 0, 1, 2, 0, 2, 2, 0, 3, 2, 0,
+ 4, 2, 0, 2, 4, 0, 4, 4, 2, 4, 4, 4, 0, 3, 0,
+ 1, 3, 0, 2, 3, 0, 3, 3, 0, 4, 3, 0, 3, 4, 0,
+ 4, 4, 3, 4, 4, 4, 0, 0, 1, 1, 0, 1, 2, 0, 1,
+ 3, 0, 1, 4, 0, 1, 0, 4, 1, 4, 0, 4, 0, 4, 4,
+ 0, 1, 1, 1, 1, 1, 2, 1, 1, 3, 1, 1, 4, 1, 1,
+ 1, 4, 1, 4, 1, 4, 1, 4, 4, 0, 2, 1, 1, 2, 1,
+ 2, 2, 1, 3, 2, 1, 4, 2, 1, 2, 4, 1, 4, 2, 4,
+ 2, 4, 4, 0, 3, 1, 1, 3, 1, 2, 3, 1, 3, 3, 1,
+ 4, 3, 1, 3, 4, 1, 4, 3, 4, 3, 4, 4, 0, 0, 2,
+ 1, 0, 2, 2, 0, 2, 3, 0, 2, 4, 0, 2, 0, 4, 2,
+ 2, 0, 4, 3, 0, 4, 0, 1, 2, 1, 1, 2, 2, 1, 2,
+ 3, 1, 2, 4, 1, 2, 1, 4, 2, 2, 1, 4, 3, 1, 4,
+ 0, 2, 2, 1, 2, 2, 2, 2, 2, 3, 2, 2, 4, 2, 2,
+ 2, 4, 2, 2, 2, 4, 3, 2, 4, 0, 3, 2, 1, 3, 2,
+ 2, 3, 2, 3, 3, 2, 4, 3, 2, 3, 4, 2, 2, 3, 4,
+ 3, 3, 4, 0, 0, 3, 1, 0, 3, 2, 0, 3, 3, 0, 3,
+ 4, 0, 3, 0, 4, 3, 0, 0, 4, 1, 0, 4, 0, 1, 3,
+ 1, 1, 3, 2, 1, 3, 3, 1, 3, 4, 1, 3, 1, 4, 3,
+ 0, 1, 4, 1, 1, 4, 0, 2, 3, 1, 2, 3, 2, 2, 3,
+ 3, 2, 3, 4, 2, 3, 2, 4, 3, 0, 2, 4, 1, 2, 4,
+ 0, 3, 3, 1, 3, 3, 2, 3, 3, 3, 3, 3, 4, 3, 3,
+ 3, 4, 3, 0, 3, 4, 1, 3, 4
+ };
+ }
+}
diff --git a/src/Ryujinx.Graphics.Texture/Astc/IntegerSequence.cs b/src/Ryujinx.Graphics.Texture/Astc/IntegerSequence.cs
new file mode 100644
index 00000000..367b6809
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Astc/IntegerSequence.cs
@@ -0,0 +1,31 @@
+using System;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Graphics.Texture.Astc
+{
+ [StructLayout(LayoutKind.Sequential, Size = IntegerEncoded.StructSize * Capacity + sizeof(int))]
+ internal struct IntegerSequence
+ {
+ private const int Capacity = 100;
+
+ private int _length;
+ private IntegerEncoded _start;
+
+ public Span<IntegerEncoded> List => MemoryMarshal.CreateSpan(ref _start, _length);
+
+ public void Reset() => _length = 0;
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public void Add(ref IntegerEncoded item)
+ {
+ Debug.Assert(_length < Capacity);
+
+ int oldLength = _length;
+ _length++;
+
+ List[oldLength] = item;
+ }
+ }
+}
diff --git a/src/Ryujinx.Graphics.Texture/BC6Decoder.cs b/src/Ryujinx.Graphics.Texture/BC6Decoder.cs
new file mode 100644
index 00000000..819bf022
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/BC6Decoder.cs
@@ -0,0 +1,819 @@
+using Ryujinx.Graphics.Texture.Utils;
+using System;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Graphics.Texture
+{
+ static class BC6Decoder
+ {
+ private const int HalfOne = 0x3C00;
+
+ public static void Decode(Span<byte> output, ReadOnlySpan<byte> data, int width, int height, bool signed)
+ {
+ ReadOnlySpan<Block> blocks = MemoryMarshal.Cast<byte, Block>(data);
+
+ Span<ulong> output64 = MemoryMarshal.Cast<byte, ulong>(output);
+
+ int wInBlocks = (width + 3) / 4;
+ int hInBlocks = (height + 3) / 4;
+
+ for (int y = 0; y < hInBlocks; y++)
+ {
+ int y2 = y * 4;
+ int bh = Math.Min(4, height - y2);
+
+ for (int x = 0; x < wInBlocks; x++)
+ {
+ int x2 = x * 4;
+ int bw = Math.Min(4, width - x2);
+
+ DecodeBlock(blocks[y * wInBlocks + x], output64.Slice(y2 * width + x2), bw, bh, width, signed);
+ }
+ }
+ }
+
+ private static void DecodeBlock(Block block, Span<ulong> output, int w, int h, int width, bool signed)
+ {
+ int mode = (int)(block.Low & 3);
+ if ((mode & 2) != 0)
+ {
+ mode = (int)(block.Low & 0x1f);
+ }
+
+ Span<RgbaColor32> endPoints = stackalloc RgbaColor32[4];
+ int subsetCount = DecodeEndPoints(ref block, endPoints, mode, signed);
+ if (subsetCount == 0)
+ {
+ // Mode is invalid, the spec mandates that hardware fills the block with
+ // a opaque black color.
+ for (int ty = 0; ty < h; ty++)
+ {
+ int baseOffs = ty * width;
+
+ for (int tx = 0; tx < w; tx++)
+ {
+ output[baseOffs + tx] = (ulong)HalfOne << 48;
+ }
+ }
+
+ return;
+ }
+
+ int partition;
+ int indexBitCount;
+ ulong indices;
+
+ if (subsetCount > 1)
+ {
+ partition = (int)((block.High >> 13) & 0x1F);
+ indexBitCount = 3;
+
+ int fixUpIndex = BC67Tables.FixUpIndices[subsetCount - 1][partition][1] * 3;
+ ulong lowMask = (ulong.MaxValue >> (65 - fixUpIndex)) << 3;
+ ulong highMask = ulong.MaxValue << (fixUpIndex + 3);
+
+ indices = ((block.High >> 16) & highMask) | ((block.High >> 17) & lowMask) | ((block.High >> 18) & 3);
+ }
+ else
+ {
+ partition = 0;
+ indexBitCount = 4;
+ indices = (block.High & ~0xFUL) | ((block.High >> 1) & 7);
+ }
+
+ ulong indexMask = (1UL << indexBitCount) - 1;
+
+ for (int ty = 0; ty < h; ty++)
+ {
+ int baseOffs = ty * width;
+
+ for (int tx = 0; tx < w; tx++)
+ {
+ int offs = baseOffs + tx;
+ int index = (int)(indices & indexMask);
+ int endPointBase = BC67Tables.PartitionTable[subsetCount - 1][partition][ty * 4 + tx] << 1;
+
+ RgbaColor32 color1 = endPoints[endPointBase];
+ RgbaColor32 color2 = endPoints[endPointBase + 1];
+
+ RgbaColor32 color = BC67Utils.Interpolate(color1, color2, index, indexBitCount);
+
+ output[offs] =
+ (ulong)FinishUnquantize(color.R, signed) |
+ ((ulong)FinishUnquantize(color.G, signed) << 16) |
+ ((ulong)FinishUnquantize(color.B, signed) << 32) |
+ ((ulong)HalfOne << 48);
+
+ indices >>= indexBitCount;
+ }
+ }
+ }
+
+ private static int DecodeEndPoints(ref Block block, Span<RgbaColor32> endPoints, int mode, bool signed)
+ {
+ ulong low = block.Low;
+ ulong high = block.High;
+
+ int r0 = 0, g0 = 0, b0 = 0, r1 = 0, g1 = 0, b1 = 0, r2 = 0, g2 = 0, b2 = 0, r3 = 0, g3 = 0, b3 = 0;
+ int subsetCount;
+
+ switch (mode)
+ {
+ case 0:
+ r0 = (int)(low >> 5) & 0x3FF;
+ g0 = (int)(low >> 15) & 0x3FF;
+ b0 = (int)(low >> 25) & 0x3FF;
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 10);
+ g0 = SignExtend(g0, 10);
+ b0 = SignExtend(b0, 10);
+ }
+
+ r1 = r0 + SignExtend((int)(low >> 35), 5);
+ g1 = g0 + SignExtend((int)(low >> 45), 5);
+ b1 = b0 + SignExtend((int)(low >> 55), 5);
+
+ r2 = r0 + SignExtend((int)(high >> 1), 5);
+ g2 = g0 + SignExtend((int)(((low << 2) & 0x10) | ((low >> 41) & 0xF)), 5);
+ b2 = b0 + SignExtend((int)(((low << 1) & 0x10) | ((high << 3) & 0x08) | (low >> 61)), 5);
+
+ r3 = r0 + SignExtend((int)(high >> 7), 5);
+ g3 = g0 + SignExtend((int)(((low >> 36) & 0x10) | ((low >> 51) & 0xF)), 5);
+ b3 = b0 + SignExtend((int)(
+ ((low) & 0x10) |
+ ((high >> 9) & 0x08) |
+ ((high >> 4) & 0x04) |
+ ((low >> 59) & 0x02) |
+ ((low >> 50) & 0x01)), 5);
+
+ r0 = Unquantize(r0, 10, signed);
+ g0 = Unquantize(g0, 10, signed);
+ b0 = Unquantize(b0, 10, signed);
+
+ r1 = Unquantize(r1 & 0x3FF, 10, signed);
+ g1 = Unquantize(g1 & 0x3FF, 10, signed);
+ b1 = Unquantize(b1 & 0x3FF, 10, signed);
+
+ r2 = Unquantize(r2 & 0x3FF, 10, signed);
+ g2 = Unquantize(g2 & 0x3FF, 10, signed);
+ b2 = Unquantize(b2 & 0x3FF, 10, signed);
+
+ r3 = Unquantize(r3 & 0x3FF, 10, signed);
+ g3 = Unquantize(g3 & 0x3FF, 10, signed);
+ b3 = Unquantize(b3 & 0x3FF, 10, signed);
+
+ subsetCount = 2;
+ break;
+ case 1:
+ r0 = (int)(low >> 5) & 0x7F;
+ g0 = (int)(low >> 15) & 0x7F;
+ b0 = (int)(low >> 25) & 0x7F;
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 7);
+ g0 = SignExtend(g0, 7);
+ b0 = SignExtend(b0, 7);
+ }
+
+ r1 = r0 + SignExtend((int)(low >> 35), 6);
+ g1 = g0 + SignExtend((int)(low >> 45), 6);
+ b1 = b0 + SignExtend((int)(low >> 55), 6);
+
+ r2 = r0 + SignExtend((int)(high >> 1), 6);
+ g2 = g0 + SignExtend((int)(((low << 3) & 0x20) | ((low >> 20) & 0x10) | ((low >> 41) & 0x0F)), 6);
+ b2 = b0 + SignExtend((int)(
+ ((low >> 17) & 0x20) |
+ ((low >> 10) & 0x10) |
+ ((high << 3) & 0x08) |
+ (low >> 61)), 6);
+
+ r3 = r0 + SignExtend((int)(high >> 7), 6);
+ g3 = g0 + SignExtend((int)(((low << 1) & 0x30) | ((low >> 51) & 0xF)), 6);
+ b3 = b0 + SignExtend((int)(
+ ((low >> 28) & 0x20) |
+ ((low >> 30) & 0x10) |
+ ((low >> 29) & 0x08) |
+ ((low >> 21) & 0x04) |
+ ((low >> 12) & 0x03)), 6);
+
+ r0 = Unquantize(r0, 7, signed);
+ g0 = Unquantize(g0, 7, signed);
+ b0 = Unquantize(b0, 7, signed);
+
+ r1 = Unquantize(r1 & 0x7F, 7, signed);
+ g1 = Unquantize(g1 & 0x7F, 7, signed);
+ b1 = Unquantize(b1 & 0x7F, 7, signed);
+
+ r2 = Unquantize(r2 & 0x7F, 7, signed);
+ g2 = Unquantize(g2 & 0x7F, 7, signed);
+ b2 = Unquantize(b2 & 0x7F, 7, signed);
+
+ r3 = Unquantize(r3 & 0x7F, 7, signed);
+ g3 = Unquantize(g3 & 0x7F, 7, signed);
+ b3 = Unquantize(b3 & 0x7F, 7, signed);
+
+ subsetCount = 2;
+ break;
+ case 2:
+ r0 = (int)(((low >> 30) & 0x400) | ((low >> 5) & 0x3FF));
+ g0 = (int)(((low >> 39) & 0x400) | ((low >> 15) & 0x3FF));
+ b0 = (int)(((low >> 49) & 0x400) | ((low >> 25) & 0x3FF));
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 11);
+ g0 = SignExtend(g0, 11);
+ b0 = SignExtend(b0, 11);
+ }
+
+ r1 = r0 + SignExtend((int)(low >> 35), 5);
+ g1 = g0 + SignExtend((int)(low >> 45), 4);
+ b1 = b0 + SignExtend((int)(low >> 55), 4);
+
+ r2 = r0 + SignExtend((int)(high >> 1), 5);
+ g2 = g0 + SignExtend((int)(low >> 41), 4);
+ b2 = b0 + SignExtend((int)(((high << 3) & 8) | (low >> 61)), 4);
+
+ r3 = r0 + SignExtend((int)(high >> 7), 5);
+ g3 = g0 + SignExtend((int)(low >> 51), 4);
+ b3 = b0 + SignExtend((int)(
+ ((high >> 9) & 8) |
+ ((high >> 4) & 4) |
+ ((low >> 59) & 2) |
+ ((low >> 50) & 1)), 4);
+
+ r0 = Unquantize(r0, 11, signed);
+ g0 = Unquantize(g0, 11, signed);
+ b0 = Unquantize(b0, 11, signed);
+
+ r1 = Unquantize(r1 & 0x7FF, 11, signed);
+ g1 = Unquantize(g1 & 0x7FF, 11, signed);
+ b1 = Unquantize(b1 & 0x7FF, 11, signed);
+
+ r2 = Unquantize(r2 & 0x7FF, 11, signed);
+ g2 = Unquantize(g2 & 0x7FF, 11, signed);
+ b2 = Unquantize(b2 & 0x7FF, 11, signed);
+
+ r3 = Unquantize(r3 & 0x7FF, 11, signed);
+ g3 = Unquantize(g3 & 0x7FF, 11, signed);
+ b3 = Unquantize(b3 & 0x7FF, 11, signed);
+
+ subsetCount = 2;
+ break;
+ case 3:
+ r0 = (int)(low >> 5) & 0x3FF;
+ g0 = (int)(low >> 15) & 0x3FF;
+ b0 = (int)(low >> 25) & 0x3FF;
+
+ r1 = (int)(low >> 35) & 0x3FF;
+ g1 = (int)(low >> 45) & 0x3FF;
+ b1 = (int)(((high << 9) & 0x200) | (low >> 55));
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 10);
+ g0 = SignExtend(g0, 10);
+ b0 = SignExtend(b0, 10);
+
+ r1 = SignExtend(r1, 10);
+ g1 = SignExtend(g1, 10);
+ b1 = SignExtend(b1, 10);
+ }
+
+ r0 = Unquantize(r0, 10, signed);
+ g0 = Unquantize(g0, 10, signed);
+ b0 = Unquantize(b0, 10, signed);
+
+ r1 = Unquantize(r1, 10, signed);
+ g1 = Unquantize(g1, 10, signed);
+ b1 = Unquantize(b1, 10, signed);
+
+ subsetCount = 1;
+ break;
+ case 6:
+ r0 = (int)(((low >> 29) & 0x400) | ((low >> 5) & 0x3FF));
+ g0 = (int)(((low >> 40) & 0x400) | ((low >> 15) & 0x3FF));
+ b0 = (int)(((low >> 49) & 0x400) | ((low >> 25) & 0x3FF));
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 11);
+ g0 = SignExtend(g0, 11);
+ b0 = SignExtend(b0, 11);
+ }
+
+ r1 = r0 + SignExtend((int)(low >> 35), 4);
+ g1 = g0 + SignExtend((int)(low >> 45), 5);
+ b1 = b0 + SignExtend((int)(low >> 55), 4);
+
+ r2 = r0 + SignExtend((int)(high >> 1), 4);
+ g2 = g0 + SignExtend((int)(((high >> 7) & 0x10) | ((low >> 41) & 0x0F)), 5);
+ b2 = b0 + SignExtend((int)(((high << 3) & 0x08) | ((low >> 61))), 4);
+
+ r3 = r0 + SignExtend((int)(high >> 7), 4);
+ g3 = g0 + SignExtend((int)(((low >> 36) & 0x10) | ((low >> 51) & 0x0F)), 5);
+ b3 = b0 + SignExtend((int)(
+ ((high >> 9) & 8) |
+ ((high >> 4) & 4) |
+ ((low >> 59) & 2) |
+ ((high >> 5) & 1)), 4);
+
+ r0 = Unquantize(r0, 11, signed);
+ g0 = Unquantize(g0, 11, signed);
+ b0 = Unquantize(b0, 11, signed);
+
+ r1 = Unquantize(r1 & 0x7FF, 11, signed);
+ g1 = Unquantize(g1 & 0x7FF, 11, signed);
+ b1 = Unquantize(b1 & 0x7FF, 11, signed);
+
+ r2 = Unquantize(r2 & 0x7FF, 11, signed);
+ g2 = Unquantize(g2 & 0x7FF, 11, signed);
+ b2 = Unquantize(b2 & 0x7FF, 11, signed);
+
+ r3 = Unquantize(r3 & 0x7FF, 11, signed);
+ g3 = Unquantize(g3 & 0x7FF, 11, signed);
+ b3 = Unquantize(b3 & 0x7FF, 11, signed);
+
+ subsetCount = 2;
+ break;
+ case 7:
+ r0 = (int)(((low >> 34) & 0x400) | ((low >> 5) & 0x3FF));
+ g0 = (int)(((low >> 44) & 0x400) | ((low >> 15) & 0x3FF));
+ b0 = (int)(((high << 10) & 0x400) | ((low >> 25) & 0x3FF));
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 11);
+ g0 = SignExtend(g0, 11);
+ b0 = SignExtend(b0, 11);
+ }
+
+ r1 = (r0 + SignExtend((int)(low >> 35), 9)) & 0x7FF;
+ g1 = (g0 + SignExtend((int)(low >> 45), 9)) & 0x7FF;
+ b1 = (b0 + SignExtend((int)(low >> 55), 9)) & 0x7FF;
+
+ r0 = Unquantize(r0, 11, signed);
+ g0 = Unquantize(g0, 11, signed);
+ b0 = Unquantize(b0, 11, signed);
+
+ r1 = Unquantize(r1, 11, signed);
+ g1 = Unquantize(g1, 11, signed);
+ b1 = Unquantize(b1, 11, signed);
+
+ subsetCount = 1;
+ break;
+ case 10:
+ r0 = (int)(((low >> 29) & 0x400) | ((low >> 5) & 0x3FF));
+ g0 = (int)(((low >> 39) & 0x400) | ((low >> 15) & 0x3FF));
+ b0 = (int)(((low >> 50) & 0x400) | ((low >> 25) & 0x3FF));
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 11);
+ g0 = SignExtend(g0, 11);
+ b0 = SignExtend(b0, 11);
+ }
+
+ r1 = r0 + SignExtend((int)(low >> 35), 4);
+ g1 = g0 + SignExtend((int)(low >> 45), 4);
+ b1 = b0 + SignExtend((int)(low >> 55), 5);
+
+ r2 = r0 + SignExtend((int)(high >> 1), 4);
+ g2 = g0 + SignExtend((int)(low >> 41), 4);
+ b2 = b0 + SignExtend((int)(((low >> 36) & 0x10) | ((high << 3) & 8) | (low >> 61)), 5);
+
+ r3 = r0 + SignExtend((int)(high >> 7), 4);
+ g3 = g0 + SignExtend((int)(low >> 51), 4);
+ b3 = b0 + SignExtend((int)(
+ ((high >> 7) & 0x10) |
+ ((high >> 9) & 0x08) |
+ ((high >> 4) & 0x06) |
+ ((low >> 50) & 0x01)), 5);
+
+ r0 = Unquantize(r0, 11, signed);
+ g0 = Unquantize(g0, 11, signed);
+ b0 = Unquantize(b0, 11, signed);
+
+ r1 = Unquantize(r1 & 0x7FF, 11, signed);
+ g1 = Unquantize(g1 & 0x7FF, 11, signed);
+ b1 = Unquantize(b1 & 0x7FF, 11, signed);
+
+ r2 = Unquantize(r2 & 0x7FF, 11, signed);
+ g2 = Unquantize(g2 & 0x7FF, 11, signed);
+ b2 = Unquantize(b2 & 0x7FF, 11, signed);
+
+ r3 = Unquantize(r3 & 0x7FF, 11, signed);
+ g3 = Unquantize(g3 & 0x7FF, 11, signed);
+ b3 = Unquantize(b3 & 0x7FF, 11, signed);
+
+ subsetCount = 2;
+ break;
+ case 11:
+ r0 = (int)(((low >> 32) & 0x800) | ((low >> 34) & 0x400) | ((low >> 5) & 0x3FF));
+ g0 = (int)(((low >> 42) & 0x800) | ((low >> 44) & 0x400) | ((low >> 15) & 0x3FF));
+ b0 = (int)(((low >> 52) & 0x800) | ((high << 10) & 0x400) | ((low >> 25) & 0x3FF));
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 12);
+ g0 = SignExtend(g0, 12);
+ b0 = SignExtend(b0, 12);
+ }
+
+ r1 = (r0 + SignExtend((int)(low >> 35), 8)) & 0xFFF;
+ g1 = (g0 + SignExtend((int)(low >> 45), 8)) & 0xFFF;
+ b1 = (b0 + SignExtend((int)(low >> 55), 8)) & 0xFFF;
+
+ r0 = Unquantize(r0, 12, signed);
+ g0 = Unquantize(g0, 12, signed);
+ b0 = Unquantize(b0, 12, signed);
+
+ r1 = Unquantize(r1, 12, signed);
+ g1 = Unquantize(g1, 12, signed);
+ b1 = Unquantize(b1, 12, signed);
+
+ subsetCount = 1;
+ break;
+ case 14:
+ r0 = (int)(low >> 5) & 0x1FF;
+ g0 = (int)(low >> 15) & 0x1FF;
+ b0 = (int)(low >> 25) & 0x1FF;
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 9);
+ g0 = SignExtend(g0, 9);
+ b0 = SignExtend(b0, 9);
+ }
+
+ r1 = r0 + SignExtend((int)(low >> 35), 5);
+ g1 = g0 + SignExtend((int)(low >> 45), 5);
+ b1 = b0 + SignExtend((int)(low >> 55), 5);
+
+ r2 = r0 + SignExtend((int)(high >> 1), 5);
+ g2 = g0 + SignExtend((int)(((low >> 20) & 0x10) | ((low >> 41) & 0xF)), 5);
+ b2 = b0 + SignExtend((int)(((low >> 10) & 0x10) | ((high << 3) & 8) | (low >> 61)), 5);
+
+ r3 = r0 + SignExtend((int)(high >> 7), 5);
+ g3 = g0 + SignExtend((int)(((low >> 36) & 0x10) | ((low >> 51) & 0xF)), 5);
+ b3 = b0 + SignExtend((int)(
+ ((low >> 30) & 0x10) |
+ ((high >> 9) & 0x08) |
+ ((high >> 4) & 0x04) |
+ ((low >> 59) & 0x02) |
+ ((low >> 50) & 0x01)), 5);
+
+ r0 = Unquantize(r0, 9, signed);
+ g0 = Unquantize(g0, 9, signed);
+ b0 = Unquantize(b0, 9, signed);
+
+ r1 = Unquantize(r1 & 0x1FF, 9, signed);
+ g1 = Unquantize(g1 & 0x1FF, 9, signed);
+ b1 = Unquantize(b1 & 0x1FF, 9, signed);
+
+ r2 = Unquantize(r2 & 0x1FF, 9, signed);
+ g2 = Unquantize(g2 & 0x1FF, 9, signed);
+ b2 = Unquantize(b2 & 0x1FF, 9, signed);
+
+ r3 = Unquantize(r3 & 0x1FF, 9, signed);
+ g3 = Unquantize(g3 & 0x1FF, 9, signed);
+ b3 = Unquantize(b3 & 0x1FF, 9, signed);
+
+ subsetCount = 2;
+ break;
+ case 15:
+ r0 = (BitReverse6((int)(low >> 39) & 0x3F) << 10) | ((int)(low >> 5) & 0x3FF);
+ g0 = (BitReverse6((int)(low >> 49) & 0x3F) << 10) | ((int)(low >> 15) & 0x3FF);
+ b0 = ((BitReverse6((int)(low >> 59)) | (int)(high & 1)) << 10) | ((int)(low >> 25) & 0x3FF);
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 16);
+ g0 = SignExtend(g0, 16);
+ b0 = SignExtend(b0, 16);
+ }
+
+ r1 = (r0 + SignExtend((int)(low >> 35), 4)) & 0xFFFF;
+ g1 = (g0 + SignExtend((int)(low >> 45), 4)) & 0xFFFF;
+ b1 = (b0 + SignExtend((int)(low >> 55), 4)) & 0xFFFF;
+
+ subsetCount = 1;
+ break;
+ case 18:
+ r0 = (int)(low >> 5) & 0xFF;
+ g0 = (int)(low >> 15) & 0xFF;
+ b0 = (int)(low >> 25) & 0xFF;
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 8);
+ g0 = SignExtend(g0, 8);
+ b0 = SignExtend(b0, 8);
+ }
+
+ r1 = r0 + SignExtend((int)(low >> 35), 6);
+ g1 = g0 + SignExtend((int)(low >> 45), 5);
+ b1 = b0 + SignExtend((int)(low >> 55), 5);
+
+ r2 = r0 + SignExtend((int)(high >> 1), 6);
+ g2 = g0 + SignExtend((int)(((low >> 20) & 0x10) | ((low >> 41) & 0xF)), 5);
+ b2 = b0 + SignExtend((int)(((low >> 10) & 0x10) | ((high << 3) & 8) | (low >> 61)), 5);
+
+ r3 = r0 + SignExtend((int)(high >> 7), 6);
+ g3 = g0 + SignExtend((int)(((low >> 9) & 0x10) | ((low >> 51) & 0xF)), 5);
+ b3 = b0 + SignExtend((int)(
+ ((low >> 30) & 0x18) |
+ ((low >> 21) & 0x04) |
+ ((low >> 59) & 0x02) |
+ ((low >> 50) & 0x01)), 5);
+
+ r0 = Unquantize(r0, 8, signed);
+ g0 = Unquantize(g0, 8, signed);
+ b0 = Unquantize(b0, 8, signed);
+
+ r1 = Unquantize(r1 & 0xFF, 8, signed);
+ g1 = Unquantize(g1 & 0xFF, 8, signed);
+ b1 = Unquantize(b1 & 0xFF, 8, signed);
+
+ r2 = Unquantize(r2 & 0xFF, 8, signed);
+ g2 = Unquantize(g2 & 0xFF, 8, signed);
+ b2 = Unquantize(b2 & 0xFF, 8, signed);
+
+ r3 = Unquantize(r3 & 0xFF, 8, signed);
+ g3 = Unquantize(g3 & 0xFF, 8, signed);
+ b3 = Unquantize(b3 & 0xFF, 8, signed);
+
+ subsetCount = 2;
+ break;
+ case 22:
+ r0 = (int)(low >> 5) & 0xFF;
+ g0 = (int)(low >> 15) & 0xFF;
+ b0 = (int)(low >> 25) & 0xFF;
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 8);
+ g0 = SignExtend(g0, 8);
+ b0 = SignExtend(b0, 8);
+ }
+
+ r1 = r0 + SignExtend((int)(low >> 35), 5);
+ g1 = g0 + SignExtend((int)(low >> 45), 6);
+ b1 = b0 + SignExtend((int)(low >> 55), 5);
+
+ r2 = r0 + SignExtend((int)(high >> 1), 5);
+ g2 = g0 + SignExtend((int)(((low >> 18) & 0x20) | ((low >> 20) & 0x10) | ((low >> 41) & 0xF)), 6);
+ b2 = b0 + SignExtend((int)(((low >> 10) & 0x10) | ((high << 3) & 0x08) | (low >> 61)), 5);
+
+ r3 = r0 + SignExtend((int)(high >> 7), 5);
+ g3 = g0 + SignExtend((int)(((low >> 28) & 0x20) | ((low >> 36) & 0x10) | ((low >> 51) & 0x0F)), 6);
+ b3 = b0 + SignExtend((int)(
+ ((low >> 30) & 0x10) |
+ ((high >> 9) & 0x08) |
+ ((high >> 4) & 0x04) |
+ ((low >> 59) & 0x02) |
+ ((low >> 13) & 0x01)), 5);
+
+ r0 = Unquantize(r0, 8, signed);
+ g0 = Unquantize(g0, 8, signed);
+ b0 = Unquantize(b0, 8, signed);
+
+ r1 = Unquantize(r1 & 0xFF, 8, signed);
+ g1 = Unquantize(g1 & 0xFF, 8, signed);
+ b1 = Unquantize(b1 & 0xFF, 8, signed);
+
+ r2 = Unquantize(r2 & 0xFF, 8, signed);
+ g2 = Unquantize(g2 & 0xFF, 8, signed);
+ b2 = Unquantize(b2 & 0xFF, 8, signed);
+
+ r3 = Unquantize(r3 & 0xFF, 8, signed);
+ g3 = Unquantize(g3 & 0xFF, 8, signed);
+ b3 = Unquantize(b3 & 0xFF, 8, signed);
+
+ subsetCount = 2;
+ break;
+ case 26:
+ r0 = (int)(low >> 5) & 0xFF;
+ g0 = (int)(low >> 15) & 0xFF;
+ b0 = (int)(low >> 25) & 0xFF;
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 8);
+ g0 = SignExtend(g0, 8);
+ b0 = SignExtend(b0, 8);
+ }
+
+ r1 = r0 + SignExtend((int)(low >> 35), 5);
+ g1 = g0 + SignExtend((int)(low >> 45), 5);
+ b1 = b0 + SignExtend((int)(low >> 55), 6);
+
+ r2 = r0 + SignExtend((int)(high >> 1), 5);
+ g2 = g0 + SignExtend((int)(((low >> 20) & 0x10) | ((low >> 41) & 0xF)), 5);
+ b2 = b0 + SignExtend((int)(
+ ((low >> 18) & 0x20) |
+ ((low >> 10) & 0x10) |
+ ((high << 3) & 0x08) |
+ (low >> 61)), 6);
+
+ r3 = r0 + SignExtend((int)(high >> 7), 5);
+ g3 = g0 + SignExtend((int)(((low >> 36) & 0x10) | ((low >> 51) & 0xF)), 5);
+ b3 = b0 + SignExtend((int)(
+ ((low >> 28) & 0x20) |
+ ((low >> 30) & 0x10) |
+ ((high >> 9) & 0x08) |
+ ((high >> 4) & 0x04) |
+ ((low >> 12) & 0x02) |
+ ((low >> 50) & 0x01)), 6);
+
+ r0 = Unquantize(r0, 8, signed);
+ g0 = Unquantize(g0, 8, signed);
+ b0 = Unquantize(b0, 8, signed);
+
+ r1 = Unquantize(r1 & 0xFF, 8, signed);
+ g1 = Unquantize(g1 & 0xFF, 8, signed);
+ b1 = Unquantize(b1 & 0xFF, 8, signed);
+
+ r2 = Unquantize(r2 & 0xFF, 8, signed);
+ g2 = Unquantize(g2 & 0xFF, 8, signed);
+ b2 = Unquantize(b2 & 0xFF, 8, signed);
+
+ r3 = Unquantize(r3 & 0xFF, 8, signed);
+ g3 = Unquantize(g3 & 0xFF, 8, signed);
+ b3 = Unquantize(b3 & 0xFF, 8, signed);
+
+ subsetCount = 2;
+ break;
+ case 30:
+ r0 = (int)(low >> 5) & 0x3F;
+ g0 = (int)(low >> 15) & 0x3F;
+ b0 = (int)(low >> 25) & 0x3F;
+
+ r1 = (int)(low >> 35) & 0x3F;
+ g1 = (int)(low >> 45) & 0x3F;
+ b1 = (int)(low >> 55) & 0x3F;
+
+ r2 = (int)(high >> 1) & 0x3F;
+ g2 = (int)(((low >> 16) & 0x20) | ((low >> 20) & 0x10) | ((low >> 41) & 0xF));
+ b2 = (int)(((low >> 17) & 0x20) | ((low >> 10) & 0x10) | ((high << 3) & 0x08) | (low >> 61));
+
+ r3 = (int)(high >> 7) & 0x3F;
+ g3 = (int)(((low >> 26) & 0x20) | ((low >> 7) & 0x10) | ((low >> 51) & 0xF));
+ b3 = (int)(
+ ((low >> 28) & 0x20) |
+ ((low >> 30) & 0x10) |
+ ((low >> 29) & 0x08) |
+ ((low >> 21) & 0x04) |
+ ((low >> 12) & 0x03));
+
+ if (signed)
+ {
+ r0 = SignExtend(r0, 6);
+ g0 = SignExtend(g0, 6);
+ b0 = SignExtend(b0, 6);
+
+ r1 = SignExtend(r1, 6);
+ g1 = SignExtend(g1, 6);
+ b1 = SignExtend(b1, 6);
+
+ r2 = SignExtend(r2, 6);
+ g2 = SignExtend(g2, 6);
+ b2 = SignExtend(b2, 6);
+
+ r3 = SignExtend(r3, 6);
+ g3 = SignExtend(g3, 6);
+ b3 = SignExtend(b3, 6);
+ }
+
+ r0 = Unquantize(r0, 6, signed);
+ g0 = Unquantize(g0, 6, signed);
+ b0 = Unquantize(b0, 6, signed);
+
+ r1 = Unquantize(r1, 6, signed);
+ g1 = Unquantize(g1, 6, signed);
+ b1 = Unquantize(b1, 6, signed);
+
+ r2 = Unquantize(r2, 6, signed);
+ g2 = Unquantize(g2, 6, signed);
+ b2 = Unquantize(b2, 6, signed);
+
+ r3 = Unquantize(r3, 6, signed);
+ g3 = Unquantize(g3, 6, signed);
+ b3 = Unquantize(b3, 6, signed);
+
+ subsetCount = 2;
+ break;
+ default:
+ subsetCount = 0;
+ break;
+ }
+
+ if (subsetCount > 0)
+ {
+ endPoints[0] = new RgbaColor32(r0, g0, b0, HalfOne);
+ endPoints[1] = new RgbaColor32(r1, g1, b1, HalfOne);
+
+ if (subsetCount > 1)
+ {
+ endPoints[2] = new RgbaColor32(r2, g2, b2, HalfOne);
+ endPoints[3] = new RgbaColor32(r3, g3, b3, HalfOne);
+ }
+ }
+
+ return subsetCount;
+ }
+
+ private static int SignExtend(int value, int bits)
+ {
+ int shift = 32 - bits;
+ return (value << shift) >> shift;
+ }
+
+ private static int Unquantize(int value, int bits, bool signed)
+ {
+ if (signed)
+ {
+ if (bits >= 16)
+ {
+ return value;
+ }
+ else
+ {
+ bool sign = value < 0;
+
+ if (sign)
+ {
+ value = -value;
+ }
+
+ if (value == 0)
+ {
+ return value;
+ }
+ else if (value >= ((1 << (bits - 1)) - 1))
+ {
+ value = 0x7FFF;
+ }
+ else
+ {
+ value = ((value << 15) + 0x4000) >> (bits - 1);
+ }
+
+ if (sign)
+ {
+ value = -value;
+ }
+ }
+ }
+ else
+ {
+ if (bits >= 15 || value == 0)
+ {
+ return value;
+ }
+ else if (value == ((1 << bits) - 1))
+ {
+ return 0xFFFF;
+ }
+ else
+ {
+ return ((value << 16) + 0x8000) >> bits;
+ }
+ }
+
+ return value;
+ }
+
+ private static ushort FinishUnquantize(int value, bool signed)
+ {
+ if (signed)
+ {
+ value = value < 0 ? -((-value * 31) >> 5) : (value * 31) >> 5;
+
+ int sign = 0;
+ if (value < 0)
+ {
+ sign = 0x8000;
+ value = -value;
+ }
+
+ return (ushort)(sign | value);
+ }
+ else
+ {
+ return (ushort)((value * 31) >> 6);
+ }
+ }
+
+ private static int BitReverse6(int value)
+ {
+ value = ((value >> 1) & 0x55) | ((value << 1) & 0xaa);
+ value = ((value >> 2) & 0x33) | ((value << 2) & 0xcc);
+ value = ((value >> 4) & 0x0f) | ((value << 4) & 0xf0);
+ return value >> 2;
+ }
+ }
+}
diff --git a/src/Ryujinx.Graphics.Texture/BC7Decoder.cs b/src/Ryujinx.Graphics.Texture/BC7Decoder.cs
new file mode 100644
index 00000000..b865a559
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/BC7Decoder.cs
@@ -0,0 +1,220 @@
+using Ryujinx.Graphics.Texture.Utils;
+using System;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Graphics.Texture
+{
+ static class BC7Decoder
+ {
+ public static void Decode(Span<byte> output, ReadOnlySpan<byte> data, int width, int height)
+ {
+ ReadOnlySpan<Block> blocks = MemoryMarshal.Cast<byte, Block>(data);
+
+ Span<uint> output32 = MemoryMarshal.Cast<byte, uint>(output);
+
+ int wInBlocks = (width + 3) / 4;
+ int hInBlocks = (height + 3) / 4;
+
+ for (int y = 0; y < hInBlocks; y++)
+ {
+ int y2 = y * 4;
+ int bh = Math.Min(4, height - y2);
+
+ for (int x = 0; x < wInBlocks; x++)
+ {
+ int x2 = x * 4;
+ int bw = Math.Min(4, width - x2);
+
+ DecodeBlock(blocks[y * wInBlocks + x], output32.Slice(y2 * width + x2), bw, bh, width);
+ }
+ }
+ }
+
+ private static void DecodeBlock(Block block, Span<uint> output, int w, int h, int width)
+ {
+ int mode = BitOperations.TrailingZeroCount((byte)block.Low | 0x100);
+ if (mode == 8)
+ {
+ // Mode is invalid, the spec mandates that hardware fills the block with
+ // a transparent black color.
+ for (int ty = 0; ty < h; ty++)
+ {
+ int baseOffs = ty * width;
+
+ for (int tx = 0; tx < w; tx++)
+ {
+ int offs = baseOffs + tx;
+
+ output[offs] = 0;
+ }
+ }
+
+ return;
+ }
+
+ BC7ModeInfo modeInfo = BC67Tables.BC7ModeInfos[mode];
+
+ int offset = mode + 1;
+ int partition = (int)block.Decode(ref offset, modeInfo.PartitionBitCount);
+ int rotation = (int)block.Decode(ref offset, modeInfo.RotationBitCount);
+ int indexMode = (int)block.Decode(ref offset, modeInfo.IndexModeBitCount);
+
+ Debug.Assert(partition < 64);
+ Debug.Assert(rotation < 4);
+ Debug.Assert(indexMode < 2);
+
+ int endPointCount = modeInfo.SubsetCount * 2;
+
+ Span<RgbaColor32> endPoints = stackalloc RgbaColor32[endPointCount];
+ Span<byte> pValues = stackalloc byte[modeInfo.PBits];
+
+ endPoints.Fill(new RgbaColor32(0, 0, 0, 255));
+
+ for (int i = 0; i < endPointCount; i++)
+ {
+ endPoints[i].R = (int)block.Decode(ref offset, modeInfo.ColorDepth);
+ }
+
+ for (int i = 0; i < endPointCount; i++)
+ {
+ endPoints[i].G = (int)block.Decode(ref offset, modeInfo.ColorDepth);
+ }
+
+ for (int i = 0; i < endPointCount; i++)
+ {
+ endPoints[i].B = (int)block.Decode(ref offset, modeInfo.ColorDepth);
+ }
+
+ if (modeInfo.AlphaDepth != 0)
+ {
+ for (int i = 0; i < endPointCount; i++)
+ {
+ endPoints[i].A = (int)block.Decode(ref offset, modeInfo.AlphaDepth);
+ }
+ }
+
+ for (int i = 0; i < modeInfo.PBits; i++)
+ {
+ pValues[i] = (byte)block.Decode(ref offset, 1);
+ }
+
+ for (int i = 0; i < endPointCount; i++)
+ {
+ int pBit = -1;
+
+ if (modeInfo.PBits != 0)
+ {
+ int pIndex = (i * modeInfo.PBits) / endPointCount;
+ pBit = pValues[pIndex];
+ }
+
+ Unquantize(ref endPoints[i], modeInfo.ColorDepth, modeInfo.AlphaDepth, pBit);
+ }
+
+ byte[] partitionTable = BC67Tables.PartitionTable[modeInfo.SubsetCount - 1][partition];
+ byte[] fixUpTable = BC67Tables.FixUpIndices[modeInfo.SubsetCount - 1][partition];
+
+ Span<byte> colorIndices = stackalloc byte[16];
+
+ for (int i = 0; i < 16; i++)
+ {
+ byte subset = partitionTable[i];
+ int bitCount = i == fixUpTable[subset] ? modeInfo.ColorIndexBitCount - 1 : modeInfo.ColorIndexBitCount;
+
+ colorIndices[i] = (byte)block.Decode(ref offset, bitCount);
+ Debug.Assert(colorIndices[i] < 16);
+ }
+
+ Span<byte> alphaIndices = stackalloc byte[16];
+
+ if (modeInfo.AlphaIndexBitCount != 0)
+ {
+ for (int i = 0; i < 16; i++)
+ {
+ int bitCount = i != 0 ? modeInfo.AlphaIndexBitCount : modeInfo.AlphaIndexBitCount - 1;
+
+ alphaIndices[i] = (byte)block.Decode(ref offset, bitCount);
+ Debug.Assert(alphaIndices[i] < 16);
+ }
+ }
+
+ for (int ty = 0; ty < h; ty++)
+ {
+ int baseOffs = ty * width;
+
+ for (int tx = 0; tx < w; tx++)
+ {
+ int i = ty * 4 + tx;
+
+ RgbaColor32 color;
+
+ byte subset = partitionTable[i];
+
+ RgbaColor32 color1 = endPoints[subset * 2];
+ RgbaColor32 color2 = endPoints[subset * 2 + 1];
+
+ if (modeInfo.AlphaIndexBitCount != 0)
+ {
+ if (indexMode == 0)
+ {
+ color = BC67Utils.Interpolate(color1, color2, colorIndices[i], alphaIndices[i], modeInfo.ColorIndexBitCount, modeInfo.AlphaIndexBitCount);
+ }
+ else
+ {
+ color = BC67Utils.Interpolate(color1, color2, alphaIndices[i], colorIndices[i], modeInfo.AlphaIndexBitCount, modeInfo.ColorIndexBitCount);
+ }
+ }
+ else
+ {
+ color = BC67Utils.Interpolate(color1, color2, colorIndices[i], colorIndices[i], modeInfo.ColorIndexBitCount, modeInfo.ColorIndexBitCount);
+ }
+
+ if (rotation != 0)
+ {
+ int a = color.A;
+
+ switch (rotation)
+ {
+ case 1: color.A = color.R; color.R = a; break;
+ case 2: color.A = color.G; color.G = a; break;
+ case 3: color.A = color.B; color.B = a; break;
+ }
+ }
+
+ RgbaColor8 color8 = color.GetColor8();
+
+ output[baseOffs + tx] = color8.ToUInt32();
+ }
+ }
+ }
+
+ private static void Unquantize(ref RgbaColor32 color, int colorDepth, int alphaDepth, int pBit)
+ {
+ color.R = UnquantizeComponent(color.R, colorDepth, pBit);
+ color.G = UnquantizeComponent(color.G, colorDepth, pBit);
+ color.B = UnquantizeComponent(color.B, colorDepth, pBit);
+ color.A = alphaDepth != 0 ? UnquantizeComponent(color.A, alphaDepth, pBit) : 255;
+ }
+
+ private static int UnquantizeComponent(int component, int bits, int pBit)
+ {
+ int shift = 8 - bits;
+ int value = component << shift;
+
+ if (pBit >= 0)
+ {
+ Debug.Assert(pBit <= 1);
+ value |= value >> (bits + 1);
+ value |= pBit << (shift - 1);
+ }
+ else
+ {
+ value |= value >> bits;
+ }
+
+ return value;
+ }
+ }
+}
diff --git a/src/Ryujinx.Graphics.Texture/BCnDecoder.cs b/src/Ryujinx.Graphics.Texture/BCnDecoder.cs
new file mode 100644
index 00000000..b21fa4d1
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/BCnDecoder.cs
@@ -0,0 +1,894 @@
+using Ryujinx.Common;
+using System;
+using System.Buffers.Binary;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace Ryujinx.Graphics.Texture
+{
+ public static class BCnDecoder
+ {
+ private const int BlockWidth = 4;
+ private const int BlockHeight = 4;
+
+ public static byte[] DecodeBC1(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
+ {
+ int size = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
+ }
+
+ byte[] output = new byte[size];
+
+ Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4];
+
+ Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
+ Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output);
+
+ Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile);
+
+ Span<Vector128<byte>> outputLine0 = default;
+ Span<Vector128<byte>> outputLine1 = default;
+ Span<Vector128<byte>> outputLine2 = default;
+ Span<Vector128<byte>> outputLine3 = default;
+
+ int imageBaseOOffs = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ int w = BitUtils.DivRoundUp(width, BlockWidth);
+ int h = BitUtils.DivRoundUp(height, BlockHeight);
+
+ for (int l2 = 0; l2 < layers; l2++)
+ {
+ for (int z = 0; z < depth; z++)
+ {
+ for (int y = 0; y < h; y++)
+ {
+ int baseY = y * BlockHeight;
+ int copyHeight = Math.Min(BlockHeight, height - baseY);
+ int lineBaseOOffs = imageBaseOOffs + baseY * width;
+
+ if (copyHeight == 4)
+ {
+ outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs));
+ outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width));
+ outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 2));
+ outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 3));
+ }
+
+ for (int x = 0; x < w; x++)
+ {
+ int baseX = x * BlockWidth;
+ int copyWidth = Math.Min(BlockWidth, width - baseX);
+
+ BC1DecodeTileRgb(tile, data);
+
+ if ((copyWidth | copyHeight) == 4)
+ {
+ outputLine0[x] = tileAsVector128[0];
+ outputLine1[x] = tileAsVector128[1];
+ outputLine2[x] = tileAsVector128[2];
+ outputLine3[x] = tileAsVector128[3];
+ }
+ else
+ {
+ int pixelBaseOOffs = lineBaseOOffs + baseX;
+
+ for (int tY = 0; tY < copyHeight; tY++)
+ {
+ tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
+ }
+ }
+
+ data = data.Slice(8);
+ }
+ }
+
+ imageBaseOOffs += width * height;
+ }
+ }
+
+ width = Math.Max(1, width >> 1);
+ height = Math.Max(1, height >> 1);
+ depth = Math.Max(1, depth >> 1);
+ }
+
+ return output;
+ }
+
+ public static byte[] DecodeBC2(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
+ {
+ int size = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
+ }
+
+ byte[] output = new byte[size];
+
+ Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4];
+
+ Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
+ Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output);
+
+ Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile);
+
+ Span<Vector128<byte>> outputLine0 = default;
+ Span<Vector128<byte>> outputLine1 = default;
+ Span<Vector128<byte>> outputLine2 = default;
+ Span<Vector128<byte>> outputLine3 = default;
+
+ int imageBaseOOffs = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ int w = BitUtils.DivRoundUp(width, BlockWidth);
+ int h = BitUtils.DivRoundUp(height, BlockHeight);
+
+ for (int l2 = 0; l2 < layers; l2++)
+ {
+ for (int z = 0; z < depth; z++)
+ {
+ for (int y = 0; y < h; y++)
+ {
+ int baseY = y * BlockHeight;
+ int copyHeight = Math.Min(BlockHeight, height - baseY);
+ int lineBaseOOffs = imageBaseOOffs + baseY * width;
+
+ if (copyHeight == 4)
+ {
+ outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs));
+ outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width));
+ outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 2));
+ outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 3));
+ }
+
+ for (int x = 0; x < w; x++)
+ {
+ int baseX = x * BlockWidth;
+ int copyWidth = Math.Min(BlockWidth, width - baseX);
+
+ BC23DecodeTileRgb(tile, data.Slice(8));
+
+ ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data);
+
+ for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, block >>= 4)
+ {
+ tile[i] = (byte)((block & 0xf) | (block << 4));
+ }
+
+ if ((copyWidth | copyHeight) == 4)
+ {
+ outputLine0[x] = tileAsVector128[0];
+ outputLine1[x] = tileAsVector128[1];
+ outputLine2[x] = tileAsVector128[2];
+ outputLine3[x] = tileAsVector128[3];
+ }
+ else
+ {
+ int pixelBaseOOffs = lineBaseOOffs + baseX;
+
+ for (int tY = 0; tY < copyHeight; tY++)
+ {
+ tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
+ }
+ }
+
+ data = data.Slice(16);
+ }
+ }
+
+ imageBaseOOffs += width * height;
+ }
+ }
+
+ width = Math.Max(1, width >> 1);
+ height = Math.Max(1, height >> 1);
+ depth = Math.Max(1, depth >> 1);
+ }
+
+ return output;
+ }
+
+ public static byte[] DecodeBC3(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
+ {
+ int size = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
+ }
+
+ byte[] output = new byte[size];
+
+ Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4];
+ Span<byte> rPal = stackalloc byte[8];
+
+ Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
+ Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output);
+
+ Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile);
+
+ Span<Vector128<byte>> outputLine0 = default;
+ Span<Vector128<byte>> outputLine1 = default;
+ Span<Vector128<byte>> outputLine2 = default;
+ Span<Vector128<byte>> outputLine3 = default;
+
+ int imageBaseOOffs = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ int w = BitUtils.DivRoundUp(width, BlockWidth);
+ int h = BitUtils.DivRoundUp(height, BlockHeight);
+
+ for (int l2 = 0; l2 < layers; l2++)
+ {
+ for (int z = 0; z < depth; z++)
+ {
+ for (int y = 0; y < h; y++)
+ {
+ int baseY = y * BlockHeight;
+ int copyHeight = Math.Min(BlockHeight, height - baseY);
+ int lineBaseOOffs = imageBaseOOffs + baseY * width;
+
+ if (copyHeight == 4)
+ {
+ outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs));
+ outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width));
+ outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 2));
+ outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 3));
+ }
+
+ for (int x = 0; x < w; x++)
+ {
+ int baseX = x * BlockWidth;
+ int copyWidth = Math.Min(BlockWidth, width - baseX);
+
+ BC23DecodeTileRgb(tile, data.Slice(8));
+
+ ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data);
+
+ rPal[0] = (byte)block;
+ rPal[1] = (byte)(block >> 8);
+
+ BCnLerpAlphaUnorm(rPal);
+ BCnDecodeTileAlphaRgba(tile, rPal, block >> 16);
+
+ if ((copyWidth | copyHeight) == 4)
+ {
+ outputLine0[x] = tileAsVector128[0];
+ outputLine1[x] = tileAsVector128[1];
+ outputLine2[x] = tileAsVector128[2];
+ outputLine3[x] = tileAsVector128[3];
+ }
+ else
+ {
+ int pixelBaseOOffs = lineBaseOOffs + baseX;
+
+ for (int tY = 0; tY < copyHeight; tY++)
+ {
+ tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
+ }
+ }
+
+ data = data.Slice(16);
+ }
+ }
+
+ imageBaseOOffs += width * height;
+ }
+ }
+
+ width = Math.Max(1, width >> 1);
+ height = Math.Max(1, height >> 1);
+ depth = Math.Max(1, depth >> 1);
+ }
+
+ return output;
+ }
+
+ public static byte[] DecodeBC4(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed)
+ {
+ int size = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ size += BitUtils.AlignUp(Math.Max(1, width >> l), 4) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers;
+ }
+
+ // Backends currently expect a stride alignment of 4 bytes, so output width must be aligned.
+ int alignedWidth = BitUtils.AlignUp(width, 4);
+
+ byte[] output = new byte[size];
+ Span<byte> outputSpan = new Span<byte>(output);
+
+ ReadOnlySpan<ulong> data64 = MemoryMarshal.Cast<byte, ulong>(data);
+
+ Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight];
+ Span<byte> rPal = stackalloc byte[8];
+
+ Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
+
+ Span<uint> outputLine0 = default;
+ Span<uint> outputLine1 = default;
+ Span<uint> outputLine2 = default;
+ Span<uint> outputLine3 = default;
+
+ int imageBaseOOffs = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ int w = BitUtils.DivRoundUp(width, BlockWidth);
+ int h = BitUtils.DivRoundUp(height, BlockHeight);
+
+ for (int l2 = 0; l2 < layers; l2++)
+ {
+ for (int z = 0; z < depth; z++)
+ {
+ for (int y = 0; y < h; y++)
+ {
+ int baseY = y * BlockHeight;
+ int copyHeight = Math.Min(BlockHeight, height - baseY);
+ int lineBaseOOffs = imageBaseOOffs + baseY * alignedWidth;
+
+ if (copyHeight == 4)
+ {
+ outputLine0 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs));
+ outputLine1 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + alignedWidth));
+ outputLine2 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + alignedWidth * 2));
+ outputLine3 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + alignedWidth * 3));
+ }
+
+ for (int x = 0; x < w; x++)
+ {
+ int baseX = x * BlockWidth;
+ int copyWidth = Math.Min(BlockWidth, width - baseX);
+
+ ulong block = data64[0];
+
+ rPal[0] = (byte)block;
+ rPal[1] = (byte)(block >> 8);
+
+ if (signed)
+ {
+ BCnLerpAlphaSnorm(rPal);
+ }
+ else
+ {
+ BCnLerpAlphaUnorm(rPal);
+ }
+
+ BCnDecodeTileAlpha(tile, rPal, block >> 16);
+
+ if ((copyWidth | copyHeight) == 4)
+ {
+ outputLine0[x] = tileAsUint[0];
+ outputLine1[x] = tileAsUint[1];
+ outputLine2[x] = tileAsUint[2];
+ outputLine3[x] = tileAsUint[3];
+ }
+ else
+ {
+ int pixelBaseOOffs = lineBaseOOffs + baseX;
+
+ for (int tY = 0; tY < copyHeight; tY++)
+ {
+ tile.Slice(tY * 4, copyWidth).CopyTo(outputSpan.Slice(pixelBaseOOffs + alignedWidth * tY, copyWidth));
+ }
+ }
+
+ data64 = data64.Slice(1);
+ }
+ }
+
+ imageBaseOOffs += alignedWidth * height;
+ }
+ }
+
+ width = Math.Max(1, width >> 1);
+ height = Math.Max(1, height >> 1);
+ depth = Math.Max(1, depth >> 1);
+
+ alignedWidth = BitUtils.AlignUp(width, 4);
+ }
+
+ return output;
+ }
+
+ public static byte[] DecodeBC5(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed)
+ {
+ int size = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ size += BitUtils.AlignUp(Math.Max(1, width >> l), 2) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 2;
+ }
+
+ // Backends currently expect a stride alignment of 4 bytes, so output width must be aligned.
+ int alignedWidth = BitUtils.AlignUp(width, 2);
+
+ byte[] output = new byte[size];
+
+ ReadOnlySpan<ulong> data64 = MemoryMarshal.Cast<byte, ulong>(data);
+
+ Span<byte> rTile = stackalloc byte[BlockWidth * BlockHeight * 2];
+ Span<byte> gTile = stackalloc byte[BlockWidth * BlockHeight * 2];
+ Span<byte> rPal = stackalloc byte[8];
+ Span<byte> gPal = stackalloc byte[8];
+
+ Span<ushort> outputAsUshort = MemoryMarshal.Cast<byte, ushort>(output);
+
+ Span<uint> rTileAsUint = MemoryMarshal.Cast<byte, uint>(rTile);
+ Span<uint> gTileAsUint = MemoryMarshal.Cast<byte, uint>(gTile);
+
+ Span<ulong> outputLine0 = default;
+ Span<ulong> outputLine1 = default;
+ Span<ulong> outputLine2 = default;
+ Span<ulong> outputLine3 = default;
+
+ int imageBaseOOffs = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ int w = BitUtils.DivRoundUp(width, BlockWidth);
+ int h = BitUtils.DivRoundUp(height, BlockHeight);
+
+ for (int l2 = 0; l2 < layers; l2++)
+ {
+ for (int z = 0; z < depth; z++)
+ {
+ for (int y = 0; y < h; y++)
+ {
+ int baseY = y * BlockHeight;
+ int copyHeight = Math.Min(BlockHeight, height - baseY);
+ int lineBaseOOffs = imageBaseOOffs + baseY * alignedWidth;
+
+ if (copyHeight == 4)
+ {
+ outputLine0 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs));
+ outputLine1 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + alignedWidth));
+ outputLine2 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + alignedWidth * 2));
+ outputLine3 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + alignedWidth * 3));
+ }
+
+ for (int x = 0; x < w; x++)
+ {
+ int baseX = x * BlockWidth;
+ int copyWidth = Math.Min(BlockWidth, width - baseX);
+
+ ulong blockL = data64[0];
+ ulong blockH = data64[1];
+
+ rPal[0] = (byte)blockL;
+ rPal[1] = (byte)(blockL >> 8);
+ gPal[0] = (byte)blockH;
+ gPal[1] = (byte)(blockH >> 8);
+
+ if (signed)
+ {
+ BCnLerpAlphaSnorm(rPal);
+ BCnLerpAlphaSnorm(gPal);
+ }
+ else
+ {
+ BCnLerpAlphaUnorm(rPal);
+ BCnLerpAlphaUnorm(gPal);
+ }
+
+ BCnDecodeTileAlpha(rTile, rPal, blockL >> 16);
+ BCnDecodeTileAlpha(gTile, gPal, blockH >> 16);
+
+ if ((copyWidth | copyHeight) == 4)
+ {
+ outputLine0[x] = InterleaveBytes(rTileAsUint[0], gTileAsUint[0]);
+ outputLine1[x] = InterleaveBytes(rTileAsUint[1], gTileAsUint[1]);
+ outputLine2[x] = InterleaveBytes(rTileAsUint[2], gTileAsUint[2]);
+ outputLine3[x] = InterleaveBytes(rTileAsUint[3], gTileAsUint[3]);
+ }
+ else
+ {
+ int pixelBaseOOffs = lineBaseOOffs + baseX;
+
+ for (int tY = 0; tY < copyHeight; tY++)
+ {
+ int line = pixelBaseOOffs + alignedWidth * tY;
+
+ for (int tX = 0; tX < copyWidth; tX++)
+ {
+ int texel = tY * BlockWidth + tX;
+
+ outputAsUshort[line + tX] = (ushort)(rTile[texel] | (gTile[texel] << 8));
+ }
+ }
+ }
+
+ data64 = data64.Slice(2);
+ }
+ }
+
+ imageBaseOOffs += alignedWidth * height;
+ }
+ }
+
+ width = Math.Max(1, width >> 1);
+ height = Math.Max(1, height >> 1);
+ depth = Math.Max(1, depth >> 1);
+
+ alignedWidth = BitUtils.AlignUp(width, 2);
+ }
+
+ return output;
+ }
+
+ public static byte[] DecodeBC6(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed)
+ {
+ int size = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 8;
+ }
+
+ byte[] output = new byte[size];
+
+ int inputOffset = 0;
+ int outputOffset = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ int w = BitUtils.DivRoundUp(width, BlockWidth);
+ int h = BitUtils.DivRoundUp(height, BlockHeight);
+
+ for (int l2 = 0; l2 < layers; l2++)
+ {
+ for (int z = 0; z < depth; z++)
+ {
+ BC6Decoder.Decode(output.AsSpan().Slice(outputOffset), data.Slice(inputOffset), width, height, signed);
+
+ inputOffset += w * h * 16;
+ outputOffset += width * height * 8;
+ }
+ }
+
+ width = Math.Max(1, width >> 1);
+ height = Math.Max(1, height >> 1);
+ depth = Math.Max(1, depth >> 1);
+ }
+
+ return output;
+ }
+
+ public static byte[] DecodeBC7(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
+ {
+ int size = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
+ }
+
+ byte[] output = new byte[size];
+
+ int inputOffset = 0;
+ int outputOffset = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ int w = BitUtils.DivRoundUp(width, BlockWidth);
+ int h = BitUtils.DivRoundUp(height, BlockHeight);
+
+ for (int l2 = 0; l2 < layers; l2++)
+ {
+ for (int z = 0; z < depth; z++)
+ {
+ BC7Decoder.Decode(output.AsSpan().Slice(outputOffset), data.Slice(inputOffset), width, height);
+
+ inputOffset += w * h * 16;
+ outputOffset += width * height * 4;
+ }
+ }
+
+ width = Math.Max(1, width >> 1);
+ height = Math.Max(1, height >> 1);
+ depth = Math.Max(1, depth >> 1);
+ }
+
+ return output;
+ }
+
+ private static ulong InterleaveBytes(uint left, uint right)
+ {
+ return InterleaveBytesWithZeros(left) | (InterleaveBytesWithZeros(right) << 8);
+ }
+
+ private static ulong InterleaveBytesWithZeros(uint value)
+ {
+ ulong output = value;
+ output = (output ^ (output << 16)) & 0xffff0000ffffUL;
+ output = (output ^ (output << 8)) & 0xff00ff00ff00ffUL;
+ return output;
+ }
+
+ private static void BCnLerpAlphaUnorm(Span<byte> alpha)
+ {
+ byte a0 = alpha[0];
+ byte a1 = alpha[1];
+
+ if (a0 > a1)
+ {
+ alpha[2] = (byte)((6 * a0 + 1 * a1) / 7);
+ alpha[3] = (byte)((5 * a0 + 2 * a1) / 7);
+ alpha[4] = (byte)((4 * a0 + 3 * a1) / 7);
+ alpha[5] = (byte)((3 * a0 + 4 * a1) / 7);
+ alpha[6] = (byte)((2 * a0 + 5 * a1) / 7);
+ alpha[7] = (byte)((1 * a0 + 6 * a1) / 7);
+ }
+ else
+ {
+ alpha[2] = (byte)((4 * a0 + 1 * a1) / 5);
+ alpha[3] = (byte)((3 * a0 + 2 * a1) / 5);
+ alpha[4] = (byte)((2 * a0 + 3 * a1) / 5);
+ alpha[5] = (byte)((1 * a0 + 4 * a1) / 5);
+ alpha[6] = 0;
+ alpha[7] = 0xff;
+ }
+ }
+
+ private static void BCnLerpAlphaSnorm(Span<byte> alpha)
+ {
+ sbyte a0 = (sbyte)alpha[0];
+ sbyte a1 = (sbyte)alpha[1];
+
+ if (a0 > a1)
+ {
+ alpha[2] = (byte)((6 * a0 + 1 * a1) / 7);
+ alpha[3] = (byte)((5 * a0 + 2 * a1) / 7);
+ alpha[4] = (byte)((4 * a0 + 3 * a1) / 7);
+ alpha[5] = (byte)((3 * a0 + 4 * a1) / 7);
+ alpha[6] = (byte)((2 * a0 + 5 * a1) / 7);
+ alpha[7] = (byte)((1 * a0 + 6 * a1) / 7);
+ }
+ else
+ {
+ alpha[2] = (byte)((4 * a0 + 1 * a1) / 5);
+ alpha[3] = (byte)((3 * a0 + 2 * a1) / 5);
+ alpha[4] = (byte)((2 * a0 + 3 * a1) / 5);
+ alpha[5] = (byte)((1 * a0 + 4 * a1) / 5);
+ alpha[6] = 0x80;
+ alpha[7] = 0x7f;
+ }
+ }
+
+ private unsafe static void BCnDecodeTileAlpha(Span<byte> output, Span<byte> rPal, ulong rI)
+ {
+ if (Avx2.IsSupported)
+ {
+ Span<Vector128<byte>> outputAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(output);
+
+ Vector128<uint> shifts = Vector128.Create(0u, 3u, 6u, 9u);
+ Vector128<uint> masks = Vector128.Create(7u);
+
+ Vector128<byte> vClut;
+
+ fixed (byte* pRPal = rPal)
+ {
+ vClut = Sse2.LoadScalarVector128((ulong*)pRPal).AsByte();
+ }
+
+ Vector128<uint> indices0 = Vector128.Create((uint)rI);
+ Vector128<uint> indices1 = Vector128.Create((uint)(rI >> 24));
+ Vector128<uint> indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts);
+ Vector128<uint> indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts);
+ Vector128<uint> indices01 = Sse2.ShiftRightLogical(indices00, 12);
+ Vector128<uint> indices11 = Sse2.ShiftRightLogical(indices10, 12);
+ indices00 = Sse2.And(indices00, masks);
+ indices10 = Sse2.And(indices10, masks);
+ indices01 = Sse2.And(indices01, masks);
+ indices11 = Sse2.And(indices11, masks);
+
+ Vector128<ushort> indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32());
+ Vector128<ushort> indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32());
+
+ Vector128<byte> indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16());
+
+ outputAsVector128[0] = Ssse3.Shuffle(vClut, indices);
+ }
+ else
+ {
+ for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3)
+ {
+ output[i] = rPal[(int)(rI & 7)];
+ }
+ }
+ }
+
+ private unsafe static void BCnDecodeTileAlphaRgba(Span<byte> output, Span<byte> rPal, ulong rI)
+ {
+ if (Avx2.IsSupported)
+ {
+ Span<Vector256<uint>> outputAsVector256 = MemoryMarshal.Cast<byte, Vector256<uint>>(output);
+
+ Vector256<uint> shifts = Vector256.Create(0u, 3u, 6u, 9u, 12u, 15u, 18u, 21u);
+
+ Vector128<uint> vClut128;
+
+ fixed (byte* pRPal = rPal)
+ {
+ vClut128 = Sse2.LoadScalarVector128((ulong*)pRPal).AsUInt32();
+ }
+
+ Vector256<uint> vClut = Avx2.ConvertToVector256Int32(vClut128.AsByte()).AsUInt32();
+ vClut = Avx2.ShiftLeftLogical(vClut, 24);
+
+ Vector256<uint> indices0 = Vector256.Create((uint)rI);
+ Vector256<uint> indices1 = Vector256.Create((uint)(rI >> 24));
+
+ indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts);
+ indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts);
+
+ outputAsVector256[0] = Avx2.Or(outputAsVector256[0], Avx2.PermuteVar8x32(vClut, indices0));
+ outputAsVector256[1] = Avx2.Or(outputAsVector256[1], Avx2.PermuteVar8x32(vClut, indices1));
+ }
+ else
+ {
+ for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, rI >>= 3)
+ {
+ output[i] = rPal[(int)(rI & 7)];
+ }
+ }
+ }
+
+ private unsafe static void BC1DecodeTileRgb(Span<byte> output, ReadOnlySpan<byte> input)
+ {
+ Span<uint> clut = stackalloc uint[4];
+
+ uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input);
+ uint c0 = (ushort)c0c1;
+ uint c1 = (ushort)(c0c1 >> 16);
+
+ clut[0] = ConvertRgb565ToRgb888(c0) | 0xff000000;
+ clut[1] = ConvertRgb565ToRgb888(c1) | 0xff000000;
+ clut[2] = BC1LerpRgb2(clut[0], clut[1], c0, c1);
+ clut[3] = BC1LerpRgb3(clut[0], clut[1], c0, c1);
+
+ BCnDecodeTileRgb(clut, output, input);
+ }
+
+ private unsafe static void BC23DecodeTileRgb(Span<byte> output, ReadOnlySpan<byte> input)
+ {
+ Span<uint> clut = stackalloc uint[4];
+
+ uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input);
+ uint c0 = (ushort)c0c1;
+ uint c1 = (ushort)(c0c1 >> 16);
+
+ clut[0] = ConvertRgb565ToRgb888(c0);
+ clut[1] = ConvertRgb565ToRgb888(c1);
+ clut[2] = BC23LerpRgb2(clut[0], clut[1]);
+ clut[3] = BC23LerpRgb3(clut[0], clut[1]);
+
+ BCnDecodeTileRgb(clut, output, input);
+ }
+
+ private unsafe static void BCnDecodeTileRgb(Span<uint> clut, Span<byte> output, ReadOnlySpan<byte> input)
+ {
+ if (Avx2.IsSupported)
+ {
+ Span<Vector256<uint>> outputAsVector256 = MemoryMarshal.Cast<byte, Vector256<uint>>(output);
+
+ Vector256<uint> shifts0 = Vector256.Create(0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u);
+ Vector256<uint> shifts1 = Vector256.Create(16u, 18u, 20u, 22u, 24u, 26u, 28u, 30u);
+ Vector256<uint> masks = Vector256.Create(3u);
+
+ Vector256<uint> vClut;
+
+ fixed (uint* pClut = &clut[0])
+ {
+ vClut = Sse2.LoadVector128(pClut).ToVector256Unsafe();
+ }
+
+ Vector256<uint> indices0;
+
+ fixed (byte* pInput = input)
+ {
+ indices0 = Avx2.BroadcastScalarToVector256((uint*)(pInput + 4));
+ }
+
+ Vector256<uint> indices1 = indices0;
+
+ indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts0);
+ indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts1);
+ indices0 = Avx2.And(indices0, masks);
+ indices1 = Avx2.And(indices1, masks);
+
+ outputAsVector256[0] = Avx2.PermuteVar8x32(vClut, indices0);
+ outputAsVector256[1] = Avx2.PermuteVar8x32(vClut, indices1);
+ }
+ else
+ {
+ Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output);
+
+ uint indices = BinaryPrimitives.ReadUInt32LittleEndian(input.Slice(4));
+
+ for (int i = 0; i < BlockWidth * BlockHeight; i++, indices >>= 2)
+ {
+ outputAsUint[i] = clut[(int)(indices & 3)];
+ }
+ }
+ }
+
+ private static uint BC1LerpRgb2(uint color0, uint color1, uint c0, uint c1)
+ {
+ if (c0 > c1)
+ {
+ return BC23LerpRgb2(color0, color1) | 0xff000000;
+ }
+
+ uint carry = color0 & color1;
+ uint addHalve = ((color0 ^ color1) >> 1) & 0x7f7f7f;
+ return (addHalve + carry) | 0xff000000;
+ }
+
+ private static uint BC23LerpRgb2(uint color0, uint color1)
+ {
+ uint r0 = (byte)color0;
+ uint g0 = color0 & 0xff00;
+ uint b0 = color0 & 0xff0000;
+
+ uint r1 = (byte)color1;
+ uint g1 = color1 & 0xff00;
+ uint b1 = color1 & 0xff0000;
+
+ uint mixR = (2 * r0 + r1) / 3;
+ uint mixG = (2 * g0 + g1) / 3;
+ uint mixB = (2 * b0 + b1) / 3;
+
+ return mixR | (mixG & 0xff00) | (mixB & 0xff0000);
+ }
+
+ private static uint BC1LerpRgb3(uint color0, uint color1, uint c0, uint c1)
+ {
+ if (c0 > c1)
+ {
+ return BC23LerpRgb3(color0, color1) | 0xff000000;
+ }
+
+ return 0;
+ }
+
+ private static uint BC23LerpRgb3(uint color0, uint color1)
+ {
+ uint r0 = (byte)color0;
+ uint g0 = color0 & 0xff00;
+ uint b0 = color0 & 0xff0000;
+
+ uint r1 = (byte)color1;
+ uint g1 = color1 & 0xff00;
+ uint b1 = color1 & 0xff0000;
+
+ uint mixR = (2 * r1 + r0) / 3;
+ uint mixG = (2 * g1 + g0) / 3;
+ uint mixB = (2 * b1 + b0) / 3;
+
+ return mixR | (mixG & 0xff00) | (mixB & 0xff0000);
+ }
+
+ private static uint ConvertRgb565ToRgb888(uint value)
+ {
+ uint b = (value & 0x1f) << 19;
+ uint g = (value << 5) & 0xfc00;
+ uint r = (value >> 8) & 0xf8;
+
+ b |= b >> 5;
+ g |= g >> 6;
+ r |= r >> 5;
+
+ return r | (g & 0xff00) | (b & 0xff0000);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/Ryujinx.Graphics.Texture/BCnEncoder.cs b/src/Ryujinx.Graphics.Texture/BCnEncoder.cs
new file mode 100644
index 00000000..02b79c1b
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/BCnEncoder.cs
@@ -0,0 +1,60 @@
+using Ryujinx.Common;
+using Ryujinx.Graphics.Texture.Encoders;
+using System;
+
+namespace Ryujinx.Graphics.Texture
+{
+ public static class BCnEncoder
+ {
+ private const int BlockWidth = 4;
+ private const int BlockHeight = 4;
+
+ public static byte[] EncodeBC7(byte[] data, int width, int height, int depth, int levels, int layers)
+ {
+ int size = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ int w = BitUtils.DivRoundUp(Math.Max(1, width >> l), BlockWidth);
+ int h = BitUtils.DivRoundUp(Math.Max(1, height >> l), BlockHeight);
+
+ size += w * h * 16 * Math.Max(1, depth >> l) * layers;
+ }
+
+ byte[] output = new byte[size];
+
+ int imageBaseIOffs = 0;
+ int imageBaseOOffs = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ int rgba8Size = width * height * depth * layers * 4;
+
+ int w = BitUtils.DivRoundUp(width, BlockWidth);
+ int h = BitUtils.DivRoundUp(height, BlockHeight);
+
+ for (int l2 = 0; l2 < layers; l2++)
+ {
+ for (int z = 0; z < depth; z++)
+ {
+ BC7Encoder.Encode(
+ output.AsMemory().Slice(imageBaseOOffs),
+ data.AsMemory().Slice(imageBaseIOffs),
+ width,
+ height,
+ EncodeMode.Fast | EncodeMode.Multithreaded);
+
+ imageBaseIOffs += width * height * 4;
+ imageBaseOOffs += w * h * 16;
+ }
+ }
+
+ width = Math.Max(1, width >> 1);
+ height = Math.Max(1, height >> 1);
+ depth = Math.Max(1, depth >> 1);
+ }
+
+ return output;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/Ryujinx.Graphics.Texture/BlockLinearConstants.cs b/src/Ryujinx.Graphics.Texture/BlockLinearConstants.cs
new file mode 100644
index 00000000..d95691cf
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/BlockLinearConstants.cs
@@ -0,0 +1,10 @@
+namespace Ryujinx.Graphics.Texture
+{
+ static class BlockLinearConstants
+ {
+ public const int GobStride = 64;
+ public const int GobHeight = 8;
+
+ public const int GobSize = GobStride * GobHeight;
+ }
+} \ No newline at end of file
diff --git a/src/Ryujinx.Graphics.Texture/BlockLinearLayout.cs b/src/Ryujinx.Graphics.Texture/BlockLinearLayout.cs
new file mode 100644
index 00000000..e098e959
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/BlockLinearLayout.cs
@@ -0,0 +1,195 @@
+using Ryujinx.Common;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+
+using static Ryujinx.Graphics.Texture.BlockLinearConstants;
+
+namespace Ryujinx.Graphics.Texture
+{
+ class BlockLinearLayout
+ {
+ private struct RobAndSliceSizes
+ {
+ public int RobSize;
+ public int SliceSize;
+
+ public RobAndSliceSizes(int robSize, int sliceSize)
+ {
+ RobSize = robSize;
+ SliceSize = sliceSize;
+ }
+ }
+
+ private int _texBpp;
+
+ private int _bhMask;
+ private int _bdMask;
+
+ private int _bhShift;
+ private int _bdShift;
+ private int _bppShift;
+
+ private int _xShift;
+
+ private int _robSize;
+ private int _sliceSize;
+
+ // Variables for built in iteration.
+ private int _yPart;
+ private int _yzPart;
+ private int _zPart;
+
+ public BlockLinearLayout(
+ int width,
+ int height,
+ int gobBlocksInY,
+ int gobBlocksInZ,
+ int bpp)
+ {
+ _texBpp = bpp;
+
+ _bppShift = BitOperations.TrailingZeroCount(bpp);
+
+ _bhMask = gobBlocksInY - 1;
+ _bdMask = gobBlocksInZ - 1;
+
+ _bhShift = BitOperations.TrailingZeroCount(gobBlocksInY);
+ _bdShift = BitOperations.TrailingZeroCount(gobBlocksInZ);
+
+ _xShift = BitOperations.TrailingZeroCount(GobSize * gobBlocksInY * gobBlocksInZ);
+
+ RobAndSliceSizes rsSizes = GetRobAndSliceSizes(width, height, gobBlocksInY, gobBlocksInZ);
+
+ _robSize = rsSizes.RobSize;
+ _sliceSize = rsSizes.SliceSize;
+ }
+
+ private RobAndSliceSizes GetRobAndSliceSizes(int width, int height, int gobBlocksInY, int gobBlocksInZ)
+ {
+ int widthInGobs = BitUtils.DivRoundUp(width * _texBpp, GobStride);
+
+ int robSize = GobSize * gobBlocksInY * gobBlocksInZ * widthInGobs;
+
+ int sliceSize = BitUtils.DivRoundUp(height, gobBlocksInY * GobHeight) * robSize;
+
+ return new RobAndSliceSizes(robSize, sliceSize);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public int GetOffset(int x, int y, int z)
+ {
+ return GetOffsetWithLineOffset(x << _bppShift, y, z);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public int GetOffsetWithLineOffset(int x, int y, int z)
+ {
+ int yh = y / GobHeight;
+
+ int offset = (z >> _bdShift) * _sliceSize + (yh >> _bhShift) * _robSize;
+
+ offset += (x / GobStride) << _xShift;
+
+ offset += (yh & _bhMask) * GobSize;
+
+ offset += ((z & _bdMask) * GobSize) << _bhShift;
+
+ offset += ((x & 0x3f) >> 5) << 8;
+ offset += ((y & 0x07) >> 1) << 6;
+ offset += ((x & 0x1f) >> 4) << 5;
+ offset += ((y & 0x01) >> 0) << 4;
+ offset += ((x & 0x0f) >> 0) << 0;
+
+ return offset;
+ }
+
+ public (int offset, int size) GetRectangleRange(int x, int y, int width, int height)
+ {
+ // Justification:
+ // The 2D offset is a combination of separate x and y parts.
+ // Both components increase with input and never overlap bits.
+ // Therefore for each component, the minimum input value is the lowest that component can go.
+ // Minimum total value is minimum X component + minimum Y component. Similar goes for maximum.
+
+ int start = GetOffset(x, y, 0);
+ int end = GetOffset(x + width - 1, y + height - 1, 0) + _texBpp; // Cover the last pixel.
+ return (start, end - start);
+ }
+
+ public bool LayoutMatches(BlockLinearLayout other)
+ {
+ return _robSize == other._robSize &&
+ _sliceSize == other._sliceSize &&
+ _texBpp == other._texBpp &&
+ _bhMask == other._bhMask &&
+ _bdMask == other._bdMask;
+ }
+
+ // Functions for built in iteration.
+ // Components of the offset can be updated separately, and combined to save some time.
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public void SetY(int y)
+ {
+ int yh = y / GobHeight;
+ int offset = (yh >> _bhShift) * _robSize;
+
+ offset += (yh & _bhMask) * GobSize;
+
+ offset += ((y & 0x07) >> 1) << 6;
+ offset += ((y & 0x01) >> 0) << 4;
+
+ _yPart = offset;
+ _yzPart = offset + _zPart;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public void SetZ(int z)
+ {
+ int offset = (z >> _bdShift) * _sliceSize;
+
+ offset += ((z & _bdMask) * GobSize) << _bhShift;
+
+ _zPart = offset;
+ _yzPart = offset + _yPart;
+ }
+
+ /// <summary>
+ /// Optimized conversion for line offset in bytes to an absolute offset. Input x must be divisible by 16.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public int GetOffsetWithLineOffset16(int x)
+ {
+ int offset = (x / GobStride) << _xShift;
+
+ offset += ((x & 0x3f) >> 5) << 8;
+ offset += ((x & 0x1f) >> 4) << 5;
+
+ return offset + _yzPart;
+ }
+
+ /// <summary>
+ /// Optimized conversion for line offset in bytes to an absolute offset. Input x must be divisible by 64.
+ /// </summary>
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public int GetOffsetWithLineOffset64(int x)
+ {
+ int offset = (x / GobStride) << _xShift;
+
+ return offset + _yzPart;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public int GetOffset(int x)
+ {
+ x <<= _bppShift;
+ int offset = (x / GobStride) << _xShift;
+
+ offset += ((x & 0x3f) >> 5) << 8;
+ offset += ((x & 0x1f) >> 4) << 5;
+ offset += (x & 0x0f);
+
+ return offset + _yzPart;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/Ryujinx.Graphics.Texture/Bpp12Pixel.cs b/src/Ryujinx.Graphics.Texture/Bpp12Pixel.cs
new file mode 100644
index 00000000..5a38259e
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Bpp12Pixel.cs
@@ -0,0 +1,11 @@
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Graphics.Texture
+{
+ [StructLayout(LayoutKind.Sequential, Pack = 1, Size = 12)]
+ public struct Bpp12Pixel
+ {
+ private ulong _elem1;
+ private uint _elem2;
+ }
+}
diff --git a/src/Ryujinx.Graphics.Texture/ETC2Decoder.cs b/src/Ryujinx.Graphics.Texture/ETC2Decoder.cs
new file mode 100644
index 00000000..21ff4be4
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/ETC2Decoder.cs
@@ -0,0 +1,682 @@
+using Ryujinx.Common;
+using System;
+using System.Buffers.Binary;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Graphics.Texture
+{
+ public static class ETC2Decoder
+ {
+ private const uint AlphaMask = 0xff000000u;
+
+ private const int BlockWidth = 4;
+ private const int BlockHeight = 4;
+
+ private static readonly int[][] _etc1Lut =
+ {
+ new int[] { 2, 8, -2, -8 },
+ new int[] { 5, 17, -5, -17 },
+ new int[] { 9, 29, -9, -29 },
+ new int[] { 13, 42, -13, -42 },
+ new int[] { 18, 60, -18, -60 },
+ new int[] { 24, 80, -24, -80 },
+ new int[] { 33, 106, -33, -106 },
+ new int[] { 47, 183, -47, -183 }
+ };
+
+ private static readonly int[] _etc2Lut =
+ {
+ 3, 6, 11, 16, 23, 32, 41, 64
+ };
+
+ private static readonly int[][] _etc2AlphaLut =
+ {
+ new int[] { -3, -6, -9, -15, 2, 5, 8, 14 },
+ new int[] { -3, -7, -10, -13, 2, 6, 9, 12 },
+ new int[] { -2, -5, -8, -13, 1, 4, 7, 12 },
+ new int[] { -2, -4, -6, -13, 1, 3, 5, 12 },
+ new int[] { -3, -6, -8, -12, 2, 5, 7, 11 },
+ new int[] { -3, -7, -9, -11, 2, 6, 8, 10 },
+ new int[] { -4, -7, -8, -11, 3, 6, 7, 10 },
+ new int[] { -3, -5, -8, -11, 2, 4, 7, 10 },
+ new int[] { -2, -6, -8, -10, 1, 5, 7, 9 },
+ new int[] { -2, -5, -8, -10, 1, 4, 7, 9 },
+ new int[] { -2, -4, -8, -10, 1, 3, 7, 9 },
+ new int[] { -2, -5, -7, -10, 1, 4, 6, 9 },
+ new int[] { -3, -4, -7, -10, 2, 3, 6, 9 },
+ new int[] { -1, -2, -3, -10, 0, 1, 2, 9 },
+ new int[] { -4, -6, -8, -9, 3, 5, 7, 8 },
+ new int[] { -3, -5, -7, -9, 2, 4, 6, 8 }
+ };
+
+ public static byte[] DecodeRgb(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
+ {
+ ReadOnlySpan<ulong> dataUlong = MemoryMarshal.Cast<byte, ulong>(data);
+
+ int inputOffset = 0;
+
+ byte[] output = new byte[CalculateOutputSize(width, height, depth, levels, layers)];
+
+ Span<uint> outputUint = MemoryMarshal.Cast<byte, uint>(output);
+ Span<uint> tile = stackalloc uint[BlockWidth * BlockHeight];
+
+ int imageBaseOOffs = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ int wInBlocks = BitUtils.DivRoundUp(width, BlockWidth);
+ int hInBlocks = BitUtils.DivRoundUp(height, BlockHeight);
+
+ for (int l2 = 0; l2 < layers; l2++)
+ {
+ for (int z = 0; z < depth; z++)
+ {
+ for (int y = 0; y < hInBlocks; y++)
+ {
+ int ty = y * BlockHeight;
+ int bh = Math.Min(BlockHeight, height - ty);
+
+ for (int x = 0; x < wInBlocks; x++)
+ {
+ int tx = x * BlockWidth;
+ int bw = Math.Min(BlockWidth, width - tx);
+
+ ulong colorBlock = dataUlong[inputOffset++];
+
+ DecodeBlock(tile, colorBlock);
+
+ for (int py = 0; py < bh; py++)
+ {
+ int oOffsBase = imageBaseOOffs + ((ty + py) * width) + tx;
+
+ for (int px = 0; px < bw; px++)
+ {
+ int oOffs = oOffsBase + px;
+
+ outputUint[oOffs] = tile[py * BlockWidth + px] | AlphaMask;
+ }
+ }
+ }
+ }
+
+ imageBaseOOffs += width * height;
+ }
+ }
+
+ width = Math.Max(1, width >> 1);
+ height = Math.Max(1, height >> 1);
+ depth = Math.Max(1, depth >> 1);
+ }
+
+ return output;
+ }
+
+ public static byte[] DecodePta(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
+ {
+ ReadOnlySpan<ulong> dataUlong = MemoryMarshal.Cast<byte, ulong>(data);
+
+ int inputOffset = 0;
+
+ byte[] output = new byte[CalculateOutputSize(width, height, depth, levels, layers)];
+
+ Span<uint> outputUint = MemoryMarshal.Cast<byte, uint>(output);
+ Span<uint> tile = stackalloc uint[BlockWidth * BlockHeight];
+
+ int imageBaseOOffs = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ int wInBlocks = BitUtils.DivRoundUp(width, BlockWidth);
+ int hInBlocks = BitUtils.DivRoundUp(height, BlockHeight);
+
+ for (int l2 = 0; l2 < layers; l2++)
+ {
+ for (int z = 0; z < depth; z++)
+ {
+ for (int y = 0; y < hInBlocks; y++)
+ {
+ int ty = y * BlockHeight;
+ int bh = Math.Min(BlockHeight, height - ty);
+
+ for (int x = 0; x < wInBlocks; x++)
+ {
+ int tx = x * BlockWidth;
+ int bw = Math.Min(BlockWidth, width - tx);
+
+ ulong colorBlock = dataUlong[inputOffset++];
+
+ DecodeBlockPta(tile, colorBlock);
+
+ for (int py = 0; py < bh; py++)
+ {
+ int oOffsBase = imageBaseOOffs + ((ty + py) * width) + tx;
+
+ tile.Slice(py * BlockWidth, bw).CopyTo(outputUint.Slice(oOffsBase, bw));
+ }
+ }
+ }
+
+ imageBaseOOffs += width * height;
+ }
+ }
+
+ width = Math.Max(1, width >> 1);
+ height = Math.Max(1, height >> 1);
+ depth = Math.Max(1, depth >> 1);
+ }
+
+ return output;
+ }
+
+ public static byte[] DecodeRgba(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
+ {
+ ReadOnlySpan<ulong> dataUlong = MemoryMarshal.Cast<byte, ulong>(data);
+
+ int inputOffset = 0;
+
+ byte[] output = new byte[CalculateOutputSize(width, height, depth, levels, layers)];
+
+ Span<uint> outputUint = MemoryMarshal.Cast<byte, uint>(output);
+ Span<uint> tile = stackalloc uint[BlockWidth * BlockHeight];
+
+ int imageBaseOOffs = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ int wInBlocks = BitUtils.DivRoundUp(width, BlockWidth);
+ int hInBlocks = BitUtils.DivRoundUp(height, BlockHeight);
+
+ for (int l2 = 0; l2 < layers; l2++)
+ {
+ for (int z = 0; z < depth; z++)
+ {
+ for (int y = 0; y < hInBlocks; y++)
+ {
+ int ty = y * BlockHeight;
+ int bh = Math.Min(BlockHeight, height - ty);
+
+ for (int x = 0; x < wInBlocks; x++)
+ {
+ int tx = x * BlockWidth;
+ int bw = Math.Min(BlockWidth, width - tx);
+
+ ulong alphaBlock = dataUlong[inputOffset];
+ ulong colorBlock = dataUlong[inputOffset + 1];
+
+ inputOffset += 2;
+
+ DecodeBlock(tile, colorBlock);
+
+ byte alphaBase = (byte)alphaBlock;
+ int[] alphaTable = _etc2AlphaLut[(alphaBlock >> 8) & 0xf];
+ int alphaMultiplier = (int)(alphaBlock >> 12) & 0xf;
+ ulong alphaIndices = BinaryPrimitives.ReverseEndianness(alphaBlock);
+
+ if (alphaMultiplier != 0)
+ {
+ for (int py = 0; py < bh; py++)
+ {
+ int oOffsBase = imageBaseOOffs + ((ty + py) * width) + tx;
+
+ for (int px = 0; px < bw; px++)
+ {
+ int oOffs = oOffsBase + px;
+ int alphaIndex = (int)((alphaIndices >> (((px * BlockHeight + py) ^ 0xf) * 3)) & 7);
+
+ byte a = Saturate(alphaBase + alphaTable[alphaIndex] * alphaMultiplier);
+
+ outputUint[oOffs] = tile[py * BlockWidth + px] | ((uint)a << 24);
+ }
+ }
+ }
+ else
+ {
+ uint a = (uint)alphaBase << 24;
+
+ for (int py = 0; py < bh; py++)
+ {
+ int oOffsBase = imageBaseOOffs + ((ty + py) * width) + tx;
+
+ for (int px = 0; px < bw; px++)
+ {
+ int oOffs = oOffsBase + px;
+
+ outputUint[oOffs] = tile[py * BlockWidth + px] | a;
+ }
+ }
+ }
+ }
+ }
+
+ imageBaseOOffs += width * height;
+ }
+ }
+
+ width = Math.Max(1, width >> 1);
+ height = Math.Max(1, height >> 1);
+ depth = Math.Max(1, depth >> 1);
+ }
+
+ return output;
+ }
+
+ private static void DecodeBlock(Span<uint> tile, ulong block)
+ {
+ uint blockLow = (uint)(block >> 0);
+ uint blockHigh = (uint)(block >> 32);
+
+ uint r1, g1, b1;
+ uint r2, g2, b2;
+
+ bool differentialMode = (blockLow & 0x2000000) != 0;
+
+ if (differentialMode)
+ {
+ (r1, g1, b1, r2, g2, b2) = UnpackRgb555DiffEndPoints(blockLow);
+
+ if (r2 > 31)
+ {
+ DecodeBlock59T(tile, blockLow, blockHigh);
+ }
+ else if (g2 > 31)
+ {
+ DecodeBlock58H(tile, blockLow, blockHigh);
+ }
+ else if (b2 > 31)
+ {
+ DecodeBlock57P(tile, block);
+ }
+ else
+ {
+ r1 |= r1 >> 5;
+ g1 |= g1 >> 5;
+ b1 |= b1 >> 5;
+
+ r2 = (r2 << 3) | (r2 >> 2);
+ g2 = (g2 << 3) | (g2 >> 2);
+ b2 = (b2 << 3) | (b2 >> 2);
+
+ DecodeBlockETC1(tile, blockLow, blockHigh, r1, g1, b1, r2, g2, b2);
+ }
+ }
+ else
+ {
+ r1 = (blockLow & 0x0000f0) >> 0;
+ g1 = (blockLow & 0x00f000) >> 8;
+ b1 = (blockLow & 0xf00000) >> 16;
+
+ r2 = (blockLow & 0x00000f) << 4;
+ g2 = (blockLow & 0x000f00) >> 4;
+ b2 = (blockLow & 0x0f0000) >> 12;
+
+ r1 |= r1 >> 4;
+ g1 |= g1 >> 4;
+ b1 |= b1 >> 4;
+
+ r2 |= r2 >> 4;
+ g2 |= g2 >> 4;
+ b2 |= b2 >> 4;
+
+ DecodeBlockETC1(tile, blockLow, blockHigh, r1, g1, b1, r2, g2, b2);
+ }
+ }
+
+ private static void DecodeBlockPta(Span<uint> tile, ulong block)
+ {
+ uint blockLow = (uint)(block >> 0);
+ uint blockHigh = (uint)(block >> 32);
+
+ (uint r1, uint g1, uint b1, uint r2, uint g2, uint b2) = UnpackRgb555DiffEndPoints(blockLow);
+
+ bool fullyOpaque = (blockLow & 0x2000000) != 0;
+
+ if (fullyOpaque)
+ {
+ if (r2 > 31)
+ {
+ DecodeBlock59T(tile, blockLow, blockHigh);
+ }
+ else if (g2 > 31)
+ {
+ DecodeBlock58H(tile, blockLow, blockHigh);
+ }
+ else if (b2 > 31)
+ {
+ DecodeBlock57P(tile, block);
+ }
+ else
+ {
+ r1 |= r1 >> 5;
+ g1 |= g1 >> 5;
+ b1 |= b1 >> 5;
+
+ r2 = (r2 << 3) | (r2 >> 2);
+ g2 = (g2 << 3) | (g2 >> 2);
+ b2 = (b2 << 3) | (b2 >> 2);
+
+ DecodeBlockETC1(tile, blockLow, blockHigh, r1, g1, b1, r2, g2, b2);
+ }
+
+ for (int i = 0; i < tile.Length; i++)
+ {
+ tile[i] |= AlphaMask;
+ }
+ }
+ else
+ {
+ if (r2 > 31)
+ {
+ DecodeBlock59T(tile, blockLow, blockHigh, AlphaMask);
+ }
+ else if (g2 > 31)
+ {
+ DecodeBlock58H(tile, blockLow, blockHigh, AlphaMask);
+ }
+ else if (b2 > 31)
+ {
+ DecodeBlock57P(tile, block);
+
+ for (int i = 0; i < tile.Length; i++)
+ {
+ tile[i] |= AlphaMask;
+ }
+ }
+ else
+ {
+ r1 |= r1 >> 5;
+ g1 |= g1 >> 5;
+ b1 |= b1 >> 5;
+
+ r2 = (r2 << 3) | (r2 >> 2);
+ g2 = (g2 << 3) | (g2 >> 2);
+ b2 = (b2 << 3) | (b2 >> 2);
+
+ DecodeBlockETC1(tile, blockLow, blockHigh, r1, g1, b1, r2, g2, b2, AlphaMask);
+ }
+ }
+ }
+
+ private static (uint, uint, uint, uint, uint, uint) UnpackRgb555DiffEndPoints(uint blockLow)
+ {
+ uint r1 = (blockLow & 0x0000f8) >> 0;
+ uint g1 = (blockLow & 0x00f800) >> 8;
+ uint b1 = (blockLow & 0xf80000) >> 16;
+
+ uint r2 = (uint)((sbyte)(r1 >> 3) + ((sbyte)((blockLow & 0x000007) << 5) >> 5));
+ uint g2 = (uint)((sbyte)(g1 >> 3) + ((sbyte)((blockLow & 0x000700) >> 3) >> 5));
+ uint b2 = (uint)((sbyte)(b1 >> 3) + ((sbyte)((blockLow & 0x070000) >> 11) >> 5));
+
+ return (r1, g1, b1, r2, g2, b2);
+ }
+
+ private static void DecodeBlock59T(Span<uint> tile, uint blockLow, uint blockHigh, uint alphaMask = 0)
+ {
+ uint r1 = (blockLow & 3) | ((blockLow >> 1) & 0xc);
+ uint g1 = (blockLow >> 12) & 0xf;
+ uint b1 = (blockLow >> 8) & 0xf;
+
+ uint r2 = (blockLow >> 20) & 0xf;
+ uint g2 = (blockLow >> 16) & 0xf;
+ uint b2 = (blockLow >> 28) & 0xf;
+
+ r1 |= r1 << 4;
+ g1 |= g1 << 4;
+ b1 |= b1 << 4;
+
+ r2 |= r2 << 4;
+ g2 |= g2 << 4;
+ b2 |= b2 << 4;
+
+ int dist = _etc2Lut[((blockLow >> 24) & 1) | ((blockLow >> 25) & 6)];
+
+ Span<uint> palette = stackalloc uint[4];
+
+ palette[0] = Pack(r1, g1, b1);
+ palette[1] = Pack(r2, g2, b2, dist);
+ palette[2] = Pack(r2, g2, b2);
+ palette[3] = Pack(r2, g2, b2, -dist);
+
+ blockHigh = BinaryPrimitives.ReverseEndianness(blockHigh);
+
+ for (int y = 0; y < BlockHeight; y++)
+ {
+ for (int x = 0; x < BlockWidth; x++)
+ {
+ int offset = (y * 4) + x;
+ int index = (x * 4) + y;
+
+ int paletteIndex = (int)((blockHigh >> index) & 1) | (int)((blockHigh >> (index + 15)) & 2);
+
+ tile[offset] = palette[paletteIndex];
+
+ if (alphaMask != 0)
+ {
+ if (paletteIndex == 2)
+ {
+ tile[offset] = 0;
+ }
+ else
+ {
+ tile[offset] |= alphaMask;
+ }
+ }
+ }
+ }
+ }
+
+ private static void DecodeBlock58H(Span<uint> tile, uint blockLow, uint blockHigh, uint alphaMask = 0)
+ {
+ uint r1 = (blockLow >> 3) & 0xf;
+ uint g1 = ((blockLow << 1) & 0xe) | ((blockLow >> 12) & 1);
+ uint b1 = ((blockLow >> 23) & 1) | ((blockLow >> 7) & 6) | ((blockLow >> 8) & 8);
+
+ uint r2 = (blockLow >> 19) & 0xf;
+ uint g2 = ((blockLow >> 31) & 1) | ((blockLow >> 15) & 0xe);
+ uint b2 = (blockLow >> 27) & 0xf;
+
+ uint rgb1 = Pack4Be(r1, g1, b1);
+ uint rgb2 = Pack4Be(r2, g2, b2);
+
+ r1 |= r1 << 4;
+ g1 |= g1 << 4;
+ b1 |= b1 << 4;
+
+ r2 |= r2 << 4;
+ g2 |= g2 << 4;
+ b2 |= b2 << 4;
+
+ int dist = _etc2Lut[(rgb1 >= rgb2 ? 1u : 0u) | ((blockLow >> 23) & 2) | ((blockLow >> 24) & 4)];
+
+ Span<uint> palette = stackalloc uint[4];
+
+ palette[0] = Pack(r1, g1, b1, dist);
+ palette[1] = Pack(r1, g1, b1, -dist);
+ palette[2] = Pack(r2, g2, b2, dist);
+ palette[3] = Pack(r2, g2, b2, -dist);
+
+ blockHigh = BinaryPrimitives.ReverseEndianness(blockHigh);
+
+ for (int y = 0; y < BlockHeight; y++)
+ {
+ for (int x = 0; x < BlockWidth; x++)
+ {
+ int offset = (y * 4) + x;
+ int index = (x * 4) + y;
+
+ int paletteIndex = (int)((blockHigh >> index) & 1) | (int)((blockHigh >> (index + 15)) & 2);
+
+ tile[offset] = palette[paletteIndex];
+
+ if (alphaMask != 0)
+ {
+ if (paletteIndex == 2)
+ {
+ tile[offset] = 0;
+ }
+ else
+ {
+ tile[offset] |= alphaMask;
+ }
+ }
+ }
+ }
+ }
+
+ private static void DecodeBlock57P(Span<uint> tile, ulong block)
+ {
+ int r0 = (int)((block >> 1) & 0x3f);
+ int g0 = (int)(((block >> 9) & 0x3f) | ((block & 1) << 6));
+ int b0 = (int)(((block >> 31) & 1) | ((block >> 15) & 6) | ((block >> 16) & 0x18) | ((block >> 3) & 0x20));
+
+ int rh = (int)(((block >> 24) & 1) | ((block >> 25) & 0x3e));
+ int gh = (int)((block >> 33) & 0x7f);
+ int bh = (int)(((block >> 43) & 0x1f) | ((block >> 27) & 0x20));
+
+ int rv = (int)(((block >> 53) & 7) | ((block >> 37) & 0x38));
+ int gv = (int)(((block >> 62) & 3) | ((block >> 46) & 0x7c));
+ int bv = (int)((block >> 56) & 0x3f);
+
+ r0 = (r0 << 2) | (r0 >> 4);
+ g0 = (g0 << 1) | (g0 >> 6);
+ b0 = (b0 << 2) | (b0 >> 4);
+
+ rh = (rh << 2) | (rh >> 4);
+ gh = (gh << 1) | (gh >> 6);
+ bh = (bh << 2) | (bh >> 4);
+
+ rv = (rv << 2) | (rv >> 4);
+ gv = (gv << 1) | (gv >> 6);
+ bv = (bv << 2) | (bv >> 4);
+
+ for (int y = 0; y < BlockHeight; y++)
+ {
+ for (int x = 0; x < BlockWidth; x++)
+ {
+ int offset = y * BlockWidth + x;
+
+ byte r = Saturate(((x * (rh - r0)) + (y * (rv - r0)) + (r0 * 4) + 2) >> 2);
+ byte g = Saturate(((x * (gh - g0)) + (y * (gv - g0)) + (g0 * 4) + 2) >> 2);
+ byte b = Saturate(((x * (bh - b0)) + (y * (bv - b0)) + (b0 * 4) + 2) >> 2);
+
+ tile[offset] = Pack(r, g, b);
+ }
+ }
+ }
+
+ private static void DecodeBlockETC1(
+ Span<uint> tile,
+ uint blockLow,
+ uint blockHigh,
+ uint r1,
+ uint g1,
+ uint b1,
+ uint r2,
+ uint g2,
+ uint b2,
+ uint alphaMask = 0)
+ {
+ int[] table1 = _etc1Lut[(blockLow >> 29) & 7];
+ int[] table2 = _etc1Lut[(blockLow >> 26) & 7];
+
+ bool flip = (blockLow & 0x1000000) != 0;
+
+ if (!flip)
+ {
+ for (int y = 0; y < BlockHeight; y++)
+ {
+ for (int x = 0; x < BlockWidth / 2; x++)
+ {
+ uint color1 = CalculatePixel(r1, g1, b1, x + 0, y, blockHigh, table1, alphaMask);
+ uint color2 = CalculatePixel(r2, g2, b2, x + 2, y, blockHigh, table2, alphaMask);
+
+ int offset1 = y * BlockWidth + x;
+ int offset2 = y * BlockWidth + x + 2;
+
+ tile[offset1] = color1;
+ tile[offset2] = color2;
+ }
+ }
+ }
+ else
+ {
+ for (int y = 0; y < BlockHeight / 2; y++)
+ {
+ for (int x = 0; x < BlockWidth; x++)
+ {
+ uint color1 = CalculatePixel(r1, g1, b1, x, y + 0, blockHigh, table1, alphaMask);
+ uint color2 = CalculatePixel(r2, g2, b2, x, y + 2, blockHigh, table2, alphaMask);
+
+ int offset1 = (y * BlockWidth) + x;
+ int offset2 = ((y + 2) * BlockWidth) + x;
+
+ tile[offset1] = color1;
+ tile[offset2] = color2;
+ }
+ }
+ }
+ }
+
+ private static uint CalculatePixel(uint r, uint g, uint b, int x, int y, uint block, int[] table, uint alphaMask)
+ {
+ int index = x * BlockHeight + y;
+ uint msb = block << 1;
+ uint tableIndex = index < 8
+ ? ((block >> (index + 24)) & 1) + ((msb >> (index + 8)) & 2)
+ : ((block >> (index + 8)) & 1) + ((msb >> (index - 8)) & 2);
+
+ if (alphaMask != 0)
+ {
+ if (tableIndex == 0)
+ {
+ return Pack(r, g, b) | alphaMask;
+ }
+ else if (tableIndex == 2)
+ {
+ return 0;
+ }
+ else
+ {
+ return Pack(r, g, b, table[tableIndex]) | alphaMask;
+ }
+ }
+
+ return Pack(r, g, b, table[tableIndex]);
+ }
+
+ private static uint Pack(uint r, uint g, uint b, int offset)
+ {
+ r = Saturate((int)(r + offset));
+ g = Saturate((int)(g + offset));
+ b = Saturate((int)(b + offset));
+
+ return Pack(r, g, b);
+ }
+
+ private static uint Pack(uint r, uint g, uint b)
+ {
+ return r | (g << 8) | (b << 16);
+ }
+
+ private static uint Pack4Be(uint r, uint g, uint b)
+ {
+ return (r << 8) | (g << 4) | b;
+ }
+
+ private static byte Saturate(int value)
+ {
+ return value > byte.MaxValue ? byte.MaxValue : value < byte.MinValue ? byte.MinValue : (byte)value;
+ }
+
+ private static int CalculateOutputSize(int width, int height, int depth, int levels, int layers)
+ {
+ int size = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
+ }
+
+ return size;
+ }
+ }
+}
diff --git a/src/Ryujinx.Graphics.Texture/Encoders/BC7Encoder.cs b/src/Ryujinx.Graphics.Texture/Encoders/BC7Encoder.cs
new file mode 100644
index 00000000..35d36bce
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Encoders/BC7Encoder.cs
@@ -0,0 +1,1005 @@
+using Ryujinx.Graphics.Texture.Utils;
+using System;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+using System.Threading.Tasks;
+
+namespace Ryujinx.Graphics.Texture.Encoders
+{
+ static class BC7Encoder
+ {
+ private const int MinColorVarianceForModeChange = 160;
+
+ public static void Encode(Memory<byte> outputStorage, ReadOnlyMemory<byte> data, int width, int height, EncodeMode mode)
+ {
+ int widthInBlocks = (width + 3) / 4;
+ int heightInBlocks = (height + 3) / 4;
+
+ bool fastMode = (mode & EncodeMode.ModeMask) == EncodeMode.Fast;
+
+ if (mode.HasFlag(EncodeMode.Multithreaded))
+ {
+ Parallel.For(0, heightInBlocks, (yInBlocks) =>
+ {
+ Span<ulong> output = MemoryMarshal.Cast<byte, ulong>(outputStorage.Span);
+ int y = yInBlocks * 4;
+
+ for (int xInBlocks = 0; xInBlocks < widthInBlocks; xInBlocks++)
+ {
+ int x = xInBlocks * 4;
+ Block block = CompressBlock(data.Span, x, y, width, height, fastMode);
+
+ int offset = (yInBlocks * widthInBlocks + xInBlocks) * 2;
+ output[offset] = block.Low;
+ output[offset + 1] = block.High;
+ }
+ });
+ }
+ else
+ {
+ Span<ulong> output = MemoryMarshal.Cast<byte, ulong>(outputStorage.Span);
+ int offset = 0;
+
+ for (int y = 0; y < height; y += 4)
+ {
+ for (int x = 0; x < width; x += 4)
+ {
+ Block block = CompressBlock(data.Span, x, y, width, height, fastMode);
+
+ output[offset++] = block.Low;
+ output[offset++] = block.High;
+ }
+ }
+ }
+ }
+
+ private static readonly int[] _mostFrequentPartitions = new int[]
+ {
+ 0, 13, 2, 1, 15, 14, 10, 23
+ };
+
+ private static Block CompressBlock(ReadOnlySpan<byte> data, int x, int y, int width, int height, bool fastMode)
+ {
+ int w = Math.Min(4, width - x);
+ int h = Math.Min(4, height - y);
+
+ var dataUint = MemoryMarshal.Cast<byte, uint>(data);
+
+ int baseOffset = y * width + x;
+
+ Span<uint> tile = stackalloc uint[w * h];
+
+ for (int ty = 0; ty < h; ty++)
+ {
+ int rowOffset = baseOffset + ty * width;
+
+ for (int tx = 0; tx < w; tx++)
+ {
+ tile[ty * w + tx] = dataUint[rowOffset + tx];
+ }
+ }
+
+ return fastMode ? EncodeFast(tile, w, h) : EncodeExhaustive(tile, w, h);
+ }
+
+ private static Block EncodeFast(ReadOnlySpan<uint> tile, int w, int h)
+ {
+ (RgbaColor8 minColor, RgbaColor8 maxColor) = BC67Utils.GetMinMaxColors(tile, w, h);
+
+ bool alphaNotOne = minColor.A != 255 || maxColor.A != 255;
+ int variance = BC67Utils.SquaredDifference(minColor.GetColor32(), maxColor.GetColor32());
+ int selectedMode;
+ int indexMode = 0;
+
+ if (alphaNotOne)
+ {
+ bool constantAlpha = minColor.A == maxColor.A;
+ if (constantAlpha)
+ {
+ selectedMode = variance > MinColorVarianceForModeChange ? 7 : 6;
+ }
+ else
+ {
+ if (variance > MinColorVarianceForModeChange)
+ {
+ Span<uint> uniqueRGB = stackalloc uint[16];
+ Span<uint> uniqueAlpha = stackalloc uint[16];
+
+ int uniqueRGBCount = 0;
+ int uniqueAlphaCount = 0;
+
+ uint rgbMask = new RgbaColor8(255, 255, 255, 0).ToUInt32();
+ uint alphaMask = new RgbaColor8(0, 0, 0, 255).ToUInt32();
+
+ for (int i = 0; i < tile.Length; i++)
+ {
+ uint c = tile[i];
+
+ if (!uniqueRGB.Slice(0, uniqueRGBCount).Contains(c & rgbMask))
+ {
+ uniqueRGB[uniqueRGBCount++] = c & rgbMask;
+ }
+
+ if (!uniqueAlpha.Slice(0, uniqueAlphaCount).Contains(c & alphaMask))
+ {
+ uniqueAlpha[uniqueAlphaCount++] = c & alphaMask;
+ }
+ }
+
+ selectedMode = 4;
+ indexMode = uniqueRGBCount > uniqueAlphaCount ? 1 : 0;
+ }
+ else
+ {
+ selectedMode = 5;
+ }
+ }
+ }
+ else
+ {
+ if (variance > MinColorVarianceForModeChange)
+ {
+ selectedMode = 1;
+ }
+ else
+ {
+ selectedMode = 6;
+ }
+ }
+
+ int selectedPartition = 0;
+
+ if (selectedMode == 1 || selectedMode == 7)
+ {
+ int partitionSelectionLowestError = int.MaxValue;
+
+ for (int i = 0; i < _mostFrequentPartitions.Length; i++)
+ {
+ int p = _mostFrequentPartitions[i];
+ int error = GetEndPointSelectionErrorFast(tile, 2, p, w, h, partitionSelectionLowestError);
+ if (error < partitionSelectionLowestError)
+ {
+ partitionSelectionLowestError = error;
+ selectedPartition = p;
+ }
+ }
+ }
+
+ return Encode(selectedMode, selectedPartition, 0, indexMode, fastMode: true, tile, w, h, out _);
+ }
+
+ private static Block EncodeExhaustive(ReadOnlySpan<uint> tile, int w, int h)
+ {
+ Block bestBlock = default;
+ int lowestError = int.MaxValue;
+ int lowestErrorSubsets = int.MaxValue;
+
+ for (int m = 0; m < 8; m++)
+ {
+ for (int r = 0; r < (m == 4 || m == 5 ? 4 : 1); r++)
+ {
+ for (int im = 0; im < (m == 4 ? 2 : 1); im++)
+ {
+ for (int p = 0; p < 1 << BC67Tables.BC7ModeInfos[m].PartitionBitCount; p++)
+ {
+ Block block = Encode(m, p, r, im, fastMode: false, tile, w, h, out int maxError);
+ if (maxError < lowestError || (maxError == lowestError && BC67Tables.BC7ModeInfos[m].SubsetCount < lowestErrorSubsets))
+ {
+ lowestError = maxError;
+ lowestErrorSubsets = BC67Tables.BC7ModeInfos[m].SubsetCount;
+ bestBlock = block;
+ }
+ }
+ }
+ }
+ }
+
+ return bestBlock;
+ }
+
+ private static Block Encode(
+ int mode,
+ int partition,
+ int rotation,
+ int indexMode,
+ bool fastMode,
+ ReadOnlySpan<uint> tile,
+ int w,
+ int h,
+ out int errorSum)
+ {
+ BC7ModeInfo modeInfo = BC67Tables.BC7ModeInfos[mode];
+ int subsetCount = modeInfo.SubsetCount;
+ int partitionBitCount = modeInfo.PartitionBitCount;
+ int rotationBitCount = modeInfo.RotationBitCount;
+ int indexModeBitCount = modeInfo.IndexModeBitCount;
+ int colorDepth = modeInfo.ColorDepth;
+ int alphaDepth = modeInfo.AlphaDepth;
+ int pBits = modeInfo.PBits;
+ int colorIndexBitCount = modeInfo.ColorIndexBitCount;
+ int alphaIndexBitCount = modeInfo.AlphaIndexBitCount;
+ bool separateAlphaIndices = alphaIndexBitCount != 0;
+
+ uint alphaMask;
+
+ if (separateAlphaIndices)
+ {
+ alphaMask = rotation switch
+ {
+ 1 => new RgbaColor8(255, 0, 0, 0).ToUInt32(),
+ 2 => new RgbaColor8(0, 255, 0, 0).ToUInt32(),
+ 3 => new RgbaColor8(0, 0, 255, 0).ToUInt32(),
+ _ => new RgbaColor8(0, 0, 0, 255).ToUInt32()
+ };
+ }
+ else
+ {
+ alphaMask = new RgbaColor8(0, 0, 0, 0).ToUInt32();
+ }
+
+ if (indexMode != 0)
+ {
+ alphaMask = ~alphaMask;
+ }
+
+ //
+ // Select color palette.
+ //
+
+ Span<uint> endPoints0 = stackalloc uint[subsetCount];
+ Span<uint> endPoints1 = stackalloc uint[subsetCount];
+
+ SelectEndPoints(
+ tile,
+ w,
+ h,
+ endPoints0,
+ endPoints1,
+ subsetCount,
+ partition,
+ colorIndexBitCount,
+ colorDepth,
+ alphaDepth,
+ ~alphaMask,
+ fastMode);
+
+ if (separateAlphaIndices)
+ {
+ SelectEndPoints(
+ tile,
+ w,
+ h,
+ endPoints0,
+ endPoints1,
+ subsetCount,
+ partition,
+ alphaIndexBitCount,
+ colorDepth,
+ alphaDepth,
+ alphaMask,
+ fastMode);
+ }
+
+ Span<int> pBitValues = stackalloc int[pBits];
+
+ for (int i = 0; i < pBits; i++)
+ {
+ int pBit;
+
+ if (pBits == subsetCount)
+ {
+ pBit = GetPBit(endPoints0[i], endPoints1[i], colorDepth, alphaDepth);
+ }
+ else
+ {
+ int subset = i >> 1;
+ uint color = (i & 1) == 0 ? endPoints0[subset] : endPoints1[subset];
+ pBit = GetPBit(color, colorDepth, alphaDepth);
+ }
+
+ pBitValues[i] = pBit;
+ }
+
+ int colorIndexCount = 1 << colorIndexBitCount;
+ int alphaIndexCount = 1 << alphaIndexBitCount;
+
+ Span<byte> colorIndices = stackalloc byte[16];
+ Span<byte> alphaIndices = stackalloc byte[16];
+
+ errorSum = BC67Utils.SelectIndices(
+ tile,
+ w,
+ h,
+ endPoints0,
+ endPoints1,
+ pBitValues,
+ colorIndices,
+ subsetCount,
+ partition,
+ colorIndexBitCount,
+ colorIndexCount,
+ colorDepth,
+ alphaDepth,
+ pBits,
+ alphaMask);
+
+ if (separateAlphaIndices)
+ {
+ errorSum += BC67Utils.SelectIndices(
+ tile,
+ w,
+ h,
+ endPoints0,
+ endPoints1,
+ pBitValues,
+ alphaIndices,
+ subsetCount,
+ partition,
+ alphaIndexBitCount,
+ alphaIndexCount,
+ colorDepth,
+ alphaDepth,
+ pBits,
+ ~alphaMask);
+ }
+
+ Span<bool> colorSwapSubset = stackalloc bool[3];
+
+ for (int i = 0; i < 3; i++)
+ {
+ colorSwapSubset[i] = colorIndices[BC67Tables.FixUpIndices[subsetCount - 1][partition][i]] >= (colorIndexCount >> 1);
+ }
+
+ bool alphaSwapSubset = alphaIndices[0] >= (alphaIndexCount >> 1);
+
+ Block block = new Block();
+
+ int offset = 0;
+
+ block.Encode(1UL << mode, ref offset, mode + 1);
+ block.Encode((ulong)partition, ref offset, partitionBitCount);
+ block.Encode((ulong)rotation, ref offset, rotationBitCount);
+ block.Encode((ulong)indexMode, ref offset, indexModeBitCount);
+
+ for (int comp = 0; comp < 3; comp++)
+ {
+ int rotatedComp = comp;
+
+ if (((comp + 1) & 3) == rotation)
+ {
+ rotatedComp = 3;
+ }
+
+ for (int subset = 0; subset < subsetCount; subset++)
+ {
+ RgbaColor8 color0 = RgbaColor8.FromUInt32(endPoints0[subset]);
+ RgbaColor8 color1 = RgbaColor8.FromUInt32(endPoints1[subset]);
+
+ int pBit0 = -1, pBit1 = -1;
+
+ if (pBits == subsetCount)
+ {
+ pBit0 = pBit1 = pBitValues[subset];
+ }
+ else if (pBits != 0)
+ {
+ pBit0 = pBitValues[subset * 2];
+ pBit1 = pBitValues[subset * 2 + 1];
+ }
+
+ if (indexMode == 0 ? colorSwapSubset[subset] : alphaSwapSubset)
+ {
+ block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), colorDepth, pBit1), ref offset, colorDepth);
+ block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), colorDepth, pBit0), ref offset, colorDepth);
+ }
+ else
+ {
+ block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), colorDepth, pBit0), ref offset, colorDepth);
+ block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), colorDepth, pBit1), ref offset, colorDepth);
+ }
+ }
+ }
+
+ if (alphaDepth != 0)
+ {
+ int rotatedComp = (rotation - 1) & 3;
+
+ for (int subset = 0; subset < subsetCount; subset++)
+ {
+ RgbaColor8 color0 = RgbaColor8.FromUInt32(endPoints0[subset]);
+ RgbaColor8 color1 = RgbaColor8.FromUInt32(endPoints1[subset]);
+
+ int pBit0 = -1, pBit1 = -1;
+
+ if (pBits == subsetCount)
+ {
+ pBit0 = pBit1 = pBitValues[subset];
+ }
+ else if (pBits != 0)
+ {
+ pBit0 = pBitValues[subset * 2];
+ pBit1 = pBitValues[subset * 2 + 1];
+ }
+
+ if (separateAlphaIndices && indexMode == 0 ? alphaSwapSubset : colorSwapSubset[subset])
+ {
+ block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), alphaDepth, pBit1), ref offset, alphaDepth);
+ block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), alphaDepth, pBit0), ref offset, alphaDepth);
+ }
+ else
+ {
+ block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), alphaDepth, pBit0), ref offset, alphaDepth);
+ block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), alphaDepth, pBit1), ref offset, alphaDepth);
+ }
+ }
+ }
+
+ for (int i = 0; i < pBits; i++)
+ {
+ block.Encode((ulong)pBitValues[i], ref offset, 1);
+ }
+
+ byte[] fixUpTable = BC67Tables.FixUpIndices[subsetCount - 1][partition];
+
+ for (int i = 0; i < 16; i++)
+ {
+ int subset = BC67Tables.PartitionTable[subsetCount - 1][partition][i];
+ byte index = colorIndices[i];
+
+ if (colorSwapSubset[subset])
+ {
+ index = (byte)(index ^ (colorIndexCount - 1));
+ }
+
+ int finalIndexBitCount = i == fixUpTable[subset] ? colorIndexBitCount - 1 : colorIndexBitCount;
+
+ Debug.Assert(index < (1 << finalIndexBitCount));
+
+ block.Encode(index, ref offset, finalIndexBitCount);
+ }
+
+ if (separateAlphaIndices)
+ {
+ for (int i = 0; i < 16; i++)
+ {
+ byte index = alphaIndices[i];
+
+ if (alphaSwapSubset)
+ {
+ index = (byte)(index ^ (alphaIndexCount - 1));
+ }
+
+ int finalIndexBitCount = i == 0 ? alphaIndexBitCount - 1 : alphaIndexBitCount;
+
+ Debug.Assert(index < (1 << finalIndexBitCount));
+
+ block.Encode(index, ref offset, finalIndexBitCount);
+ }
+ }
+
+ return block;
+ }
+
+ private static unsafe int GetEndPointSelectionErrorFast(ReadOnlySpan<uint> tile, int subsetCount, int partition, int w, int h, int maxError)
+ {
+ byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition];
+
+ Span<RgbaColor8> minColors = stackalloc RgbaColor8[subsetCount];
+ Span<RgbaColor8> maxColors = stackalloc RgbaColor8[subsetCount];
+
+ BC67Utils.GetMinMaxColors(partitionTable, tile, w, h, minColors, maxColors, subsetCount);
+
+ Span<uint> endPoints0 = stackalloc uint[subsetCount];
+ Span<uint> endPoints1 = stackalloc uint[subsetCount];
+
+ SelectEndPointsFast(partitionTable, tile, w, h, subsetCount, minColors, maxColors, endPoints0, endPoints1, uint.MaxValue);
+
+ Span<RgbaColor32> palette = stackalloc RgbaColor32[8];
+
+ int errorSum = 0;
+
+ for (int subset = 0; subset < subsetCount; subset++)
+ {
+ RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32();
+ int sum = blockDir.R + blockDir.G + blockDir.B + blockDir.A;
+ if (sum != 0)
+ {
+ blockDir = (blockDir << 6) / new RgbaColor32(sum);
+ }
+
+ uint c0 = endPoints0[subset];
+ uint c1 = endPoints1[subset];
+
+ int pBit0 = GetPBit(c0, 6, 0);
+ int pBit1 = GetPBit(c1, 6, 0);
+
+ c0 = BC67Utils.Quantize(RgbaColor8.FromUInt32(c0), 6, 0, pBit0).ToUInt32();
+ c1 = BC67Utils.Quantize(RgbaColor8.FromUInt32(c1), 6, 0, pBit1).ToUInt32();
+
+ if (Sse41.IsSupported)
+ {
+ Vector128<byte> c0Rep = Vector128.Create(c0).AsByte();
+ Vector128<byte> c1Rep = Vector128.Create(c1).AsByte();
+
+ Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
+
+ Vector128<byte> rWeights;
+ Vector128<byte> lWeights;
+
+ fixed (byte* pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1])
+ {
+ rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte();
+ lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte();
+ }
+
+ Vector128<byte> iWeights = Sse2.UnpackLow(rWeights, lWeights);
+ Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
+ Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
+ Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+ Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+
+ static Vector128<short> ShiftRoundToNearest(Vector128<short> x)
+ {
+ return Sse2.ShiftRightLogical(Sse2.Add(x, Vector128.Create((short)32)), 6);
+ }
+
+ Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
+ Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
+ Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
+ Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
+
+ for (int i = 0; i < tile.Length; i++)
+ {
+ if (partitionTable[i] != subset)
+ {
+ continue;
+ }
+
+ uint c = tile[i];
+
+ Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
+
+ Vector128<short> delta0 = Sse2.Subtract(color, pal0);
+ Vector128<short> delta1 = Sse2.Subtract(color, pal1);
+ Vector128<short> delta2 = Sse2.Subtract(color, pal2);
+ Vector128<short> delta3 = Sse2.Subtract(color, pal3);
+
+ Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
+ Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
+ Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
+ Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
+
+ Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
+ Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
+
+ Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
+
+ Vector128<ushort> min = Sse41.MinHorizontal(delta);
+
+ errorSum += min.GetElement(0);
+ }
+ }
+ else
+ {
+ RgbaColor32 e032 = RgbaColor8.FromUInt32(c0).GetColor32();
+ RgbaColor32 e132 = RgbaColor8.FromUInt32(c1).GetColor32();
+
+ palette[0] = e032;
+ palette[palette.Length - 1] = e132;
+
+ for (int i = 1; i < palette.Length - 1; i++)
+ {
+ palette[i] = BC67Utils.Interpolate(e032, e132, i, 3);
+ }
+
+ for (int i = 0; i < tile.Length; i++)
+ {
+ if (partitionTable[i] != subset)
+ {
+ continue;
+ }
+
+ uint c = tile[i];
+ RgbaColor32 color = Unsafe.As<uint, RgbaColor8>(ref c).GetColor32();
+
+ int bestMatchScore = int.MaxValue;
+
+ for (int j = 0; j < palette.Length; j++)
+ {
+ int score = BC67Utils.SquaredDifference(color, palette[j]);
+
+ if (score < bestMatchScore)
+ {
+ bestMatchScore = score;
+ }
+ }
+
+ errorSum += bestMatchScore;
+ }
+ }
+
+ // No point in continuing if we are already above maximum.
+ if (errorSum >= maxError)
+ {
+ return int.MaxValue;
+ }
+ }
+
+ return errorSum;
+ }
+
+ private static void SelectEndPoints(
+ ReadOnlySpan<uint> tile,
+ int w,
+ int h,
+ Span<uint> endPoints0,
+ Span<uint> endPoints1,
+ int subsetCount,
+ int partition,
+ int indexBitCount,
+ int colorDepth,
+ int alphaDepth,
+ uint writeMask,
+ bool fastMode)
+ {
+ byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition];
+
+ Span<RgbaColor8> minColors = stackalloc RgbaColor8[subsetCount];
+ Span<RgbaColor8> maxColors = stackalloc RgbaColor8[subsetCount];
+
+ BC67Utils.GetMinMaxColors(partitionTable, tile, w, h, minColors, maxColors, subsetCount);
+
+ uint inverseMask = ~writeMask;
+
+ for (int i = 0; i < subsetCount; i++)
+ {
+ Unsafe.As<RgbaColor8, uint>(ref minColors[i]) |= inverseMask;
+ Unsafe.As<RgbaColor8, uint>(ref maxColors[i]) |= inverseMask;
+ }
+
+ if (fastMode)
+ {
+ SelectEndPointsFast(partitionTable, tile, w, h, subsetCount, minColors, maxColors, endPoints0, endPoints1, writeMask);
+ }
+ else
+ {
+ Span<RgbaColor8> colors = stackalloc RgbaColor8[subsetCount * 16];
+ Span<byte> counts = stackalloc byte[subsetCount];
+
+ int i = 0;
+ for (int ty = 0; ty < h; ty++)
+ {
+ for (int tx = 0; tx < w; tx++)
+ {
+ int subset = partitionTable[ty * 4 + tx];
+ RgbaColor8 color = RgbaColor8.FromUInt32(tile[i++] | inverseMask);
+
+ static void AddIfNew(Span<RgbaColor8> values, RgbaColor8 value, int subset, ref byte count)
+ {
+ for (int i = 0; i < count; i++)
+ {
+ if (values[subset * 16 + i] == value)
+ {
+ return;
+ }
+ }
+
+ values[subset * 16 + count++] = value;
+ }
+
+ AddIfNew(colors, color, subset, ref counts[subset]);
+ }
+ }
+
+ for (int subset = 0; subset < subsetCount; subset++)
+ {
+ int offset = subset * 16;
+
+ RgbaColor8 minColor = minColors[subset];
+ RgbaColor8 maxColor = maxColors[subset];
+
+ ReadOnlySpan<RgbaColor8> subsetColors = colors.Slice(offset, counts[subset]);
+
+ (RgbaColor8 e0, RgbaColor8 e1) = SelectEndPoints(subsetColors, minColor, maxColor, indexBitCount, colorDepth, alphaDepth, inverseMask);
+
+ endPoints0[subset] = (endPoints0[subset] & inverseMask) | (e0.ToUInt32() & writeMask);
+ endPoints1[subset] = (endPoints1[subset] & inverseMask) | (e1.ToUInt32() & writeMask);
+ }
+ }
+ }
+
+ private static unsafe void SelectEndPointsFast(
+ ReadOnlySpan<byte> partitionTable,
+ ReadOnlySpan<uint> tile,
+ int w,
+ int h,
+ int subsetCount,
+ ReadOnlySpan<RgbaColor8> minColors,
+ ReadOnlySpan<RgbaColor8> maxColors,
+ Span<uint> endPoints0,
+ Span<uint> endPoints1,
+ uint writeMask)
+ {
+ uint inverseMask = ~writeMask;
+
+ if (Sse41.IsSupported && w == 4 && h == 4)
+ {
+ Vector128<byte> row0, row1, row2, row3;
+ Vector128<short> ones = Vector128<short>.AllBitsSet;
+
+ fixed (uint* pTile = tile)
+ {
+ row0 = Sse2.LoadVector128(pTile).AsByte();
+ row1 = Sse2.LoadVector128(pTile + 4).AsByte();
+ row2 = Sse2.LoadVector128(pTile + 8).AsByte();
+ row3 = Sse2.LoadVector128(pTile + 12).AsByte();
+ }
+
+ Vector128<byte> partitionMask;
+
+ fixed (byte* pPartitionTable = partitionTable)
+ {
+ partitionMask = Sse2.LoadVector128(pPartitionTable);
+ }
+
+ for (int subset = 0; subset < subsetCount; subset++)
+ {
+ RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32();
+ int sum = blockDir.R + blockDir.G + blockDir.B + blockDir.A;
+ if (sum != 0)
+ {
+ blockDir = (blockDir << 6) / new RgbaColor32(sum);
+ }
+
+ Vector128<byte> bd = Vector128.Create(blockDir.GetColor8().ToUInt32()).AsByte();
+
+ Vector128<short> delta0 = Ssse3.MultiplyAddAdjacent(row0, bd.AsSByte());
+ Vector128<short> delta1 = Ssse3.MultiplyAddAdjacent(row1, bd.AsSByte());
+ Vector128<short> delta2 = Ssse3.MultiplyAddAdjacent(row2, bd.AsSByte());
+ Vector128<short> delta3 = Ssse3.MultiplyAddAdjacent(row3, bd.AsSByte());
+
+ Vector128<short> delta01 = Ssse3.HorizontalAdd(delta0, delta1);
+ Vector128<short> delta23 = Ssse3.HorizontalAdd(delta2, delta3);
+
+ Vector128<byte> subsetMask = Sse2.Xor(Sse2.CompareEqual(partitionMask, Vector128.Create((byte)subset)), ones.AsByte());
+
+ Vector128<short> subsetMask01 = Sse2.UnpackLow(subsetMask, subsetMask).AsInt16();
+ Vector128<short> subsetMask23 = Sse2.UnpackHigh(subsetMask, subsetMask).AsInt16();
+
+ Vector128<ushort> min01 = Sse41.MinHorizontal(Sse2.Or(delta01, subsetMask01).AsUInt16());
+ Vector128<ushort> min23 = Sse41.MinHorizontal(Sse2.Or(delta23, subsetMask23).AsUInt16());
+ Vector128<ushort> max01 = Sse41.MinHorizontal(Sse2.Xor(Sse2.AndNot(subsetMask01, delta01), ones).AsUInt16());
+ Vector128<ushort> max23 = Sse41.MinHorizontal(Sse2.Xor(Sse2.AndNot(subsetMask23, delta23), ones).AsUInt16());
+
+ uint minPos01 = min01.AsUInt32().GetElement(0);
+ uint minPos23 = min23.AsUInt32().GetElement(0);
+ uint maxPos01 = max01.AsUInt32().GetElement(0);
+ uint maxPos23 = max23.AsUInt32().GetElement(0);
+
+ uint minDistColor = (ushort)minPos23 < (ushort)minPos01
+ ? tile[(int)(minPos23 >> 16) + 8]
+ : tile[(int)(minPos01 >> 16)];
+
+ // Note that we calculate the maximum as the minimum of the inverse, so less here is actually greater.
+ uint maxDistColor = (ushort)maxPos23 < (ushort)maxPos01
+ ? tile[(int)(maxPos23 >> 16) + 8]
+ : tile[(int)(maxPos01 >> 16)];
+
+ endPoints0[subset] = (endPoints0[subset] & inverseMask) | (minDistColor & writeMask);
+ endPoints1[subset] = (endPoints1[subset] & inverseMask) | (maxDistColor & writeMask);
+ }
+ }
+ else
+ {
+ for (int subset = 0; subset < subsetCount; subset++)
+ {
+ RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32();
+ blockDir = RgbaColor32.DivideGuarded(blockDir << 6, new RgbaColor32(blockDir.R + blockDir.G + blockDir.B + blockDir.A), 0);
+
+ int minDist = int.MaxValue;
+ int maxDist = int.MinValue;
+
+ RgbaColor8 minDistColor = default;
+ RgbaColor8 maxDistColor = default;
+
+ int i = 0;
+ for (int ty = 0; ty < h; ty++)
+ {
+ for (int tx = 0; tx < w; tx++, i++)
+ {
+ if (partitionTable[ty * 4 + tx] != subset)
+ {
+ continue;
+ }
+
+ RgbaColor8 color = RgbaColor8.FromUInt32(tile[i]);
+ int dist = RgbaColor32.Dot(color.GetColor32(), blockDir);
+
+ if (minDist > dist)
+ {
+ minDist = dist;
+ minDistColor = color;
+ }
+
+ if (maxDist < dist)
+ {
+ maxDist = dist;
+ maxDistColor = color;
+ }
+ }
+ }
+
+ endPoints0[subset] = (endPoints0[subset] & inverseMask) | (minDistColor.ToUInt32() & writeMask);
+ endPoints1[subset] = (endPoints1[subset] & inverseMask) | (maxDistColor.ToUInt32() & writeMask);
+ }
+ }
+ }
+
+ private static (RgbaColor8, RgbaColor8) SelectEndPoints(
+ ReadOnlySpan<RgbaColor8> values,
+ RgbaColor8 minValue,
+ RgbaColor8 maxValue,
+ int indexBitCount,
+ int colorDepth,
+ int alphaDepth,
+ uint alphaMask)
+ {
+ int n = values.Length;
+ int numInterpolatedColors = 1 << indexBitCount;
+ int numInterpolatedColorsMinus1 = numInterpolatedColors - 1;
+
+ if (n == 0)
+ {
+ return (default, default);
+ }
+
+ minValue = BC67Utils.Quantize(minValue, colorDepth, alphaDepth);
+ maxValue = BC67Utils.Quantize(maxValue, colorDepth, alphaDepth);
+
+ RgbaColor32 blockDir = maxValue.GetColor32() - minValue.GetColor32();
+ blockDir = RgbaColor32.DivideGuarded(blockDir << 6, new RgbaColor32(blockDir.R + blockDir.G + blockDir.B + blockDir.A), 0);
+
+ int minDist = int.MaxValue;
+ int maxDist = 0;
+
+ for (int i = 0; i < values.Length; i++)
+ {
+ RgbaColor8 color = values[i];
+ int dist = RgbaColor32.Dot(BC67Utils.Quantize(color, colorDepth, alphaDepth).GetColor32(), blockDir);
+
+ if (minDist >= dist)
+ {
+ minDist = dist;
+ }
+
+ if (maxDist <= dist)
+ {
+ maxDist = dist;
+ }
+ }
+
+ Span<RgbaColor8> palette = stackalloc RgbaColor8[numInterpolatedColors];
+
+ int distRange = Math.Max(1, maxDist - minDist);
+
+ RgbaColor32 nV = new RgbaColor32(n);
+
+ int bestErrorSum = int.MaxValue;
+ RgbaColor8 bestE0 = default;
+ RgbaColor8 bestE1 = default;
+
+ Span<int> indices = stackalloc int[n];
+ Span<RgbaColor32> colors = stackalloc RgbaColor32[n];
+
+ for (int maxIndex = numInterpolatedColorsMinus1; maxIndex >= 1; maxIndex--)
+ {
+ int sumX = 0;
+ int sumXX = 0;
+ int sumXXIncrement = 0;
+
+ for (int i = 0; i < values.Length; i++)
+ {
+ RgbaColor32 color = values[i].GetColor32();
+
+ int dist = RgbaColor32.Dot(color, blockDir);
+
+ int normalizedValue = ((dist - minDist) << 6) / distRange;
+ int texelIndex = (normalizedValue * maxIndex + 32) >> 6;
+
+ indices[i] = texelIndex;
+ colors[i] = color;
+
+ sumX += texelIndex;
+ sumXX += texelIndex * texelIndex;
+ sumXXIncrement += 1 + texelIndex * 2;
+ }
+
+ for (int start = 0; start < numInterpolatedColors - maxIndex; start++)
+ {
+ RgbaColor32 sumY = new RgbaColor32(0);
+ RgbaColor32 sumXY = new RgbaColor32(0);
+
+ for (int i = 0; i < indices.Length; i++)
+ {
+ RgbaColor32 y = colors[i];
+
+ sumY += y;
+ sumXY += new RgbaColor32(start + indices[i]) * y;
+ }
+
+ RgbaColor32 sumXV = new RgbaColor32(sumX);
+ RgbaColor32 sumXXV = new RgbaColor32(sumXX);
+ RgbaColor32 m = RgbaColor32.DivideGuarded((nV * sumXY - sumXV * sumY) << 6, nV * sumXXV - sumXV * sumXV, 0);
+ RgbaColor32 b = ((sumY << 6) - m * sumXV) / nV;
+
+ RgbaColor8 candidateE0 = (b >> 6).GetColor8();
+ RgbaColor8 candidateE1 = ((b + m * new RgbaColor32(numInterpolatedColorsMinus1)) >> 6).GetColor8();
+
+ int pBit0 = GetPBit(candidateE0.ToUInt32(), colorDepth, alphaDepth);
+ int pBit1 = GetPBit(candidateE1.ToUInt32(), colorDepth, alphaDepth);
+
+ int errorSum = BC67Utils.SelectIndices(
+ MemoryMarshal.Cast<RgbaColor8, uint>(values),
+ candidateE0.ToUInt32(),
+ candidateE1.ToUInt32(),
+ pBit0,
+ pBit1,
+ indexBitCount,
+ numInterpolatedColors,
+ colorDepth,
+ alphaDepth,
+ alphaMask);
+
+ if (errorSum <= bestErrorSum)
+ {
+ bestErrorSum = errorSum;
+ bestE0 = candidateE0;
+ bestE1 = candidateE1;
+ }
+
+ sumX += n;
+ sumXX += sumXXIncrement;
+ sumXXIncrement += 2 * n;
+ }
+ }
+
+ return (bestE0, bestE1);
+ }
+
+ private static int GetPBit(uint color, int colorDepth, int alphaDepth)
+ {
+ uint mask = 0x808080u >> colorDepth;
+
+ if (alphaDepth != 0)
+ {
+ // If alpha is 0, let's assume the color information is not too important and prefer
+ // to preserve alpha instead.
+ if ((color >> 24) == 0)
+ {
+ return 0;
+ }
+
+ mask |= 0x80000000u >> alphaDepth;
+ }
+
+ color &= 0x7f7f7f7fu;
+ color += mask >> 1;
+
+ int onesCount = BitOperations.PopCount(color & mask);
+ return onesCount >= 2 ? 1 : 0;
+ }
+
+ private static int GetPBit(uint c0, uint c1, int colorDepth, int alphaDepth)
+ {
+ // Giving preference to the first endpoint yields better results,
+ // might be a side effect of the endpoint selection algorithm?
+ return GetPBit(c0, colorDepth, alphaDepth);
+ }
+ }
+}
diff --git a/src/Ryujinx.Graphics.Texture/Encoders/EncodeMode.cs b/src/Ryujinx.Graphics.Texture/Encoders/EncodeMode.cs
new file mode 100644
index 00000000..5734d301
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Encoders/EncodeMode.cs
@@ -0,0 +1,10 @@
+namespace Ryujinx.Graphics.Texture.Encoders
+{
+ enum EncodeMode
+ {
+ Fast,
+ Exhaustive,
+ ModeMask = 0xff,
+ Multithreaded = 1 << 8
+ }
+}
diff --git a/src/Ryujinx.Graphics.Texture/LayoutConverter.cs b/src/Ryujinx.Graphics.Texture/LayoutConverter.cs
new file mode 100644
index 00000000..09eaf300
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/LayoutConverter.cs
@@ -0,0 +1,591 @@
+using Ryujinx.Common;
+using System;
+using System.Runtime.Intrinsics;
+using static Ryujinx.Graphics.Texture.BlockLinearConstants;
+
+namespace Ryujinx.Graphics.Texture
+{
+ public static class LayoutConverter
+ {
+ public const int HostStrideAlignment = 4;
+
+ public static void ConvertBlockLinearToLinear(
+ Span<byte> dst,
+ int width,
+ int height,
+ int stride,
+ int bytesPerPixel,
+ int gobBlocksInY,
+ ReadOnlySpan<byte> data)
+ {
+ int gobHeight = gobBlocksInY * GobHeight;
+
+ int strideTrunc = BitUtils.AlignDown(width * bytesPerPixel, 16);
+ int strideTrunc64 = BitUtils.AlignDown(width * bytesPerPixel, 64);
+
+ int xStart = strideTrunc / bytesPerPixel;
+
+ int outStrideGap = stride - width * bytesPerPixel;
+
+ int alignment = GobStride / bytesPerPixel;
+
+ int wAligned = BitUtils.AlignUp(width, alignment);
+
+ BlockLinearLayout layoutConverter = new BlockLinearLayout(wAligned, height, gobBlocksInY, 1, bytesPerPixel);
+
+ unsafe bool Convert<T>(Span<byte> output, ReadOnlySpan<byte> data) where T : unmanaged
+ {
+ fixed (byte* outputPtr = output, dataPtr = data)
+ {
+ byte* outPtr = outputPtr;
+
+ for (int y = 0; y < height; y++)
+ {
+ layoutConverter.SetY(y);
+
+ for (int x = 0; x < strideTrunc64; x += 64, outPtr += 64)
+ {
+ byte* offset = dataPtr + layoutConverter.GetOffsetWithLineOffset64(x);
+ byte* offset2 = offset + 0x20;
+ byte* offset3 = offset + 0x100;
+ byte* offset4 = offset + 0x120;
+
+ Vector128<byte> value = *(Vector128<byte>*)offset;
+ Vector128<byte> value2 = *(Vector128<byte>*)offset2;
+ Vector128<byte> value3 = *(Vector128<byte>*)offset3;
+ Vector128<byte> value4 = *(Vector128<byte>*)offset4;
+
+ *(Vector128<byte>*)outPtr = value;
+ *(Vector128<byte>*)(outPtr + 16) = value2;
+ *(Vector128<byte>*)(outPtr + 32) = value3;
+ *(Vector128<byte>*)(outPtr + 48) = value4;
+ }
+
+ for (int x = strideTrunc64; x < strideTrunc; x += 16, outPtr += 16)
+ {
+ byte* offset = dataPtr + layoutConverter.GetOffsetWithLineOffset16(x);
+
+ *(Vector128<byte>*)outPtr = *(Vector128<byte>*)offset;
+ }
+
+ for (int x = xStart; x < width; x++, outPtr += bytesPerPixel)
+ {
+ byte* offset = dataPtr + layoutConverter.GetOffset(x);
+
+ *(T*)outPtr = *(T*)offset;
+ }
+
+ outPtr += outStrideGap;
+ }
+ }
+ return true;
+ }
+
+ bool _ = bytesPerPixel switch
+ {
+ 1 => Convert<byte>(dst, data),
+ 2 => Convert<ushort>(dst, data),
+ 4 => Convert<uint>(dst, data),
+ 8 => Convert<ulong>(dst, data),
+ 12 => Convert<Bpp12Pixel>(dst, data),
+ 16 => Convert<Vector128<byte>>(dst, data),
+ _ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.")
+ };
+ }
+
+ public static byte[] ConvertBlockLinearToLinear(
+ int width,
+ int height,
+ int depth,
+ int sliceDepth,
+ int levels,
+ int layers,
+ int blockWidth,
+ int blockHeight,
+ int bytesPerPixel,
+ int gobBlocksInY,
+ int gobBlocksInZ,
+ int gobBlocksInTileX,
+ SizeInfo sizeInfo,
+ ReadOnlySpan<byte> data)
+ {
+ int outSize = GetTextureSize(
+ width,
+ height,
+ sliceDepth,
+ levels,
+ layers,
+ blockWidth,
+ blockHeight,
+ bytesPerPixel);
+
+ byte[] output = new byte[outSize];
+
+ int outOffs = 0;
+
+ int mipGobBlocksInY = gobBlocksInY;
+ int mipGobBlocksInZ = gobBlocksInZ;
+
+ int gobWidth = (GobStride / bytesPerPixel) * gobBlocksInTileX;
+ int gobHeight = gobBlocksInY * GobHeight;
+
+ for (int level = 0; level < levels; level++)
+ {
+ int w = Math.Max(1, width >> level);
+ int h = Math.Max(1, height >> level);
+ int d = Math.Max(1, depth >> level);
+
+ w = BitUtils.DivRoundUp(w, blockWidth);
+ h = BitUtils.DivRoundUp(h, blockHeight);
+
+ while (h <= (mipGobBlocksInY >> 1) * GobHeight && mipGobBlocksInY != 1)
+ {
+ mipGobBlocksInY >>= 1;
+ }
+
+ while (d <= (mipGobBlocksInZ >> 1) && mipGobBlocksInZ != 1)
+ {
+ mipGobBlocksInZ >>= 1;
+ }
+
+ int strideTrunc = BitUtils.AlignDown(w * bytesPerPixel, 16);
+ int strideTrunc64 = BitUtils.AlignDown(w * bytesPerPixel, 64);
+
+ int xStart = strideTrunc / bytesPerPixel;
+
+ int stride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
+
+ int outStrideGap = stride - w * bytesPerPixel;
+
+ int alignment = gobWidth;
+
+ if (d < gobBlocksInZ || w <= gobWidth || h <= gobHeight)
+ {
+ alignment = GobStride / bytesPerPixel;
+ }
+
+ int wAligned = BitUtils.AlignUp(w, alignment);
+
+ BlockLinearLayout layoutConverter = new BlockLinearLayout(
+ wAligned,
+ h,
+ mipGobBlocksInY,
+ mipGobBlocksInZ,
+ bytesPerPixel);
+
+ int sd = Math.Max(1, sliceDepth >> level);
+
+ unsafe bool Convert<T>(Span<byte> output, ReadOnlySpan<byte> data) where T : unmanaged
+ {
+ fixed (byte* outputPtr = output, dataPtr = data)
+ {
+ byte* outPtr = outputPtr + outOffs;
+ for (int layer = 0; layer < layers; layer++)
+ {
+ byte* inBaseOffset = dataPtr + (layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level));
+
+ for (int z = 0; z < sd; z++)
+ {
+ layoutConverter.SetZ(z);
+ for (int y = 0; y < h; y++)
+ {
+ layoutConverter.SetY(y);
+
+ for (int x = 0; x < strideTrunc64; x += 64, outPtr += 64)
+ {
+ byte* offset = inBaseOffset + layoutConverter.GetOffsetWithLineOffset64(x);
+ byte* offset2 = offset + 0x20;
+ byte* offset3 = offset + 0x100;
+ byte* offset4 = offset + 0x120;
+
+ Vector128<byte> value = *(Vector128<byte>*)offset;
+ Vector128<byte> value2 = *(Vector128<byte>*)offset2;
+ Vector128<byte> value3 = *(Vector128<byte>*)offset3;
+ Vector128<byte> value4 = *(Vector128<byte>*)offset4;
+
+ *(Vector128<byte>*)outPtr = value;
+ *(Vector128<byte>*)(outPtr + 16) = value2;
+ *(Vector128<byte>*)(outPtr + 32) = value3;
+ *(Vector128<byte>*)(outPtr + 48) = value4;
+ }
+
+ for (int x = strideTrunc64; x < strideTrunc; x += 16, outPtr += 16)
+ {
+ byte* offset = inBaseOffset + layoutConverter.GetOffsetWithLineOffset16(x);
+
+ *(Vector128<byte>*)outPtr = *(Vector128<byte>*)offset;
+ }
+
+ for (int x = xStart; x < w; x++, outPtr += bytesPerPixel)
+ {
+ byte* offset = inBaseOffset + layoutConverter.GetOffset(x);
+
+ *(T*)outPtr = *(T*)offset;
+ }
+
+ outPtr += outStrideGap;
+ }
+ }
+ }
+ outOffs += stride * h * d * layers;
+ }
+ return true;
+ }
+
+ bool _ = bytesPerPixel switch
+ {
+ 1 => Convert<byte>(output, data),
+ 2 => Convert<ushort>(output, data),
+ 4 => Convert<uint>(output, data),
+ 8 => Convert<ulong>(output, data),
+ 12 => Convert<Bpp12Pixel>(output, data),
+ 16 => Convert<Vector128<byte>>(output, data),
+ _ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.")
+ };
+ }
+ return output;
+ }
+
+ public static byte[] ConvertLinearStridedToLinear(
+ int width,
+ int height,
+ int blockWidth,
+ int blockHeight,
+ int lineSize,
+ int stride,
+ int bytesPerPixel,
+ ReadOnlySpan<byte> data)
+ {
+ int w = BitUtils.DivRoundUp(width, blockWidth);
+ int h = BitUtils.DivRoundUp(height, blockHeight);
+
+ int outStride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
+ lineSize = Math.Min(lineSize, outStride);
+
+ byte[] output = new byte[h * outStride];
+ Span<byte> outSpan = output;
+
+ int outOffs = 0;
+ int inOffs = 0;
+
+ for (int y = 0; y < h; y++)
+ {
+ data.Slice(inOffs, lineSize).CopyTo(outSpan.Slice(outOffs, lineSize));
+
+ inOffs += stride;
+ outOffs += outStride;
+ }
+
+ return output;
+ }
+
+ public static void ConvertLinearToBlockLinear(
+ Span<byte> dst,
+ int width,
+ int height,
+ int stride,
+ int bytesPerPixel,
+ int gobBlocksInY,
+ ReadOnlySpan<byte> data)
+ {
+ int gobHeight = gobBlocksInY * GobHeight;
+
+ int strideTrunc = BitUtils.AlignDown(width * bytesPerPixel, 16);
+ int strideTrunc64 = BitUtils.AlignDown(width * bytesPerPixel, 64);
+
+ int xStart = strideTrunc / bytesPerPixel;
+
+ int inStrideGap = stride - width * bytesPerPixel;
+
+ int alignment = GobStride / bytesPerPixel;
+
+ int wAligned = BitUtils.AlignUp(width, alignment);
+
+ BlockLinearLayout layoutConverter = new BlockLinearLayout(wAligned, height, gobBlocksInY, 1, bytesPerPixel);
+
+ unsafe bool Convert<T>(Span<byte> output, ReadOnlySpan<byte> data) where T : unmanaged
+ {
+ fixed (byte* outputPtr = output, dataPtr = data)
+ {
+ byte* inPtr = dataPtr;
+
+ for (int y = 0; y < height; y++)
+ {
+ layoutConverter.SetY(y);
+
+ for (int x = 0; x < strideTrunc64; x += 64, inPtr += 64)
+ {
+ byte* offset = outputPtr + layoutConverter.GetOffsetWithLineOffset64(x);
+ byte* offset2 = offset + 0x20;
+ byte* offset3 = offset + 0x100;
+ byte* offset4 = offset + 0x120;
+
+ Vector128<byte> value = *(Vector128<byte>*)inPtr;
+ Vector128<byte> value2 = *(Vector128<byte>*)(inPtr + 16);
+ Vector128<byte> value3 = *(Vector128<byte>*)(inPtr + 32);
+ Vector128<byte> value4 = *(Vector128<byte>*)(inPtr + 48);
+
+ *(Vector128<byte>*)offset = value;
+ *(Vector128<byte>*)offset2 = value2;
+ *(Vector128<byte>*)offset3 = value3;
+ *(Vector128<byte>*)offset4 = value4;
+ }
+
+ for (int x = strideTrunc64; x < strideTrunc; x += 16, inPtr += 16)
+ {
+ byte* offset = outputPtr + layoutConverter.GetOffsetWithLineOffset16(x);
+
+ *(Vector128<byte>*)offset = *(Vector128<byte>*)inPtr;
+ }
+
+ for (int x = xStart; x < width; x++, inPtr += bytesPerPixel)
+ {
+ byte* offset = outputPtr + layoutConverter.GetOffset(x);
+
+ *(T*)offset = *(T*)inPtr;
+ }
+
+ inPtr += inStrideGap;
+ }
+ }
+ return true;
+ }
+
+ bool _ = bytesPerPixel switch
+ {
+ 1 => Convert<byte>(dst, data),
+ 2 => Convert<ushort>(dst, data),
+ 4 => Convert<uint>(dst, data),
+ 8 => Convert<ulong>(dst, data),
+ 12 => Convert<Bpp12Pixel>(dst, data),
+ 16 => Convert<Vector128<byte>>(dst, data),
+ _ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.")
+ };
+ }
+
+ public static ReadOnlySpan<byte> ConvertLinearToBlockLinear(
+ Span<byte> output,
+ int width,
+ int height,
+ int depth,
+ int sliceDepth,
+ int levels,
+ int layers,
+ int blockWidth,
+ int blockHeight,
+ int bytesPerPixel,
+ int gobBlocksInY,
+ int gobBlocksInZ,
+ int gobBlocksInTileX,
+ SizeInfo sizeInfo,
+ ReadOnlySpan<byte> data)
+ {
+ if (output.Length == 0)
+ {
+ output = new byte[sizeInfo.TotalSize];
+ }
+
+ int inOffs = 0;
+
+ int mipGobBlocksInY = gobBlocksInY;
+ int mipGobBlocksInZ = gobBlocksInZ;
+
+ int gobWidth = (GobStride / bytesPerPixel) * gobBlocksInTileX;
+ int gobHeight = gobBlocksInY * GobHeight;
+
+ for (int level = 0; level < levels; level++)
+ {
+ int w = Math.Max(1, width >> level);
+ int h = Math.Max(1, height >> level);
+ int d = Math.Max(1, depth >> level);
+
+ w = BitUtils.DivRoundUp(w, blockWidth);
+ h = BitUtils.DivRoundUp(h, blockHeight);
+
+ while (h <= (mipGobBlocksInY >> 1) * GobHeight && mipGobBlocksInY != 1)
+ {
+ mipGobBlocksInY >>= 1;
+ }
+
+ while (d <= (mipGobBlocksInZ >> 1) && mipGobBlocksInZ != 1)
+ {
+ mipGobBlocksInZ >>= 1;
+ }
+
+ int strideTrunc = BitUtils.AlignDown(w * bytesPerPixel, 16);
+ int strideTrunc64 = BitUtils.AlignDown(w * bytesPerPixel, 64);
+
+ int xStart = strideTrunc / bytesPerPixel;
+
+ int stride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
+
+ int inStrideGap = stride - w * bytesPerPixel;
+
+ int alignment = gobWidth;
+
+ if (d < gobBlocksInZ || w <= gobWidth || h <= gobHeight)
+ {
+ alignment = GobStride / bytesPerPixel;
+ }
+
+ int wAligned = BitUtils.AlignUp(w, alignment);
+
+ BlockLinearLayout layoutConverter = new BlockLinearLayout(
+ wAligned,
+ h,
+ mipGobBlocksInY,
+ mipGobBlocksInZ,
+ bytesPerPixel);
+
+ int sd = Math.Max(1, sliceDepth >> level);
+
+ unsafe bool Convert<T>(Span<byte> output, ReadOnlySpan<byte> data) where T : unmanaged
+ {
+ fixed (byte* outputPtr = output, dataPtr = data)
+ {
+ byte* inPtr = dataPtr + inOffs;
+ for (int layer = 0; layer < layers; layer++)
+ {
+ byte* outBaseOffset = outputPtr + (layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level));
+
+ for (int z = 0; z < sd; z++)
+ {
+ layoutConverter.SetZ(z);
+ for (int y = 0; y < h; y++)
+ {
+ layoutConverter.SetY(y);
+
+ for (int x = 0; x < strideTrunc64; x += 64, inPtr += 64)
+ {
+ byte* offset = outBaseOffset + layoutConverter.GetOffsetWithLineOffset64(x);
+ byte* offset2 = offset + 0x20;
+ byte* offset3 = offset + 0x100;
+ byte* offset4 = offset + 0x120;
+
+ Vector128<byte> value = *(Vector128<byte>*)inPtr;
+ Vector128<byte> value2 = *(Vector128<byte>*)(inPtr + 16);
+ Vector128<byte> value3 = *(Vector128<byte>*)(inPtr + 32);
+ Vector128<byte> value4 = *(Vector128<byte>*)(inPtr + 48);
+
+ *(Vector128<byte>*)offset = value;
+ *(Vector128<byte>*)offset2 = value2;
+ *(Vector128<byte>*)offset3 = value3;
+ *(Vector128<byte>*)offset4 = value4;
+ }
+
+ for (int x = strideTrunc64; x < strideTrunc; x += 16, inPtr += 16)
+ {
+ byte* offset = outBaseOffset + layoutConverter.GetOffsetWithLineOffset16(x);
+
+ *(Vector128<byte>*)offset = *(Vector128<byte>*)inPtr;
+ }
+
+ for (int x = xStart; x < w; x++, inPtr += bytesPerPixel)
+ {
+ byte* offset = outBaseOffset + layoutConverter.GetOffset(x);
+
+ *(T*)offset = *(T*)inPtr;
+ }
+
+ inPtr += inStrideGap;
+ }
+ }
+ }
+ inOffs += stride * h * d * layers;
+ }
+ return true;
+ }
+
+ bool _ = bytesPerPixel switch
+ {
+ 1 => Convert<byte>(output, data),
+ 2 => Convert<ushort>(output, data),
+ 4 => Convert<uint>(output, data),
+ 8 => Convert<ulong>(output, data),
+ 12 => Convert<Bpp12Pixel>(output, data),
+ 16 => Convert<Vector128<byte>>(output, data),
+ _ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.")
+ };
+ }
+
+ return output;
+ }
+
+ public static ReadOnlySpan<byte> ConvertLinearToLinearStrided(
+ Span<byte> output,
+ int width,
+ int height,
+ int blockWidth,
+ int blockHeight,
+ int stride,
+ int bytesPerPixel,
+ ReadOnlySpan<byte> data)
+ {
+ int w = BitUtils.DivRoundUp(width, blockWidth);
+ int h = BitUtils.DivRoundUp(height, blockHeight);
+
+ int inStride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
+ int lineSize = width * bytesPerPixel;
+
+ if (inStride == stride)
+ {
+ if (output.Length != 0)
+ {
+ data.CopyTo(output);
+ return output;
+ }
+ else
+ {
+ return data;
+ }
+ }
+
+ if (output.Length == 0)
+ {
+ output = new byte[h * stride];
+ }
+
+ int inOffs = 0;
+ int outOffs = 0;
+
+ for (int y = 0; y < h; y++)
+ {
+ data.Slice(inOffs, lineSize).CopyTo(output.Slice(outOffs, lineSize));
+
+ inOffs += inStride;
+ outOffs += stride;
+ }
+
+ return output;
+ }
+
+ private static int GetTextureSize(
+ int width,
+ int height,
+ int depth,
+ int levels,
+ int layers,
+ int blockWidth,
+ int blockHeight,
+ int bytesPerPixel)
+ {
+ int layerSize = 0;
+
+ for (int level = 0; level < levels; level++)
+ {
+ int w = Math.Max(1, width >> level);
+ int h = Math.Max(1, height >> level);
+ int d = Math.Max(1, depth >> level);
+
+ w = BitUtils.DivRoundUp(w, blockWidth);
+ h = BitUtils.DivRoundUp(h, blockHeight);
+
+ int stride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
+
+ layerSize += stride * h * d;
+ }
+
+ return layerSize * layers;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/Ryujinx.Graphics.Texture/OffsetCalculator.cs b/src/Ryujinx.Graphics.Texture/OffsetCalculator.cs
new file mode 100644
index 00000000..d7472e2f
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/OffsetCalculator.cs
@@ -0,0 +1,141 @@
+using Ryujinx.Common;
+using System;
+using System.Runtime.CompilerServices;
+using static Ryujinx.Graphics.Texture.BlockLinearConstants;
+
+namespace Ryujinx.Graphics.Texture
+{
+ public class OffsetCalculator
+ {
+ private int _width;
+ private int _height;
+ private int _stride;
+ private bool _isLinear;
+ private int _bytesPerPixel;
+
+ private BlockLinearLayout _layoutConverter;
+
+ // Variables for built in iteration.
+ private int _yPart;
+
+ public OffsetCalculator(
+ int width,
+ int height,
+ int stride,
+ bool isLinear,
+ int gobBlocksInY,
+ int gobBlocksInZ,
+ int bytesPerPixel)
+ {
+ _width = width;
+ _height = height;
+ _stride = stride;
+ _isLinear = isLinear;
+ _bytesPerPixel = bytesPerPixel;
+
+ int wAlignment = GobStride / bytesPerPixel;
+
+ int wAligned = BitUtils.AlignUp(width, wAlignment);
+
+ if (!isLinear)
+ {
+ _layoutConverter = new BlockLinearLayout(
+ wAligned,
+ height,
+ gobBlocksInY,
+ gobBlocksInZ,
+ bytesPerPixel);
+ }
+ }
+
+ public OffsetCalculator(
+ int width,
+ int height,
+ int stride,
+ bool isLinear,
+ int gobBlocksInY,
+ int bytesPerPixel) : this(width, height, stride, isLinear, gobBlocksInY, 1, bytesPerPixel)
+ {
+ }
+
+ public void SetY(int y)
+ {
+ if (_isLinear)
+ {
+ _yPart = y * _stride;
+ }
+ else
+ {
+ _layoutConverter.SetY(y);
+ }
+ }
+
+ public int GetOffset(int x, int y)
+ {
+ if (_isLinear)
+ {
+ return x * _bytesPerPixel + y * _stride;
+ }
+ else
+ {
+ return _layoutConverter.GetOffset(x, y, 0);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public int GetOffset(int x)
+ {
+ if (_isLinear)
+ {
+ return x * _bytesPerPixel + _yPart;
+ }
+ else
+ {
+ return _layoutConverter.GetOffset(x);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public int GetOffsetWithLineOffset64(int x)
+ {
+ if (_isLinear)
+ {
+ return x + _yPart;
+ }
+ else
+ {
+ return _layoutConverter.GetOffsetWithLineOffset64(x);
+ }
+ }
+
+ public (int offset, int size) GetRectangleRange(int x, int y, int width, int height)
+ {
+ if (_isLinear)
+ {
+ int start = y * Math.Abs(_stride) + x * _bytesPerPixel;
+ int end = (y + height - 1) * Math.Abs(_stride) + (x + width) * _bytesPerPixel;
+ return (y * _stride + x * _bytesPerPixel, end - start);
+ }
+ else
+ {
+ return _layoutConverter.GetRectangleRange(x, y, width, height);
+ }
+ }
+
+ public bool LayoutMatches(OffsetCalculator other)
+ {
+ if (_isLinear)
+ {
+ return other._isLinear &&
+ _width == other._width &&
+ _height == other._height &&
+ _stride == other._stride &&
+ _bytesPerPixel == other._bytesPerPixel;
+ }
+ else
+ {
+ return !other._isLinear && _layoutConverter.LayoutMatches(other._layoutConverter);
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/src/Ryujinx.Graphics.Texture/PixelConverter.cs b/src/Ryujinx.Graphics.Texture/PixelConverter.cs
new file mode 100644
index 00000000..add25cd3
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/PixelConverter.cs
@@ -0,0 +1,216 @@
+using Ryujinx.Common;
+using System;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace Ryujinx.Graphics.Texture
+{
+ public static class PixelConverter
+ {
+ private static (int remainder, int outRemainder, int height) GetLineRemainders(int length, int width, int bpp, int outBpp)
+ {
+ int stride = BitUtils.AlignUp(width * bpp, LayoutConverter.HostStrideAlignment);
+ int remainder = stride / bpp - width;
+
+ int outStride = BitUtils.AlignUp(width * outBpp, LayoutConverter.HostStrideAlignment);
+ int outRemainder = outStride / outBpp - width;
+
+ return (remainder, outRemainder, length / stride);
+ }
+
+ public unsafe static byte[] ConvertR4G4ToR4G4B4A4(ReadOnlySpan<byte> data, int width)
+ {
+ byte[] output = new byte[data.Length * 2];
+
+ (int remainder, int outRemainder, int height) = GetLineRemainders(data.Length, width, 1, 2);
+
+ Span<ushort> outputSpan = MemoryMarshal.Cast<byte, ushort>(output);
+
+ if (remainder == 0)
+ {
+ int start = 0;
+
+ if (Sse41.IsSupported)
+ {
+ int sizeTrunc = data.Length & ~7;
+ start = sizeTrunc;
+
+ fixed (byte* inputPtr = data, outputPtr = output)
+ {
+ for (ulong offset = 0; offset < (ulong)sizeTrunc; offset += 8)
+ {
+ Sse2.Store(outputPtr + offset * 2, Sse41.ConvertToVector128Int16(inputPtr + offset).AsByte());
+ }
+ }
+ }
+
+ for (int i = start; i < data.Length; i++)
+ {
+ outputSpan[i] = (ushort)data[i];
+ }
+ }
+ else
+ {
+ int offset = 0;
+ int outOffset = 0;
+
+ for (int y = 0; y < height; y++)
+ {
+ for (int x = 0; x < width; x++)
+ {
+ outputSpan[outOffset++] = data[offset++];
+ }
+
+ offset += remainder;
+ outOffset += outRemainder;
+ }
+ }
+
+ return output;
+ }
+
+ public unsafe static byte[] ConvertR5G6B5ToR8G8B8A8(ReadOnlySpan<byte> data, int width)
+ {
+ byte[] output = new byte[data.Length * 2];
+ int offset = 0;
+ int outOffset = 0;
+
+ (int remainder, int outRemainder, int height) = GetLineRemainders(data.Length, width, 2, 4);
+
+ ReadOnlySpan<ushort> inputSpan = MemoryMarshal.Cast<byte, ushort>(data);
+ Span<uint> outputSpan = MemoryMarshal.Cast<byte, uint>(output);
+
+ for (int y = 0; y < height; y++)
+ {
+ for (int x = 0; x < width; x++)
+ {
+ uint packed = inputSpan[offset++];
+
+ uint outputPacked = 0xff000000;
+ outputPacked |= (packed << 3) & 0x000000f8;
+ outputPacked |= (packed << 8) & 0x00f80000;
+
+ // Replicate 5 bit components.
+ outputPacked |= (outputPacked >> 5) & 0x00070007;
+
+ // Include and replicate 6 bit component.
+ outputPacked |= ((packed << 5) & 0x0000fc00) | ((packed >> 1) & 0x00000300);
+
+ outputSpan[outOffset++] = outputPacked;
+ }
+
+ offset += remainder;
+ outOffset += outRemainder;
+ }
+
+ return output;
+ }
+
+ public unsafe static byte[] ConvertR5G5B5ToR8G8B8A8(ReadOnlySpan<byte> data, int width, bool forceAlpha)
+ {
+ byte[] output = new byte[data.Length * 2];
+ int offset = 0;
+ int outOffset = 0;
+
+ (int remainder, int outRemainder, int height) = GetLineRemainders(data.Length, width, 2, 4);
+
+ ReadOnlySpan<ushort> inputSpan = MemoryMarshal.Cast<byte, ushort>(data);
+ Span<uint> outputSpan = MemoryMarshal.Cast<byte, uint>(output);
+
+ for (int y = 0; y < height; y++)
+ {
+ for (int x = 0; x < width; x++)
+ {
+ uint packed = inputSpan[offset++];
+
+ uint a = forceAlpha ? 1 : (packed >> 15);
+
+ uint outputPacked = a * 0xff000000;
+ outputPacked |= (packed << 3) & 0x000000f8;
+ outputPacked |= (packed << 6) & 0x0000f800;
+ outputPacked |= (packed << 9) & 0x00f80000;
+
+ // Replicate 5 bit components.
+ outputPacked |= (outputPacked >> 5) & 0x00070707;
+
+ outputSpan[outOffset++] = outputPacked;
+ }
+
+ offset += remainder;
+ outOffset += outRemainder;
+ }
+
+ return output;
+ }
+
+ public unsafe static byte[] ConvertA1B5G5R5ToR8G8B8A8(ReadOnlySpan<byte> data, int width)
+ {
+ byte[] output = new byte[data.Length * 2];
+ int offset = 0;
+ int outOffset = 0;
+
+ (int remainder, int outRemainder, int height) = GetLineRemainders(data.Length, width, 2, 4);
+
+ ReadOnlySpan<ushort> inputSpan = MemoryMarshal.Cast<byte, ushort>(data);
+ Span<uint> outputSpan = MemoryMarshal.Cast<byte, uint>(output);
+
+ for (int y = 0; y < height; y++)
+ {
+ for (int x = 0; x < width; x++)
+ {
+ uint packed = inputSpan[offset++];
+
+ uint a = packed >> 15;
+
+ uint outputPacked = a * 0xff000000;
+ outputPacked |= (packed >> 8) & 0x000000f8;
+ outputPacked |= (packed << 5) & 0x0000f800;
+ outputPacked |= (packed << 18) & 0x00f80000;
+
+ // Replicate 5 bit components.
+ outputPacked |= (outputPacked >> 5) & 0x00070707;
+
+ outputSpan[outOffset++] = outputPacked;
+ }
+
+ offset += remainder;
+ outOffset += outRemainder;
+ }
+
+ return output;
+ }
+
+ public unsafe static byte[] ConvertR4G4B4A4ToR8G8B8A8(ReadOnlySpan<byte> data, int width)
+ {
+ byte[] output = new byte[data.Length * 2];
+ int offset = 0;
+ int outOffset = 0;
+
+ (int remainder, int outRemainder, int height) = GetLineRemainders(data.Length, width, 2, 4);
+
+ ReadOnlySpan<ushort> inputSpan = MemoryMarshal.Cast<byte, ushort>(data);
+ Span<uint> outputSpan = MemoryMarshal.Cast<byte, uint>(output);
+
+ for (int y = 0; y < height; y++)
+ {
+ for (int x = 0; x < width; x++)
+ {
+ uint packed = inputSpan[offset++];
+
+ uint outputPacked = packed & 0x0000000f;
+ outputPacked |= (packed << 4) & 0x00000f00;
+ outputPacked |= (packed << 8) & 0x000f0000;
+ outputPacked |= (packed << 12) & 0x0f000000;
+
+ outputSpan[outOffset++] = outputPacked * 0x11;
+ }
+
+ offset += remainder;
+ outOffset += outRemainder;
+ }
+
+ return output;
+ }
+ }
+}
diff --git a/src/Ryujinx.Graphics.Texture/Region.cs b/src/Ryujinx.Graphics.Texture/Region.cs
new file mode 100644
index 00000000..e59888a0
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Region.cs
@@ -0,0 +1,14 @@
+namespace Ryujinx.Graphics.Texture
+{
+ public readonly struct Region
+ {
+ public int Offset { get; }
+ public int Size { get; }
+
+ public Region(int offset, int size)
+ {
+ Offset = offset;
+ Size = size;
+ }
+ }
+}
diff --git a/src/Ryujinx.Graphics.Texture/Ryujinx.Graphics.Texture.csproj b/src/Ryujinx.Graphics.Texture/Ryujinx.Graphics.Texture.csproj
new file mode 100644
index 00000000..70e3453c
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Ryujinx.Graphics.Texture.csproj
@@ -0,0 +1,11 @@
+<Project Sdk="Microsoft.NET.Sdk">
+ <PropertyGroup>
+ <TargetFramework>net7.0</TargetFramework>
+ <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+ </PropertyGroup>
+
+ <ItemGroup>
+ <ProjectReference Include="..\Ryujinx.Common\Ryujinx.Common.csproj" />
+ </ItemGroup>
+
+</Project>
diff --git a/src/Ryujinx.Graphics.Texture/Size.cs b/src/Ryujinx.Graphics.Texture/Size.cs
new file mode 100644
index 00000000..21c45b38
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Size.cs
@@ -0,0 +1,16 @@
+namespace Ryujinx.Graphics.Texture
+{
+ public readonly struct Size
+ {
+ public int Width { get; }
+ public int Height { get; }
+ public int Depth { get; }
+
+ public Size(int width, int height, int depth)
+ {
+ Width = width;
+ Height = height;
+ Depth = depth;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/Ryujinx.Graphics.Texture/SizeCalculator.cs b/src/Ryujinx.Graphics.Texture/SizeCalculator.cs
new file mode 100644
index 00000000..5568784f
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/SizeCalculator.cs
@@ -0,0 +1,287 @@
+using Ryujinx.Common;
+using System;
+
+using static Ryujinx.Graphics.Texture.BlockLinearConstants;
+
+namespace Ryujinx.Graphics.Texture
+{
+ public static class SizeCalculator
+ {
+ private const int StrideAlignment = 32;
+
+ private static int Calculate3DOffsetCount(int levels, int depth)
+ {
+ int offsetCount = depth;
+
+ while (--levels > 0)
+ {
+ depth = Math.Max(1, depth >> 1);
+ offsetCount += depth;
+ }
+
+ return offsetCount;
+ }
+
+ public static SizeInfo GetBlockLinearTextureSize(
+ int width,
+ int height,
+ int depth,
+ int levels,
+ int layers,
+ int blockWidth,
+ int blockHeight,
+ int bytesPerPixel,
+ int gobBlocksInY,
+ int gobBlocksInZ,
+ int gobBlocksInTileX,
+ int gpuLayerSize = 0)
+ {
+ bool is3D = depth > 1;
+
+ int layerSize = 0;
+
+ int[] allOffsets = new int[is3D ? Calculate3DOffsetCount(levels, depth) : levels * layers * depth];
+ int[] mipOffsets = new int[levels];
+ int[] sliceSizes = new int[levels];
+ int[] levelSizes = new int[levels];
+
+ int mipGobBlocksInY = gobBlocksInY;
+ int mipGobBlocksInZ = gobBlocksInZ;
+
+ int gobWidth = (GobStride / bytesPerPixel) * gobBlocksInTileX;
+ int gobHeight = gobBlocksInY * GobHeight;
+
+ int depthLevelOffset = 0;
+
+ for (int level = 0; level < levels; level++)
+ {
+ int w = Math.Max(1, width >> level);
+ int h = Math.Max(1, height >> level);
+ int d = Math.Max(1, depth >> level);
+
+ w = BitUtils.DivRoundUp(w, blockWidth);
+ h = BitUtils.DivRoundUp(h, blockHeight);
+
+ while (h <= (mipGobBlocksInY >> 1) * GobHeight && mipGobBlocksInY != 1)
+ {
+ mipGobBlocksInY >>= 1;
+ }
+
+ while (d <= (mipGobBlocksInZ >> 1) && mipGobBlocksInZ != 1)
+ {
+ mipGobBlocksInZ >>= 1;
+ }
+
+ int widthInGobs = BitUtils.DivRoundUp(w * bytesPerPixel, GobStride);
+
+ int alignment = gobBlocksInTileX;
+
+ if (d < gobBlocksInZ || w <= gobWidth || h <= gobHeight)
+ {
+ alignment = 1;
+ }
+
+ widthInGobs = BitUtils.AlignUp(widthInGobs, alignment);
+
+ int totalBlocksOfGobsInZ = BitUtils.DivRoundUp(d, mipGobBlocksInZ);
+ int totalBlocksOfGobsInY = BitUtils.DivRoundUp(BitUtils.DivRoundUp(h, GobHeight), mipGobBlocksInY);
+
+ int robSize = widthInGobs * mipGobBlocksInY * mipGobBlocksInZ * GobSize;
+
+ if (is3D)
+ {
+ int gobSize = mipGobBlocksInY * GobSize;
+
+ int sliceSize = totalBlocksOfGobsInY * widthInGobs * gobSize;
+
+ int baseOffset = layerSize;
+
+ int mask = gobBlocksInZ - 1;
+
+ for (int z = 0; z < d; z++)
+ {
+ int zLow = z & mask;
+ int zHigh = z & ~mask;
+
+ allOffsets[z + depthLevelOffset] = baseOffset + zLow * gobSize + zHigh * sliceSize;
+ }
+ }
+
+ mipOffsets[level] = layerSize;
+ sliceSizes[level] = totalBlocksOfGobsInY * robSize;
+ levelSizes[level] = totalBlocksOfGobsInZ * sliceSizes[level];
+
+ layerSize += levelSizes[level];
+
+ depthLevelOffset += d;
+ }
+
+ if (layers > 1)
+ {
+ layerSize = AlignLayerSize(
+ layerSize,
+ height,
+ depth,
+ blockHeight,
+ gobBlocksInY,
+ gobBlocksInZ,
+ gobBlocksInTileX);
+ }
+
+ int totalSize;
+
+ if (layerSize < gpuLayerSize)
+ {
+ totalSize = (layers - 1) * gpuLayerSize + layerSize;
+ layerSize = gpuLayerSize;
+ }
+ else
+ {
+ totalSize = layerSize * layers;
+ }
+
+ if (!is3D)
+ {
+ for (int layer = 0; layer < layers; layer++)
+ {
+ int baseIndex = layer * levels;
+ int baseOffset = layer * layerSize;
+
+ for (int level = 0; level < levels; level++)
+ {
+ allOffsets[baseIndex + level] = baseOffset + mipOffsets[level];
+ }
+ }
+ }
+
+ return new SizeInfo(mipOffsets, allOffsets, sliceSizes, levelSizes, depth, levels, layerSize, totalSize, is3D);
+ }
+
+ public static SizeInfo GetLinearTextureSize(int stride, int height, int blockHeight)
+ {
+ // Non-2D or mipmapped linear textures are not supported by the Switch GPU,
+ // so we only need to handle a single case (2D textures without mipmaps).
+ int totalSize = stride * BitUtils.DivRoundUp(height, blockHeight);
+
+ return new SizeInfo(totalSize);
+ }
+
+ private static int AlignLayerSize(
+ int size,
+ int height,
+ int depth,
+ int blockHeight,
+ int gobBlocksInY,
+ int gobBlocksInZ,
+ int gobBlocksInTileX)
+ {
+ if (gobBlocksInTileX < 2)
+ {
+ height = BitUtils.DivRoundUp(height, blockHeight);
+
+ while (height <= (gobBlocksInY >> 1) * GobHeight && gobBlocksInY != 1)
+ {
+ gobBlocksInY >>= 1;
+ }
+
+ while (depth <= (gobBlocksInZ >> 1) && gobBlocksInZ != 1)
+ {
+ gobBlocksInZ >>= 1;
+ }
+
+ int blockOfGobsSize = gobBlocksInY * gobBlocksInZ * GobSize;
+
+ int sizeInBlockOfGobs = size / blockOfGobsSize;
+
+ if (size != sizeInBlockOfGobs * blockOfGobsSize)
+ {
+ size = (sizeInBlockOfGobs + 1) * blockOfGobsSize;
+ }
+ }
+ else
+ {
+ int alignment = (gobBlocksInTileX * GobSize) * gobBlocksInY * gobBlocksInZ;
+
+ size = BitUtils.AlignUp(size, alignment);
+ }
+
+ return size;
+ }
+
+ public static Size GetBlockLinearAlignedSize(
+ int width,
+ int height,
+ int depth,
+ int blockWidth,
+ int blockHeight,
+ int bytesPerPixel,
+ int gobBlocksInY,
+ int gobBlocksInZ,
+ int gobBlocksInTileX)
+ {
+ width = BitUtils.DivRoundUp(width, blockWidth);
+ height = BitUtils.DivRoundUp(height, blockHeight);
+
+ int gobWidth = (GobStride / bytesPerPixel) * gobBlocksInTileX;
+ int gobHeight = gobBlocksInY * GobHeight;
+
+ int alignment = gobWidth;
+
+ if (depth < gobBlocksInZ || width <= gobWidth || height <= gobHeight)
+ {
+ alignment = GobStride / bytesPerPixel;
+ }
+
+ // Height has already been divided by block height, so pass it as 1.
+ (gobBlocksInY, gobBlocksInZ) = GetMipGobBlockSizes(height, depth, 1, gobBlocksInY, gobBlocksInZ);
+
+ int blockOfGobsHeight = gobBlocksInY * GobHeight;
+ int blockOfGobsDepth = gobBlocksInZ;
+
+ width = BitUtils.AlignUp(width, alignment);
+ height = BitUtils.AlignUp(height, blockOfGobsHeight);
+ depth = BitUtils.AlignUp(depth, blockOfGobsDepth);
+
+ return new Size(width, height, depth);
+ }
+
+ public static Size GetLinearAlignedSize(
+ int width,
+ int height,
+ int blockWidth,
+ int blockHeight,
+ int bytesPerPixel)
+ {
+ width = BitUtils.DivRoundUp(width, blockWidth);
+ height = BitUtils.DivRoundUp(height, blockHeight);
+
+ int widthAlignment = StrideAlignment / bytesPerPixel;
+
+ width = BitUtils.AlignUp(width, widthAlignment);
+
+ return new Size(width, height, 1);
+ }
+
+ public static (int, int) GetMipGobBlockSizes(
+ int height,
+ int depth,
+ int blockHeight,
+ int gobBlocksInY,
+ int gobBlocksInZ)
+ {
+ height = BitUtils.DivRoundUp(height, blockHeight);
+
+ while (height <= (gobBlocksInY >> 1) * GobHeight && gobBlocksInY != 1)
+ {
+ gobBlocksInY >>= 1;
+ }
+
+ while (depth <= (gobBlocksInZ >> 1) && gobBlocksInZ != 1)
+ {
+ gobBlocksInZ >>= 1;
+ }
+
+ return (gobBlocksInY, gobBlocksInZ);
+ }
+ }
+} \ No newline at end of file
diff --git a/src/Ryujinx.Graphics.Texture/SizeInfo.cs b/src/Ryujinx.Graphics.Texture/SizeInfo.cs
new file mode 100644
index 00000000..eb573728
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/SizeInfo.cs
@@ -0,0 +1,119 @@
+using System;
+using System.Collections.Generic;
+
+namespace Ryujinx.Graphics.Texture
+{
+ public readonly struct SizeInfo
+ {
+ private readonly int[] _mipOffsets;
+
+ private readonly int _levels;
+ private readonly int _depth;
+ private readonly bool _is3D;
+
+ public readonly int[] AllOffsets;
+ public readonly int[] SliceSizes;
+ public readonly int[] LevelSizes;
+ public int LayerSize { get; }
+ public int TotalSize { get; }
+
+ public SizeInfo(int size)
+ {
+ _mipOffsets = new int[] { 0 };
+ AllOffsets = new int[] { 0 };
+ SliceSizes = new int[] { size };
+ LevelSizes = new int[] { size };
+ _depth = 1;
+ _levels = 1;
+ LayerSize = size;
+ TotalSize = size;
+ _is3D = false;
+ }
+
+ internal SizeInfo(
+ int[] mipOffsets,
+ int[] allOffsets,
+ int[] sliceSizes,
+ int[] levelSizes,
+ int depth,
+ int levels,
+ int layerSize,
+ int totalSize,
+ bool is3D)
+ {
+ _mipOffsets = mipOffsets;
+ AllOffsets = allOffsets;
+ SliceSizes = sliceSizes;
+ LevelSizes = levelSizes;
+ _depth = depth;
+ _levels = levels;
+ LayerSize = layerSize;
+ TotalSize = totalSize;
+ _is3D = is3D;
+ }
+
+ public int GetMipOffset(int level)
+ {
+ if ((uint)level >= _mipOffsets.Length)
+ {
+ throw new ArgumentOutOfRangeException(nameof(level));
+ }
+
+ return _mipOffsets[level];
+ }
+
+ public bool FindView(int offset, out int firstLayer, out int firstLevel)
+ {
+ int index = Array.BinarySearch(AllOffsets, offset);
+
+ if (index < 0)
+ {
+ firstLayer = 0;
+ firstLevel = 0;
+
+ return false;
+ }
+
+ if (_is3D)
+ {
+ firstLayer = index;
+ firstLevel = 0;
+
+ int levelDepth = _depth;
+
+ while (firstLayer >= levelDepth)
+ {
+ firstLayer -= levelDepth;
+ firstLevel++;
+ levelDepth = Math.Max(levelDepth >> 1, 1);
+ }
+ }
+ else
+ {
+ firstLayer = index / _levels;
+ firstLevel = index - (firstLayer * _levels);
+ }
+
+ return true;
+ }
+
+ public IEnumerable<Region> AllRegions()
+ {
+ if (_is3D)
+ {
+ for (int i = 0; i < _mipOffsets.Length; i++)
+ {
+ int maxSize = TotalSize - _mipOffsets[i];
+ yield return new Region(_mipOffsets[i], Math.Min(maxSize, LevelSizes[i]));
+ }
+ }
+ else
+ {
+ for (int i = 0; i < AllOffsets.Length; i++)
+ {
+ yield return new Region(AllOffsets[i], SliceSizes[i % _levels]);
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/src/Ryujinx.Graphics.Texture/Utils/BC67Tables.cs b/src/Ryujinx.Graphics.Texture/Utils/BC67Tables.cs
new file mode 100644
index 00000000..d890652c
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Utils/BC67Tables.cs
@@ -0,0 +1,297 @@
+namespace Ryujinx.Graphics.Texture.Utils
+{
+ static class BC67Tables
+ {
+ public static readonly BC7ModeInfo[] BC7ModeInfos = new BC7ModeInfo[]
+ {
+ new BC7ModeInfo(3, 4, 6, 0, 0, 3, 0, 4, 0),
+ new BC7ModeInfo(2, 6, 2, 0, 0, 3, 0, 6, 0),
+ new BC7ModeInfo(3, 6, 0, 0, 0, 2, 0, 5, 0),
+ new BC7ModeInfo(2, 6, 4, 0, 0, 2, 0, 7, 0),
+ new BC7ModeInfo(1, 0, 0, 2, 1, 2, 3, 5, 6),
+ new BC7ModeInfo(1, 0, 0, 2, 0, 2, 2, 7, 8),
+ new BC7ModeInfo(1, 0, 2, 0, 0, 4, 0, 7, 7),
+ new BC7ModeInfo(2, 6, 4, 0, 0, 2, 0, 5, 5)
+ };
+
+ public static readonly byte[][] Weights =
+ {
+ new byte[] { 0, 21, 43, 64 },
+ new byte[] { 0, 9, 18, 27, 37, 46, 55, 64 },
+ new byte[] { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 }
+ };
+
+ public static readonly byte[][] InverseWeights =
+ {
+ new byte[] { 64, 43, 21, 0 },
+ new byte[] { 64, 55, 46, 37, 27, 18, 9, 0 },
+ new byte[] { 64, 60, 55, 51, 47, 43, 38, 34, 30, 26, 21, 17, 13, 9, 4, 0 }
+ };
+
+ public static readonly byte[][][] FixUpIndices = new byte[3][][]
+ {
+ new byte[64][]
+ {
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 },
+ new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }
+ },
+ new byte[64][]
+ {
+ new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 },
+ new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 },
+ new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 },
+ new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 },
+ new byte[] { 0, 15, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 2, 0 },
+ new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 15, 0 },
+ new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 },
+ new byte[] { 0, 8, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 },
+ new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 6, 0 }, new byte[] { 0, 8, 0 },
+ new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 },
+ new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 },
+ new byte[] { 0, 2, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 6, 0 },
+ new byte[] { 0, 6, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 6, 0 }, new byte[] { 0, 8, 0 },
+ new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 },
+ new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 },
+ new byte[] { 0, 15, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 15, 0 }
+ },
+ new byte[64][]
+ {
+ new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 }, new byte[] { 0, 15, 8 }, new byte[] { 0, 15, 3 },
+ new byte[] { 0, 8, 15 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 15, 3 }, new byte[] { 0, 15, 8 },
+ new byte[] { 0, 8, 15 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 6, 15 }, new byte[] { 0, 6, 15 },
+ new byte[] { 0, 6, 15 }, new byte[] { 0, 5, 15 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 },
+ new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 15, 3 },
+ new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 }, new byte[] { 0, 6, 15 }, new byte[] { 0, 10, 8 },
+ new byte[] { 0, 5, 3 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 8, 6 }, new byte[] { 0, 6, 10 },
+ new byte[] { 0, 8, 15 }, new byte[] { 0, 5, 15 }, new byte[] { 0, 15, 10 }, new byte[] { 0, 15, 8 },
+ new byte[] { 0, 8, 15 }, new byte[] { 0, 15, 3 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 5, 10 },
+ new byte[] { 0, 6, 10 }, new byte[] { 0, 10, 8 }, new byte[] { 0, 8, 9 }, new byte[] { 0, 15, 10 },
+ new byte[] { 0, 15, 6 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 15, 8 }, new byte[] { 0, 5, 15 },
+ new byte[] { 0, 15, 3 }, new byte[] { 0, 15, 6 }, new byte[] { 0, 15, 6 }, new byte[] { 0, 15, 8 },
+ new byte[] { 0, 3, 15 }, new byte[] { 0, 15, 3 }, new byte[] { 0, 5, 15 }, new byte[] { 0, 5, 15 },
+ new byte[] { 0, 5, 15 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 5, 15 }, new byte[] { 0, 10, 15 },
+ new byte[] { 0, 5, 15 }, new byte[] { 0, 10, 15 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 13, 15 },
+ new byte[] { 0, 15, 3 }, new byte[] { 0, 12, 15 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 }
+ }
+ };
+
+ public static readonly byte[][][] PartitionTable = new byte[3][][]
+ {
+ new byte[64][]
+ {
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 0
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 1
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 2
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 3
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 4
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 5
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 6
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 7
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 8
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 9
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 10
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 11
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 12
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 13
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 14
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 15
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 16
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 17
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 18
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 19
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 20
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 21
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 22
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 23
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 24
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 25
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 26
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 27
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 28
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 29
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 30
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 31
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 32
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 33
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 34
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 35
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 36
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 37
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 38
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 39
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 40
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 41
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 42
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 43
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 44
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 45
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 46
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 47
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 48
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 49
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 50
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 51
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 52
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 53
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 54
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 55
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 56
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 57
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 58
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 59
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 60
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 61
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 62
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } // 63
+ },
+ new byte[64][]
+ {
+ new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, // 0
+ new byte[16] { 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1 }, // 1
+ new byte[16] { 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1 }, // 2
+ new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1 }, // 3
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1 }, // 4
+ new byte[16] { 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, // 5
+ new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, // 6
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1 }, // 7
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1 }, // 8
+ new byte[16] { 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // 9
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, // 10
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1 }, // 11
+ new byte[16] { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // 12
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }, // 13
+ new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // 14
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1 }, // 15
+ new byte[16] { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1 }, // 16
+ new byte[16] { 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, // 17
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0 }, // 18
+ new byte[16] { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0 }, // 19
+ new byte[16] { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, // 20
+ new byte[16] { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0 }, // 21
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 }, // 22
+ new byte[16] { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1 }, // 23
+ new byte[16] { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0 }, // 24
+ new byte[16] { 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 }, // 25
+ new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0 }, // 26
+ new byte[16] { 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0 }, // 27
+ new byte[16] { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0 }, // 28
+ new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, // 29
+ new byte[16] { 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0 }, // 30
+ new byte[16] { 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0 }, // 31
+ new byte[16] { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }, // 32
+ new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1 }, // 33
+ new byte[16] { 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0 }, // 34
+ new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0 }, // 35
+ new byte[16] { 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0 }, // 36
+ new byte[16] { 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0 }, // 37
+ new byte[16] { 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1 }, // 38
+ new byte[16] { 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1 }, // 39
+ new byte[16] { 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0 }, // 40
+ new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0 }, // 41
+ new byte[16] { 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0 }, // 42
+ new byte[16] { 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0 }, // 43
+ new byte[16] { 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0 }, // 44
+ new byte[16] { 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1 }, // 45
+ new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1 }, // 46
+ new byte[16] { 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0 }, // 47
+ new byte[16] { 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0 }, // 48
+ new byte[16] { 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0 }, // 49
+ new byte[16] { 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0 }, // 50
+ new byte[16] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0 }, // 51
+ new byte[16] { 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1 }, // 52
+ new byte[16] { 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1 }, // 53
+ new byte[16] { 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0 }, // 54
+ new byte[16] { 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0 }, // 55
+ new byte[16] { 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1 }, // 56
+ new byte[16] { 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1 }, // 57
+ new byte[16] { 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1 }, // 58
+ new byte[16] { 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1 }, // 59
+ new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, // 60
+ new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, // 61
+ new byte[16] { 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0 }, // 62
+ new byte[16] { 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1 } // 63
+ },
+ new byte[64][]
+ {
+ new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 1, 2, 2, 2, 2 }, // 0
+ new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1 }, // 1
+ new byte[16] { 0, 0, 0, 0, 2, 0, 0, 1, 2, 2, 1, 1, 2, 2, 1, 1 }, // 2
+ new byte[16] { 0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 1, 0, 1, 1, 1 }, // 3
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2 }, // 4
+ new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 2, 2 }, // 5
+ new byte[16] { 0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1 }, // 6
+ new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1 }, // 7
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2 }, // 8
+ new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2 }, // 9
+ new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 }, // 10
+ new byte[16] { 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2 }, // 11
+ new byte[16] { 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2 }, // 12
+ new byte[16] { 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2 }, // 13
+ new byte[16] { 0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 2 }, // 14
+ new byte[16] { 0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0, 2, 2, 2, 0 }, // 15
+ new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2 }, // 16
+ new byte[16] { 0, 1, 1, 1, 0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0 }, // 17
+ new byte[16] { 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2 }, // 18
+ new byte[16] { 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1 }, // 19
+ new byte[16] { 0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2 }, // 20
+ new byte[16] { 0, 0, 0, 1, 0, 0, 0, 1, 2, 2, 2, 1, 2, 2, 2, 1 }, // 21
+ new byte[16] { 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2 }, // 22
+ new byte[16] { 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 1, 0, 2, 2, 1, 0 }, // 23
+ new byte[16] { 0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0 }, // 24
+ new byte[16] { 0, 0, 1, 2, 0, 0, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2 }, // 25
+ new byte[16] { 0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1, 0, 1, 1, 0 }, // 26
+ new byte[16] { 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1 }, // 27
+ new byte[16] { 0, 0, 2, 2, 1, 1, 0, 2, 1, 1, 0, 2, 0, 0, 2, 2 }, // 28
+ new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 2, 0, 0, 2, 2, 2, 2, 2 }, // 29
+ new byte[16] { 0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1 }, // 30
+ new byte[16] { 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 1, 1, 2, 2, 2, 1 }, // 31
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 2, 2, 2 }, // 32
+ new byte[16] { 0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 2, 0, 0, 1, 1 }, // 33
+ new byte[16] { 0, 0, 1, 1, 0, 0, 1, 2, 0, 0, 2, 2, 0, 2, 2, 2 }, // 34
+ new byte[16] { 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0 }, // 35
+ new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0 }, // 36
+ new byte[16] { 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0 }, // 37
+ new byte[16] { 0, 1, 2, 0, 2, 0, 1, 2, 1, 2, 0, 1, 0, 1, 2, 0 }, // 38
+ new byte[16] { 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1 }, // 39
+ new byte[16] { 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0, 1, 1 }, // 40
+ new byte[16] { 0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2 }, // 41
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1 }, // 42
+ new byte[16] { 0, 0, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2, 1, 1, 2, 2 }, // 43
+ new byte[16] { 0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 1, 1 }, // 44
+ new byte[16] { 0, 2, 2, 0, 1, 2, 2, 1, 0, 2, 2, 0, 1, 2, 2, 1 }, // 45
+ new byte[16] { 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 0, 1 }, // 46
+ new byte[16] { 0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1 }, // 47
+ new byte[16] { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2 }, // 48
+ new byte[16] { 0, 2, 2, 2, 0, 1, 1, 1, 0, 2, 2, 2, 0, 1, 1, 1 }, // 49
+ new byte[16] { 0, 0, 0, 2, 1, 1, 1, 2, 0, 0, 0, 2, 1, 1, 1, 2 }, // 50
+ new byte[16] { 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2 }, // 51
+ new byte[16] { 0, 2, 2, 2, 0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2 }, // 52
+ new byte[16] { 0, 0, 0, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2 }, // 53
+ new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2 }, // 54
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2 }, // 55
+ new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2 }, // 56
+ new byte[16] { 0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2 }, // 57
+ new byte[16] { 0, 0, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2 }, // 58
+ new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2 }, // 59
+ new byte[16] { 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1 }, // 60
+ new byte[16] { 0, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2 }, // 61
+ new byte[16] { 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, // 62
+ new byte[16] { 0, 1, 1, 1, 2, 0, 1, 1, 2, 2, 0, 1, 2, 2, 2, 0 } // 63
+ }
+ };
+ }
+}
diff --git a/src/Ryujinx.Graphics.Texture/Utils/BC67Utils.cs b/src/Ryujinx.Graphics.Texture/Utils/BC67Utils.cs
new file mode 100644
index 00000000..e6c3f6e7
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Utils/BC67Utils.cs
@@ -0,0 +1,1327 @@
+using System;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace Ryujinx.Graphics.Texture.Utils
+{
+ static class BC67Utils
+ {
+ private static byte[][] _quantizationLut;
+ private static byte[][] _quantizationLutNoPBit;
+
+ static BC67Utils()
+ {
+ _quantizationLut = new byte[5][];
+ _quantizationLutNoPBit = new byte[5][];
+
+ for (int depth = 4; depth < 9; depth++)
+ {
+ byte[] lut = new byte[512];
+ byte[] lutNoPBit = new byte[256];
+
+ for (int i = 0; i < lut.Length; i++)
+ {
+ lut[i] = QuantizeComponentForLut((byte)i, depth, i >> 8);
+
+ if (i < lutNoPBit.Length)
+ {
+ lutNoPBit[i] = QuantizeComponentForLut((byte)i, depth);
+ }
+ }
+
+ _quantizationLut[depth - 4] = lut;
+ _quantizationLutNoPBit[depth - 4] = lutNoPBit;
+ }
+ }
+
+ public static (RgbaColor8, RgbaColor8) GetMinMaxColors(ReadOnlySpan<uint> tile, int w, int h)
+ {
+ if (Sse41.IsSupported && w == 4 && h == 4)
+ {
+ GetMinMaxColorsOneSubset4x4Sse41(tile, out RgbaColor8 minColor, out RgbaColor8 maxColor);
+
+ return (minColor, maxColor);
+ }
+ else
+ {
+ RgbaColor8 minColor = new RgbaColor8(255, 255, 255, 255);
+ RgbaColor8 maxColor = default;
+
+ for (int i = 0; i < tile.Length; i++)
+ {
+ RgbaColor8 color = RgbaColor8.FromUInt32(tile[i]);
+
+ minColor.R = Math.Min(minColor.R, color.R);
+ minColor.G = Math.Min(minColor.G, color.G);
+ minColor.B = Math.Min(minColor.B, color.B);
+ minColor.A = Math.Min(minColor.A, color.A);
+
+ maxColor.R = Math.Max(maxColor.R, color.R);
+ maxColor.G = Math.Max(maxColor.G, color.G);
+ maxColor.B = Math.Max(maxColor.B, color.B);
+ maxColor.A = Math.Max(maxColor.A, color.A);
+ }
+
+ return (minColor, maxColor);
+ }
+ }
+
+ public static void GetMinMaxColors(
+ ReadOnlySpan<byte> partitionTable,
+ ReadOnlySpan<uint> tile,
+ int w,
+ int h,
+ Span<RgbaColor8> minColors,
+ Span<RgbaColor8> maxColors,
+ int subsetCount)
+ {
+ if (Sse41.IsSupported && w == 4 && h == 4)
+ {
+ if (subsetCount == 1)
+ {
+ GetMinMaxColorsOneSubset4x4Sse41(tile, out minColors[0], out maxColors[0]);
+ return;
+ }
+ else if (subsetCount == 2)
+ {
+ GetMinMaxColorsTwoSubsets4x4Sse41(partitionTable, tile, minColors, maxColors);
+ return;
+ }
+ }
+
+ minColors.Fill(new RgbaColor8(255, 255, 255, 255));
+
+ int i = 0;
+ for (int ty = 0; ty < h; ty++)
+ {
+ for (int tx = 0; tx < w; tx++)
+ {
+ int subset = partitionTable[ty * w + tx];
+ RgbaColor8 color = RgbaColor8.FromUInt32(tile[i++]);
+
+ minColors[subset].R = Math.Min(minColors[subset].R, color.R);
+ minColors[subset].G = Math.Min(minColors[subset].G, color.G);
+ minColors[subset].B = Math.Min(minColors[subset].B, color.B);
+ minColors[subset].A = Math.Min(minColors[subset].A, color.A);
+
+ maxColors[subset].R = Math.Max(maxColors[subset].R, color.R);
+ maxColors[subset].G = Math.Max(maxColors[subset].G, color.G);
+ maxColors[subset].B = Math.Max(maxColors[subset].B, color.B);
+ maxColors[subset].A = Math.Max(maxColors[subset].A, color.A);
+ }
+ }
+ }
+
+ private static unsafe void GetMinMaxColorsOneSubset4x4Sse41(ReadOnlySpan<uint> tile, out RgbaColor8 minColor, out RgbaColor8 maxColor)
+ {
+ Vector128<byte> min = Vector128<byte>.AllBitsSet;
+ Vector128<byte> max = Vector128<byte>.Zero;
+ Vector128<byte> row0, row1, row2, row3;
+
+ fixed (uint* pTile = tile)
+ {
+ row0 = Sse2.LoadVector128(pTile).AsByte();
+ row1 = Sse2.LoadVector128(pTile + 4).AsByte();
+ row2 = Sse2.LoadVector128(pTile + 8).AsByte();
+ row3 = Sse2.LoadVector128(pTile + 12).AsByte();
+ }
+
+ min = Sse2.Min(min, row0);
+ max = Sse2.Max(max, row0);
+ min = Sse2.Min(min, row1);
+ max = Sse2.Max(max, row1);
+ min = Sse2.Min(min, row2);
+ max = Sse2.Max(max, row2);
+ min = Sse2.Min(min, row3);
+ max = Sse2.Max(max, row3);
+
+ minColor = HorizontalMin(min);
+ maxColor = HorizontalMax(max);
+ }
+
+ private static unsafe void GetMinMaxColorsTwoSubsets4x4Sse41(
+ ReadOnlySpan<byte> partitionTable,
+ ReadOnlySpan<uint> tile,
+ Span<RgbaColor8> minColors,
+ Span<RgbaColor8> maxColors)
+ {
+ Vector128<byte> partitionMask;
+
+ fixed (byte* pPartitionTable = partitionTable)
+ {
+ partitionMask = Sse2.LoadVector128(pPartitionTable);
+ }
+
+ Vector128<byte> subset0Mask = Sse2.CompareEqual(partitionMask, Vector128<byte>.Zero);
+
+ Vector128<byte> subset0MaskRep16Low = Sse2.UnpackLow(subset0Mask, subset0Mask);
+ Vector128<byte> subset0MaskRep16High = Sse2.UnpackHigh(subset0Mask, subset0Mask);
+
+ Vector128<byte> subset0Mask0 = Sse2.UnpackLow(subset0MaskRep16Low.AsInt16(), subset0MaskRep16Low.AsInt16()).AsByte();
+ Vector128<byte> subset0Mask1 = Sse2.UnpackHigh(subset0MaskRep16Low.AsInt16(), subset0MaskRep16Low.AsInt16()).AsByte();
+ Vector128<byte> subset0Mask2 = Sse2.UnpackLow(subset0MaskRep16High.AsInt16(), subset0MaskRep16High.AsInt16()).AsByte();
+ Vector128<byte> subset0Mask3 = Sse2.UnpackHigh(subset0MaskRep16High.AsInt16(), subset0MaskRep16High.AsInt16()).AsByte();
+
+ Vector128<byte> min0 = Vector128<byte>.AllBitsSet;
+ Vector128<byte> min1 = Vector128<byte>.AllBitsSet;
+ Vector128<byte> max0 = Vector128<byte>.Zero;
+ Vector128<byte> max1 = Vector128<byte>.Zero;
+
+ Vector128<byte> row0, row1, row2, row3;
+
+ fixed (uint* pTile = tile)
+ {
+ row0 = Sse2.LoadVector128(pTile).AsByte();
+ row1 = Sse2.LoadVector128(pTile + 4).AsByte();
+ row2 = Sse2.LoadVector128(pTile + 8).AsByte();
+ row3 = Sse2.LoadVector128(pTile + 12).AsByte();
+ }
+
+ min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row0, subset0Mask0));
+ min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row1, subset0Mask1));
+ min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row2, subset0Mask2));
+ min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row3, subset0Mask3));
+
+ min1 = Sse2.Min(min1, Sse2.Or(row0, subset0Mask0));
+ min1 = Sse2.Min(min1, Sse2.Or(row1, subset0Mask1));
+ min1 = Sse2.Min(min1, Sse2.Or(row2, subset0Mask2));
+ min1 = Sse2.Min(min1, Sse2.Or(row3, subset0Mask3));
+
+ max0 = Sse2.Max(max0, Sse2.And(row0, subset0Mask0));
+ max0 = Sse2.Max(max0, Sse2.And(row1, subset0Mask1));
+ max0 = Sse2.Max(max0, Sse2.And(row2, subset0Mask2));
+ max0 = Sse2.Max(max0, Sse2.And(row3, subset0Mask3));
+
+ max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask0, row0));
+ max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask1, row1));
+ max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask2, row2));
+ max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask3, row3));
+
+ minColors[0] = HorizontalMin(min0);
+ minColors[1] = HorizontalMin(min1);
+ maxColors[0] = HorizontalMax(max0);
+ maxColors[1] = HorizontalMax(max1);
+ }
+
+ private static RgbaColor8 HorizontalMin(Vector128<byte> x)
+ {
+ x = Sse2.Min(x, Sse2.Shuffle(x.AsInt32(), 0x31).AsByte());
+ x = Sse2.Min(x, Sse2.Shuffle(x.AsInt32(), 2).AsByte());
+ return RgbaColor8.FromUInt32(x.AsUInt32().GetElement(0));
+ }
+
+ private static RgbaColor8 HorizontalMax(Vector128<byte> x)
+ {
+ x = Sse2.Max(x, Sse2.Shuffle(x.AsInt32(), 0x31).AsByte());
+ x = Sse2.Max(x, Sse2.Shuffle(x.AsInt32(), 2).AsByte());
+ return RgbaColor8.FromUInt32(x.AsUInt32().GetElement(0));
+ }
+
+ public static int SelectIndices(
+ ReadOnlySpan<uint> values,
+ uint endPoint0,
+ uint endPoint1,
+ int pBit0,
+ int pBit1,
+ int indexBitCount,
+ int indexCount,
+ int colorDepth,
+ int alphaDepth,
+ uint alphaMask)
+ {
+ if (Sse41.IsSupported)
+ {
+ if (indexBitCount == 2)
+ {
+ return Select2BitIndicesSse41(
+ values,
+ endPoint0,
+ endPoint1,
+ pBit0,
+ pBit1,
+ indexBitCount,
+ indexCount,
+ colorDepth,
+ alphaDepth,
+ alphaMask);
+ }
+ else if (indexBitCount == 3)
+ {
+ return Select3BitIndicesSse41(
+ values,
+ endPoint0,
+ endPoint1,
+ pBit0,
+ pBit1,
+ indexBitCount,
+ indexCount,
+ colorDepth,
+ alphaDepth,
+ alphaMask);
+ }
+ else if (indexBitCount == 4)
+ {
+ return Select4BitIndicesOneSubsetSse41(
+ values,
+ endPoint0,
+ endPoint1,
+ pBit0,
+ pBit1,
+ indexBitCount,
+ indexCount,
+ colorDepth,
+ alphaDepth,
+ alphaMask);
+ }
+ }
+
+ return SelectIndicesFallback(
+ values,
+ endPoint0,
+ endPoint1,
+ pBit0,
+ pBit1,
+ indexBitCount,
+ indexCount,
+ colorDepth,
+ alphaDepth,
+ alphaMask);
+ }
+
+ private static unsafe int Select2BitIndicesSse41(
+ ReadOnlySpan<uint> values,
+ uint endPoint0,
+ uint endPoint1,
+ int pBit0,
+ int pBit1,
+ int indexBitCount,
+ int indexCount,
+ int colorDepth,
+ int alphaDepth,
+ uint alphaMask)
+ {
+ uint alphaMaskForPalette = alphaMask;
+
+ if (alphaDepth == 0)
+ {
+ alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
+ }
+
+ int errorSum = 0;
+
+ RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
+ RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
+
+ Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
+ Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
+
+ Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
+
+ Vector128<byte> rWeights;
+ Vector128<byte> lWeights;
+
+ fixed (byte* pWeights = BC67Tables.Weights[0], pInvWeights = BC67Tables.InverseWeights[0])
+ {
+ rWeights = Sse2.LoadScalarVector128((uint*)pWeights).AsByte();
+ lWeights = Sse2.LoadScalarVector128((uint*)pInvWeights).AsByte();
+ }
+
+ Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights);
+ Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
+ Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+
+ Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
+ Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
+
+ for (int i = 0; i < values.Length; i++)
+ {
+ uint c = values[i] | alphaMask;
+
+ Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
+
+ Vector128<short> delta0 = Sse2.Subtract(color, pal0);
+ Vector128<short> delta1 = Sse2.Subtract(color, pal1);
+
+ Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
+ Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
+
+ Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
+
+ Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum01);
+
+ Vector128<ushort> min = Sse41.MinHorizontal(delta);
+
+ ushort error = min.GetElement(0);
+
+ errorSum += error;
+ }
+
+ return errorSum;
+ }
+
+ private static unsafe int Select3BitIndicesSse41(
+ ReadOnlySpan<uint> values,
+ uint endPoint0,
+ uint endPoint1,
+ int pBit0,
+ int pBit1,
+ int indexBitCount,
+ int indexCount,
+ int colorDepth,
+ int alphaDepth,
+ uint alphaMask)
+ {
+ uint alphaMaskForPalette = alphaMask;
+
+ if (alphaDepth == 0)
+ {
+ alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
+ }
+
+ int errorSum = 0;
+
+ RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
+ RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
+
+ Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
+ Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
+
+ Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
+
+ Vector128<byte> rWeights;
+ Vector128<byte> lWeights;
+
+ fixed (byte* pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1])
+ {
+ rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte();
+ lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte();
+ }
+
+ Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights);
+ Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
+ Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
+ Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+ Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+
+ Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
+ Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
+ Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
+ Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
+
+ for (int i = 0; i < values.Length; i++)
+ {
+ uint c = values[i] | alphaMask;
+
+ Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
+
+ Vector128<short> delta0 = Sse2.Subtract(color, pal0);
+ Vector128<short> delta1 = Sse2.Subtract(color, pal1);
+ Vector128<short> delta2 = Sse2.Subtract(color, pal2);
+ Vector128<short> delta3 = Sse2.Subtract(color, pal3);
+
+ Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
+ Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
+ Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
+ Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
+
+ Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
+ Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
+
+ Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
+
+ Vector128<ushort> min = Sse41.MinHorizontal(delta);
+
+ ushort error = min.GetElement(0);
+
+ errorSum += error;
+ }
+
+ return errorSum;
+ }
+
+ private static unsafe int Select4BitIndicesOneSubsetSse41(
+ ReadOnlySpan<uint> values,
+ uint endPoint0,
+ uint endPoint1,
+ int pBit0,
+ int pBit1,
+ int indexBitCount,
+ int indexCount,
+ int colorDepth,
+ int alphaDepth,
+ uint alphaMask)
+ {
+ uint alphaMaskForPalette = alphaMask;
+
+ if (alphaDepth == 0)
+ {
+ alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
+ }
+
+ int errorSum = 0;
+
+ RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
+ RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
+
+ Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
+ Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
+
+ Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
+
+ Vector128<byte> rWeights;
+ Vector128<byte> lWeights;
+
+ fixed (byte* pWeights = BC67Tables.Weights[2], pInvWeights = BC67Tables.InverseWeights[2])
+ {
+ rWeights = Sse2.LoadVector128(pWeights);
+ lWeights = Sse2.LoadVector128(pInvWeights);
+ }
+
+ Vector128<byte> iWeightsLow = Sse2.UnpackLow(lWeights, rWeights);
+ Vector128<byte> iWeightsHigh = Sse2.UnpackHigh(lWeights, rWeights);
+ Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte();
+ Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte();
+ Vector128<byte> iWeights45 = Sse2.UnpackLow(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte();
+ Vector128<byte> iWeights67 = Sse2.UnpackHigh(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte();
+ Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+ Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+ Vector128<byte> iWeights4 = Sse2.UnpackLow(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte();
+ Vector128<byte> iWeights5 = Sse2.UnpackHigh(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte();
+ Vector128<byte> iWeights6 = Sse2.UnpackLow(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte();
+ Vector128<byte> iWeights7 = Sse2.UnpackHigh(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte();
+
+ Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
+ Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
+ Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
+ Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
+ Vector128<short> pal4 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights4.AsSByte()));
+ Vector128<short> pal5 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights5.AsSByte()));
+ Vector128<short> pal6 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights6.AsSByte()));
+ Vector128<short> pal7 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights7.AsSByte()));
+
+ for (int i = 0; i < values.Length; i++)
+ {
+ uint c = values[i] | alphaMask;
+
+ Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
+
+ Vector128<short> delta0 = Sse2.Subtract(color, pal0);
+ Vector128<short> delta1 = Sse2.Subtract(color, pal1);
+ Vector128<short> delta2 = Sse2.Subtract(color, pal2);
+ Vector128<short> delta3 = Sse2.Subtract(color, pal3);
+ Vector128<short> delta4 = Sse2.Subtract(color, pal4);
+ Vector128<short> delta5 = Sse2.Subtract(color, pal5);
+ Vector128<short> delta6 = Sse2.Subtract(color, pal6);
+ Vector128<short> delta7 = Sse2.Subtract(color, pal7);
+
+ Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
+ Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
+ Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
+ Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
+ Vector128<int> deltaSum4 = Sse2.MultiplyAddAdjacent(delta4, delta4);
+ Vector128<int> deltaSum5 = Sse2.MultiplyAddAdjacent(delta5, delta5);
+ Vector128<int> deltaSum6 = Sse2.MultiplyAddAdjacent(delta6, delta6);
+ Vector128<int> deltaSum7 = Sse2.MultiplyAddAdjacent(delta7, delta7);
+
+ Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
+ Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
+ Vector128<int> deltaSum45 = Ssse3.HorizontalAdd(deltaSum4, deltaSum5);
+ Vector128<int> deltaSum67 = Ssse3.HorizontalAdd(deltaSum6, deltaSum7);
+
+ Vector128<ushort> delta0123 = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
+ Vector128<ushort> delta4567 = Sse41.PackUnsignedSaturate(deltaSum45, deltaSum67);
+
+ Vector128<ushort> min0123 = Sse41.MinHorizontal(delta0123);
+ Vector128<ushort> min4567 = Sse41.MinHorizontal(delta4567);
+
+ ushort minPos0123 = min0123.GetElement(0);
+ ushort minPos4567 = min4567.GetElement(0);
+
+ if (minPos4567 < minPos0123)
+ {
+ errorSum += minPos4567;
+ }
+ else
+ {
+ errorSum += minPos0123;
+ }
+ }
+
+ return errorSum;
+ }
+
+ private static int SelectIndicesFallback(
+ ReadOnlySpan<uint> values,
+ uint endPoint0,
+ uint endPoint1,
+ int pBit0,
+ int pBit1,
+ int indexBitCount,
+ int indexCount,
+ int colorDepth,
+ int alphaDepth,
+ uint alphaMask)
+ {
+ int errorSum = 0;
+
+ uint alphaMaskForPalette = alphaMask;
+
+ if (alphaDepth == 0)
+ {
+ alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
+ }
+
+ Span<uint> palette = stackalloc uint[indexCount];
+
+ RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
+ RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
+
+ Unsafe.As<RgbaColor8, uint>(ref c0) |= alphaMaskForPalette;
+ Unsafe.As<RgbaColor8, uint>(ref c1) |= alphaMaskForPalette;
+
+ palette[0] = c0.ToUInt32();
+ palette[indexCount - 1] = c1.ToUInt32();
+
+ for (int j = 1; j < indexCount - 1; j++)
+ {
+ palette[j] = Interpolate(c0, c1, j, indexBitCount).ToUInt32();
+ }
+
+ for (int i = 0; i < values.Length; i++)
+ {
+ uint color = values[i] | alphaMask;
+
+ int bestMatchScore = int.MaxValue;
+ int bestMatchIndex = 0;
+
+ for (int j = 0; j < indexCount; j++)
+ {
+ int score = SquaredDifference(
+ RgbaColor8.FromUInt32(color).GetColor32(),
+ RgbaColor8.FromUInt32(palette[j]).GetColor32());
+
+ if (score < bestMatchScore)
+ {
+ bestMatchScore = score;
+ bestMatchIndex = j;
+ }
+ }
+
+ errorSum += bestMatchScore;
+ }
+
+ return errorSum;
+ }
+
+ public static int SelectIndices(
+ ReadOnlySpan<uint> tile,
+ int w,
+ int h,
+ ReadOnlySpan<uint> endPoints0,
+ ReadOnlySpan<uint> endPoints1,
+ ReadOnlySpan<int> pBitValues,
+ Span<byte> indices,
+ int subsetCount,
+ int partition,
+ int indexBitCount,
+ int indexCount,
+ int colorDepth,
+ int alphaDepth,
+ int pBits,
+ uint alphaMask)
+ {
+ if (Sse41.IsSupported)
+ {
+ if (indexBitCount == 2)
+ {
+ return Select2BitIndicesSse41(
+ tile,
+ w,
+ h,
+ endPoints0,
+ endPoints1,
+ pBitValues,
+ indices,
+ subsetCount,
+ partition,
+ colorDepth,
+ alphaDepth,
+ pBits,
+ alphaMask);
+ }
+ else if (indexBitCount == 3)
+ {
+ return Select3BitIndicesSse41(
+ tile,
+ w,
+ h,
+ endPoints0,
+ endPoints1,
+ pBitValues,
+ indices,
+ subsetCount,
+ partition,
+ colorDepth,
+ alphaDepth,
+ pBits,
+ alphaMask);
+ }
+ else if (indexBitCount == 4)
+ {
+ Debug.Assert(subsetCount == 1);
+
+ return Select4BitIndicesOneSubsetSse41(
+ tile,
+ w,
+ h,
+ endPoints0[0],
+ endPoints1[0],
+ pBitValues,
+ indices,
+ partition,
+ colorDepth,
+ alphaDepth,
+ pBits,
+ alphaMask);
+ }
+ }
+
+ return SelectIndicesFallback(
+ tile,
+ w,
+ h,
+ endPoints0,
+ endPoints1,
+ pBitValues,
+ indices,
+ subsetCount,
+ partition,
+ indexBitCount,
+ indexCount,
+ colorDepth,
+ alphaDepth,
+ pBits,
+ alphaMask);
+ }
+
+ private static unsafe int Select2BitIndicesSse41(
+ ReadOnlySpan<uint> tile,
+ int w,
+ int h,
+ ReadOnlySpan<uint> endPoints0,
+ ReadOnlySpan<uint> endPoints1,
+ ReadOnlySpan<int> pBitValues,
+ Span<byte> indices,
+ int subsetCount,
+ int partition,
+ int colorDepth,
+ int alphaDepth,
+ int pBits,
+ uint alphaMask)
+ {
+ byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition];
+
+ uint alphaMaskForPalette = alphaMask;
+
+ if (alphaDepth == 0)
+ {
+ alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
+ }
+
+ int errorSum = 0;
+
+ for (int subset = 0; subset < subsetCount; subset++)
+ {
+ int pBit0 = -1, pBit1 = -1;
+
+ if (pBits == subsetCount)
+ {
+ pBit0 = pBit1 = pBitValues[subset];
+ }
+ else if (pBits != 0)
+ {
+ pBit0 = pBitValues[subset * 2];
+ pBit1 = pBitValues[subset * 2 + 1];
+ }
+
+ RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0);
+ RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1);
+
+ Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
+ Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
+
+ Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
+
+ Vector128<byte> rWeights;
+ Vector128<byte> lWeights;
+
+ fixed (byte* pWeights = BC67Tables.Weights[0], pInvWeights = BC67Tables.InverseWeights[0])
+ {
+ rWeights = Sse2.LoadScalarVector128((uint*)pWeights).AsByte();
+ lWeights = Sse2.LoadScalarVector128((uint*)pInvWeights).AsByte();
+ }
+
+ Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights);
+ Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
+ Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+
+ Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
+ Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
+
+ int i = 0;
+ for (int ty = 0; ty < h; ty++)
+ {
+ for (int tx = 0; tx < w; tx++, i++)
+ {
+ int tileOffset = ty * 4 + tx;
+ if (partitionTable[tileOffset] != subset)
+ {
+ continue;
+ }
+
+ uint c = tile[i] | alphaMask;
+
+ Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
+
+ Vector128<short> delta0 = Sse2.Subtract(color, pal0);
+ Vector128<short> delta1 = Sse2.Subtract(color, pal1);
+
+ Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
+ Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
+
+ Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
+
+ Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum01);
+
+ Vector128<ushort> min = Sse41.MinHorizontal(delta);
+
+ uint minPos = min.AsUInt32().GetElement(0);
+ ushort error = (ushort)minPos;
+ uint index = minPos >> 16;
+
+ indices[tileOffset] = (byte)index;
+ errorSum += error;
+ }
+ }
+ }
+
+ return errorSum;
+ }
+
+ private static unsafe int Select3BitIndicesSse41(
+ ReadOnlySpan<uint> tile,
+ int w,
+ int h,
+ ReadOnlySpan<uint> endPoints0,
+ ReadOnlySpan<uint> endPoints1,
+ ReadOnlySpan<int> pBitValues,
+ Span<byte> indices,
+ int subsetCount,
+ int partition,
+ int colorDepth,
+ int alphaDepth,
+ int pBits,
+ uint alphaMask)
+ {
+ byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition];
+
+ uint alphaMaskForPalette = alphaMask;
+
+ if (alphaDepth == 0)
+ {
+ alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
+ }
+
+ int errorSum = 0;
+
+ for (int subset = 0; subset < subsetCount; subset++)
+ {
+ int pBit0 = -1, pBit1 = -1;
+
+ if (pBits == subsetCount)
+ {
+ pBit0 = pBit1 = pBitValues[subset];
+ }
+ else if (pBits != 0)
+ {
+ pBit0 = pBitValues[subset * 2];
+ pBit1 = pBitValues[subset * 2 + 1];
+ }
+
+ RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0);
+ RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1);
+
+ Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
+ Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
+
+ Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
+
+ Vector128<byte> rWeights;
+ Vector128<byte> lWeights;
+
+ fixed (byte* pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1])
+ {
+ rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte();
+ lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte();
+ }
+
+ Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights);
+ Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
+ Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
+ Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+ Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+
+ Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
+ Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
+ Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
+ Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
+
+ int i = 0;
+ for (int ty = 0; ty < h; ty++)
+ {
+ for (int tx = 0; tx < w; tx++, i++)
+ {
+ int tileOffset = ty * 4 + tx;
+ if (partitionTable[tileOffset] != subset)
+ {
+ continue;
+ }
+
+ uint c = tile[i] | alphaMask;
+
+ Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
+
+ Vector128<short> delta0 = Sse2.Subtract(color, pal0);
+ Vector128<short> delta1 = Sse2.Subtract(color, pal1);
+ Vector128<short> delta2 = Sse2.Subtract(color, pal2);
+ Vector128<short> delta3 = Sse2.Subtract(color, pal3);
+
+ Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
+ Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
+ Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
+ Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
+
+ Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
+ Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
+
+ Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
+
+ Vector128<ushort> min = Sse41.MinHorizontal(delta);
+
+ uint minPos = min.AsUInt32().GetElement(0);
+ ushort error = (ushort)minPos;
+ uint index = minPos >> 16;
+
+ indices[tileOffset] = (byte)index;
+ errorSum += error;
+ }
+ }
+ }
+
+ return errorSum;
+ }
+
+ private static unsafe int Select4BitIndicesOneSubsetSse41(
+ ReadOnlySpan<uint> tile,
+ int w,
+ int h,
+ uint endPoint0,
+ uint endPoint1,
+ ReadOnlySpan<int> pBitValues,
+ Span<byte> indices,
+ int partition,
+ int colorDepth,
+ int alphaDepth,
+ int pBits,
+ uint alphaMask)
+ {
+ uint alphaMaskForPalette = alphaMask;
+
+ if (alphaDepth == 0)
+ {
+ alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
+ }
+
+ int errorSum = 0;
+
+ int pBit0 = -1, pBit1 = -1;
+
+ if (pBits != 0)
+ {
+ pBit0 = pBitValues[0];
+ pBit1 = pBitValues[1];
+ }
+
+ RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
+ RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
+
+ Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
+ Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
+
+ Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
+
+ Vector128<byte> rWeights;
+ Vector128<byte> lWeights;
+
+ fixed (byte* pWeights = BC67Tables.Weights[2], pInvWeights = BC67Tables.InverseWeights[2])
+ {
+ rWeights = Sse2.LoadVector128(pWeights);
+ lWeights = Sse2.LoadVector128(pInvWeights);
+ }
+
+ Vector128<byte> iWeightsLow = Sse2.UnpackLow(lWeights, rWeights);
+ Vector128<byte> iWeightsHigh = Sse2.UnpackHigh(lWeights, rWeights);
+ Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte();
+ Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte();
+ Vector128<byte> iWeights45 = Sse2.UnpackLow(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte();
+ Vector128<byte> iWeights67 = Sse2.UnpackHigh(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte();
+ Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
+ Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+ Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
+ Vector128<byte> iWeights4 = Sse2.UnpackLow(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte();
+ Vector128<byte> iWeights5 = Sse2.UnpackHigh(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte();
+ Vector128<byte> iWeights6 = Sse2.UnpackLow(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte();
+ Vector128<byte> iWeights7 = Sse2.UnpackHigh(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte();
+
+ Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
+ Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
+ Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
+ Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
+ Vector128<short> pal4 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights4.AsSByte()));
+ Vector128<short> pal5 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights5.AsSByte()));
+ Vector128<short> pal6 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights6.AsSByte()));
+ Vector128<short> pal7 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights7.AsSByte()));
+
+ int i = 0;
+ for (int ty = 0; ty < h; ty++)
+ {
+ for (int tx = 0; tx < w; tx++, i++)
+ {
+ uint c = tile[i] | alphaMask;
+
+ Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
+
+ Vector128<short> delta0 = Sse2.Subtract(color, pal0);
+ Vector128<short> delta1 = Sse2.Subtract(color, pal1);
+ Vector128<short> delta2 = Sse2.Subtract(color, pal2);
+ Vector128<short> delta3 = Sse2.Subtract(color, pal3);
+ Vector128<short> delta4 = Sse2.Subtract(color, pal4);
+ Vector128<short> delta5 = Sse2.Subtract(color, pal5);
+ Vector128<short> delta6 = Sse2.Subtract(color, pal6);
+ Vector128<short> delta7 = Sse2.Subtract(color, pal7);
+
+ Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
+ Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
+ Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
+ Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
+ Vector128<int> deltaSum4 = Sse2.MultiplyAddAdjacent(delta4, delta4);
+ Vector128<int> deltaSum5 = Sse2.MultiplyAddAdjacent(delta5, delta5);
+ Vector128<int> deltaSum6 = Sse2.MultiplyAddAdjacent(delta6, delta6);
+ Vector128<int> deltaSum7 = Sse2.MultiplyAddAdjacent(delta7, delta7);
+
+ Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
+ Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
+ Vector128<int> deltaSum45 = Ssse3.HorizontalAdd(deltaSum4, deltaSum5);
+ Vector128<int> deltaSum67 = Ssse3.HorizontalAdd(deltaSum6, deltaSum7);
+
+ Vector128<ushort> delta0123 = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
+ Vector128<ushort> delta4567 = Sse41.PackUnsignedSaturate(deltaSum45, deltaSum67);
+
+ Vector128<ushort> min0123 = Sse41.MinHorizontal(delta0123);
+ Vector128<ushort> min4567 = Sse41.MinHorizontal(delta4567);
+
+ uint minPos0123 = min0123.AsUInt32().GetElement(0);
+ uint minPos4567 = min4567.AsUInt32().GetElement(0);
+
+ if ((ushort)minPos4567 < (ushort)minPos0123)
+ {
+ errorSum += (ushort)minPos4567;
+ indices[ty * 4 + tx] = (byte)(8 + (minPos4567 >> 16));
+ }
+ else
+ {
+ errorSum += (ushort)minPos0123;
+ indices[ty * 4 + tx] = (byte)(minPos0123 >> 16);
+ }
+ }
+ }
+
+ return errorSum;
+ }
+
+ private static Vector128<short> ShiftRoundToNearest(Vector128<short> x)
+ {
+ return Sse2.ShiftRightLogical(Sse2.Add(x, Vector128.Create((short)32)), 6);
+ }
+
+ private static int SelectIndicesFallback(
+ ReadOnlySpan<uint> tile,
+ int w,
+ int h,
+ ReadOnlySpan<uint> endPoints0,
+ ReadOnlySpan<uint> endPoints1,
+ ReadOnlySpan<int> pBitValues,
+ Span<byte> indices,
+ int subsetCount,
+ int partition,
+ int indexBitCount,
+ int indexCount,
+ int colorDepth,
+ int alphaDepth,
+ int pBits,
+ uint alphaMask)
+ {
+ int errorSum = 0;
+
+ uint alphaMaskForPalette = alphaMask;
+
+ if (alphaDepth == 0)
+ {
+ alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
+ }
+
+ Span<uint> palette = stackalloc uint[subsetCount * indexCount];
+
+ for (int subset = 0; subset < subsetCount; subset++)
+ {
+ int palBase = subset * indexCount;
+
+ int pBit0 = -1, pBit1 = -1;
+
+ if (pBits == subsetCount)
+ {
+ pBit0 = pBit1 = pBitValues[subset];
+ }
+ else if (pBits != 0)
+ {
+ pBit0 = pBitValues[subset * 2];
+ pBit1 = pBitValues[subset * 2 + 1];
+ }
+
+ RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0);
+ RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1);
+
+ Unsafe.As<RgbaColor8, uint>(ref c0) |= alphaMaskForPalette;
+ Unsafe.As<RgbaColor8, uint>(ref c1) |= alphaMaskForPalette;
+
+ palette[palBase + 0] = c0.ToUInt32();
+ palette[palBase + indexCount - 1] = c1.ToUInt32();
+
+ for (int j = 1; j < indexCount - 1; j++)
+ {
+ palette[palBase + j] = Interpolate(c0, c1, j, indexBitCount).ToUInt32();
+ }
+ }
+
+ int i = 0;
+ for (int ty = 0; ty < h; ty++)
+ {
+ for (int tx = 0; tx < w; tx++)
+ {
+ int subset = BC67Tables.PartitionTable[subsetCount - 1][partition][ty * 4 + tx];
+ uint color = tile[i++] | alphaMask;
+
+ int bestMatchScore = int.MaxValue;
+ int bestMatchIndex = 0;
+
+ for (int j = 0; j < indexCount; j++)
+ {
+ int score = SquaredDifference(
+ RgbaColor8.FromUInt32(color).GetColor32(),
+ RgbaColor8.FromUInt32(palette[subset * indexCount + j]).GetColor32());
+
+ if (score < bestMatchScore)
+ {
+ bestMatchScore = score;
+ bestMatchIndex = j;
+ }
+ }
+
+ indices[ty * 4 + tx] = (byte)bestMatchIndex;
+ errorSum += bestMatchScore;
+ }
+ }
+
+ return errorSum;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static int SquaredDifference(RgbaColor32 color1, RgbaColor32 color2)
+ {
+ RgbaColor32 delta = color1 - color2;
+ return RgbaColor32.Dot(delta, delta);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor8 Interpolate(RgbaColor8 color1, RgbaColor8 color2, int weightIndex, int indexBitCount)
+ {
+ return Interpolate(color1.GetColor32(), color2.GetColor32(), weightIndex, indexBitCount).GetColor8();
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor32 Interpolate(RgbaColor32 color1, RgbaColor32 color2, int weightIndex, int indexBitCount)
+ {
+ Debug.Assert(indexBitCount >= 2 && indexBitCount <= 4);
+
+ int weight = (((weightIndex << 7) / ((1 << indexBitCount) - 1)) + 1) >> 1;
+
+ RgbaColor32 weightV = new RgbaColor32(weight);
+ RgbaColor32 invWeightV = new RgbaColor32(64 - weight);
+
+ return (color1 * invWeightV + color2 * weightV + new RgbaColor32(32)) >> 6;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor32 Interpolate(
+ RgbaColor32 color1,
+ RgbaColor32 color2,
+ int colorWeightIndex,
+ int alphaWeightIndex,
+ int colorIndexBitCount,
+ int alphaIndexBitCount)
+ {
+ Debug.Assert(colorIndexBitCount >= 2 && colorIndexBitCount <= 4);
+ Debug.Assert(alphaIndexBitCount >= 2 && alphaIndexBitCount <= 4);
+
+ int colorWeight = BC67Tables.Weights[colorIndexBitCount - 2][colorWeightIndex];
+ int alphaWeight = BC67Tables.Weights[alphaIndexBitCount - 2][alphaWeightIndex];
+
+ RgbaColor32 weightV = new RgbaColor32(colorWeight);
+ weightV.A = alphaWeight;
+ RgbaColor32 invWeightV = new RgbaColor32(64) - weightV;
+
+ return (color1 * invWeightV + color2 * weightV + new RgbaColor32(32)) >> 6;
+ }
+
+ public static RgbaColor8 Quantize(RgbaColor8 color, int colorBits, int alphaBits, int pBit = -1)
+ {
+ if (alphaBits == 0)
+ {
+ int colorShift = 8 - colorBits;
+
+ uint c;
+
+ if (pBit >= 0)
+ {
+ byte[] lutColor = _quantizationLut[colorBits - 4];
+
+ Debug.Assert(pBit <= 1);
+ int high = pBit << 8;
+ uint mask = (0xffu >> (colorBits + 1)) * 0x10101;
+
+ c = lutColor[color.R | high];
+ c |= (uint)lutColor[color.G | high] << 8;
+ c |= (uint)lutColor[color.B | high] << 16;
+
+ c <<= colorShift;
+ c |= (c >> (colorBits + 1)) & mask;
+ c |= ((uint)pBit * 0x10101) << (colorShift - 1);
+ }
+ else
+ {
+ byte[] lutColor = _quantizationLutNoPBit[colorBits - 4];
+
+ uint mask = (0xffu >> colorBits) * 0x10101;
+
+ c = lutColor[color.R];
+ c |= (uint)lutColor[color.G] << 8;
+ c |= (uint)lutColor[color.B] << 16;
+
+ c <<= colorShift;
+ c |= (c >> colorBits) & mask;
+ }
+
+ c |= (uint)color.A << 24;
+
+ return RgbaColor8.FromUInt32(c);
+ }
+
+ return QuantizeFallback(color, colorBits, alphaBits, pBit);
+ }
+
+ private static RgbaColor8 QuantizeFallback(RgbaColor8 color, int colorBits, int alphaBits, int pBit = -1)
+ {
+ byte r = UnquantizeComponent(QuantizeComponent(color.R, colorBits, pBit), colorBits, pBit);
+ byte g = UnquantizeComponent(QuantizeComponent(color.G, colorBits, pBit), colorBits, pBit);
+ byte b = UnquantizeComponent(QuantizeComponent(color.B, colorBits, pBit), colorBits, pBit);
+ byte a = alphaBits == 0 ? color.A : UnquantizeComponent(QuantizeComponent(color.A, alphaBits, pBit), alphaBits, pBit);
+ return new RgbaColor8(r, g, b, a);
+ }
+
+ public static byte QuantizeComponent(byte component, int bits, int pBit = -1)
+ {
+ return pBit >= 0 ? _quantizationLut[bits - 4][component | (pBit << 8)] : _quantizationLutNoPBit[bits - 4][component];
+ }
+
+ private static byte QuantizeComponentForLut(byte component, int bits, int pBit = -1)
+ {
+ int shift = 8 - bits;
+ int fill = component >> bits;
+
+ if (pBit >= 0)
+ {
+ Debug.Assert(pBit <= 1);
+ fill >>= 1;
+ fill |= pBit << (shift - 1);
+ }
+
+ int q1 = component >> shift;
+ int q2 = Math.Max(q1 - 1, 0);
+ int q3 = Math.Min(q1 + 1, (1 << bits) - 1);
+
+ int delta1 = FastAbs(((q1 << shift) | fill) - component);
+ int delta2 = component - ((q2 << shift) | fill);
+ int delta3 = ((q3 << shift) | fill) - component;
+
+ if (delta1 < delta2 && delta1 < delta3)
+ {
+ return (byte)q1;
+ }
+ else if (delta2 < delta3)
+ {
+ return (byte)q2;
+ }
+ else
+ {
+ return (byte)q3;
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static int FastAbs(int x)
+ {
+ int sign = x >> 31;
+ return (x + sign) ^ sign;
+ }
+
+ private static byte UnquantizeComponent(byte component, int bits, int pBit)
+ {
+ int shift = 8 - bits;
+ int value = component << shift;
+
+ if (pBit >= 0)
+ {
+ Debug.Assert(pBit <= 1);
+ value |= value >> (bits + 1);
+ value |= pBit << (shift - 1);
+ }
+ else
+ {
+ value |= value >> bits;
+ }
+
+ return (byte)value;
+ }
+ }
+}
diff --git a/src/Ryujinx.Graphics.Texture/Utils/BC7ModeInfo.cs b/src/Ryujinx.Graphics.Texture/Utils/BC7ModeInfo.cs
new file mode 100644
index 00000000..687df22c
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Utils/BC7ModeInfo.cs
@@ -0,0 +1,37 @@
+namespace Ryujinx.Graphics.Texture.Utils
+{
+ readonly struct BC7ModeInfo
+ {
+ public readonly int SubsetCount;
+ public readonly int PartitionBitCount;
+ public readonly int PBits;
+ public readonly int RotationBitCount;
+ public readonly int IndexModeBitCount;
+ public readonly int ColorIndexBitCount;
+ public readonly int AlphaIndexBitCount;
+ public readonly int ColorDepth;
+ public readonly int AlphaDepth;
+
+ public BC7ModeInfo(
+ int subsetCount,
+ int partitionBitsCount,
+ int pBits,
+ int rotationBitCount,
+ int indexModeBitCount,
+ int colorIndexBitCount,
+ int alphaIndexBitCount,
+ int colorDepth,
+ int alphaDepth)
+ {
+ SubsetCount = subsetCount;
+ PartitionBitCount = partitionBitsCount;
+ PBits = pBits;
+ RotationBitCount = rotationBitCount;
+ IndexModeBitCount = indexModeBitCount;
+ ColorIndexBitCount = colorIndexBitCount;
+ AlphaIndexBitCount = alphaIndexBitCount;
+ ColorDepth = colorDepth;
+ AlphaDepth = alphaDepth;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/Ryujinx.Graphics.Texture/Utils/Block.cs b/src/Ryujinx.Graphics.Texture/Utils/Block.cs
new file mode 100644
index 00000000..a8bae077
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Utils/Block.cs
@@ -0,0 +1,55 @@
+namespace Ryujinx.Graphics.Texture.Utils
+{
+ struct Block
+ {
+ public ulong Low;
+ public ulong High;
+
+ public void Encode(ulong value, ref int offset, int bits)
+ {
+ if (offset >= 64)
+ {
+ High |= value << (offset - 64);
+ }
+ else
+ {
+ Low |= value << offset;
+
+ if (offset + bits > 64)
+ {
+ int remainder = 64 - offset;
+ High |= value >> remainder;
+ }
+ }
+
+ offset += bits;
+ }
+
+ public ulong Decode(ref int offset, int bits)
+ {
+ ulong value;
+ ulong mask = bits == 64 ? ulong.MaxValue : (1UL << bits) - 1;
+
+ if (offset >= 64)
+ {
+ value = (High >> (offset - 64)) & mask;
+ }
+ else
+ {
+ value = Low >> offset;
+
+ if (offset + bits > 64)
+ {
+ int remainder = 64 - offset;
+ value |= High << remainder;
+ }
+
+ value &= mask;
+ }
+
+ offset += bits;
+
+ return value;
+ }
+ }
+} \ No newline at end of file
diff --git a/src/Ryujinx.Graphics.Texture/Utils/RgbaColor32.cs b/src/Ryujinx.Graphics.Texture/Utils/RgbaColor32.cs
new file mode 100644
index 00000000..582044d9
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Utils/RgbaColor32.cs
@@ -0,0 +1,229 @@
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace Ryujinx.Graphics.Texture.Utils
+{
+ struct RgbaColor32 : IEquatable<RgbaColor32>
+ {
+ private Vector128<int> _color;
+
+ public int R
+ {
+ get => _color.GetElement(0);
+ set => _color = _color.WithElement(0, value);
+ }
+
+ public int G
+ {
+ get => _color.GetElement(1);
+ set => _color = _color.WithElement(1, value);
+ }
+
+ public int B
+ {
+ get => _color.GetElement(2);
+ set => _color = _color.WithElement(2, value);
+ }
+
+ public int A
+ {
+ get => _color.GetElement(3);
+ set => _color = _color.WithElement(3, value);
+ }
+
+ public RgbaColor32(Vector128<int> color)
+ {
+ _color = color;
+ }
+
+ public RgbaColor32(int r, int g, int b, int a)
+ {
+ _color = Vector128.Create(r, g, b, a);
+ }
+
+ public RgbaColor32(int scalar)
+ {
+ _color = Vector128.Create(scalar);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor32 operator +(RgbaColor32 x, RgbaColor32 y)
+ {
+ if (Sse2.IsSupported)
+ {
+ return new RgbaColor32(Sse2.Add(x._color, y._color));
+ }
+ else
+ {
+ return new RgbaColor32(x.R + y.R, x.G + y.G, x.B + y.B, x.A + y.A);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor32 operator -(RgbaColor32 x, RgbaColor32 y)
+ {
+ if (Sse2.IsSupported)
+ {
+ return new RgbaColor32(Sse2.Subtract(x._color, y._color));
+ }
+ else
+ {
+ return new RgbaColor32(x.R - y.R, x.G - y.G, x.B - y.B, x.A - y.A);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor32 operator *(RgbaColor32 x, RgbaColor32 y)
+ {
+ if (Sse41.IsSupported)
+ {
+ return new RgbaColor32(Sse41.MultiplyLow(x._color, y._color));
+ }
+ else
+ {
+ return new RgbaColor32(x.R * y.R, x.G * y.G, x.B * y.B, x.A * y.A);
+ }
+ }
+
+ public static RgbaColor32 operator /(RgbaColor32 x, RgbaColor32 y)
+ {
+ return new RgbaColor32(x.R / y.R, x.G / y.G, x.B / y.B, x.A / y.A);
+ }
+
+ public static RgbaColor32 DivideGuarded(RgbaColor32 x, RgbaColor32 y, int resultIfZero)
+ {
+ return new RgbaColor32(
+ DivideGuarded(x.R, y.R, resultIfZero),
+ DivideGuarded(x.G, y.G, resultIfZero),
+ DivideGuarded(x.B, y.B, resultIfZero),
+ DivideGuarded(x.A, y.A, resultIfZero));
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor32 operator <<(RgbaColor32 x, int shift)
+ {
+ if (Sse2.IsSupported)
+ {
+ return new RgbaColor32(Sse2.ShiftLeftLogical(x._color, (byte)shift));
+ }
+ else
+ {
+ return new RgbaColor32(x.R << shift, x.G << shift, x.B << shift, x.A << shift);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor32 operator >>(RgbaColor32 x, int shift)
+ {
+ if (Sse2.IsSupported)
+ {
+ return new RgbaColor32(Sse2.ShiftRightLogical(x._color, (byte)shift));
+ }
+ else
+ {
+ return new RgbaColor32(x.R >> shift, x.G >> shift, x.B >> shift, x.A >> shift);
+ }
+ }
+
+ public static bool operator ==(RgbaColor32 x, RgbaColor32 y)
+ {
+ return x.Equals(y);
+ }
+
+ public static bool operator !=(RgbaColor32 x, RgbaColor32 y)
+ {
+ return !x.Equals(y);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static int Dot(RgbaColor32 x, RgbaColor32 y)
+ {
+ if (Sse41.IsSupported)
+ {
+ Vector128<int> product = Sse41.MultiplyLow(x._color, y._color);
+ Vector128<int> sum = Ssse3.HorizontalAdd(product, product);
+ sum = Ssse3.HorizontalAdd(sum, sum);
+ return sum.GetElement(0);
+ }
+ else
+ {
+ return x.R * y.R + x.G * y.G + x.B * y.B + x.A * y.A;
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor32 Max(RgbaColor32 x, RgbaColor32 y)
+ {
+ if (Sse41.IsSupported)
+ {
+ return new RgbaColor32(Sse41.Max(x._color, y._color));
+ }
+ else
+ {
+ return new RgbaColor32(Math.Max(x.R, y.R), Math.Max(x.G, y.G), Math.Max(x.B, y.B), Math.Max(x.A, y.A));
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static RgbaColor32 Min(RgbaColor32 x, RgbaColor32 y)
+ {
+ if (Sse41.IsSupported)
+ {
+ return new RgbaColor32(Sse41.Min(x._color, y._color));
+ }
+ else
+ {
+ return new RgbaColor32(Math.Min(x.R, y.R), Math.Min(x.G, y.G), Math.Min(x.B, y.B), Math.Min(x.A, y.A));
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public RgbaColor8 GetColor8()
+ {
+ if (Sse41.IsSupported)
+ {
+ Vector128<int> temp = _color;
+ Vector128<ushort> color16 = Sse41.PackUnsignedSaturate(temp, temp);
+ Vector128<byte> color8 = Sse2.PackUnsignedSaturate(color16.AsInt16(), color16.AsInt16());
+ uint color = color8.AsUInt32().GetElement(0);
+ return Unsafe.As<uint, RgbaColor8>(ref color);
+ }
+ else
+ {
+ return new RgbaColor8(ClampByte(R), ClampByte(G), ClampByte(B), ClampByte(A));
+ }
+ }
+
+ private static int DivideGuarded(int dividend, int divisor, int resultIfZero)
+ {
+ if (divisor == 0)
+ {
+ return resultIfZero;
+ }
+
+ return dividend / divisor;
+ }
+
+ private static byte ClampByte(int value)
+ {
+ return (byte)Math.Clamp(value, 0, 255);
+ }
+
+ public override int GetHashCode()
+ {
+ return HashCode.Combine(R, G, B, A);
+ }
+
+ public override bool Equals(object obj)
+ {
+ return obj is RgbaColor32 other && Equals(other);
+ }
+
+ public bool Equals(RgbaColor32 other)
+ {
+ return _color.Equals(other._color);
+ }
+ }
+}
diff --git a/src/Ryujinx.Graphics.Texture/Utils/RgbaColor8.cs b/src/Ryujinx.Graphics.Texture/Utils/RgbaColor8.cs
new file mode 100644
index 00000000..0edf1cce
--- /dev/null
+++ b/src/Ryujinx.Graphics.Texture/Utils/RgbaColor8.cs
@@ -0,0 +1,84 @@
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace Ryujinx.Graphics.Texture.Utils
+{
+ struct RgbaColor8 : IEquatable<RgbaColor8>
+ {
+ public byte R;
+ public byte G;
+ public byte B;
+ public byte A;
+
+ public RgbaColor8(byte r, byte g, byte b, byte a)
+ {
+ R = r;
+ G = g;
+ B = b;
+ A = a;
+ }
+
+ public static RgbaColor8 FromUInt32(uint color)
+ {
+ return Unsafe.As<uint, RgbaColor8>(ref color);
+ }
+
+ public static bool operator ==(RgbaColor8 x, RgbaColor8 y)
+ {
+ return x.Equals(y);
+ }
+
+ public static bool operator !=(RgbaColor8 x, RgbaColor8 y)
+ {
+ return !x.Equals(y);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public RgbaColor32 GetColor32()
+ {
+ if (Sse41.IsSupported)
+ {
+ Vector128<byte> color = Vector128.CreateScalarUnsafe(Unsafe.As<RgbaColor8, uint>(ref this)).AsByte();
+ return new RgbaColor32(Sse41.ConvertToVector128Int32(color));
+ }
+ else
+ {
+ return new RgbaColor32(R, G, B, A);
+ }
+ }
+
+ public uint ToUInt32()
+ {
+ return Unsafe.As<RgbaColor8, uint>(ref this);
+ }
+
+ public override int GetHashCode()
+ {
+ return HashCode.Combine(R, G, B, A);
+ }
+
+ public override bool Equals(object obj)
+ {
+ return obj is RgbaColor8 other && Equals(other);
+ }
+
+ public bool Equals(RgbaColor8 other)
+ {
+ return R == other.R && G == other.G && B == other.B && A == other.A;
+ }
+
+ public byte GetComponent(int index)
+ {
+ return index switch
+ {
+ 0 => R,
+ 1 => G,
+ 2 => B,
+ 3 => A,
+ _ => throw new ArgumentOutOfRangeException(nameof(index))
+ };
+ }
+ }
+}