diff options
Diffstat (limited to 'src/Ryujinx.Graphics.Texture')
32 files changed, 9565 insertions, 0 deletions
diff --git a/src/Ryujinx.Graphics.Texture/Astc/AstcDecoder.cs b/src/Ryujinx.Graphics.Texture/Astc/AstcDecoder.cs new file mode 100644 index 00000000..08738583 --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Astc/AstcDecoder.cs @@ -0,0 +1,1621 @@ +using Ryujinx.Common.Utilities; +using System; +using System.Diagnostics; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Texture.Astc +{ + // https://github.com/GammaUNC/FasTC/blob/master/ASTCEncoder/src/Decompressor.cpp + public class AstcDecoder + { + private ReadOnlyMemory<byte> InputBuffer { get; } + private Memory<byte> OutputBuffer { get; } + + private int BlockSizeX { get; } + private int BlockSizeY { get; } + + private AstcLevel[] Levels { get; } + + private bool Success { get; set; } + + public int TotalBlockCount { get; } + + public AstcDecoder( + ReadOnlyMemory<byte> inputBuffer, + Memory<byte> outputBuffer, + int blockWidth, + int blockHeight, + int width, + int height, + int depth, + int levels, + int layers) + { + if ((uint)blockWidth > 12) + { + throw new ArgumentOutOfRangeException(nameof(blockWidth)); + } + + if ((uint)blockHeight > 12) + { + throw new ArgumentOutOfRangeException(nameof(blockHeight)); + } + + InputBuffer = inputBuffer; + OutputBuffer = outputBuffer; + + BlockSizeX = blockWidth; + BlockSizeY = blockHeight; + + Levels = new AstcLevel[levels * layers]; + + Success = true; + + TotalBlockCount = 0; + + int currentInputBlock = 0; + int currentOutputOffset = 0; + + for (int i = 0; i < levels; i++) + { + for (int j = 0; j < layers; j++) + { + ref AstcLevel level = ref Levels[i * layers + j]; + + level.ImageSizeX = Math.Max(1, width >> i); + level.ImageSizeY = Math.Max(1, height >> i); + level.ImageSizeZ = Math.Max(1, depth >> i); + + level.BlockCountX = (level.ImageSizeX + blockWidth - 1) / blockWidth; + level.BlockCountY = (level.ImageSizeY + blockHeight - 1) / blockHeight; + + level.StartBlock = currentInputBlock; + level.OutputByteOffset = currentOutputOffset; + + currentInputBlock += level.TotalBlockCount; + currentOutputOffset += level.PixelCount * 4; + } + } + + TotalBlockCount = currentInputBlock; + } + + private struct AstcLevel + { + public int ImageSizeX { get; set; } + public int ImageSizeY { get; set; } + public int ImageSizeZ { get; set; } + + public int BlockCountX { get; set; } + public int BlockCountY { get; set; } + + public int StartBlock { get; set; } + public int OutputByteOffset { get; set; } + + public int TotalBlockCount => BlockCountX * BlockCountY * ImageSizeZ; + public int PixelCount => ImageSizeX * ImageSizeY * ImageSizeZ; + } + + public static int QueryDecompressedSize(int sizeX, int sizeY, int sizeZ, int levelCount, int layerCount) + { + int size = 0; + + for (int i = 0; i < levelCount; i++) + { + int levelSizeX = Math.Max(1, sizeX >> i); + int levelSizeY = Math.Max(1, sizeY >> i); + int levelSizeZ = Math.Max(1, sizeZ >> i); + + size += levelSizeX * levelSizeY * levelSizeZ * layerCount; + } + + return size * 4; + } + + public void ProcessBlock(int index) + { + Buffer16 inputBlock = MemoryMarshal.Cast<byte, Buffer16>(InputBuffer.Span)[index]; + + Span<int> decompressedData = stackalloc int[144]; + + try + { + DecompressBlock(inputBlock, decompressedData, BlockSizeX, BlockSizeY); + } + catch (Exception) + { + Success = false; + } + + Span<byte> decompressedBytes = MemoryMarshal.Cast<int, byte>(decompressedData); + + AstcLevel levelInfo = GetLevelInfo(index); + + WriteDecompressedBlock(decompressedBytes, OutputBuffer.Span.Slice(levelInfo.OutputByteOffset), + index - levelInfo.StartBlock, levelInfo); + } + + private AstcLevel GetLevelInfo(int blockIndex) + { + foreach (AstcLevel levelInfo in Levels) + { + if (blockIndex < levelInfo.StartBlock + levelInfo.TotalBlockCount) + { + return levelInfo; + } + } + + throw new AstcDecoderException("Invalid block index."); + } + + private void WriteDecompressedBlock(ReadOnlySpan<byte> block, Span<byte> outputBuffer, int blockIndex, AstcLevel level) + { + int stride = level.ImageSizeX * 4; + + int blockCordX = blockIndex % level.BlockCountX; + int blockCordY = blockIndex / level.BlockCountX; + + int pixelCordX = blockCordX * BlockSizeX; + int pixelCordY = blockCordY * BlockSizeY; + + int outputPixelsX = Math.Min(pixelCordX + BlockSizeX, level.ImageSizeX) - pixelCordX; + int outputPixelsY = Math.Min(pixelCordY + BlockSizeY, level.ImageSizeY * level.ImageSizeZ) - pixelCordY; + + int outputStart = pixelCordX * 4 + pixelCordY * stride; + int outputOffset = outputStart; + + int inputOffset = 0; + + for (int i = 0; i < outputPixelsY; i++) + { + ReadOnlySpan<byte> blockRow = block.Slice(inputOffset, outputPixelsX * 4); + Span<byte> outputRow = outputBuffer.Slice(outputOffset); + blockRow.CopyTo(outputRow); + + inputOffset += BlockSizeX * 4; + outputOffset += stride; + } + } + + struct TexelWeightParams + { + public int Width; + public int Height; + public int MaxWeight; + public bool DualPlane; + public bool Error; + public bool VoidExtentLdr; + public bool VoidExtentHdr; + + public int GetPackedBitSize() + { + // How many indices do we have? + int indices = Height * Width; + + if (DualPlane) + { + indices *= 2; + } + + IntegerEncoded intEncoded = IntegerEncoded.CreateEncoding(MaxWeight); + + return intEncoded.GetBitLength(indices); + } + + public int GetNumWeightValues() + { + int ret = Width * Height; + + if (DualPlane) + { + ret *= 2; + } + + return ret; + } + } + + public static bool TryDecodeToRgba8( + ReadOnlyMemory<byte> data, + int blockWidth, + int blockHeight, + int width, + int height, + int depth, + int levels, + int layers, + out Span<byte> decoded) + { + byte[] output = new byte[QueryDecompressedSize(width, height, depth, levels, layers)]; + + AstcDecoder decoder = new AstcDecoder(data, output, blockWidth, blockHeight, width, height, depth, levels, layers); + + for (int i = 0; i < decoder.TotalBlockCount; i++) + { + decoder.ProcessBlock(i); + } + + decoded = output; + + return decoder.Success; + } + + public static bool TryDecodeToRgba8( + ReadOnlyMemory<byte> data, + Memory<byte> outputBuffer, + int blockWidth, + int blockHeight, + int width, + int height, + int depth, + int levels, + int layers) + { + AstcDecoder decoder = new AstcDecoder(data, outputBuffer, blockWidth, blockHeight, width, height, depth, levels, layers); + + for (int i = 0; i < decoder.TotalBlockCount; i++) + { + decoder.ProcessBlock(i); + } + + return decoder.Success; + } + + public static bool TryDecodeToRgba8P( + ReadOnlyMemory<byte> data, + Memory<byte> outputBuffer, + int blockWidth, + int blockHeight, + int width, + int height, + int depth, + int levels, + int layers) + { + AstcDecoder decoder = new AstcDecoder(data, outputBuffer, blockWidth, blockHeight, width, height, depth, levels, layers); + + // Lazy parallelism + Enumerable.Range(0, decoder.TotalBlockCount).AsParallel().ForAll(x => decoder.ProcessBlock(x)); + + return decoder.Success; + } + + public static bool TryDecodeToRgba8P( + ReadOnlyMemory<byte> data, + int blockWidth, + int blockHeight, + int width, + int height, + int depth, + int levels, + int layers, + out byte[] decoded) + { + byte[] output = new byte[QueryDecompressedSize(width, height, depth, levels, layers)]; + + AstcDecoder decoder = new AstcDecoder(data, output, blockWidth, blockHeight, width, height, depth, levels, layers); + + Enumerable.Range(0, decoder.TotalBlockCount).AsParallel().ForAll(x => decoder.ProcessBlock(x)); + + decoded = output; + + return decoder.Success; + } + + public static bool DecompressBlock( + Buffer16 inputBlock, + Span<int> outputBuffer, + int blockWidth, + int blockHeight) + { + BitStream128 bitStream = new BitStream128(inputBlock); + + DecodeBlockInfo(ref bitStream, out TexelWeightParams texelParams); + + if (texelParams.Error) + { + throw new AstcDecoderException("Invalid block mode"); + } + + if (texelParams.VoidExtentLdr) + { + FillVoidExtentLdr(ref bitStream, outputBuffer, blockWidth, blockHeight); + + return true; + } + + if (texelParams.VoidExtentHdr) + { + throw new AstcDecoderException("HDR void extent blocks are not supported."); + } + + if (texelParams.Width > blockWidth) + { + throw new AstcDecoderException("Texel weight grid width should be smaller than block width."); + } + + if (texelParams.Height > blockHeight) + { + throw new AstcDecoderException("Texel weight grid height should be smaller than block height."); + } + + // Read num partitions + int numberPartitions = bitStream.ReadBits(2) + 1; + Debug.Assert(numberPartitions <= 4); + + if (numberPartitions == 4 && texelParams.DualPlane) + { + throw new AstcDecoderException("Dual plane mode is incompatible with four partition blocks."); + } + + // Based on the number of partitions, read the color endpoint mode for + // each partition. + + // Determine partitions, partition index, and color endpoint modes + int planeIndices; + int partitionIndex; + + Span<uint> colorEndpointMode = stackalloc uint[4]; + + BitStream128 colorEndpointStream = new BitStream128(); + + // Read extra config data... + uint baseColorEndpointMode = 0; + + if (numberPartitions == 1) + { + colorEndpointMode[0] = (uint)bitStream.ReadBits(4); + partitionIndex = 0; + } + else + { + partitionIndex = bitStream.ReadBits(10); + baseColorEndpointMode = (uint)bitStream.ReadBits(6); + } + + uint baseMode = (baseColorEndpointMode & 3); + + // Remaining bits are color endpoint data... + int numberWeightBits = texelParams.GetPackedBitSize(); + int remainingBits = bitStream.BitsLeft - numberWeightBits; + + // Consider extra bits prior to texel data... + uint extraColorEndpointModeBits = 0; + + if (baseMode != 0) + { + switch (numberPartitions) + { + case 2: extraColorEndpointModeBits += 2; break; + case 3: extraColorEndpointModeBits += 5; break; + case 4: extraColorEndpointModeBits += 8; break; + default: Debug.Assert(false); break; + } + } + + remainingBits -= (int)extraColorEndpointModeBits; + + // Do we have a dual plane situation? + int planeSelectorBits = 0; + + if (texelParams.DualPlane) + { + planeSelectorBits = 2; + } + + remainingBits -= planeSelectorBits; + + // Read color data... + int colorDataBits = remainingBits; + + while (remainingBits > 0) + { + int numberBits = Math.Min(remainingBits, 8); + int bits = bitStream.ReadBits(numberBits); + colorEndpointStream.WriteBits(bits, numberBits); + remainingBits -= 8; + } + + // Read the plane selection bits + planeIndices = bitStream.ReadBits(planeSelectorBits); + + // Read the rest of the CEM + if (baseMode != 0) + { + uint extraColorEndpointMode = (uint)bitStream.ReadBits((int)extraColorEndpointModeBits); + uint tempColorEndpointMode = (extraColorEndpointMode << 6) | baseColorEndpointMode; + tempColorEndpointMode >>= 2; + + Span<bool> c = stackalloc bool[4]; + + for (int i = 0; i < numberPartitions; i++) + { + c[i] = (tempColorEndpointMode & 1) != 0; + tempColorEndpointMode >>= 1; + } + + Span<byte> m = stackalloc byte[4]; + + for (int i = 0; i < numberPartitions; i++) + { + m[i] = (byte)(tempColorEndpointMode & 3); + tempColorEndpointMode >>= 2; + Debug.Assert(m[i] <= 3); + } + + for (int i = 0; i < numberPartitions; i++) + { + colorEndpointMode[i] = baseMode; + if (!(c[i])) colorEndpointMode[i] -= 1; + colorEndpointMode[i] <<= 2; + colorEndpointMode[i] |= m[i]; + } + } + else if (numberPartitions > 1) + { + uint tempColorEndpointMode = baseColorEndpointMode >> 2; + + for (int i = 0; i < numberPartitions; i++) + { + colorEndpointMode[i] = tempColorEndpointMode; + } + } + + // Make sure everything up till here is sane. + for (int i = 0; i < numberPartitions; i++) + { + Debug.Assert(colorEndpointMode[i] < 16); + } + Debug.Assert(bitStream.BitsLeft == texelParams.GetPackedBitSize()); + + // Decode both color data and texel weight data + Span<int> colorValues = stackalloc int[32]; // Four values * two endpoints * four maximum partitions + DecodeColorValues(colorValues, ref colorEndpointStream, colorEndpointMode, numberPartitions, colorDataBits); + + EndPointSet endPoints; + unsafe { _ = &endPoints; } // Skip struct initialization + + int colorValuesPosition = 0; + + for (int i = 0; i < numberPartitions; i++) + { + ComputeEndpoints(endPoints.Get(i), colorValues, colorEndpointMode[i], ref colorValuesPosition); + } + + // Read the texel weight data. + Buffer16 texelWeightData = inputBlock; + + // Reverse everything + for (int i = 0; i < 8; i++) + { + byte a = ReverseByte(texelWeightData[i]); + byte b = ReverseByte(texelWeightData[15 - i]); + + texelWeightData[i] = b; + texelWeightData[15 - i] = a; + } + + // Make sure that higher non-texel bits are set to zero + int clearByteStart = (texelParams.GetPackedBitSize() >> 3) + 1; + texelWeightData[clearByteStart - 1] &= (byte)((1 << (texelParams.GetPackedBitSize() % 8)) - 1); + + int cLen = 16 - clearByteStart; + for (int i = clearByteStart; i < clearByteStart + cLen; i++) texelWeightData[i] = 0; + + IntegerSequence texelWeightValues; + unsafe { _ = &texelWeightValues; } // Skip struct initialization + texelWeightValues.Reset(); + + BitStream128 weightBitStream = new BitStream128(texelWeightData); + + IntegerEncoded.DecodeIntegerSequence(ref texelWeightValues, ref weightBitStream, texelParams.MaxWeight, texelParams.GetNumWeightValues()); + + // Blocks can be at most 12x12, so we can have as many as 144 weights + Weights weights; + unsafe { _ = &weights; } // Skip struct initialization + + UnquantizeTexelWeights(ref weights, ref texelWeightValues, ref texelParams, blockWidth, blockHeight); + + ushort[] table = Bits.Replicate8_16Table; + + // Now that we have endpoints and weights, we can interpolate and generate + // the proper decoding... + for (int j = 0; j < blockHeight; j++) + { + for (int i = 0; i < blockWidth; i++) + { + int partition = Select2dPartition(partitionIndex, i, j, numberPartitions, ((blockHeight * blockWidth) < 32)); + Debug.Assert(partition < numberPartitions); + + AstcPixel pixel = new AstcPixel(); + for (int component = 0; component < 4; component++) + { + int component0 = endPoints.Get(partition)[0].GetComponent(component); + component0 = table[component0]; + int component1 = endPoints.Get(partition)[1].GetComponent(component); + component1 = table[component1]; + + int plane = 0; + + if (texelParams.DualPlane && (((planeIndices + 1) & 3) == component)) + { + plane = 1; + } + + int weight = weights.Get(plane)[j * blockWidth + i]; + int finalComponent = (component0 * (64 - weight) + component1 * weight + 32) / 64; + + if (finalComponent == 65535) + { + pixel.SetComponent(component, 255); + } + else + { + double finalComponentFloat = finalComponent; + pixel.SetComponent(component, (int)(255.0 * (finalComponentFloat / 65536.0) + 0.5)); + } + } + + outputBuffer[j * blockWidth + i] = pixel.Pack(); + } + } + + return true; + } + + // Blocks can be at most 12x12, so we can have as many as 144 weights + [StructLayout(LayoutKind.Sequential, Size = 144 * sizeof(int) * Count)] + private struct Weights + { + private int _start; + + public const int Count = 2; + + public Span<int> this[int index] + { + get + { + if ((uint)index >= Count) + { + throw new ArgumentOutOfRangeException(); + } + + ref int start = ref Unsafe.Add(ref _start, index * 144); + + return MemoryMarshal.CreateSpan(ref start, 144); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public Span<int> Get(int index) + { + ref int start = ref Unsafe.Add(ref _start, index * 144); + + return MemoryMarshal.CreateSpan(ref start, 144); + } + } + + private static int Select2dPartition(int seed, int x, int y, int partitionCount, bool isSmallBlock) + { + return SelectPartition(seed, x, y, 0, partitionCount, isSmallBlock); + } + + private static int SelectPartition(int seed, int x, int y, int z, int partitionCount, bool isSmallBlock) + { + if (partitionCount == 1) + { + return 0; + } + + if (isSmallBlock) + { + x <<= 1; + y <<= 1; + z <<= 1; + } + + seed += (partitionCount - 1) * 1024; + + int rightNum = Hash52((uint)seed); + byte seed01 = (byte)(rightNum & 0xF); + byte seed02 = (byte)((rightNum >> 4) & 0xF); + byte seed03 = (byte)((rightNum >> 8) & 0xF); + byte seed04 = (byte)((rightNum >> 12) & 0xF); + byte seed05 = (byte)((rightNum >> 16) & 0xF); + byte seed06 = (byte)((rightNum >> 20) & 0xF); + byte seed07 = (byte)((rightNum >> 24) & 0xF); + byte seed08 = (byte)((rightNum >> 28) & 0xF); + byte seed09 = (byte)((rightNum >> 18) & 0xF); + byte seed10 = (byte)((rightNum >> 22) & 0xF); + byte seed11 = (byte)((rightNum >> 26) & 0xF); + byte seed12 = (byte)(((rightNum >> 30) | (rightNum << 2)) & 0xF); + + seed01 *= seed01; seed02 *= seed02; + seed03 *= seed03; seed04 *= seed04; + seed05 *= seed05; seed06 *= seed06; + seed07 *= seed07; seed08 *= seed08; + seed09 *= seed09; seed10 *= seed10; + seed11 *= seed11; seed12 *= seed12; + + int seedHash1, seedHash2, seedHash3; + + if ((seed & 1) != 0) + { + seedHash1 = (seed & 2) != 0 ? 4 : 5; + seedHash2 = (partitionCount == 3) ? 6 : 5; + } + else + { + seedHash1 = (partitionCount == 3) ? 6 : 5; + seedHash2 = (seed & 2) != 0 ? 4 : 5; + } + + seedHash3 = (seed & 0x10) != 0 ? seedHash1 : seedHash2; + + seed01 >>= seedHash1; seed02 >>= seedHash2; seed03 >>= seedHash1; seed04 >>= seedHash2; + seed05 >>= seedHash1; seed06 >>= seedHash2; seed07 >>= seedHash1; seed08 >>= seedHash2; + seed09 >>= seedHash3; seed10 >>= seedHash3; seed11 >>= seedHash3; seed12 >>= seedHash3; + + int a = seed01 * x + seed02 * y + seed11 * z + (rightNum >> 14); + int b = seed03 * x + seed04 * y + seed12 * z + (rightNum >> 10); + int c = seed05 * x + seed06 * y + seed09 * z + (rightNum >> 6); + int d = seed07 * x + seed08 * y + seed10 * z + (rightNum >> 2); + + a &= 0x3F; b &= 0x3F; c &= 0x3F; d &= 0x3F; + + if (partitionCount < 4) d = 0; + if (partitionCount < 3) c = 0; + + if (a >= b && a >= c && a >= d) return 0; + else if (b >= c && b >= d) return 1; + else if (c >= d) return 2; + return 3; + } + + static int Hash52(uint val) + { + val ^= val >> 15; val -= val << 17; val += val << 7; val += val << 4; + val ^= val >> 5; val += val << 16; val ^= val >> 7; val ^= val >> 3; + val ^= val << 6; val ^= val >> 17; + + return (int)val; + } + + static void UnquantizeTexelWeights( + ref Weights outputBuffer, + ref IntegerSequence weights, + ref TexelWeightParams texelParams, + int blockWidth, + int blockHeight) + { + int weightIndices = 0; + Weights unquantized; + unsafe { _ = &unquantized; } // Skip struct initialization + + Span<IntegerEncoded> weightsList = weights.List; + Span<int> unquantized0 = unquantized[0]; + Span<int> unquantized1 = unquantized[1]; + + for (int i = 0; i < weightsList.Length; i++) + { + unquantized0[weightIndices] = UnquantizeTexelWeight(weightsList[i]); + + if (texelParams.DualPlane) + { + i++; + unquantized1[weightIndices] = UnquantizeTexelWeight(weightsList[i]); + + if (i == weightsList.Length) + { + break; + } + } + + if (++weightIndices >= texelParams.Width * texelParams.Height) break; + } + + // Do infill if necessary (Section C.2.18) ... + int ds = (1024 + blockWidth / 2) / (blockWidth - 1); + int dt = (1024 + blockHeight / 2) / (blockHeight - 1); + + int planeScale = texelParams.DualPlane ? 2 : 1; + + for (int plane = 0; plane < planeScale; plane++) + { + Span<int> unquantizedSpan = unquantized.Get(plane); + Span<int> outputSpan = outputBuffer.Get(plane); + + for (int t = 0; t < blockHeight; t++) + { + for (int s = 0; s < blockWidth; s++) + { + int cs = ds * s; + int ct = dt * t; + + int gs = (cs * (texelParams.Width - 1) + 32) >> 6; + int gt = (ct * (texelParams.Height - 1) + 32) >> 6; + + int js = gs >> 4; + int fs = gs & 0xF; + + int jt = gt >> 4; + int ft = gt & 0x0F; + + int w11 = (fs * ft + 8) >> 4; + + int v0 = js + jt * texelParams.Width; + + int weight = 8; + + int wxh = texelParams.Width * texelParams.Height; + + if (v0 < wxh) + { + weight += unquantizedSpan[v0] * (16 - fs - ft + w11); + + if (v0 + 1 < wxh) + { + weight += unquantizedSpan[v0 + 1] * (fs - w11); + } + } + + if (v0 + texelParams.Width < wxh) + { + weight += unquantizedSpan[v0 + texelParams.Width] * (ft - w11); + + if (v0 + texelParams.Width + 1 < wxh) + { + weight += unquantizedSpan[v0 + texelParams.Width + 1] * w11; + } + } + + outputSpan[t * blockWidth + s] = weight >> 4; + } + } + } + } + + static int UnquantizeTexelWeight(IntegerEncoded intEncoded) + { + int bitValue = intEncoded.BitValue; + int bitLength = intEncoded.NumberBits; + + int a = Bits.Replicate1_7(bitValue & 1); + int b = 0, c = 0, d = 0; + + int result = 0; + + switch (intEncoded.GetEncoding()) + { + case IntegerEncoded.EIntegerEncoding.JustBits: + result = Bits.Replicate(bitValue, bitLength, 6); + break; + + case IntegerEncoded.EIntegerEncoding.Trit: + { + d = intEncoded.TritValue; + Debug.Assert(d < 3); + + switch (bitLength) + { + case 0: + { + result = d switch + { + 0 => 0, + 1 => 32, + 2 => 63, + _ => 0 + }; + + break; + } + + case 1: + { + c = 50; + break; + } + + case 2: + { + c = 23; + int b2 = (bitValue >> 1) & 1; + b = (b2 << 6) | (b2 << 2) | b2; + + break; + } + + case 3: + { + c = 11; + int cb = (bitValue >> 1) & 3; + b = (cb << 5) | cb; + + break; + } + + default: + throw new AstcDecoderException("Invalid trit encoding for texel weight."); + } + + break; + } + + case IntegerEncoded.EIntegerEncoding.Quint: + { + d = intEncoded.QuintValue; + Debug.Assert(d < 5); + + switch (bitLength) + { + case 0: + { + result = d switch + { + 0 => 0, + 1 => 16, + 2 => 32, + 3 => 47, + 4 => 63, + _ => 0 + }; + + break; + } + + case 1: + { + c = 28; + + break; + } + + case 2: + { + c = 13; + int b2 = (bitValue >> 1) & 1; + b = (b2 << 6) | (b2 << 1); + + break; + } + + default: + throw new AstcDecoderException("Invalid quint encoding for texel weight."); + } + + break; + } + } + + if (intEncoded.GetEncoding() != IntegerEncoded.EIntegerEncoding.JustBits && bitLength > 0) + { + // Decode the value... + result = d * c + b; + result ^= a; + result = (a & 0x20) | (result >> 2); + } + + Debug.Assert(result < 64); + + // Change from [0,63] to [0,64] + if (result > 32) + { + result += 1; + } + + return result; + } + + static byte ReverseByte(byte b) + { + // Taken from http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits + return (byte)((((b) * 0x80200802L) & 0x0884422110L) * 0x0101010101L >> 32); + } + + static Span<uint> ReadUintColorValues(int number, Span<int> colorValues, ref int colorValuesPosition) + { + Span<int> ret = colorValues.Slice(colorValuesPosition, number); + + colorValuesPosition += number; + + return MemoryMarshal.Cast<int, uint>(ret); + } + + static Span<int> ReadIntColorValues(int number, Span<int> colorValues, ref int colorValuesPosition) + { + Span<int> ret = colorValues.Slice(colorValuesPosition, number); + + colorValuesPosition += number; + + return ret; + } + + static void ComputeEndpoints( + Span<AstcPixel> endPoints, + Span<int> colorValues, + uint colorEndpointMode, + ref int colorValuesPosition) + { + switch (colorEndpointMode) + { + case 0: + { + Span<uint> val = ReadUintColorValues(2, colorValues, ref colorValuesPosition); + + endPoints[0] = new AstcPixel(0xFF, (short)val[0], (short)val[0], (short)val[0]); + endPoints[1] = new AstcPixel(0xFF, (short)val[1], (short)val[1], (short)val[1]); + + break; + } + + + case 1: + { + Span<uint> val = ReadUintColorValues(2, colorValues, ref colorValuesPosition); + int l0 = (int)((val[0] >> 2) | (val[1] & 0xC0)); + int l1 = (int)Math.Max(l0 + (val[1] & 0x3F), 0xFFU); + + endPoints[0] = new AstcPixel(0xFF, (short)l0, (short)l0, (short)l0); + endPoints[1] = new AstcPixel(0xFF, (short)l1, (short)l1, (short)l1); + + break; + } + + case 4: + { + Span<uint> val = ReadUintColorValues(4, colorValues, ref colorValuesPosition); + + endPoints[0] = new AstcPixel((short)val[2], (short)val[0], (short)val[0], (short)val[0]); + endPoints[1] = new AstcPixel((short)val[3], (short)val[1], (short)val[1], (short)val[1]); + + break; + } + + case 5: + { + Span<int> val = ReadIntColorValues(4, colorValues, ref colorValuesPosition); + + Bits.BitTransferSigned(ref val[1], ref val[0]); + Bits.BitTransferSigned(ref val[3], ref val[2]); + + endPoints[0] = new AstcPixel((short)val[2], (short)val[0], (short)val[0], (short)val[0]); + endPoints[1] = new AstcPixel((short)(val[2] + val[3]), (short)(val[0] + val[1]), (short)(val[0] + val[1]), (short)(val[0] + val[1])); + + endPoints[0].ClampByte(); + endPoints[1].ClampByte(); + + break; + } + + case 6: + { + Span<uint> val = ReadUintColorValues(4, colorValues, ref colorValuesPosition); + + endPoints[0] = new AstcPixel(0xFF, (short)(val[0] * val[3] >> 8), (short)(val[1] * val[3] >> 8), (short)(val[2] * val[3] >> 8)); + endPoints[1] = new AstcPixel(0xFF, (short)val[0], (short)val[1], (short)val[2]); + + break; + } + + case 8: + { + Span<uint> val = ReadUintColorValues(6, colorValues, ref colorValuesPosition); + + if (val[1] + val[3] + val[5] >= val[0] + val[2] + val[4]) + { + endPoints[0] = new AstcPixel(0xFF, (short)val[0], (short)val[2], (short)val[4]); + endPoints[1] = new AstcPixel(0xFF, (short)val[1], (short)val[3], (short)val[5]); + } + else + { + endPoints[0] = AstcPixel.BlueContract(0xFF, (short)val[1], (short)val[3], (short)val[5]); + endPoints[1] = AstcPixel.BlueContract(0xFF, (short)val[0], (short)val[2], (short)val[4]); + } + + break; + } + + case 9: + { + Span<int> val = ReadIntColorValues(6, colorValues, ref colorValuesPosition); + + Bits.BitTransferSigned(ref val[1], ref val[0]); + Bits.BitTransferSigned(ref val[3], ref val[2]); + Bits.BitTransferSigned(ref val[5], ref val[4]); + + if (val[1] + val[3] + val[5] >= 0) + { + endPoints[0] = new AstcPixel(0xFF, (short)val[0], (short)val[2], (short)val[4]); + endPoints[1] = new AstcPixel(0xFF, (short)(val[0] + val[1]), (short)(val[2] + val[3]), (short)(val[4] + val[5])); + } + else + { + endPoints[0] = AstcPixel.BlueContract(0xFF, val[0] + val[1], val[2] + val[3], val[4] + val[5]); + endPoints[1] = AstcPixel.BlueContract(0xFF, val[0], val[2], val[4]); + } + + endPoints[0].ClampByte(); + endPoints[1].ClampByte(); + + break; + } + + case 10: + { + Span<uint> val = ReadUintColorValues(6, colorValues, ref colorValuesPosition); + + endPoints[0] = new AstcPixel((short)val[4], (short)(val[0] * val[3] >> 8), (short)(val[1] * val[3] >> 8), (short)(val[2] * val[3] >> 8)); + endPoints[1] = new AstcPixel((short)val[5], (short)val[0], (short)val[1], (short)val[2]); + + break; + } + + case 12: + { + Span<uint> val = ReadUintColorValues(8, colorValues, ref colorValuesPosition); + + if (val[1] + val[3] + val[5] >= val[0] + val[2] + val[4]) + { + endPoints[0] = new AstcPixel((short)val[6], (short)val[0], (short)val[2], (short)val[4]); + endPoints[1] = new AstcPixel((short)val[7], (short)val[1], (short)val[3], (short)val[5]); + } + else + { + endPoints[0] = AstcPixel.BlueContract((short)val[7], (short)val[1], (short)val[3], (short)val[5]); + endPoints[1] = AstcPixel.BlueContract((short)val[6], (short)val[0], (short)val[2], (short)val[4]); + } + + break; + } + + case 13: + { + Span<int> val = ReadIntColorValues(8, colorValues, ref colorValuesPosition); + + Bits.BitTransferSigned(ref val[1], ref val[0]); + Bits.BitTransferSigned(ref val[3], ref val[2]); + Bits.BitTransferSigned(ref val[5], ref val[4]); + Bits.BitTransferSigned(ref val[7], ref val[6]); + + if (val[1] + val[3] + val[5] >= 0) + { + endPoints[0] = new AstcPixel((short)val[6], (short)val[0], (short)val[2], (short)val[4]); + endPoints[1] = new AstcPixel((short)(val[7] + val[6]), (short)(val[0] + val[1]), (short)(val[2] + val[3]), (short)(val[4] + val[5])); + } + else + { + endPoints[0] = AstcPixel.BlueContract(val[6] + val[7], val[0] + val[1], val[2] + val[3], val[4] + val[5]); + endPoints[1] = AstcPixel.BlueContract(val[6], val[0], val[2], val[4]); + } + + endPoints[0].ClampByte(); + endPoints[1].ClampByte(); + + break; + } + + default: + throw new AstcDecoderException("Unsupported color endpoint mode (is it HDR?)"); + } + } + + static void DecodeColorValues( + Span<int> outputValues, + ref BitStream128 colorBitStream, + Span<uint> modes, + int numberPartitions, + int numberBitsForColorData) + { + // First figure out how many color values we have + int numberValues = 0; + + for (int i = 0; i < numberPartitions; i++) + { + numberValues += (int)((modes[i] >> 2) + 1) << 1; + } + + // Then based on the number of values and the remaining number of bits, + // figure out the max value for each of them... + int range = 256; + + while (--range > 0) + { + IntegerEncoded intEncoded = IntegerEncoded.CreateEncoding(range); + int bitLength = intEncoded.GetBitLength(numberValues); + + if (bitLength <= numberBitsForColorData) + { + // Find the smallest possible range that matches the given encoding + while (--range > 0) + { + IntegerEncoded newIntEncoded = IntegerEncoded.CreateEncoding(range); + if (!newIntEncoded.MatchesEncoding(intEncoded)) + { + break; + } + } + + // Return to last matching range. + range++; + break; + } + } + + // We now have enough to decode our integer sequence. + IntegerSequence integerEncodedSequence; + unsafe { _ = &integerEncodedSequence; } // Skip struct initialization + integerEncodedSequence.Reset(); + + IntegerEncoded.DecodeIntegerSequence(ref integerEncodedSequence, ref colorBitStream, range, numberValues); + + // Once we have the decoded values, we need to dequantize them to the 0-255 range + // This procedure is outlined in ASTC spec C.2.13 + int outputIndices = 0; + + foreach (ref IntegerEncoded intEncoded in integerEncodedSequence.List) + { + int bitLength = intEncoded.NumberBits; + int bitValue = intEncoded.BitValue; + + Debug.Assert(bitLength >= 1); + + int a = 0, b = 0, c = 0, d = 0; + // A is just the lsb replicated 9 times. + a = Bits.Replicate(bitValue & 1, 1, 9); + + switch (intEncoded.GetEncoding()) + { + case IntegerEncoded.EIntegerEncoding.JustBits: + { + outputValues[outputIndices++] = Bits.Replicate(bitValue, bitLength, 8); + + break; + } + + case IntegerEncoded.EIntegerEncoding.Trit: + { + d = intEncoded.TritValue; + + switch (bitLength) + { + case 1: + { + c = 204; + + break; + } + + case 2: + { + c = 93; + // B = b000b0bb0 + int b2 = (bitValue >> 1) & 1; + b = (b2 << 8) | (b2 << 4) | (b2 << 2) | (b2 << 1); + + break; + } + + case 3: + { + c = 44; + // B = cb000cbcb + int cb = (bitValue >> 1) & 3; + b = (cb << 7) | (cb << 2) | cb; + + break; + } + + + case 4: + { + c = 22; + // B = dcb000dcb + int dcb = (bitValue >> 1) & 7; + b = (dcb << 6) | dcb; + + break; + } + + case 5: + { + c = 11; + // B = edcb000ed + int edcb = (bitValue >> 1) & 0xF; + b = (edcb << 5) | (edcb >> 2); + + break; + } + + case 6: + { + c = 5; + // B = fedcb000f + int fedcb = (bitValue >> 1) & 0x1F; + b = (fedcb << 4) | (fedcb >> 4); + + break; + } + + default: + throw new AstcDecoderException("Unsupported trit encoding for color values."); + } + + break; + } + + case IntegerEncoded.EIntegerEncoding.Quint: + { + d = intEncoded.QuintValue; + + switch (bitLength) + { + case 1: + { + c = 113; + + break; + } + + case 2: + { + c = 54; + // B = b0000bb00 + int b2 = (bitValue >> 1) & 1; + b = (b2 << 8) | (b2 << 3) | (b2 << 2); + + break; + } + + case 3: + { + c = 26; + // B = cb0000cbc + int cb = (bitValue >> 1) & 3; + b = (cb << 7) | (cb << 1) | (cb >> 1); + + break; + } + + case 4: + { + c = 13; + // B = dcb0000dc + int dcb = (bitValue >> 1) & 7; + b = (dcb << 6) | (dcb >> 1); + + break; + } + + case 5: + { + c = 6; + // B = edcb0000e + int edcb = (bitValue >> 1) & 0xF; + b = (edcb << 5) | (edcb >> 3); + + break; + } + + default: + throw new AstcDecoderException("Unsupported quint encoding for color values."); + } + break; + } + } + + if (intEncoded.GetEncoding() != IntegerEncoded.EIntegerEncoding.JustBits) + { + int T = d * c + b; + T ^= a; + T = (a & 0x80) | (T >> 2); + + outputValues[outputIndices++] = T; + } + } + + // Make sure that each of our values is in the proper range... + for (int i = 0; i < numberValues; i++) + { + Debug.Assert(outputValues[i] <= 255); + } + } + + static void FillVoidExtentLdr(ref BitStream128 bitStream, Span<int> outputBuffer, int blockWidth, int blockHeight) + { + // Don't actually care about the void extent, just read the bits... + for (int i = 0; i < 4; ++i) + { + bitStream.ReadBits(13); + } + + // Decode the RGBA components and renormalize them to the range [0, 255] + ushort r = (ushort)bitStream.ReadBits(16); + ushort g = (ushort)bitStream.ReadBits(16); + ushort b = (ushort)bitStream.ReadBits(16); + ushort a = (ushort)bitStream.ReadBits(16); + + int rgba = (r >> 8) | (g & 0xFF00) | ((b) & 0xFF00) << 8 | ((a) & 0xFF00) << 16; + + for (int j = 0; j < blockHeight; j++) + { + for (int i = 0; i < blockWidth; i++) + { + outputBuffer[j * blockWidth + i] = rgba; + } + } + } + + static void DecodeBlockInfo(ref BitStream128 bitStream, out TexelWeightParams texelParams) + { + texelParams = new TexelWeightParams(); + + // Read the entire block mode all at once + ushort modeBits = (ushort)bitStream.ReadBits(11); + + // Does this match the void extent block mode? + if ((modeBits & 0x01FF) == 0x1FC) + { + if ((modeBits & 0x200) != 0) + { + texelParams.VoidExtentHdr = true; + } + else + { + texelParams.VoidExtentLdr = true; + } + + // Next two bits must be one. + if ((modeBits & 0x400) == 0 || bitStream.ReadBits(1) == 0) + { + texelParams.Error = true; + } + + return; + } + + // First check if the last four bits are zero + if ((modeBits & 0xF) == 0) + { + texelParams.Error = true; + + return; + } + + // If the last two bits are zero, then if bits + // [6-8] are all ones, this is also reserved. + if ((modeBits & 0x3) == 0 && (modeBits & 0x1C0) == 0x1C0) + { + texelParams.Error = true; + + return; + } + + // Otherwise, there is no error... Figure out the layout + // of the block mode. Layout is determined by a number + // between 0 and 9 corresponding to table C.2.8 of the + // ASTC spec. + int layout; + + if ((modeBits & 0x1) != 0 || (modeBits & 0x2) != 0) + { + // layout is in [0-4] + if ((modeBits & 0x8) != 0) + { + // layout is in [2-4] + if ((modeBits & 0x4) != 0) + { + // layout is in [3-4] + if ((modeBits & 0x100) != 0) + { + layout = 4; + } + else + { + layout = 3; + } + } + else + { + layout = 2; + } + } + else + { + // layout is in [0-1] + if ((modeBits & 0x4) != 0) + { + layout = 1; + } + else + { + layout = 0; + } + } + } + else + { + // layout is in [5-9] + if ((modeBits & 0x100) != 0) + { + // layout is in [7-9] + if ((modeBits & 0x80) != 0) + { + // layout is in [7-8] + Debug.Assert((modeBits & 0x40) == 0); + + if ((modeBits & 0x20) != 0) + { + layout = 8; + } + else + { + layout = 7; + } + } + else + { + layout = 9; + } + } + else + { + // layout is in [5-6] + if ((modeBits & 0x80) != 0) + { + layout = 6; + } + else + { + layout = 5; + } + } + } + + Debug.Assert(layout < 10); + + // Determine R + int r = (modeBits >> 4) & 1; + if (layout < 5) + { + r |= (modeBits & 0x3) << 1; + } + else + { + r |= (modeBits & 0xC) >> 1; + } + + Debug.Assert(2 <= r && r <= 7); + + // Determine width & height + switch (layout) + { + case 0: + { + int a = (modeBits >> 5) & 0x3; + int b = (modeBits >> 7) & 0x3; + + texelParams.Width = b + 4; + texelParams.Height = a + 2; + + break; + } + + case 1: + { + int a = (modeBits >> 5) & 0x3; + int b = (modeBits >> 7) & 0x3; + + texelParams.Width = b + 8; + texelParams.Height = a + 2; + + break; + } + + case 2: + { + int a = (modeBits >> 5) & 0x3; + int b = (modeBits >> 7) & 0x3; + + texelParams.Width = a + 2; + texelParams.Height = b + 8; + + break; + } + + case 3: + { + int a = (modeBits >> 5) & 0x3; + int b = (modeBits >> 7) & 0x1; + + texelParams.Width = a + 2; + texelParams.Height = b + 6; + + break; + } + + case 4: + { + int a = (modeBits >> 5) & 0x3; + int b = (modeBits >> 7) & 0x1; + + texelParams.Width = b + 2; + texelParams.Height = a + 2; + + break; + } + + case 5: + { + int a = (modeBits >> 5) & 0x3; + + texelParams.Width = 12; + texelParams.Height = a + 2; + + break; + } + + case 6: + { + int a = (modeBits >> 5) & 0x3; + + texelParams.Width = a + 2; + texelParams.Height = 12; + + break; + } + + case 7: + { + texelParams.Width = 6; + texelParams.Height = 10; + + break; + } + + case 8: + { + texelParams.Width = 10; + texelParams.Height = 6; + break; + } + + case 9: + { + int a = (modeBits >> 5) & 0x3; + int b = (modeBits >> 9) & 0x3; + + texelParams.Width = a + 6; + texelParams.Height = b + 6; + + break; + } + + default: + // Don't know this layout... + texelParams.Error = true; + break; + } + + // Determine whether or not we're using dual planes + // and/or high precision layouts. + bool d = ((layout != 9) && ((modeBits & 0x400) != 0)); + bool h = (layout != 9) && ((modeBits & 0x200) != 0); + + if (h) + { + ReadOnlySpan<byte> maxWeights = new byte[] { 9, 11, 15, 19, 23, 31 }; + texelParams.MaxWeight = maxWeights[r - 2]; + } + else + { + ReadOnlySpan<byte> maxWeights = new byte[] { 1, 2, 3, 4, 5, 7 }; + texelParams.MaxWeight = maxWeights[r - 2]; + } + + texelParams.DualPlane = d; + } + } +} diff --git a/src/Ryujinx.Graphics.Texture/Astc/AstcDecoderException.cs b/src/Ryujinx.Graphics.Texture/Astc/AstcDecoderException.cs new file mode 100644 index 00000000..fdc48267 --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Astc/AstcDecoderException.cs @@ -0,0 +1,9 @@ +using System; + +namespace Ryujinx.Graphics.Texture.Astc +{ + public class AstcDecoderException : Exception + { + public AstcDecoderException(string exMsg) : base(exMsg) { } + } +}
\ No newline at end of file diff --git a/src/Ryujinx.Graphics.Texture/Astc/AstcPixel.cs b/src/Ryujinx.Graphics.Texture/Astc/AstcPixel.cs new file mode 100644 index 00000000..13197714 --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Astc/AstcPixel.cs @@ -0,0 +1,68 @@ +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Texture.Astc +{ + [StructLayout(LayoutKind.Sequential)] + struct AstcPixel + { + internal const int StructSize = 12; + + public short A; + public short R; + public short G; + public short B; + + private uint _bitDepthInt; + + private Span<byte> BitDepth => MemoryMarshal.CreateSpan(ref Unsafe.As<uint, byte>(ref _bitDepthInt), 4); + private Span<short> Components => MemoryMarshal.CreateSpan(ref A, 4); + + public AstcPixel(short a, short r, short g, short b) + { + A = a; + R = r; + G = g; + B = b; + + _bitDepthInt = 0x08080808; + } + + public void ClampByte() + { + R = Math.Min(Math.Max(R, (short)0), (short)255); + G = Math.Min(Math.Max(G, (short)0), (short)255); + B = Math.Min(Math.Max(B, (short)0), (short)255); + A = Math.Min(Math.Max(A, (short)0), (short)255); + } + + public short GetComponent(int index) + { + return Components[index]; + } + + public void SetComponent(int index, int value) + { + Components[index] = (short)value; + } + + public int Pack() + { + return A << 24 | + B << 16 | + G << 8 | + R << 0; + } + + // Adds more precision to the blue channel as described + // in C.2.14 + public static AstcPixel BlueContract(int a, int r, int g, int b) + { + return new AstcPixel((short)(a), + (short)((r + b) >> 1), + (short)((g + b) >> 1), + (short)(b)); + } + } +} diff --git a/src/Ryujinx.Graphics.Texture/Astc/BitStream128.cs b/src/Ryujinx.Graphics.Texture/Astc/BitStream128.cs new file mode 100644 index 00000000..3bf9769f --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Astc/BitStream128.cs @@ -0,0 +1,72 @@ +using Ryujinx.Common.Utilities; +using System; +using System.Diagnostics; + +namespace Ryujinx.Graphics.Texture.Astc +{ + public struct BitStream128 + { + private Buffer16 _data; + public int BitsLeft { get; set; } + + public BitStream128(Buffer16 data) + { + _data = data; + BitsLeft = 128; + } + + public int ReadBits(int bitCount) + { + Debug.Assert(bitCount < 32); + + if (bitCount == 0) + { + return 0; + } + + int mask = (1 << bitCount) - 1; + int value = _data.As<int>() & mask; + + Span<ulong> span = _data.AsSpan<ulong>(); + + ulong carry = span[1] << (64 - bitCount); + span[0] = (span[0] >> bitCount) | carry; + span[1] >>= bitCount; + + BitsLeft -= bitCount; + + return value; + } + + public void WriteBits(int value, int bitCount) + { + Debug.Assert(bitCount < 32); + + if (bitCount == 0) return; + + ulong maskedValue = (uint)(value & ((1 << bitCount) - 1)); + + Span<ulong> span = _data.AsSpan<ulong>(); + + if (BitsLeft < 64) + { + ulong lowMask = maskedValue << BitsLeft; + span[0] |= lowMask; + } + + if (BitsLeft + bitCount > 64) + { + if (BitsLeft > 64) + { + span[1] |= maskedValue << (BitsLeft - 64); + } + else + { + span[1] |= maskedValue >> (64 - BitsLeft); + } + } + + BitsLeft += bitCount; + } + } +}
\ No newline at end of file diff --git a/src/Ryujinx.Graphics.Texture/Astc/Bits.cs b/src/Ryujinx.Graphics.Texture/Astc/Bits.cs new file mode 100644 index 00000000..b140a20a --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Astc/Bits.cs @@ -0,0 +1,66 @@ +namespace Ryujinx.Graphics.Texture.Astc +{ + internal static class Bits + { + public static readonly ushort[] Replicate8_16Table; + public static readonly byte[] Replicate1_7Table; + + static Bits() + { + Replicate8_16Table = new ushort[0x200]; + Replicate1_7Table = new byte[0x200]; + + for (int i = 0; i < 0x200; i++) + { + Replicate8_16Table[i] = (ushort)Replicate(i, 8, 16); + Replicate1_7Table[i] = (byte)Replicate(i, 1, 7); + } + } + + public static int Replicate8_16(int value) + { + return Replicate8_16Table[value]; + } + + public static int Replicate1_7(int value) + { + return Replicate1_7Table[value]; + } + + public static int Replicate(int value, int numberBits, int toBit) + { + if (numberBits == 0) return 0; + if (toBit == 0) return 0; + + int tempValue = value & ((1 << numberBits) - 1); + int retValue = tempValue; + int resLength = numberBits; + + while (resLength < toBit) + { + int comp = 0; + if (numberBits > toBit - resLength) + { + int newShift = toBit - resLength; + comp = numberBits - newShift; + numberBits = newShift; + } + retValue <<= numberBits; + retValue |= tempValue >> comp; + resLength += numberBits; + } + + return retValue; + } + + // Transfers a bit as described in C.2.14 + public static void BitTransferSigned(ref int a, ref int b) + { + b >>= 1; + b |= a & 0x80; + a >>= 1; + a &= 0x3F; + if ((a & 0x20) != 0) a -= 0x40; + } + } +} diff --git a/src/Ryujinx.Graphics.Texture/Astc/EndPointSet.cs b/src/Ryujinx.Graphics.Texture/Astc/EndPointSet.cs new file mode 100644 index 00000000..45e61ca2 --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Astc/EndPointSet.cs @@ -0,0 +1,23 @@ +using System; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Texture.Astc +{ + [StructLayout(LayoutKind.Sequential, Size = AstcPixel.StructSize * 8)] + internal struct EndPointSet + { + private AstcPixel _start; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public Span<AstcPixel> Get(int index) + { + Debug.Assert(index < 4); + + ref AstcPixel start = ref Unsafe.Add(ref _start, index * 2); + + return MemoryMarshal.CreateSpan(ref start, 2); + } + } +} diff --git a/src/Ryujinx.Graphics.Texture/Astc/IntegerEncoded.cs b/src/Ryujinx.Graphics.Texture/Astc/IntegerEncoded.cs new file mode 100644 index 00000000..065de46b --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Astc/IntegerEncoded.cs @@ -0,0 +1,345 @@ +using System; +using System.Numerics; + +namespace Ryujinx.Graphics.Texture.Astc +{ + internal struct IntegerEncoded + { + internal const int StructSize = 8; + private static readonly IntegerEncoded[] Encodings; + + public enum EIntegerEncoding : byte + { + JustBits, + Quint, + Trit + } + + EIntegerEncoding _encoding; + public byte NumberBits { get; private set; } + public byte TritValue { get; private set; } + public byte QuintValue { get; private set; } + public int BitValue { get; private set; } + + static IntegerEncoded() + { + Encodings = new IntegerEncoded[0x100]; + + for (int i = 0; i < Encodings.Length; i++) + { + Encodings[i] = CreateEncodingCalc(i); + } + } + + public IntegerEncoded(EIntegerEncoding encoding, int numBits) + { + _encoding = encoding; + NumberBits = (byte)numBits; + BitValue = 0; + TritValue = 0; + QuintValue = 0; + } + + public bool MatchesEncoding(IntegerEncoded other) + { + return _encoding == other._encoding && NumberBits == other.NumberBits; + } + + public EIntegerEncoding GetEncoding() + { + return _encoding; + } + + public int GetBitLength(int numberVals) + { + int totalBits = NumberBits * numberVals; + if (_encoding == EIntegerEncoding.Trit) + { + totalBits += (numberVals * 8 + 4) / 5; + } + else if (_encoding == EIntegerEncoding.Quint) + { + totalBits += (numberVals * 7 + 2) / 3; + } + return totalBits; + } + + public static IntegerEncoded CreateEncoding(int maxVal) + { + return Encodings[maxVal]; + } + + private static IntegerEncoded CreateEncodingCalc(int maxVal) + { + while (maxVal > 0) + { + int check = maxVal + 1; + + // Is maxVal a power of two? + if ((check & (check - 1)) == 0) + { + return new IntegerEncoded(EIntegerEncoding.JustBits, BitOperations.PopCount((uint)maxVal)); + } + + // Is maxVal of the type 3*2^n - 1? + if ((check % 3 == 0) && ((check / 3) & ((check / 3) - 1)) == 0) + { + return new IntegerEncoded(EIntegerEncoding.Trit, BitOperations.PopCount((uint)(check / 3 - 1))); + } + + // Is maxVal of the type 5*2^n - 1? + if ((check % 5 == 0) && ((check / 5) & ((check / 5) - 1)) == 0) + { + return new IntegerEncoded(EIntegerEncoding.Quint, BitOperations.PopCount((uint)(check / 5 - 1))); + } + + // Apparently it can't be represented with a bounded integer sequence... + // just iterate. + maxVal--; + } + + return new IntegerEncoded(EIntegerEncoding.JustBits, 0); + } + + public static void DecodeTritBlock( + ref BitStream128 bitStream, + ref IntegerSequence listIntegerEncoded, + int numberBitsPerValue) + { + // Implement the algorithm in section C.2.12 + Span<int> m = stackalloc int[5]; + + m[0] = bitStream.ReadBits(numberBitsPerValue); + int encoded = bitStream.ReadBits(2); + m[1] = bitStream.ReadBits(numberBitsPerValue); + encoded |= bitStream.ReadBits(2) << 2; + m[2] = bitStream.ReadBits(numberBitsPerValue); + encoded |= bitStream.ReadBits(1) << 4; + m[3] = bitStream.ReadBits(numberBitsPerValue); + encoded |= bitStream.ReadBits(2) << 5; + m[4] = bitStream.ReadBits(numberBitsPerValue); + encoded |= bitStream.ReadBits(1) << 7; + + ReadOnlySpan<byte> encodings = GetTritEncoding(encoded); + + IntegerEncoded intEncoded = new IntegerEncoded(EIntegerEncoding.Trit, numberBitsPerValue); + + for (int i = 0; i < 5; i++) + { + intEncoded.BitValue = m[i]; + intEncoded.TritValue = encodings[i]; + + listIntegerEncoded.Add(ref intEncoded); + } + } + + public static void DecodeQuintBlock( + ref BitStream128 bitStream, + ref IntegerSequence listIntegerEncoded, + int numberBitsPerValue) + { + ReadOnlySpan<byte> interleavedBits = new byte[] { 3, 2, 2 }; + + // Implement the algorithm in section C.2.12 + Span<int> m = stackalloc int[3]; + ulong encoded = 0; + int encodedBitsRead = 0; + + for (int i = 0; i < m.Length; i++) + { + m[i] = bitStream.ReadBits(numberBitsPerValue); + + uint encodedBits = (uint)bitStream.ReadBits(interleavedBits[i]); + + encoded |= encodedBits << encodedBitsRead; + encodedBitsRead += interleavedBits[i]; + } + + ReadOnlySpan<byte> encodings = GetQuintEncoding((int)encoded); + + for (int i = 0; i < 3; i++) + { + IntegerEncoded intEncoded = new IntegerEncoded(EIntegerEncoding.Quint, numberBitsPerValue) + { + BitValue = m[i], + QuintValue = encodings[i] + }; + + listIntegerEncoded.Add(ref intEncoded); + } + } + + public static void DecodeIntegerSequence( + ref IntegerSequence decodeIntegerSequence, + ref BitStream128 bitStream, + int maxRange, + int numberValues) + { + // Determine encoding parameters + IntegerEncoded intEncoded = CreateEncoding(maxRange); + + // Start decoding + int numberValuesDecoded = 0; + while (numberValuesDecoded < numberValues) + { + switch (intEncoded.GetEncoding()) + { + case EIntegerEncoding.Quint: + { + DecodeQuintBlock(ref bitStream, ref decodeIntegerSequence, intEncoded.NumberBits); + numberValuesDecoded += 3; + + break; + } + + case EIntegerEncoding.Trit: + { + DecodeTritBlock(ref bitStream, ref decodeIntegerSequence, intEncoded.NumberBits); + numberValuesDecoded += 5; + + break; + } + + case EIntegerEncoding.JustBits: + { + intEncoded.BitValue = bitStream.ReadBits(intEncoded.NumberBits); + decodeIntegerSequence.Add(ref intEncoded); + numberValuesDecoded++; + + break; + } + } + } + } + + private static ReadOnlySpan<byte> GetTritEncoding(int index) + { + return TritEncodings.Slice(index * 5, 5); + } + + private static ReadOnlySpan<byte> GetQuintEncoding(int index) + { + return QuintEncodings.Slice(index * 3, 3); + } + + private static ReadOnlySpan<byte> TritEncodings => new byte[] + { + 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, + 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, + 2, 1, 0, 0, 0, 1, 0, 2, 0, 0, 0, 2, 0, 0, 0, + 1, 2, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0, 2, 0, 0, + 0, 2, 2, 0, 0, 1, 2, 2, 0, 0, 2, 2, 2, 0, 0, + 2, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, + 2, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0, 1, 1, 0, 0, + 1, 1, 1, 0, 0, 2, 1, 1, 0, 0, 1, 1, 2, 0, 0, + 0, 2, 1, 0, 0, 1, 2, 1, 0, 0, 2, 2, 1, 0, 0, + 2, 1, 2, 0, 0, 0, 0, 0, 2, 2, 1, 0, 0, 2, 2, + 2, 0, 0, 2, 2, 0, 0, 2, 2, 2, 0, 0, 0, 1, 0, + 1, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 0, 2, 1, 0, + 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 2, 1, 0, 1, 0, + 1, 0, 2, 1, 0, 0, 2, 0, 1, 0, 1, 2, 0, 1, 0, + 2, 2, 0, 1, 0, 2, 0, 2, 1, 0, 0, 2, 2, 1, 0, + 1, 2, 2, 1, 0, 2, 2, 2, 1, 0, 2, 0, 2, 1, 0, + 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 2, 0, 1, 1, 0, + 0, 1, 2, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, + 2, 1, 1, 1, 0, 1, 1, 2, 1, 0, 0, 2, 1, 1, 0, + 1, 2, 1, 1, 0, 2, 2, 1, 1, 0, 2, 1, 2, 1, 0, + 0, 1, 0, 2, 2, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, + 1, 0, 2, 2, 2, 0, 0, 0, 2, 0, 1, 0, 0, 2, 0, + 2, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 1, 0, 2, 0, + 1, 1, 0, 2, 0, 2, 1, 0, 2, 0, 1, 0, 2, 2, 0, + 0, 2, 0, 2, 0, 1, 2, 0, 2, 0, 2, 2, 0, 2, 0, + 2, 0, 2, 2, 0, 0, 2, 2, 2, 0, 1, 2, 2, 2, 0, + 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 0, 0, 1, 2, 0, + 1, 0, 1, 2, 0, 2, 0, 1, 2, 0, 0, 1, 2, 2, 0, + 0, 1, 1, 2, 0, 1, 1, 1, 2, 0, 2, 1, 1, 2, 0, + 1, 1, 2, 2, 0, 0, 2, 1, 2, 0, 1, 2, 1, 2, 0, + 2, 2, 1, 2, 0, 2, 1, 2, 2, 0, 0, 2, 0, 2, 2, + 1, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, + 0, 0, 0, 0, 2, 1, 0, 0, 0, 2, 2, 0, 0, 0, 2, + 0, 0, 2, 0, 2, 0, 1, 0, 0, 2, 1, 1, 0, 0, 2, + 2, 1, 0, 0, 2, 1, 0, 2, 0, 2, 0, 2, 0, 0, 2, + 1, 2, 0, 0, 2, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, + 0, 2, 2, 0, 2, 1, 2, 2, 0, 2, 2, 2, 2, 0, 2, + 2, 0, 2, 0, 2, 0, 0, 1, 0, 2, 1, 0, 1, 0, 2, + 2, 0, 1, 0, 2, 0, 1, 2, 0, 2, 0, 1, 1, 0, 2, + 1, 1, 1, 0, 2, 2, 1, 1, 0, 2, 1, 1, 2, 0, 2, + 0, 2, 1, 0, 2, 1, 2, 1, 0, 2, 2, 2, 1, 0, 2, + 2, 1, 2, 0, 2, 0, 2, 2, 2, 2, 1, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 2, 0, 1, + 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 2, 1, 0, 0, 1, + 1, 0, 2, 0, 1, 0, 2, 0, 0, 1, 1, 2, 0, 0, 1, + 2, 2, 0, 0, 1, 2, 0, 2, 0, 1, 0, 2, 2, 0, 1, + 1, 2, 2, 0, 1, 2, 2, 2, 0, 1, 2, 0, 2, 0, 1, + 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 2, 0, 1, 0, 1, + 0, 1, 2, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, + 2, 1, 1, 0, 1, 1, 1, 2, 0, 1, 0, 2, 1, 0, 1, + 1, 2, 1, 0, 1, 2, 2, 1, 0, 1, 2, 1, 2, 0, 1, + 0, 0, 1, 2, 2, 1, 0, 1, 2, 2, 2, 0, 1, 2, 2, + 0, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, + 2, 0, 0, 1, 1, 0, 0, 2, 1, 1, 0, 1, 0, 1, 1, + 1, 1, 0, 1, 1, 2, 1, 0, 1, 1, 1, 0, 2, 1, 1, + 0, 2, 0, 1, 1, 1, 2, 0, 1, 1, 2, 2, 0, 1, 1, + 2, 0, 2, 1, 1, 0, 2, 2, 1, 1, 1, 2, 2, 1, 1, + 2, 2, 2, 1, 1, 2, 0, 2, 1, 1, 0, 0, 1, 1, 1, + 1, 0, 1, 1, 1, 2, 0, 1, 1, 1, 0, 1, 2, 1, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, + 1, 1, 2, 1, 1, 0, 2, 1, 1, 1, 1, 2, 1, 1, 1, + 2, 2, 1, 1, 1, 2, 1, 2, 1, 1, 0, 1, 1, 2, 2, + 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, + 0, 0, 0, 2, 1, 1, 0, 0, 2, 1, 2, 0, 0, 2, 1, + 0, 0, 2, 2, 1, 0, 1, 0, 2, 1, 1, 1, 0, 2, 1, + 2, 1, 0, 2, 1, 1, 0, 2, 2, 1, 0, 2, 0, 2, 1, + 1, 2, 0, 2, 1, 2, 2, 0, 2, 1, 2, 0, 2, 2, 1, + 0, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, + 2, 0, 2, 2, 1, 0, 0, 1, 2, 1, 1, 0, 1, 2, 1, + 2, 0, 1, 2, 1, 0, 1, 2, 2, 1, 0, 1, 1, 2, 1, + 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, + 0, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 2, 1, 2, 1, + 2, 1, 2, 2, 1, 0, 2, 1, 2, 2, 1, 2, 1, 2, 2, + 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 0, 0, 0, 1, 2, + 1, 0, 0, 1, 2, 2, 0, 0, 1, 2, 0, 0, 2, 1, 2, + 0, 1, 0, 1, 2, 1, 1, 0, 1, 2, 2, 1, 0, 1, 2, + 1, 0, 2, 1, 2, 0, 2, 0, 1, 2, 1, 2, 0, 1, 2, + 2, 2, 0, 1, 2, 2, 0, 2, 1, 2, 0, 2, 2, 1, 2, + 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 0, 2, 1, 2, + 0, 0, 1, 1, 2, 1, 0, 1, 1, 2, 2, 0, 1, 1, 2, + 0, 1, 2, 1, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 2, + 2, 1, 1, 1, 2, 1, 1, 2, 1, 2, 0, 2, 1, 1, 2, + 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, + 0, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 1, 2, 2, 2 + }; + + private static ReadOnlySpan<byte> QuintEncodings => new byte[] + { + 0, 0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0, + 0, 4, 0, 4, 4, 0, 4, 4, 4, 0, 1, 0, 1, 1, 0, + 2, 1, 0, 3, 1, 0, 4, 1, 0, 1, 4, 0, 4, 4, 1, + 4, 4, 4, 0, 2, 0, 1, 2, 0, 2, 2, 0, 3, 2, 0, + 4, 2, 0, 2, 4, 0, 4, 4, 2, 4, 4, 4, 0, 3, 0, + 1, 3, 0, 2, 3, 0, 3, 3, 0, 4, 3, 0, 3, 4, 0, + 4, 4, 3, 4, 4, 4, 0, 0, 1, 1, 0, 1, 2, 0, 1, + 3, 0, 1, 4, 0, 1, 0, 4, 1, 4, 0, 4, 0, 4, 4, + 0, 1, 1, 1, 1, 1, 2, 1, 1, 3, 1, 1, 4, 1, 1, + 1, 4, 1, 4, 1, 4, 1, 4, 4, 0, 2, 1, 1, 2, 1, + 2, 2, 1, 3, 2, 1, 4, 2, 1, 2, 4, 1, 4, 2, 4, + 2, 4, 4, 0, 3, 1, 1, 3, 1, 2, 3, 1, 3, 3, 1, + 4, 3, 1, 3, 4, 1, 4, 3, 4, 3, 4, 4, 0, 0, 2, + 1, 0, 2, 2, 0, 2, 3, 0, 2, 4, 0, 2, 0, 4, 2, + 2, 0, 4, 3, 0, 4, 0, 1, 2, 1, 1, 2, 2, 1, 2, + 3, 1, 2, 4, 1, 2, 1, 4, 2, 2, 1, 4, 3, 1, 4, + 0, 2, 2, 1, 2, 2, 2, 2, 2, 3, 2, 2, 4, 2, 2, + 2, 4, 2, 2, 2, 4, 3, 2, 4, 0, 3, 2, 1, 3, 2, + 2, 3, 2, 3, 3, 2, 4, 3, 2, 3, 4, 2, 2, 3, 4, + 3, 3, 4, 0, 0, 3, 1, 0, 3, 2, 0, 3, 3, 0, 3, + 4, 0, 3, 0, 4, 3, 0, 0, 4, 1, 0, 4, 0, 1, 3, + 1, 1, 3, 2, 1, 3, 3, 1, 3, 4, 1, 3, 1, 4, 3, + 0, 1, 4, 1, 1, 4, 0, 2, 3, 1, 2, 3, 2, 2, 3, + 3, 2, 3, 4, 2, 3, 2, 4, 3, 0, 2, 4, 1, 2, 4, + 0, 3, 3, 1, 3, 3, 2, 3, 3, 3, 3, 3, 4, 3, 3, + 3, 4, 3, 0, 3, 4, 1, 3, 4 + }; + } +} diff --git a/src/Ryujinx.Graphics.Texture/Astc/IntegerSequence.cs b/src/Ryujinx.Graphics.Texture/Astc/IntegerSequence.cs new file mode 100644 index 00000000..367b6809 --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Astc/IntegerSequence.cs @@ -0,0 +1,31 @@ +using System; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Texture.Astc +{ + [StructLayout(LayoutKind.Sequential, Size = IntegerEncoded.StructSize * Capacity + sizeof(int))] + internal struct IntegerSequence + { + private const int Capacity = 100; + + private int _length; + private IntegerEncoded _start; + + public Span<IntegerEncoded> List => MemoryMarshal.CreateSpan(ref _start, _length); + + public void Reset() => _length = 0; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Add(ref IntegerEncoded item) + { + Debug.Assert(_length < Capacity); + + int oldLength = _length; + _length++; + + List[oldLength] = item; + } + } +} diff --git a/src/Ryujinx.Graphics.Texture/BC6Decoder.cs b/src/Ryujinx.Graphics.Texture/BC6Decoder.cs new file mode 100644 index 00000000..819bf022 --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/BC6Decoder.cs @@ -0,0 +1,819 @@ +using Ryujinx.Graphics.Texture.Utils; +using System; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Texture +{ + static class BC6Decoder + { + private const int HalfOne = 0x3C00; + + public static void Decode(Span<byte> output, ReadOnlySpan<byte> data, int width, int height, bool signed) + { + ReadOnlySpan<Block> blocks = MemoryMarshal.Cast<byte, Block>(data); + + Span<ulong> output64 = MemoryMarshal.Cast<byte, ulong>(output); + + int wInBlocks = (width + 3) / 4; + int hInBlocks = (height + 3) / 4; + + for (int y = 0; y < hInBlocks; y++) + { + int y2 = y * 4; + int bh = Math.Min(4, height - y2); + + for (int x = 0; x < wInBlocks; x++) + { + int x2 = x * 4; + int bw = Math.Min(4, width - x2); + + DecodeBlock(blocks[y * wInBlocks + x], output64.Slice(y2 * width + x2), bw, bh, width, signed); + } + } + } + + private static void DecodeBlock(Block block, Span<ulong> output, int w, int h, int width, bool signed) + { + int mode = (int)(block.Low & 3); + if ((mode & 2) != 0) + { + mode = (int)(block.Low & 0x1f); + } + + Span<RgbaColor32> endPoints = stackalloc RgbaColor32[4]; + int subsetCount = DecodeEndPoints(ref block, endPoints, mode, signed); + if (subsetCount == 0) + { + // Mode is invalid, the spec mandates that hardware fills the block with + // a opaque black color. + for (int ty = 0; ty < h; ty++) + { + int baseOffs = ty * width; + + for (int tx = 0; tx < w; tx++) + { + output[baseOffs + tx] = (ulong)HalfOne << 48; + } + } + + return; + } + + int partition; + int indexBitCount; + ulong indices; + + if (subsetCount > 1) + { + partition = (int)((block.High >> 13) & 0x1F); + indexBitCount = 3; + + int fixUpIndex = BC67Tables.FixUpIndices[subsetCount - 1][partition][1] * 3; + ulong lowMask = (ulong.MaxValue >> (65 - fixUpIndex)) << 3; + ulong highMask = ulong.MaxValue << (fixUpIndex + 3); + + indices = ((block.High >> 16) & highMask) | ((block.High >> 17) & lowMask) | ((block.High >> 18) & 3); + } + else + { + partition = 0; + indexBitCount = 4; + indices = (block.High & ~0xFUL) | ((block.High >> 1) & 7); + } + + ulong indexMask = (1UL << indexBitCount) - 1; + + for (int ty = 0; ty < h; ty++) + { + int baseOffs = ty * width; + + for (int tx = 0; tx < w; tx++) + { + int offs = baseOffs + tx; + int index = (int)(indices & indexMask); + int endPointBase = BC67Tables.PartitionTable[subsetCount - 1][partition][ty * 4 + tx] << 1; + + RgbaColor32 color1 = endPoints[endPointBase]; + RgbaColor32 color2 = endPoints[endPointBase + 1]; + + RgbaColor32 color = BC67Utils.Interpolate(color1, color2, index, indexBitCount); + + output[offs] = + (ulong)FinishUnquantize(color.R, signed) | + ((ulong)FinishUnquantize(color.G, signed) << 16) | + ((ulong)FinishUnquantize(color.B, signed) << 32) | + ((ulong)HalfOne << 48); + + indices >>= indexBitCount; + } + } + } + + private static int DecodeEndPoints(ref Block block, Span<RgbaColor32> endPoints, int mode, bool signed) + { + ulong low = block.Low; + ulong high = block.High; + + int r0 = 0, g0 = 0, b0 = 0, r1 = 0, g1 = 0, b1 = 0, r2 = 0, g2 = 0, b2 = 0, r3 = 0, g3 = 0, b3 = 0; + int subsetCount; + + switch (mode) + { + case 0: + r0 = (int)(low >> 5) & 0x3FF; + g0 = (int)(low >> 15) & 0x3FF; + b0 = (int)(low >> 25) & 0x3FF; + + if (signed) + { + r0 = SignExtend(r0, 10); + g0 = SignExtend(g0, 10); + b0 = SignExtend(b0, 10); + } + + r1 = r0 + SignExtend((int)(low >> 35), 5); + g1 = g0 + SignExtend((int)(low >> 45), 5); + b1 = b0 + SignExtend((int)(low >> 55), 5); + + r2 = r0 + SignExtend((int)(high >> 1), 5); + g2 = g0 + SignExtend((int)(((low << 2) & 0x10) | ((low >> 41) & 0xF)), 5); + b2 = b0 + SignExtend((int)(((low << 1) & 0x10) | ((high << 3) & 0x08) | (low >> 61)), 5); + + r3 = r0 + SignExtend((int)(high >> 7), 5); + g3 = g0 + SignExtend((int)(((low >> 36) & 0x10) | ((low >> 51) & 0xF)), 5); + b3 = b0 + SignExtend((int)( + ((low) & 0x10) | + ((high >> 9) & 0x08) | + ((high >> 4) & 0x04) | + ((low >> 59) & 0x02) | + ((low >> 50) & 0x01)), 5); + + r0 = Unquantize(r0, 10, signed); + g0 = Unquantize(g0, 10, signed); + b0 = Unquantize(b0, 10, signed); + + r1 = Unquantize(r1 & 0x3FF, 10, signed); + g1 = Unquantize(g1 & 0x3FF, 10, signed); + b1 = Unquantize(b1 & 0x3FF, 10, signed); + + r2 = Unquantize(r2 & 0x3FF, 10, signed); + g2 = Unquantize(g2 & 0x3FF, 10, signed); + b2 = Unquantize(b2 & 0x3FF, 10, signed); + + r3 = Unquantize(r3 & 0x3FF, 10, signed); + g3 = Unquantize(g3 & 0x3FF, 10, signed); + b3 = Unquantize(b3 & 0x3FF, 10, signed); + + subsetCount = 2; + break; + case 1: + r0 = (int)(low >> 5) & 0x7F; + g0 = (int)(low >> 15) & 0x7F; + b0 = (int)(low >> 25) & 0x7F; + + if (signed) + { + r0 = SignExtend(r0, 7); + g0 = SignExtend(g0, 7); + b0 = SignExtend(b0, 7); + } + + r1 = r0 + SignExtend((int)(low >> 35), 6); + g1 = g0 + SignExtend((int)(low >> 45), 6); + b1 = b0 + SignExtend((int)(low >> 55), 6); + + r2 = r0 + SignExtend((int)(high >> 1), 6); + g2 = g0 + SignExtend((int)(((low << 3) & 0x20) | ((low >> 20) & 0x10) | ((low >> 41) & 0x0F)), 6); + b2 = b0 + SignExtend((int)( + ((low >> 17) & 0x20) | + ((low >> 10) & 0x10) | + ((high << 3) & 0x08) | + (low >> 61)), 6); + + r3 = r0 + SignExtend((int)(high >> 7), 6); + g3 = g0 + SignExtend((int)(((low << 1) & 0x30) | ((low >> 51) & 0xF)), 6); + b3 = b0 + SignExtend((int)( + ((low >> 28) & 0x20) | + ((low >> 30) & 0x10) | + ((low >> 29) & 0x08) | + ((low >> 21) & 0x04) | + ((low >> 12) & 0x03)), 6); + + r0 = Unquantize(r0, 7, signed); + g0 = Unquantize(g0, 7, signed); + b0 = Unquantize(b0, 7, signed); + + r1 = Unquantize(r1 & 0x7F, 7, signed); + g1 = Unquantize(g1 & 0x7F, 7, signed); + b1 = Unquantize(b1 & 0x7F, 7, signed); + + r2 = Unquantize(r2 & 0x7F, 7, signed); + g2 = Unquantize(g2 & 0x7F, 7, signed); + b2 = Unquantize(b2 & 0x7F, 7, signed); + + r3 = Unquantize(r3 & 0x7F, 7, signed); + g3 = Unquantize(g3 & 0x7F, 7, signed); + b3 = Unquantize(b3 & 0x7F, 7, signed); + + subsetCount = 2; + break; + case 2: + r0 = (int)(((low >> 30) & 0x400) | ((low >> 5) & 0x3FF)); + g0 = (int)(((low >> 39) & 0x400) | ((low >> 15) & 0x3FF)); + b0 = (int)(((low >> 49) & 0x400) | ((low >> 25) & 0x3FF)); + + if (signed) + { + r0 = SignExtend(r0, 11); + g0 = SignExtend(g0, 11); + b0 = SignExtend(b0, 11); + } + + r1 = r0 + SignExtend((int)(low >> 35), 5); + g1 = g0 + SignExtend((int)(low >> 45), 4); + b1 = b0 + SignExtend((int)(low >> 55), 4); + + r2 = r0 + SignExtend((int)(high >> 1), 5); + g2 = g0 + SignExtend((int)(low >> 41), 4); + b2 = b0 + SignExtend((int)(((high << 3) & 8) | (low >> 61)), 4); + + r3 = r0 + SignExtend((int)(high >> 7), 5); + g3 = g0 + SignExtend((int)(low >> 51), 4); + b3 = b0 + SignExtend((int)( + ((high >> 9) & 8) | + ((high >> 4) & 4) | + ((low >> 59) & 2) | + ((low >> 50) & 1)), 4); + + r0 = Unquantize(r0, 11, signed); + g0 = Unquantize(g0, 11, signed); + b0 = Unquantize(b0, 11, signed); + + r1 = Unquantize(r1 & 0x7FF, 11, signed); + g1 = Unquantize(g1 & 0x7FF, 11, signed); + b1 = Unquantize(b1 & 0x7FF, 11, signed); + + r2 = Unquantize(r2 & 0x7FF, 11, signed); + g2 = Unquantize(g2 & 0x7FF, 11, signed); + b2 = Unquantize(b2 & 0x7FF, 11, signed); + + r3 = Unquantize(r3 & 0x7FF, 11, signed); + g3 = Unquantize(g3 & 0x7FF, 11, signed); + b3 = Unquantize(b3 & 0x7FF, 11, signed); + + subsetCount = 2; + break; + case 3: + r0 = (int)(low >> 5) & 0x3FF; + g0 = (int)(low >> 15) & 0x3FF; + b0 = (int)(low >> 25) & 0x3FF; + + r1 = (int)(low >> 35) & 0x3FF; + g1 = (int)(low >> 45) & 0x3FF; + b1 = (int)(((high << 9) & 0x200) | (low >> 55)); + + if (signed) + { + r0 = SignExtend(r0, 10); + g0 = SignExtend(g0, 10); + b0 = SignExtend(b0, 10); + + r1 = SignExtend(r1, 10); + g1 = SignExtend(g1, 10); + b1 = SignExtend(b1, 10); + } + + r0 = Unquantize(r0, 10, signed); + g0 = Unquantize(g0, 10, signed); + b0 = Unquantize(b0, 10, signed); + + r1 = Unquantize(r1, 10, signed); + g1 = Unquantize(g1, 10, signed); + b1 = Unquantize(b1, 10, signed); + + subsetCount = 1; + break; + case 6: + r0 = (int)(((low >> 29) & 0x400) | ((low >> 5) & 0x3FF)); + g0 = (int)(((low >> 40) & 0x400) | ((low >> 15) & 0x3FF)); + b0 = (int)(((low >> 49) & 0x400) | ((low >> 25) & 0x3FF)); + + if (signed) + { + r0 = SignExtend(r0, 11); + g0 = SignExtend(g0, 11); + b0 = SignExtend(b0, 11); + } + + r1 = r0 + SignExtend((int)(low >> 35), 4); + g1 = g0 + SignExtend((int)(low >> 45), 5); + b1 = b0 + SignExtend((int)(low >> 55), 4); + + r2 = r0 + SignExtend((int)(high >> 1), 4); + g2 = g0 + SignExtend((int)(((high >> 7) & 0x10) | ((low >> 41) & 0x0F)), 5); + b2 = b0 + SignExtend((int)(((high << 3) & 0x08) | ((low >> 61))), 4); + + r3 = r0 + SignExtend((int)(high >> 7), 4); + g3 = g0 + SignExtend((int)(((low >> 36) & 0x10) | ((low >> 51) & 0x0F)), 5); + b3 = b0 + SignExtend((int)( + ((high >> 9) & 8) | + ((high >> 4) & 4) | + ((low >> 59) & 2) | + ((high >> 5) & 1)), 4); + + r0 = Unquantize(r0, 11, signed); + g0 = Unquantize(g0, 11, signed); + b0 = Unquantize(b0, 11, signed); + + r1 = Unquantize(r1 & 0x7FF, 11, signed); + g1 = Unquantize(g1 & 0x7FF, 11, signed); + b1 = Unquantize(b1 & 0x7FF, 11, signed); + + r2 = Unquantize(r2 & 0x7FF, 11, signed); + g2 = Unquantize(g2 & 0x7FF, 11, signed); + b2 = Unquantize(b2 & 0x7FF, 11, signed); + + r3 = Unquantize(r3 & 0x7FF, 11, signed); + g3 = Unquantize(g3 & 0x7FF, 11, signed); + b3 = Unquantize(b3 & 0x7FF, 11, signed); + + subsetCount = 2; + break; + case 7: + r0 = (int)(((low >> 34) & 0x400) | ((low >> 5) & 0x3FF)); + g0 = (int)(((low >> 44) & 0x400) | ((low >> 15) & 0x3FF)); + b0 = (int)(((high << 10) & 0x400) | ((low >> 25) & 0x3FF)); + + if (signed) + { + r0 = SignExtend(r0, 11); + g0 = SignExtend(g0, 11); + b0 = SignExtend(b0, 11); + } + + r1 = (r0 + SignExtend((int)(low >> 35), 9)) & 0x7FF; + g1 = (g0 + SignExtend((int)(low >> 45), 9)) & 0x7FF; + b1 = (b0 + SignExtend((int)(low >> 55), 9)) & 0x7FF; + + r0 = Unquantize(r0, 11, signed); + g0 = Unquantize(g0, 11, signed); + b0 = Unquantize(b0, 11, signed); + + r1 = Unquantize(r1, 11, signed); + g1 = Unquantize(g1, 11, signed); + b1 = Unquantize(b1, 11, signed); + + subsetCount = 1; + break; + case 10: + r0 = (int)(((low >> 29) & 0x400) | ((low >> 5) & 0x3FF)); + g0 = (int)(((low >> 39) & 0x400) | ((low >> 15) & 0x3FF)); + b0 = (int)(((low >> 50) & 0x400) | ((low >> 25) & 0x3FF)); + + if (signed) + { + r0 = SignExtend(r0, 11); + g0 = SignExtend(g0, 11); + b0 = SignExtend(b0, 11); + } + + r1 = r0 + SignExtend((int)(low >> 35), 4); + g1 = g0 + SignExtend((int)(low >> 45), 4); + b1 = b0 + SignExtend((int)(low >> 55), 5); + + r2 = r0 + SignExtend((int)(high >> 1), 4); + g2 = g0 + SignExtend((int)(low >> 41), 4); + b2 = b0 + SignExtend((int)(((low >> 36) & 0x10) | ((high << 3) & 8) | (low >> 61)), 5); + + r3 = r0 + SignExtend((int)(high >> 7), 4); + g3 = g0 + SignExtend((int)(low >> 51), 4); + b3 = b0 + SignExtend((int)( + ((high >> 7) & 0x10) | + ((high >> 9) & 0x08) | + ((high >> 4) & 0x06) | + ((low >> 50) & 0x01)), 5); + + r0 = Unquantize(r0, 11, signed); + g0 = Unquantize(g0, 11, signed); + b0 = Unquantize(b0, 11, signed); + + r1 = Unquantize(r1 & 0x7FF, 11, signed); + g1 = Unquantize(g1 & 0x7FF, 11, signed); + b1 = Unquantize(b1 & 0x7FF, 11, signed); + + r2 = Unquantize(r2 & 0x7FF, 11, signed); + g2 = Unquantize(g2 & 0x7FF, 11, signed); + b2 = Unquantize(b2 & 0x7FF, 11, signed); + + r3 = Unquantize(r3 & 0x7FF, 11, signed); + g3 = Unquantize(g3 & 0x7FF, 11, signed); + b3 = Unquantize(b3 & 0x7FF, 11, signed); + + subsetCount = 2; + break; + case 11: + r0 = (int)(((low >> 32) & 0x800) | ((low >> 34) & 0x400) | ((low >> 5) & 0x3FF)); + g0 = (int)(((low >> 42) & 0x800) | ((low >> 44) & 0x400) | ((low >> 15) & 0x3FF)); + b0 = (int)(((low >> 52) & 0x800) | ((high << 10) & 0x400) | ((low >> 25) & 0x3FF)); + + if (signed) + { + r0 = SignExtend(r0, 12); + g0 = SignExtend(g0, 12); + b0 = SignExtend(b0, 12); + } + + r1 = (r0 + SignExtend((int)(low >> 35), 8)) & 0xFFF; + g1 = (g0 + SignExtend((int)(low >> 45), 8)) & 0xFFF; + b1 = (b0 + SignExtend((int)(low >> 55), 8)) & 0xFFF; + + r0 = Unquantize(r0, 12, signed); + g0 = Unquantize(g0, 12, signed); + b0 = Unquantize(b0, 12, signed); + + r1 = Unquantize(r1, 12, signed); + g1 = Unquantize(g1, 12, signed); + b1 = Unquantize(b1, 12, signed); + + subsetCount = 1; + break; + case 14: + r0 = (int)(low >> 5) & 0x1FF; + g0 = (int)(low >> 15) & 0x1FF; + b0 = (int)(low >> 25) & 0x1FF; + + if (signed) + { + r0 = SignExtend(r0, 9); + g0 = SignExtend(g0, 9); + b0 = SignExtend(b0, 9); + } + + r1 = r0 + SignExtend((int)(low >> 35), 5); + g1 = g0 + SignExtend((int)(low >> 45), 5); + b1 = b0 + SignExtend((int)(low >> 55), 5); + + r2 = r0 + SignExtend((int)(high >> 1), 5); + g2 = g0 + SignExtend((int)(((low >> 20) & 0x10) | ((low >> 41) & 0xF)), 5); + b2 = b0 + SignExtend((int)(((low >> 10) & 0x10) | ((high << 3) & 8) | (low >> 61)), 5); + + r3 = r0 + SignExtend((int)(high >> 7), 5); + g3 = g0 + SignExtend((int)(((low >> 36) & 0x10) | ((low >> 51) & 0xF)), 5); + b3 = b0 + SignExtend((int)( + ((low >> 30) & 0x10) | + ((high >> 9) & 0x08) | + ((high >> 4) & 0x04) | + ((low >> 59) & 0x02) | + ((low >> 50) & 0x01)), 5); + + r0 = Unquantize(r0, 9, signed); + g0 = Unquantize(g0, 9, signed); + b0 = Unquantize(b0, 9, signed); + + r1 = Unquantize(r1 & 0x1FF, 9, signed); + g1 = Unquantize(g1 & 0x1FF, 9, signed); + b1 = Unquantize(b1 & 0x1FF, 9, signed); + + r2 = Unquantize(r2 & 0x1FF, 9, signed); + g2 = Unquantize(g2 & 0x1FF, 9, signed); + b2 = Unquantize(b2 & 0x1FF, 9, signed); + + r3 = Unquantize(r3 & 0x1FF, 9, signed); + g3 = Unquantize(g3 & 0x1FF, 9, signed); + b3 = Unquantize(b3 & 0x1FF, 9, signed); + + subsetCount = 2; + break; + case 15: + r0 = (BitReverse6((int)(low >> 39) & 0x3F) << 10) | ((int)(low >> 5) & 0x3FF); + g0 = (BitReverse6((int)(low >> 49) & 0x3F) << 10) | ((int)(low >> 15) & 0x3FF); + b0 = ((BitReverse6((int)(low >> 59)) | (int)(high & 1)) << 10) | ((int)(low >> 25) & 0x3FF); + + if (signed) + { + r0 = SignExtend(r0, 16); + g0 = SignExtend(g0, 16); + b0 = SignExtend(b0, 16); + } + + r1 = (r0 + SignExtend((int)(low >> 35), 4)) & 0xFFFF; + g1 = (g0 + SignExtend((int)(low >> 45), 4)) & 0xFFFF; + b1 = (b0 + SignExtend((int)(low >> 55), 4)) & 0xFFFF; + + subsetCount = 1; + break; + case 18: + r0 = (int)(low >> 5) & 0xFF; + g0 = (int)(low >> 15) & 0xFF; + b0 = (int)(low >> 25) & 0xFF; + + if (signed) + { + r0 = SignExtend(r0, 8); + g0 = SignExtend(g0, 8); + b0 = SignExtend(b0, 8); + } + + r1 = r0 + SignExtend((int)(low >> 35), 6); + g1 = g0 + SignExtend((int)(low >> 45), 5); + b1 = b0 + SignExtend((int)(low >> 55), 5); + + r2 = r0 + SignExtend((int)(high >> 1), 6); + g2 = g0 + SignExtend((int)(((low >> 20) & 0x10) | ((low >> 41) & 0xF)), 5); + b2 = b0 + SignExtend((int)(((low >> 10) & 0x10) | ((high << 3) & 8) | (low >> 61)), 5); + + r3 = r0 + SignExtend((int)(high >> 7), 6); + g3 = g0 + SignExtend((int)(((low >> 9) & 0x10) | ((low >> 51) & 0xF)), 5); + b3 = b0 + SignExtend((int)( + ((low >> 30) & 0x18) | + ((low >> 21) & 0x04) | + ((low >> 59) & 0x02) | + ((low >> 50) & 0x01)), 5); + + r0 = Unquantize(r0, 8, signed); + g0 = Unquantize(g0, 8, signed); + b0 = Unquantize(b0, 8, signed); + + r1 = Unquantize(r1 & 0xFF, 8, signed); + g1 = Unquantize(g1 & 0xFF, 8, signed); + b1 = Unquantize(b1 & 0xFF, 8, signed); + + r2 = Unquantize(r2 & 0xFF, 8, signed); + g2 = Unquantize(g2 & 0xFF, 8, signed); + b2 = Unquantize(b2 & 0xFF, 8, signed); + + r3 = Unquantize(r3 & 0xFF, 8, signed); + g3 = Unquantize(g3 & 0xFF, 8, signed); + b3 = Unquantize(b3 & 0xFF, 8, signed); + + subsetCount = 2; + break; + case 22: + r0 = (int)(low >> 5) & 0xFF; + g0 = (int)(low >> 15) & 0xFF; + b0 = (int)(low >> 25) & 0xFF; + + if (signed) + { + r0 = SignExtend(r0, 8); + g0 = SignExtend(g0, 8); + b0 = SignExtend(b0, 8); + } + + r1 = r0 + SignExtend((int)(low >> 35), 5); + g1 = g0 + SignExtend((int)(low >> 45), 6); + b1 = b0 + SignExtend((int)(low >> 55), 5); + + r2 = r0 + SignExtend((int)(high >> 1), 5); + g2 = g0 + SignExtend((int)(((low >> 18) & 0x20) | ((low >> 20) & 0x10) | ((low >> 41) & 0xF)), 6); + b2 = b0 + SignExtend((int)(((low >> 10) & 0x10) | ((high << 3) & 0x08) | (low >> 61)), 5); + + r3 = r0 + SignExtend((int)(high >> 7), 5); + g3 = g0 + SignExtend((int)(((low >> 28) & 0x20) | ((low >> 36) & 0x10) | ((low >> 51) & 0x0F)), 6); + b3 = b0 + SignExtend((int)( + ((low >> 30) & 0x10) | + ((high >> 9) & 0x08) | + ((high >> 4) & 0x04) | + ((low >> 59) & 0x02) | + ((low >> 13) & 0x01)), 5); + + r0 = Unquantize(r0, 8, signed); + g0 = Unquantize(g0, 8, signed); + b0 = Unquantize(b0, 8, signed); + + r1 = Unquantize(r1 & 0xFF, 8, signed); + g1 = Unquantize(g1 & 0xFF, 8, signed); + b1 = Unquantize(b1 & 0xFF, 8, signed); + + r2 = Unquantize(r2 & 0xFF, 8, signed); + g2 = Unquantize(g2 & 0xFF, 8, signed); + b2 = Unquantize(b2 & 0xFF, 8, signed); + + r3 = Unquantize(r3 & 0xFF, 8, signed); + g3 = Unquantize(g3 & 0xFF, 8, signed); + b3 = Unquantize(b3 & 0xFF, 8, signed); + + subsetCount = 2; + break; + case 26: + r0 = (int)(low >> 5) & 0xFF; + g0 = (int)(low >> 15) & 0xFF; + b0 = (int)(low >> 25) & 0xFF; + + if (signed) + { + r0 = SignExtend(r0, 8); + g0 = SignExtend(g0, 8); + b0 = SignExtend(b0, 8); + } + + r1 = r0 + SignExtend((int)(low >> 35), 5); + g1 = g0 + SignExtend((int)(low >> 45), 5); + b1 = b0 + SignExtend((int)(low >> 55), 6); + + r2 = r0 + SignExtend((int)(high >> 1), 5); + g2 = g0 + SignExtend((int)(((low >> 20) & 0x10) | ((low >> 41) & 0xF)), 5); + b2 = b0 + SignExtend((int)( + ((low >> 18) & 0x20) | + ((low >> 10) & 0x10) | + ((high << 3) & 0x08) | + (low >> 61)), 6); + + r3 = r0 + SignExtend((int)(high >> 7), 5); + g3 = g0 + SignExtend((int)(((low >> 36) & 0x10) | ((low >> 51) & 0xF)), 5); + b3 = b0 + SignExtend((int)( + ((low >> 28) & 0x20) | + ((low >> 30) & 0x10) | + ((high >> 9) & 0x08) | + ((high >> 4) & 0x04) | + ((low >> 12) & 0x02) | + ((low >> 50) & 0x01)), 6); + + r0 = Unquantize(r0, 8, signed); + g0 = Unquantize(g0, 8, signed); + b0 = Unquantize(b0, 8, signed); + + r1 = Unquantize(r1 & 0xFF, 8, signed); + g1 = Unquantize(g1 & 0xFF, 8, signed); + b1 = Unquantize(b1 & 0xFF, 8, signed); + + r2 = Unquantize(r2 & 0xFF, 8, signed); + g2 = Unquantize(g2 & 0xFF, 8, signed); + b2 = Unquantize(b2 & 0xFF, 8, signed); + + r3 = Unquantize(r3 & 0xFF, 8, signed); + g3 = Unquantize(g3 & 0xFF, 8, signed); + b3 = Unquantize(b3 & 0xFF, 8, signed); + + subsetCount = 2; + break; + case 30: + r0 = (int)(low >> 5) & 0x3F; + g0 = (int)(low >> 15) & 0x3F; + b0 = (int)(low >> 25) & 0x3F; + + r1 = (int)(low >> 35) & 0x3F; + g1 = (int)(low >> 45) & 0x3F; + b1 = (int)(low >> 55) & 0x3F; + + r2 = (int)(high >> 1) & 0x3F; + g2 = (int)(((low >> 16) & 0x20) | ((low >> 20) & 0x10) | ((low >> 41) & 0xF)); + b2 = (int)(((low >> 17) & 0x20) | ((low >> 10) & 0x10) | ((high << 3) & 0x08) | (low >> 61)); + + r3 = (int)(high >> 7) & 0x3F; + g3 = (int)(((low >> 26) & 0x20) | ((low >> 7) & 0x10) | ((low >> 51) & 0xF)); + b3 = (int)( + ((low >> 28) & 0x20) | + ((low >> 30) & 0x10) | + ((low >> 29) & 0x08) | + ((low >> 21) & 0x04) | + ((low >> 12) & 0x03)); + + if (signed) + { + r0 = SignExtend(r0, 6); + g0 = SignExtend(g0, 6); + b0 = SignExtend(b0, 6); + + r1 = SignExtend(r1, 6); + g1 = SignExtend(g1, 6); + b1 = SignExtend(b1, 6); + + r2 = SignExtend(r2, 6); + g2 = SignExtend(g2, 6); + b2 = SignExtend(b2, 6); + + r3 = SignExtend(r3, 6); + g3 = SignExtend(g3, 6); + b3 = SignExtend(b3, 6); + } + + r0 = Unquantize(r0, 6, signed); + g0 = Unquantize(g0, 6, signed); + b0 = Unquantize(b0, 6, signed); + + r1 = Unquantize(r1, 6, signed); + g1 = Unquantize(g1, 6, signed); + b1 = Unquantize(b1, 6, signed); + + r2 = Unquantize(r2, 6, signed); + g2 = Unquantize(g2, 6, signed); + b2 = Unquantize(b2, 6, signed); + + r3 = Unquantize(r3, 6, signed); + g3 = Unquantize(g3, 6, signed); + b3 = Unquantize(b3, 6, signed); + + subsetCount = 2; + break; + default: + subsetCount = 0; + break; + } + + if (subsetCount > 0) + { + endPoints[0] = new RgbaColor32(r0, g0, b0, HalfOne); + endPoints[1] = new RgbaColor32(r1, g1, b1, HalfOne); + + if (subsetCount > 1) + { + endPoints[2] = new RgbaColor32(r2, g2, b2, HalfOne); + endPoints[3] = new RgbaColor32(r3, g3, b3, HalfOne); + } + } + + return subsetCount; + } + + private static int SignExtend(int value, int bits) + { + int shift = 32 - bits; + return (value << shift) >> shift; + } + + private static int Unquantize(int value, int bits, bool signed) + { + if (signed) + { + if (bits >= 16) + { + return value; + } + else + { + bool sign = value < 0; + + if (sign) + { + value = -value; + } + + if (value == 0) + { + return value; + } + else if (value >= ((1 << (bits - 1)) - 1)) + { + value = 0x7FFF; + } + else + { + value = ((value << 15) + 0x4000) >> (bits - 1); + } + + if (sign) + { + value = -value; + } + } + } + else + { + if (bits >= 15 || value == 0) + { + return value; + } + else if (value == ((1 << bits) - 1)) + { + return 0xFFFF; + } + else + { + return ((value << 16) + 0x8000) >> bits; + } + } + + return value; + } + + private static ushort FinishUnquantize(int value, bool signed) + { + if (signed) + { + value = value < 0 ? -((-value * 31) >> 5) : (value * 31) >> 5; + + int sign = 0; + if (value < 0) + { + sign = 0x8000; + value = -value; + } + + return (ushort)(sign | value); + } + else + { + return (ushort)((value * 31) >> 6); + } + } + + private static int BitReverse6(int value) + { + value = ((value >> 1) & 0x55) | ((value << 1) & 0xaa); + value = ((value >> 2) & 0x33) | ((value << 2) & 0xcc); + value = ((value >> 4) & 0x0f) | ((value << 4) & 0xf0); + return value >> 2; + } + } +} diff --git a/src/Ryujinx.Graphics.Texture/BC7Decoder.cs b/src/Ryujinx.Graphics.Texture/BC7Decoder.cs new file mode 100644 index 00000000..b865a559 --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/BC7Decoder.cs @@ -0,0 +1,220 @@ +using Ryujinx.Graphics.Texture.Utils; +using System; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Texture +{ + static class BC7Decoder + { + public static void Decode(Span<byte> output, ReadOnlySpan<byte> data, int width, int height) + { + ReadOnlySpan<Block> blocks = MemoryMarshal.Cast<byte, Block>(data); + + Span<uint> output32 = MemoryMarshal.Cast<byte, uint>(output); + + int wInBlocks = (width + 3) / 4; + int hInBlocks = (height + 3) / 4; + + for (int y = 0; y < hInBlocks; y++) + { + int y2 = y * 4; + int bh = Math.Min(4, height - y2); + + for (int x = 0; x < wInBlocks; x++) + { + int x2 = x * 4; + int bw = Math.Min(4, width - x2); + + DecodeBlock(blocks[y * wInBlocks + x], output32.Slice(y2 * width + x2), bw, bh, width); + } + } + } + + private static void DecodeBlock(Block block, Span<uint> output, int w, int h, int width) + { + int mode = BitOperations.TrailingZeroCount((byte)block.Low | 0x100); + if (mode == 8) + { + // Mode is invalid, the spec mandates that hardware fills the block with + // a transparent black color. + for (int ty = 0; ty < h; ty++) + { + int baseOffs = ty * width; + + for (int tx = 0; tx < w; tx++) + { + int offs = baseOffs + tx; + + output[offs] = 0; + } + } + + return; + } + + BC7ModeInfo modeInfo = BC67Tables.BC7ModeInfos[mode]; + + int offset = mode + 1; + int partition = (int)block.Decode(ref offset, modeInfo.PartitionBitCount); + int rotation = (int)block.Decode(ref offset, modeInfo.RotationBitCount); + int indexMode = (int)block.Decode(ref offset, modeInfo.IndexModeBitCount); + + Debug.Assert(partition < 64); + Debug.Assert(rotation < 4); + Debug.Assert(indexMode < 2); + + int endPointCount = modeInfo.SubsetCount * 2; + + Span<RgbaColor32> endPoints = stackalloc RgbaColor32[endPointCount]; + Span<byte> pValues = stackalloc byte[modeInfo.PBits]; + + endPoints.Fill(new RgbaColor32(0, 0, 0, 255)); + + for (int i = 0; i < endPointCount; i++) + { + endPoints[i].R = (int)block.Decode(ref offset, modeInfo.ColorDepth); + } + + for (int i = 0; i < endPointCount; i++) + { + endPoints[i].G = (int)block.Decode(ref offset, modeInfo.ColorDepth); + } + + for (int i = 0; i < endPointCount; i++) + { + endPoints[i].B = (int)block.Decode(ref offset, modeInfo.ColorDepth); + } + + if (modeInfo.AlphaDepth != 0) + { + for (int i = 0; i < endPointCount; i++) + { + endPoints[i].A = (int)block.Decode(ref offset, modeInfo.AlphaDepth); + } + } + + for (int i = 0; i < modeInfo.PBits; i++) + { + pValues[i] = (byte)block.Decode(ref offset, 1); + } + + for (int i = 0; i < endPointCount; i++) + { + int pBit = -1; + + if (modeInfo.PBits != 0) + { + int pIndex = (i * modeInfo.PBits) / endPointCount; + pBit = pValues[pIndex]; + } + + Unquantize(ref endPoints[i], modeInfo.ColorDepth, modeInfo.AlphaDepth, pBit); + } + + byte[] partitionTable = BC67Tables.PartitionTable[modeInfo.SubsetCount - 1][partition]; + byte[] fixUpTable = BC67Tables.FixUpIndices[modeInfo.SubsetCount - 1][partition]; + + Span<byte> colorIndices = stackalloc byte[16]; + + for (int i = 0; i < 16; i++) + { + byte subset = partitionTable[i]; + int bitCount = i == fixUpTable[subset] ? modeInfo.ColorIndexBitCount - 1 : modeInfo.ColorIndexBitCount; + + colorIndices[i] = (byte)block.Decode(ref offset, bitCount); + Debug.Assert(colorIndices[i] < 16); + } + + Span<byte> alphaIndices = stackalloc byte[16]; + + if (modeInfo.AlphaIndexBitCount != 0) + { + for (int i = 0; i < 16; i++) + { + int bitCount = i != 0 ? modeInfo.AlphaIndexBitCount : modeInfo.AlphaIndexBitCount - 1; + + alphaIndices[i] = (byte)block.Decode(ref offset, bitCount); + Debug.Assert(alphaIndices[i] < 16); + } + } + + for (int ty = 0; ty < h; ty++) + { + int baseOffs = ty * width; + + for (int tx = 0; tx < w; tx++) + { + int i = ty * 4 + tx; + + RgbaColor32 color; + + byte subset = partitionTable[i]; + + RgbaColor32 color1 = endPoints[subset * 2]; + RgbaColor32 color2 = endPoints[subset * 2 + 1]; + + if (modeInfo.AlphaIndexBitCount != 0) + { + if (indexMode == 0) + { + color = BC67Utils.Interpolate(color1, color2, colorIndices[i], alphaIndices[i], modeInfo.ColorIndexBitCount, modeInfo.AlphaIndexBitCount); + } + else + { + color = BC67Utils.Interpolate(color1, color2, alphaIndices[i], colorIndices[i], modeInfo.AlphaIndexBitCount, modeInfo.ColorIndexBitCount); + } + } + else + { + color = BC67Utils.Interpolate(color1, color2, colorIndices[i], colorIndices[i], modeInfo.ColorIndexBitCount, modeInfo.ColorIndexBitCount); + } + + if (rotation != 0) + { + int a = color.A; + + switch (rotation) + { + case 1: color.A = color.R; color.R = a; break; + case 2: color.A = color.G; color.G = a; break; + case 3: color.A = color.B; color.B = a; break; + } + } + + RgbaColor8 color8 = color.GetColor8(); + + output[baseOffs + tx] = color8.ToUInt32(); + } + } + } + + private static void Unquantize(ref RgbaColor32 color, int colorDepth, int alphaDepth, int pBit) + { + color.R = UnquantizeComponent(color.R, colorDepth, pBit); + color.G = UnquantizeComponent(color.G, colorDepth, pBit); + color.B = UnquantizeComponent(color.B, colorDepth, pBit); + color.A = alphaDepth != 0 ? UnquantizeComponent(color.A, alphaDepth, pBit) : 255; + } + + private static int UnquantizeComponent(int component, int bits, int pBit) + { + int shift = 8 - bits; + int value = component << shift; + + if (pBit >= 0) + { + Debug.Assert(pBit <= 1); + value |= value >> (bits + 1); + value |= pBit << (shift - 1); + } + else + { + value |= value >> bits; + } + + return value; + } + } +} diff --git a/src/Ryujinx.Graphics.Texture/BCnDecoder.cs b/src/Ryujinx.Graphics.Texture/BCnDecoder.cs new file mode 100644 index 00000000..b21fa4d1 --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/BCnDecoder.cs @@ -0,0 +1,894 @@ +using Ryujinx.Common; +using System; +using System.Buffers.Binary; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace Ryujinx.Graphics.Texture +{ + public static class BCnDecoder + { + private const int BlockWidth = 4; + private const int BlockHeight = 4; + + public static byte[] DecodeBC1(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers) + { + int size = 0; + + for (int l = 0; l < levels; l++) + { + size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4; + } + + byte[] output = new byte[size]; + + Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4]; + + Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile); + Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output); + + Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile); + + Span<Vector128<byte>> outputLine0 = default; + Span<Vector128<byte>> outputLine1 = default; + Span<Vector128<byte>> outputLine2 = default; + Span<Vector128<byte>> outputLine3 = default; + + int imageBaseOOffs = 0; + + for (int l = 0; l < levels; l++) + { + int w = BitUtils.DivRoundUp(width, BlockWidth); + int h = BitUtils.DivRoundUp(height, BlockHeight); + + for (int l2 = 0; l2 < layers; l2++) + { + for (int z = 0; z < depth; z++) + { + for (int y = 0; y < h; y++) + { + int baseY = y * BlockHeight; + int copyHeight = Math.Min(BlockHeight, height - baseY); + int lineBaseOOffs = imageBaseOOffs + baseY * width; + + if (copyHeight == 4) + { + outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs)); + outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width)); + outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 2)); + outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 3)); + } + + for (int x = 0; x < w; x++) + { + int baseX = x * BlockWidth; + int copyWidth = Math.Min(BlockWidth, width - baseX); + + BC1DecodeTileRgb(tile, data); + + if ((copyWidth | copyHeight) == 4) + { + outputLine0[x] = tileAsVector128[0]; + outputLine1[x] = tileAsVector128[1]; + outputLine2[x] = tileAsVector128[2]; + outputLine3[x] = tileAsVector128[3]; + } + else + { + int pixelBaseOOffs = lineBaseOOffs + baseX; + + for (int tY = 0; tY < copyHeight; tY++) + { + tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth)); + } + } + + data = data.Slice(8); + } + } + + imageBaseOOffs += width * height; + } + } + + width = Math.Max(1, width >> 1); + height = Math.Max(1, height >> 1); + depth = Math.Max(1, depth >> 1); + } + + return output; + } + + public static byte[] DecodeBC2(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers) + { + int size = 0; + + for (int l = 0; l < levels; l++) + { + size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4; + } + + byte[] output = new byte[size]; + + Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4]; + + Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile); + Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output); + + Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile); + + Span<Vector128<byte>> outputLine0 = default; + Span<Vector128<byte>> outputLine1 = default; + Span<Vector128<byte>> outputLine2 = default; + Span<Vector128<byte>> outputLine3 = default; + + int imageBaseOOffs = 0; + + for (int l = 0; l < levels; l++) + { + int w = BitUtils.DivRoundUp(width, BlockWidth); + int h = BitUtils.DivRoundUp(height, BlockHeight); + + for (int l2 = 0; l2 < layers; l2++) + { + for (int z = 0; z < depth; z++) + { + for (int y = 0; y < h; y++) + { + int baseY = y * BlockHeight; + int copyHeight = Math.Min(BlockHeight, height - baseY); + int lineBaseOOffs = imageBaseOOffs + baseY * width; + + if (copyHeight == 4) + { + outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs)); + outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width)); + outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 2)); + outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 3)); + } + + for (int x = 0; x < w; x++) + { + int baseX = x * BlockWidth; + int copyWidth = Math.Min(BlockWidth, width - baseX); + + BC23DecodeTileRgb(tile, data.Slice(8)); + + ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data); + + for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, block >>= 4) + { + tile[i] = (byte)((block & 0xf) | (block << 4)); + } + + if ((copyWidth | copyHeight) == 4) + { + outputLine0[x] = tileAsVector128[0]; + outputLine1[x] = tileAsVector128[1]; + outputLine2[x] = tileAsVector128[2]; + outputLine3[x] = tileAsVector128[3]; + } + else + { + int pixelBaseOOffs = lineBaseOOffs + baseX; + + for (int tY = 0; tY < copyHeight; tY++) + { + tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth)); + } + } + + data = data.Slice(16); + } + } + + imageBaseOOffs += width * height; + } + } + + width = Math.Max(1, width >> 1); + height = Math.Max(1, height >> 1); + depth = Math.Max(1, depth >> 1); + } + + return output; + } + + public static byte[] DecodeBC3(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers) + { + int size = 0; + + for (int l = 0; l < levels; l++) + { + size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4; + } + + byte[] output = new byte[size]; + + Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4]; + Span<byte> rPal = stackalloc byte[8]; + + Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile); + Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output); + + Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile); + + Span<Vector128<byte>> outputLine0 = default; + Span<Vector128<byte>> outputLine1 = default; + Span<Vector128<byte>> outputLine2 = default; + Span<Vector128<byte>> outputLine3 = default; + + int imageBaseOOffs = 0; + + for (int l = 0; l < levels; l++) + { + int w = BitUtils.DivRoundUp(width, BlockWidth); + int h = BitUtils.DivRoundUp(height, BlockHeight); + + for (int l2 = 0; l2 < layers; l2++) + { + for (int z = 0; z < depth; z++) + { + for (int y = 0; y < h; y++) + { + int baseY = y * BlockHeight; + int copyHeight = Math.Min(BlockHeight, height - baseY); + int lineBaseOOffs = imageBaseOOffs + baseY * width; + + if (copyHeight == 4) + { + outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs)); + outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width)); + outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 2)); + outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 3)); + } + + for (int x = 0; x < w; x++) + { + int baseX = x * BlockWidth; + int copyWidth = Math.Min(BlockWidth, width - baseX); + + BC23DecodeTileRgb(tile, data.Slice(8)); + + ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data); + + rPal[0] = (byte)block; + rPal[1] = (byte)(block >> 8); + + BCnLerpAlphaUnorm(rPal); + BCnDecodeTileAlphaRgba(tile, rPal, block >> 16); + + if ((copyWidth | copyHeight) == 4) + { + outputLine0[x] = tileAsVector128[0]; + outputLine1[x] = tileAsVector128[1]; + outputLine2[x] = tileAsVector128[2]; + outputLine3[x] = tileAsVector128[3]; + } + else + { + int pixelBaseOOffs = lineBaseOOffs + baseX; + + for (int tY = 0; tY < copyHeight; tY++) + { + tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth)); + } + } + + data = data.Slice(16); + } + } + + imageBaseOOffs += width * height; + } + } + + width = Math.Max(1, width >> 1); + height = Math.Max(1, height >> 1); + depth = Math.Max(1, depth >> 1); + } + + return output; + } + + public static byte[] DecodeBC4(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed) + { + int size = 0; + + for (int l = 0; l < levels; l++) + { + size += BitUtils.AlignUp(Math.Max(1, width >> l), 4) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers; + } + + // Backends currently expect a stride alignment of 4 bytes, so output width must be aligned. + int alignedWidth = BitUtils.AlignUp(width, 4); + + byte[] output = new byte[size]; + Span<byte> outputSpan = new Span<byte>(output); + + ReadOnlySpan<ulong> data64 = MemoryMarshal.Cast<byte, ulong>(data); + + Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight]; + Span<byte> rPal = stackalloc byte[8]; + + Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile); + + Span<uint> outputLine0 = default; + Span<uint> outputLine1 = default; + Span<uint> outputLine2 = default; + Span<uint> outputLine3 = default; + + int imageBaseOOffs = 0; + + for (int l = 0; l < levels; l++) + { + int w = BitUtils.DivRoundUp(width, BlockWidth); + int h = BitUtils.DivRoundUp(height, BlockHeight); + + for (int l2 = 0; l2 < layers; l2++) + { + for (int z = 0; z < depth; z++) + { + for (int y = 0; y < h; y++) + { + int baseY = y * BlockHeight; + int copyHeight = Math.Min(BlockHeight, height - baseY); + int lineBaseOOffs = imageBaseOOffs + baseY * alignedWidth; + + if (copyHeight == 4) + { + outputLine0 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs)); + outputLine1 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + alignedWidth)); + outputLine2 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + alignedWidth * 2)); + outputLine3 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + alignedWidth * 3)); + } + + for (int x = 0; x < w; x++) + { + int baseX = x * BlockWidth; + int copyWidth = Math.Min(BlockWidth, width - baseX); + + ulong block = data64[0]; + + rPal[0] = (byte)block; + rPal[1] = (byte)(block >> 8); + + if (signed) + { + BCnLerpAlphaSnorm(rPal); + } + else + { + BCnLerpAlphaUnorm(rPal); + } + + BCnDecodeTileAlpha(tile, rPal, block >> 16); + + if ((copyWidth | copyHeight) == 4) + { + outputLine0[x] = tileAsUint[0]; + outputLine1[x] = tileAsUint[1]; + outputLine2[x] = tileAsUint[2]; + outputLine3[x] = tileAsUint[3]; + } + else + { + int pixelBaseOOffs = lineBaseOOffs + baseX; + + for (int tY = 0; tY < copyHeight; tY++) + { + tile.Slice(tY * 4, copyWidth).CopyTo(outputSpan.Slice(pixelBaseOOffs + alignedWidth * tY, copyWidth)); + } + } + + data64 = data64.Slice(1); + } + } + + imageBaseOOffs += alignedWidth * height; + } + } + + width = Math.Max(1, width >> 1); + height = Math.Max(1, height >> 1); + depth = Math.Max(1, depth >> 1); + + alignedWidth = BitUtils.AlignUp(width, 4); + } + + return output; + } + + public static byte[] DecodeBC5(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed) + { + int size = 0; + + for (int l = 0; l < levels; l++) + { + size += BitUtils.AlignUp(Math.Max(1, width >> l), 2) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 2; + } + + // Backends currently expect a stride alignment of 4 bytes, so output width must be aligned. + int alignedWidth = BitUtils.AlignUp(width, 2); + + byte[] output = new byte[size]; + + ReadOnlySpan<ulong> data64 = MemoryMarshal.Cast<byte, ulong>(data); + + Span<byte> rTile = stackalloc byte[BlockWidth * BlockHeight * 2]; + Span<byte> gTile = stackalloc byte[BlockWidth * BlockHeight * 2]; + Span<byte> rPal = stackalloc byte[8]; + Span<byte> gPal = stackalloc byte[8]; + + Span<ushort> outputAsUshort = MemoryMarshal.Cast<byte, ushort>(output); + + Span<uint> rTileAsUint = MemoryMarshal.Cast<byte, uint>(rTile); + Span<uint> gTileAsUint = MemoryMarshal.Cast<byte, uint>(gTile); + + Span<ulong> outputLine0 = default; + Span<ulong> outputLine1 = default; + Span<ulong> outputLine2 = default; + Span<ulong> outputLine3 = default; + + int imageBaseOOffs = 0; + + for (int l = 0; l < levels; l++) + { + int w = BitUtils.DivRoundUp(width, BlockWidth); + int h = BitUtils.DivRoundUp(height, BlockHeight); + + for (int l2 = 0; l2 < layers; l2++) + { + for (int z = 0; z < depth; z++) + { + for (int y = 0; y < h; y++) + { + int baseY = y * BlockHeight; + int copyHeight = Math.Min(BlockHeight, height - baseY); + int lineBaseOOffs = imageBaseOOffs + baseY * alignedWidth; + + if (copyHeight == 4) + { + outputLine0 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs)); + outputLine1 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + alignedWidth)); + outputLine2 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + alignedWidth * 2)); + outputLine3 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + alignedWidth * 3)); + } + + for (int x = 0; x < w; x++) + { + int baseX = x * BlockWidth; + int copyWidth = Math.Min(BlockWidth, width - baseX); + + ulong blockL = data64[0]; + ulong blockH = data64[1]; + + rPal[0] = (byte)blockL; + rPal[1] = (byte)(blockL >> 8); + gPal[0] = (byte)blockH; + gPal[1] = (byte)(blockH >> 8); + + if (signed) + { + BCnLerpAlphaSnorm(rPal); + BCnLerpAlphaSnorm(gPal); + } + else + { + BCnLerpAlphaUnorm(rPal); + BCnLerpAlphaUnorm(gPal); + } + + BCnDecodeTileAlpha(rTile, rPal, blockL >> 16); + BCnDecodeTileAlpha(gTile, gPal, blockH >> 16); + + if ((copyWidth | copyHeight) == 4) + { + outputLine0[x] = InterleaveBytes(rTileAsUint[0], gTileAsUint[0]); + outputLine1[x] = InterleaveBytes(rTileAsUint[1], gTileAsUint[1]); + outputLine2[x] = InterleaveBytes(rTileAsUint[2], gTileAsUint[2]); + outputLine3[x] = InterleaveBytes(rTileAsUint[3], gTileAsUint[3]); + } + else + { + int pixelBaseOOffs = lineBaseOOffs + baseX; + + for (int tY = 0; tY < copyHeight; tY++) + { + int line = pixelBaseOOffs + alignedWidth * tY; + + for (int tX = 0; tX < copyWidth; tX++) + { + int texel = tY * BlockWidth + tX; + + outputAsUshort[line + tX] = (ushort)(rTile[texel] | (gTile[texel] << 8)); + } + } + } + + data64 = data64.Slice(2); + } + } + + imageBaseOOffs += alignedWidth * height; + } + } + + width = Math.Max(1, width >> 1); + height = Math.Max(1, height >> 1); + depth = Math.Max(1, depth >> 1); + + alignedWidth = BitUtils.AlignUp(width, 2); + } + + return output; + } + + public static byte[] DecodeBC6(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed) + { + int size = 0; + + for (int l = 0; l < levels; l++) + { + size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 8; + } + + byte[] output = new byte[size]; + + int inputOffset = 0; + int outputOffset = 0; + + for (int l = 0; l < levels; l++) + { + int w = BitUtils.DivRoundUp(width, BlockWidth); + int h = BitUtils.DivRoundUp(height, BlockHeight); + + for (int l2 = 0; l2 < layers; l2++) + { + for (int z = 0; z < depth; z++) + { + BC6Decoder.Decode(output.AsSpan().Slice(outputOffset), data.Slice(inputOffset), width, height, signed); + + inputOffset += w * h * 16; + outputOffset += width * height * 8; + } + } + + width = Math.Max(1, width >> 1); + height = Math.Max(1, height >> 1); + depth = Math.Max(1, depth >> 1); + } + + return output; + } + + public static byte[] DecodeBC7(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers) + { + int size = 0; + + for (int l = 0; l < levels; l++) + { + size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4; + } + + byte[] output = new byte[size]; + + int inputOffset = 0; + int outputOffset = 0; + + for (int l = 0; l < levels; l++) + { + int w = BitUtils.DivRoundUp(width, BlockWidth); + int h = BitUtils.DivRoundUp(height, BlockHeight); + + for (int l2 = 0; l2 < layers; l2++) + { + for (int z = 0; z < depth; z++) + { + BC7Decoder.Decode(output.AsSpan().Slice(outputOffset), data.Slice(inputOffset), width, height); + + inputOffset += w * h * 16; + outputOffset += width * height * 4; + } + } + + width = Math.Max(1, width >> 1); + height = Math.Max(1, height >> 1); + depth = Math.Max(1, depth >> 1); + } + + return output; + } + + private static ulong InterleaveBytes(uint left, uint right) + { + return InterleaveBytesWithZeros(left) | (InterleaveBytesWithZeros(right) << 8); + } + + private static ulong InterleaveBytesWithZeros(uint value) + { + ulong output = value; + output = (output ^ (output << 16)) & 0xffff0000ffffUL; + output = (output ^ (output << 8)) & 0xff00ff00ff00ffUL; + return output; + } + + private static void BCnLerpAlphaUnorm(Span<byte> alpha) + { + byte a0 = alpha[0]; + byte a1 = alpha[1]; + + if (a0 > a1) + { + alpha[2] = (byte)((6 * a0 + 1 * a1) / 7); + alpha[3] = (byte)((5 * a0 + 2 * a1) / 7); + alpha[4] = (byte)((4 * a0 + 3 * a1) / 7); + alpha[5] = (byte)((3 * a0 + 4 * a1) / 7); + alpha[6] = (byte)((2 * a0 + 5 * a1) / 7); + alpha[7] = (byte)((1 * a0 + 6 * a1) / 7); + } + else + { + alpha[2] = (byte)((4 * a0 + 1 * a1) / 5); + alpha[3] = (byte)((3 * a0 + 2 * a1) / 5); + alpha[4] = (byte)((2 * a0 + 3 * a1) / 5); + alpha[5] = (byte)((1 * a0 + 4 * a1) / 5); + alpha[6] = 0; + alpha[7] = 0xff; + } + } + + private static void BCnLerpAlphaSnorm(Span<byte> alpha) + { + sbyte a0 = (sbyte)alpha[0]; + sbyte a1 = (sbyte)alpha[1]; + + if (a0 > a1) + { + alpha[2] = (byte)((6 * a0 + 1 * a1) / 7); + alpha[3] = (byte)((5 * a0 + 2 * a1) / 7); + alpha[4] = (byte)((4 * a0 + 3 * a1) / 7); + alpha[5] = (byte)((3 * a0 + 4 * a1) / 7); + alpha[6] = (byte)((2 * a0 + 5 * a1) / 7); + alpha[7] = (byte)((1 * a0 + 6 * a1) / 7); + } + else + { + alpha[2] = (byte)((4 * a0 + 1 * a1) / 5); + alpha[3] = (byte)((3 * a0 + 2 * a1) / 5); + alpha[4] = (byte)((2 * a0 + 3 * a1) / 5); + alpha[5] = (byte)((1 * a0 + 4 * a1) / 5); + alpha[6] = 0x80; + alpha[7] = 0x7f; + } + } + + private unsafe static void BCnDecodeTileAlpha(Span<byte> output, Span<byte> rPal, ulong rI) + { + if (Avx2.IsSupported) + { + Span<Vector128<byte>> outputAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(output); + + Vector128<uint> shifts = Vector128.Create(0u, 3u, 6u, 9u); + Vector128<uint> masks = Vector128.Create(7u); + + Vector128<byte> vClut; + + fixed (byte* pRPal = rPal) + { + vClut = Sse2.LoadScalarVector128((ulong*)pRPal).AsByte(); + } + + Vector128<uint> indices0 = Vector128.Create((uint)rI); + Vector128<uint> indices1 = Vector128.Create((uint)(rI >> 24)); + Vector128<uint> indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts); + Vector128<uint> indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts); + Vector128<uint> indices01 = Sse2.ShiftRightLogical(indices00, 12); + Vector128<uint> indices11 = Sse2.ShiftRightLogical(indices10, 12); + indices00 = Sse2.And(indices00, masks); + indices10 = Sse2.And(indices10, masks); + indices01 = Sse2.And(indices01, masks); + indices11 = Sse2.And(indices11, masks); + + Vector128<ushort> indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32()); + Vector128<ushort> indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32()); + + Vector128<byte> indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16()); + + outputAsVector128[0] = Ssse3.Shuffle(vClut, indices); + } + else + { + for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3) + { + output[i] = rPal[(int)(rI & 7)]; + } + } + } + + private unsafe static void BCnDecodeTileAlphaRgba(Span<byte> output, Span<byte> rPal, ulong rI) + { + if (Avx2.IsSupported) + { + Span<Vector256<uint>> outputAsVector256 = MemoryMarshal.Cast<byte, Vector256<uint>>(output); + + Vector256<uint> shifts = Vector256.Create(0u, 3u, 6u, 9u, 12u, 15u, 18u, 21u); + + Vector128<uint> vClut128; + + fixed (byte* pRPal = rPal) + { + vClut128 = Sse2.LoadScalarVector128((ulong*)pRPal).AsUInt32(); + } + + Vector256<uint> vClut = Avx2.ConvertToVector256Int32(vClut128.AsByte()).AsUInt32(); + vClut = Avx2.ShiftLeftLogical(vClut, 24); + + Vector256<uint> indices0 = Vector256.Create((uint)rI); + Vector256<uint> indices1 = Vector256.Create((uint)(rI >> 24)); + + indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts); + indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts); + + outputAsVector256[0] = Avx2.Or(outputAsVector256[0], Avx2.PermuteVar8x32(vClut, indices0)); + outputAsVector256[1] = Avx2.Or(outputAsVector256[1], Avx2.PermuteVar8x32(vClut, indices1)); + } + else + { + for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, rI >>= 3) + { + output[i] = rPal[(int)(rI & 7)]; + } + } + } + + private unsafe static void BC1DecodeTileRgb(Span<byte> output, ReadOnlySpan<byte> input) + { + Span<uint> clut = stackalloc uint[4]; + + uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input); + uint c0 = (ushort)c0c1; + uint c1 = (ushort)(c0c1 >> 16); + + clut[0] = ConvertRgb565ToRgb888(c0) | 0xff000000; + clut[1] = ConvertRgb565ToRgb888(c1) | 0xff000000; + clut[2] = BC1LerpRgb2(clut[0], clut[1], c0, c1); + clut[3] = BC1LerpRgb3(clut[0], clut[1], c0, c1); + + BCnDecodeTileRgb(clut, output, input); + } + + private unsafe static void BC23DecodeTileRgb(Span<byte> output, ReadOnlySpan<byte> input) + { + Span<uint> clut = stackalloc uint[4]; + + uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input); + uint c0 = (ushort)c0c1; + uint c1 = (ushort)(c0c1 >> 16); + + clut[0] = ConvertRgb565ToRgb888(c0); + clut[1] = ConvertRgb565ToRgb888(c1); + clut[2] = BC23LerpRgb2(clut[0], clut[1]); + clut[3] = BC23LerpRgb3(clut[0], clut[1]); + + BCnDecodeTileRgb(clut, output, input); + } + + private unsafe static void BCnDecodeTileRgb(Span<uint> clut, Span<byte> output, ReadOnlySpan<byte> input) + { + if (Avx2.IsSupported) + { + Span<Vector256<uint>> outputAsVector256 = MemoryMarshal.Cast<byte, Vector256<uint>>(output); + + Vector256<uint> shifts0 = Vector256.Create(0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u); + Vector256<uint> shifts1 = Vector256.Create(16u, 18u, 20u, 22u, 24u, 26u, 28u, 30u); + Vector256<uint> masks = Vector256.Create(3u); + + Vector256<uint> vClut; + + fixed (uint* pClut = &clut[0]) + { + vClut = Sse2.LoadVector128(pClut).ToVector256Unsafe(); + } + + Vector256<uint> indices0; + + fixed (byte* pInput = input) + { + indices0 = Avx2.BroadcastScalarToVector256((uint*)(pInput + 4)); + } + + Vector256<uint> indices1 = indices0; + + indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts0); + indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts1); + indices0 = Avx2.And(indices0, masks); + indices1 = Avx2.And(indices1, masks); + + outputAsVector256[0] = Avx2.PermuteVar8x32(vClut, indices0); + outputAsVector256[1] = Avx2.PermuteVar8x32(vClut, indices1); + } + else + { + Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output); + + uint indices = BinaryPrimitives.ReadUInt32LittleEndian(input.Slice(4)); + + for (int i = 0; i < BlockWidth * BlockHeight; i++, indices >>= 2) + { + outputAsUint[i] = clut[(int)(indices & 3)]; + } + } + } + + private static uint BC1LerpRgb2(uint color0, uint color1, uint c0, uint c1) + { + if (c0 > c1) + { + return BC23LerpRgb2(color0, color1) | 0xff000000; + } + + uint carry = color0 & color1; + uint addHalve = ((color0 ^ color1) >> 1) & 0x7f7f7f; + return (addHalve + carry) | 0xff000000; + } + + private static uint BC23LerpRgb2(uint color0, uint color1) + { + uint r0 = (byte)color0; + uint g0 = color0 & 0xff00; + uint b0 = color0 & 0xff0000; + + uint r1 = (byte)color1; + uint g1 = color1 & 0xff00; + uint b1 = color1 & 0xff0000; + + uint mixR = (2 * r0 + r1) / 3; + uint mixG = (2 * g0 + g1) / 3; + uint mixB = (2 * b0 + b1) / 3; + + return mixR | (mixG & 0xff00) | (mixB & 0xff0000); + } + + private static uint BC1LerpRgb3(uint color0, uint color1, uint c0, uint c1) + { + if (c0 > c1) + { + return BC23LerpRgb3(color0, color1) | 0xff000000; + } + + return 0; + } + + private static uint BC23LerpRgb3(uint color0, uint color1) + { + uint r0 = (byte)color0; + uint g0 = color0 & 0xff00; + uint b0 = color0 & 0xff0000; + + uint r1 = (byte)color1; + uint g1 = color1 & 0xff00; + uint b1 = color1 & 0xff0000; + + uint mixR = (2 * r1 + r0) / 3; + uint mixG = (2 * g1 + g0) / 3; + uint mixB = (2 * b1 + b0) / 3; + + return mixR | (mixG & 0xff00) | (mixB & 0xff0000); + } + + private static uint ConvertRgb565ToRgb888(uint value) + { + uint b = (value & 0x1f) << 19; + uint g = (value << 5) & 0xfc00; + uint r = (value >> 8) & 0xf8; + + b |= b >> 5; + g |= g >> 6; + r |= r >> 5; + + return r | (g & 0xff00) | (b & 0xff0000); + } + } +}
\ No newline at end of file diff --git a/src/Ryujinx.Graphics.Texture/BCnEncoder.cs b/src/Ryujinx.Graphics.Texture/BCnEncoder.cs new file mode 100644 index 00000000..02b79c1b --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/BCnEncoder.cs @@ -0,0 +1,60 @@ +using Ryujinx.Common; +using Ryujinx.Graphics.Texture.Encoders; +using System; + +namespace Ryujinx.Graphics.Texture +{ + public static class BCnEncoder + { + private const int BlockWidth = 4; + private const int BlockHeight = 4; + + public static byte[] EncodeBC7(byte[] data, int width, int height, int depth, int levels, int layers) + { + int size = 0; + + for (int l = 0; l < levels; l++) + { + int w = BitUtils.DivRoundUp(Math.Max(1, width >> l), BlockWidth); + int h = BitUtils.DivRoundUp(Math.Max(1, height >> l), BlockHeight); + + size += w * h * 16 * Math.Max(1, depth >> l) * layers; + } + + byte[] output = new byte[size]; + + int imageBaseIOffs = 0; + int imageBaseOOffs = 0; + + for (int l = 0; l < levels; l++) + { + int rgba8Size = width * height * depth * layers * 4; + + int w = BitUtils.DivRoundUp(width, BlockWidth); + int h = BitUtils.DivRoundUp(height, BlockHeight); + + for (int l2 = 0; l2 < layers; l2++) + { + for (int z = 0; z < depth; z++) + { + BC7Encoder.Encode( + output.AsMemory().Slice(imageBaseOOffs), + data.AsMemory().Slice(imageBaseIOffs), + width, + height, + EncodeMode.Fast | EncodeMode.Multithreaded); + + imageBaseIOffs += width * height * 4; + imageBaseOOffs += w * h * 16; + } + } + + width = Math.Max(1, width >> 1); + height = Math.Max(1, height >> 1); + depth = Math.Max(1, depth >> 1); + } + + return output; + } + } +}
\ No newline at end of file diff --git a/src/Ryujinx.Graphics.Texture/BlockLinearConstants.cs b/src/Ryujinx.Graphics.Texture/BlockLinearConstants.cs new file mode 100644 index 00000000..d95691cf --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/BlockLinearConstants.cs @@ -0,0 +1,10 @@ +namespace Ryujinx.Graphics.Texture +{ + static class BlockLinearConstants + { + public const int GobStride = 64; + public const int GobHeight = 8; + + public const int GobSize = GobStride * GobHeight; + } +}
\ No newline at end of file diff --git a/src/Ryujinx.Graphics.Texture/BlockLinearLayout.cs b/src/Ryujinx.Graphics.Texture/BlockLinearLayout.cs new file mode 100644 index 00000000..e098e959 --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/BlockLinearLayout.cs @@ -0,0 +1,195 @@ +using Ryujinx.Common; +using System.Numerics; +using System.Runtime.CompilerServices; + +using static Ryujinx.Graphics.Texture.BlockLinearConstants; + +namespace Ryujinx.Graphics.Texture +{ + class BlockLinearLayout + { + private struct RobAndSliceSizes + { + public int RobSize; + public int SliceSize; + + public RobAndSliceSizes(int robSize, int sliceSize) + { + RobSize = robSize; + SliceSize = sliceSize; + } + } + + private int _texBpp; + + private int _bhMask; + private int _bdMask; + + private int _bhShift; + private int _bdShift; + private int _bppShift; + + private int _xShift; + + private int _robSize; + private int _sliceSize; + + // Variables for built in iteration. + private int _yPart; + private int _yzPart; + private int _zPart; + + public BlockLinearLayout( + int width, + int height, + int gobBlocksInY, + int gobBlocksInZ, + int bpp) + { + _texBpp = bpp; + + _bppShift = BitOperations.TrailingZeroCount(bpp); + + _bhMask = gobBlocksInY - 1; + _bdMask = gobBlocksInZ - 1; + + _bhShift = BitOperations.TrailingZeroCount(gobBlocksInY); + _bdShift = BitOperations.TrailingZeroCount(gobBlocksInZ); + + _xShift = BitOperations.TrailingZeroCount(GobSize * gobBlocksInY * gobBlocksInZ); + + RobAndSliceSizes rsSizes = GetRobAndSliceSizes(width, height, gobBlocksInY, gobBlocksInZ); + + _robSize = rsSizes.RobSize; + _sliceSize = rsSizes.SliceSize; + } + + private RobAndSliceSizes GetRobAndSliceSizes(int width, int height, int gobBlocksInY, int gobBlocksInZ) + { + int widthInGobs = BitUtils.DivRoundUp(width * _texBpp, GobStride); + + int robSize = GobSize * gobBlocksInY * gobBlocksInZ * widthInGobs; + + int sliceSize = BitUtils.DivRoundUp(height, gobBlocksInY * GobHeight) * robSize; + + return new RobAndSliceSizes(robSize, sliceSize); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetOffset(int x, int y, int z) + { + return GetOffsetWithLineOffset(x << _bppShift, y, z); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetOffsetWithLineOffset(int x, int y, int z) + { + int yh = y / GobHeight; + + int offset = (z >> _bdShift) * _sliceSize + (yh >> _bhShift) * _robSize; + + offset += (x / GobStride) << _xShift; + + offset += (yh & _bhMask) * GobSize; + + offset += ((z & _bdMask) * GobSize) << _bhShift; + + offset += ((x & 0x3f) >> 5) << 8; + offset += ((y & 0x07) >> 1) << 6; + offset += ((x & 0x1f) >> 4) << 5; + offset += ((y & 0x01) >> 0) << 4; + offset += ((x & 0x0f) >> 0) << 0; + + return offset; + } + + public (int offset, int size) GetRectangleRange(int x, int y, int width, int height) + { + // Justification: + // The 2D offset is a combination of separate x and y parts. + // Both components increase with input and never overlap bits. + // Therefore for each component, the minimum input value is the lowest that component can go. + // Minimum total value is minimum X component + minimum Y component. Similar goes for maximum. + + int start = GetOffset(x, y, 0); + int end = GetOffset(x + width - 1, y + height - 1, 0) + _texBpp; // Cover the last pixel. + return (start, end - start); + } + + public bool LayoutMatches(BlockLinearLayout other) + { + return _robSize == other._robSize && + _sliceSize == other._sliceSize && + _texBpp == other._texBpp && + _bhMask == other._bhMask && + _bdMask == other._bdMask; + } + + // Functions for built in iteration. + // Components of the offset can be updated separately, and combined to save some time. + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void SetY(int y) + { + int yh = y / GobHeight; + int offset = (yh >> _bhShift) * _robSize; + + offset += (yh & _bhMask) * GobSize; + + offset += ((y & 0x07) >> 1) << 6; + offset += ((y & 0x01) >> 0) << 4; + + _yPart = offset; + _yzPart = offset + _zPart; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void SetZ(int z) + { + int offset = (z >> _bdShift) * _sliceSize; + + offset += ((z & _bdMask) * GobSize) << _bhShift; + + _zPart = offset; + _yzPart = offset + _yPart; + } + + /// <summary> + /// Optimized conversion for line offset in bytes to an absolute offset. Input x must be divisible by 16. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetOffsetWithLineOffset16(int x) + { + int offset = (x / GobStride) << _xShift; + + offset += ((x & 0x3f) >> 5) << 8; + offset += ((x & 0x1f) >> 4) << 5; + + return offset + _yzPart; + } + + /// <summary> + /// Optimized conversion for line offset in bytes to an absolute offset. Input x must be divisible by 64. + /// </summary> + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetOffsetWithLineOffset64(int x) + { + int offset = (x / GobStride) << _xShift; + + return offset + _yzPart; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetOffset(int x) + { + x <<= _bppShift; + int offset = (x / GobStride) << _xShift; + + offset += ((x & 0x3f) >> 5) << 8; + offset += ((x & 0x1f) >> 4) << 5; + offset += (x & 0x0f); + + return offset + _yzPart; + } + } +}
\ No newline at end of file diff --git a/src/Ryujinx.Graphics.Texture/Bpp12Pixel.cs b/src/Ryujinx.Graphics.Texture/Bpp12Pixel.cs new file mode 100644 index 00000000..5a38259e --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Bpp12Pixel.cs @@ -0,0 +1,11 @@ +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Texture +{ + [StructLayout(LayoutKind.Sequential, Pack = 1, Size = 12)] + public struct Bpp12Pixel + { + private ulong _elem1; + private uint _elem2; + } +} diff --git a/src/Ryujinx.Graphics.Texture/ETC2Decoder.cs b/src/Ryujinx.Graphics.Texture/ETC2Decoder.cs new file mode 100644 index 00000000..21ff4be4 --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/ETC2Decoder.cs @@ -0,0 +1,682 @@ +using Ryujinx.Common; +using System; +using System.Buffers.Binary; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Texture +{ + public static class ETC2Decoder + { + private const uint AlphaMask = 0xff000000u; + + private const int BlockWidth = 4; + private const int BlockHeight = 4; + + private static readonly int[][] _etc1Lut = + { + new int[] { 2, 8, -2, -8 }, + new int[] { 5, 17, -5, -17 }, + new int[] { 9, 29, -9, -29 }, + new int[] { 13, 42, -13, -42 }, + new int[] { 18, 60, -18, -60 }, + new int[] { 24, 80, -24, -80 }, + new int[] { 33, 106, -33, -106 }, + new int[] { 47, 183, -47, -183 } + }; + + private static readonly int[] _etc2Lut = + { + 3, 6, 11, 16, 23, 32, 41, 64 + }; + + private static readonly int[][] _etc2AlphaLut = + { + new int[] { -3, -6, -9, -15, 2, 5, 8, 14 }, + new int[] { -3, -7, -10, -13, 2, 6, 9, 12 }, + new int[] { -2, -5, -8, -13, 1, 4, 7, 12 }, + new int[] { -2, -4, -6, -13, 1, 3, 5, 12 }, + new int[] { -3, -6, -8, -12, 2, 5, 7, 11 }, + new int[] { -3, -7, -9, -11, 2, 6, 8, 10 }, + new int[] { -4, -7, -8, -11, 3, 6, 7, 10 }, + new int[] { -3, -5, -8, -11, 2, 4, 7, 10 }, + new int[] { -2, -6, -8, -10, 1, 5, 7, 9 }, + new int[] { -2, -5, -8, -10, 1, 4, 7, 9 }, + new int[] { -2, -4, -8, -10, 1, 3, 7, 9 }, + new int[] { -2, -5, -7, -10, 1, 4, 6, 9 }, + new int[] { -3, -4, -7, -10, 2, 3, 6, 9 }, + new int[] { -1, -2, -3, -10, 0, 1, 2, 9 }, + new int[] { -4, -6, -8, -9, 3, 5, 7, 8 }, + new int[] { -3, -5, -7, -9, 2, 4, 6, 8 } + }; + + public static byte[] DecodeRgb(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers) + { + ReadOnlySpan<ulong> dataUlong = MemoryMarshal.Cast<byte, ulong>(data); + + int inputOffset = 0; + + byte[] output = new byte[CalculateOutputSize(width, height, depth, levels, layers)]; + + Span<uint> outputUint = MemoryMarshal.Cast<byte, uint>(output); + Span<uint> tile = stackalloc uint[BlockWidth * BlockHeight]; + + int imageBaseOOffs = 0; + + for (int l = 0; l < levels; l++) + { + int wInBlocks = BitUtils.DivRoundUp(width, BlockWidth); + int hInBlocks = BitUtils.DivRoundUp(height, BlockHeight); + + for (int l2 = 0; l2 < layers; l2++) + { + for (int z = 0; z < depth; z++) + { + for (int y = 0; y < hInBlocks; y++) + { + int ty = y * BlockHeight; + int bh = Math.Min(BlockHeight, height - ty); + + for (int x = 0; x < wInBlocks; x++) + { + int tx = x * BlockWidth; + int bw = Math.Min(BlockWidth, width - tx); + + ulong colorBlock = dataUlong[inputOffset++]; + + DecodeBlock(tile, colorBlock); + + for (int py = 0; py < bh; py++) + { + int oOffsBase = imageBaseOOffs + ((ty + py) * width) + tx; + + for (int px = 0; px < bw; px++) + { + int oOffs = oOffsBase + px; + + outputUint[oOffs] = tile[py * BlockWidth + px] | AlphaMask; + } + } + } + } + + imageBaseOOffs += width * height; + } + } + + width = Math.Max(1, width >> 1); + height = Math.Max(1, height >> 1); + depth = Math.Max(1, depth >> 1); + } + + return output; + } + + public static byte[] DecodePta(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers) + { + ReadOnlySpan<ulong> dataUlong = MemoryMarshal.Cast<byte, ulong>(data); + + int inputOffset = 0; + + byte[] output = new byte[CalculateOutputSize(width, height, depth, levels, layers)]; + + Span<uint> outputUint = MemoryMarshal.Cast<byte, uint>(output); + Span<uint> tile = stackalloc uint[BlockWidth * BlockHeight]; + + int imageBaseOOffs = 0; + + for (int l = 0; l < levels; l++) + { + int wInBlocks = BitUtils.DivRoundUp(width, BlockWidth); + int hInBlocks = BitUtils.DivRoundUp(height, BlockHeight); + + for (int l2 = 0; l2 < layers; l2++) + { + for (int z = 0; z < depth; z++) + { + for (int y = 0; y < hInBlocks; y++) + { + int ty = y * BlockHeight; + int bh = Math.Min(BlockHeight, height - ty); + + for (int x = 0; x < wInBlocks; x++) + { + int tx = x * BlockWidth; + int bw = Math.Min(BlockWidth, width - tx); + + ulong colorBlock = dataUlong[inputOffset++]; + + DecodeBlockPta(tile, colorBlock); + + for (int py = 0; py < bh; py++) + { + int oOffsBase = imageBaseOOffs + ((ty + py) * width) + tx; + + tile.Slice(py * BlockWidth, bw).CopyTo(outputUint.Slice(oOffsBase, bw)); + } + } + } + + imageBaseOOffs += width * height; + } + } + + width = Math.Max(1, width >> 1); + height = Math.Max(1, height >> 1); + depth = Math.Max(1, depth >> 1); + } + + return output; + } + + public static byte[] DecodeRgba(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers) + { + ReadOnlySpan<ulong> dataUlong = MemoryMarshal.Cast<byte, ulong>(data); + + int inputOffset = 0; + + byte[] output = new byte[CalculateOutputSize(width, height, depth, levels, layers)]; + + Span<uint> outputUint = MemoryMarshal.Cast<byte, uint>(output); + Span<uint> tile = stackalloc uint[BlockWidth * BlockHeight]; + + int imageBaseOOffs = 0; + + for (int l = 0; l < levels; l++) + { + int wInBlocks = BitUtils.DivRoundUp(width, BlockWidth); + int hInBlocks = BitUtils.DivRoundUp(height, BlockHeight); + + for (int l2 = 0; l2 < layers; l2++) + { + for (int z = 0; z < depth; z++) + { + for (int y = 0; y < hInBlocks; y++) + { + int ty = y * BlockHeight; + int bh = Math.Min(BlockHeight, height - ty); + + for (int x = 0; x < wInBlocks; x++) + { + int tx = x * BlockWidth; + int bw = Math.Min(BlockWidth, width - tx); + + ulong alphaBlock = dataUlong[inputOffset]; + ulong colorBlock = dataUlong[inputOffset + 1]; + + inputOffset += 2; + + DecodeBlock(tile, colorBlock); + + byte alphaBase = (byte)alphaBlock; + int[] alphaTable = _etc2AlphaLut[(alphaBlock >> 8) & 0xf]; + int alphaMultiplier = (int)(alphaBlock >> 12) & 0xf; + ulong alphaIndices = BinaryPrimitives.ReverseEndianness(alphaBlock); + + if (alphaMultiplier != 0) + { + for (int py = 0; py < bh; py++) + { + int oOffsBase = imageBaseOOffs + ((ty + py) * width) + tx; + + for (int px = 0; px < bw; px++) + { + int oOffs = oOffsBase + px; + int alphaIndex = (int)((alphaIndices >> (((px * BlockHeight + py) ^ 0xf) * 3)) & 7); + + byte a = Saturate(alphaBase + alphaTable[alphaIndex] * alphaMultiplier); + + outputUint[oOffs] = tile[py * BlockWidth + px] | ((uint)a << 24); + } + } + } + else + { + uint a = (uint)alphaBase << 24; + + for (int py = 0; py < bh; py++) + { + int oOffsBase = imageBaseOOffs + ((ty + py) * width) + tx; + + for (int px = 0; px < bw; px++) + { + int oOffs = oOffsBase + px; + + outputUint[oOffs] = tile[py * BlockWidth + px] | a; + } + } + } + } + } + + imageBaseOOffs += width * height; + } + } + + width = Math.Max(1, width >> 1); + height = Math.Max(1, height >> 1); + depth = Math.Max(1, depth >> 1); + } + + return output; + } + + private static void DecodeBlock(Span<uint> tile, ulong block) + { + uint blockLow = (uint)(block >> 0); + uint blockHigh = (uint)(block >> 32); + + uint r1, g1, b1; + uint r2, g2, b2; + + bool differentialMode = (blockLow & 0x2000000) != 0; + + if (differentialMode) + { + (r1, g1, b1, r2, g2, b2) = UnpackRgb555DiffEndPoints(blockLow); + + if (r2 > 31) + { + DecodeBlock59T(tile, blockLow, blockHigh); + } + else if (g2 > 31) + { + DecodeBlock58H(tile, blockLow, blockHigh); + } + else if (b2 > 31) + { + DecodeBlock57P(tile, block); + } + else + { + r1 |= r1 >> 5; + g1 |= g1 >> 5; + b1 |= b1 >> 5; + + r2 = (r2 << 3) | (r2 >> 2); + g2 = (g2 << 3) | (g2 >> 2); + b2 = (b2 << 3) | (b2 >> 2); + + DecodeBlockETC1(tile, blockLow, blockHigh, r1, g1, b1, r2, g2, b2); + } + } + else + { + r1 = (blockLow & 0x0000f0) >> 0; + g1 = (blockLow & 0x00f000) >> 8; + b1 = (blockLow & 0xf00000) >> 16; + + r2 = (blockLow & 0x00000f) << 4; + g2 = (blockLow & 0x000f00) >> 4; + b2 = (blockLow & 0x0f0000) >> 12; + + r1 |= r1 >> 4; + g1 |= g1 >> 4; + b1 |= b1 >> 4; + + r2 |= r2 >> 4; + g2 |= g2 >> 4; + b2 |= b2 >> 4; + + DecodeBlockETC1(tile, blockLow, blockHigh, r1, g1, b1, r2, g2, b2); + } + } + + private static void DecodeBlockPta(Span<uint> tile, ulong block) + { + uint blockLow = (uint)(block >> 0); + uint blockHigh = (uint)(block >> 32); + + (uint r1, uint g1, uint b1, uint r2, uint g2, uint b2) = UnpackRgb555DiffEndPoints(blockLow); + + bool fullyOpaque = (blockLow & 0x2000000) != 0; + + if (fullyOpaque) + { + if (r2 > 31) + { + DecodeBlock59T(tile, blockLow, blockHigh); + } + else if (g2 > 31) + { + DecodeBlock58H(tile, blockLow, blockHigh); + } + else if (b2 > 31) + { + DecodeBlock57P(tile, block); + } + else + { + r1 |= r1 >> 5; + g1 |= g1 >> 5; + b1 |= b1 >> 5; + + r2 = (r2 << 3) | (r2 >> 2); + g2 = (g2 << 3) | (g2 >> 2); + b2 = (b2 << 3) | (b2 >> 2); + + DecodeBlockETC1(tile, blockLow, blockHigh, r1, g1, b1, r2, g2, b2); + } + + for (int i = 0; i < tile.Length; i++) + { + tile[i] |= AlphaMask; + } + } + else + { + if (r2 > 31) + { + DecodeBlock59T(tile, blockLow, blockHigh, AlphaMask); + } + else if (g2 > 31) + { + DecodeBlock58H(tile, blockLow, blockHigh, AlphaMask); + } + else if (b2 > 31) + { + DecodeBlock57P(tile, block); + + for (int i = 0; i < tile.Length; i++) + { + tile[i] |= AlphaMask; + } + } + else + { + r1 |= r1 >> 5; + g1 |= g1 >> 5; + b1 |= b1 >> 5; + + r2 = (r2 << 3) | (r2 >> 2); + g2 = (g2 << 3) | (g2 >> 2); + b2 = (b2 << 3) | (b2 >> 2); + + DecodeBlockETC1(tile, blockLow, blockHigh, r1, g1, b1, r2, g2, b2, AlphaMask); + } + } + } + + private static (uint, uint, uint, uint, uint, uint) UnpackRgb555DiffEndPoints(uint blockLow) + { + uint r1 = (blockLow & 0x0000f8) >> 0; + uint g1 = (blockLow & 0x00f800) >> 8; + uint b1 = (blockLow & 0xf80000) >> 16; + + uint r2 = (uint)((sbyte)(r1 >> 3) + ((sbyte)((blockLow & 0x000007) << 5) >> 5)); + uint g2 = (uint)((sbyte)(g1 >> 3) + ((sbyte)((blockLow & 0x000700) >> 3) >> 5)); + uint b2 = (uint)((sbyte)(b1 >> 3) + ((sbyte)((blockLow & 0x070000) >> 11) >> 5)); + + return (r1, g1, b1, r2, g2, b2); + } + + private static void DecodeBlock59T(Span<uint> tile, uint blockLow, uint blockHigh, uint alphaMask = 0) + { + uint r1 = (blockLow & 3) | ((blockLow >> 1) & 0xc); + uint g1 = (blockLow >> 12) & 0xf; + uint b1 = (blockLow >> 8) & 0xf; + + uint r2 = (blockLow >> 20) & 0xf; + uint g2 = (blockLow >> 16) & 0xf; + uint b2 = (blockLow >> 28) & 0xf; + + r1 |= r1 << 4; + g1 |= g1 << 4; + b1 |= b1 << 4; + + r2 |= r2 << 4; + g2 |= g2 << 4; + b2 |= b2 << 4; + + int dist = _etc2Lut[((blockLow >> 24) & 1) | ((blockLow >> 25) & 6)]; + + Span<uint> palette = stackalloc uint[4]; + + palette[0] = Pack(r1, g1, b1); + palette[1] = Pack(r2, g2, b2, dist); + palette[2] = Pack(r2, g2, b2); + palette[3] = Pack(r2, g2, b2, -dist); + + blockHigh = BinaryPrimitives.ReverseEndianness(blockHigh); + + for (int y = 0; y < BlockHeight; y++) + { + for (int x = 0; x < BlockWidth; x++) + { + int offset = (y * 4) + x; + int index = (x * 4) + y; + + int paletteIndex = (int)((blockHigh >> index) & 1) | (int)((blockHigh >> (index + 15)) & 2); + + tile[offset] = palette[paletteIndex]; + + if (alphaMask != 0) + { + if (paletteIndex == 2) + { + tile[offset] = 0; + } + else + { + tile[offset] |= alphaMask; + } + } + } + } + } + + private static void DecodeBlock58H(Span<uint> tile, uint blockLow, uint blockHigh, uint alphaMask = 0) + { + uint r1 = (blockLow >> 3) & 0xf; + uint g1 = ((blockLow << 1) & 0xe) | ((blockLow >> 12) & 1); + uint b1 = ((blockLow >> 23) & 1) | ((blockLow >> 7) & 6) | ((blockLow >> 8) & 8); + + uint r2 = (blockLow >> 19) & 0xf; + uint g2 = ((blockLow >> 31) & 1) | ((blockLow >> 15) & 0xe); + uint b2 = (blockLow >> 27) & 0xf; + + uint rgb1 = Pack4Be(r1, g1, b1); + uint rgb2 = Pack4Be(r2, g2, b2); + + r1 |= r1 << 4; + g1 |= g1 << 4; + b1 |= b1 << 4; + + r2 |= r2 << 4; + g2 |= g2 << 4; + b2 |= b2 << 4; + + int dist = _etc2Lut[(rgb1 >= rgb2 ? 1u : 0u) | ((blockLow >> 23) & 2) | ((blockLow >> 24) & 4)]; + + Span<uint> palette = stackalloc uint[4]; + + palette[0] = Pack(r1, g1, b1, dist); + palette[1] = Pack(r1, g1, b1, -dist); + palette[2] = Pack(r2, g2, b2, dist); + palette[3] = Pack(r2, g2, b2, -dist); + + blockHigh = BinaryPrimitives.ReverseEndianness(blockHigh); + + for (int y = 0; y < BlockHeight; y++) + { + for (int x = 0; x < BlockWidth; x++) + { + int offset = (y * 4) + x; + int index = (x * 4) + y; + + int paletteIndex = (int)((blockHigh >> index) & 1) | (int)((blockHigh >> (index + 15)) & 2); + + tile[offset] = palette[paletteIndex]; + + if (alphaMask != 0) + { + if (paletteIndex == 2) + { + tile[offset] = 0; + } + else + { + tile[offset] |= alphaMask; + } + } + } + } + } + + private static void DecodeBlock57P(Span<uint> tile, ulong block) + { + int r0 = (int)((block >> 1) & 0x3f); + int g0 = (int)(((block >> 9) & 0x3f) | ((block & 1) << 6)); + int b0 = (int)(((block >> 31) & 1) | ((block >> 15) & 6) | ((block >> 16) & 0x18) | ((block >> 3) & 0x20)); + + int rh = (int)(((block >> 24) & 1) | ((block >> 25) & 0x3e)); + int gh = (int)((block >> 33) & 0x7f); + int bh = (int)(((block >> 43) & 0x1f) | ((block >> 27) & 0x20)); + + int rv = (int)(((block >> 53) & 7) | ((block >> 37) & 0x38)); + int gv = (int)(((block >> 62) & 3) | ((block >> 46) & 0x7c)); + int bv = (int)((block >> 56) & 0x3f); + + r0 = (r0 << 2) | (r0 >> 4); + g0 = (g0 << 1) | (g0 >> 6); + b0 = (b0 << 2) | (b0 >> 4); + + rh = (rh << 2) | (rh >> 4); + gh = (gh << 1) | (gh >> 6); + bh = (bh << 2) | (bh >> 4); + + rv = (rv << 2) | (rv >> 4); + gv = (gv << 1) | (gv >> 6); + bv = (bv << 2) | (bv >> 4); + + for (int y = 0; y < BlockHeight; y++) + { + for (int x = 0; x < BlockWidth; x++) + { + int offset = y * BlockWidth + x; + + byte r = Saturate(((x * (rh - r0)) + (y * (rv - r0)) + (r0 * 4) + 2) >> 2); + byte g = Saturate(((x * (gh - g0)) + (y * (gv - g0)) + (g0 * 4) + 2) >> 2); + byte b = Saturate(((x * (bh - b0)) + (y * (bv - b0)) + (b0 * 4) + 2) >> 2); + + tile[offset] = Pack(r, g, b); + } + } + } + + private static void DecodeBlockETC1( + Span<uint> tile, + uint blockLow, + uint blockHigh, + uint r1, + uint g1, + uint b1, + uint r2, + uint g2, + uint b2, + uint alphaMask = 0) + { + int[] table1 = _etc1Lut[(blockLow >> 29) & 7]; + int[] table2 = _etc1Lut[(blockLow >> 26) & 7]; + + bool flip = (blockLow & 0x1000000) != 0; + + if (!flip) + { + for (int y = 0; y < BlockHeight; y++) + { + for (int x = 0; x < BlockWidth / 2; x++) + { + uint color1 = CalculatePixel(r1, g1, b1, x + 0, y, blockHigh, table1, alphaMask); + uint color2 = CalculatePixel(r2, g2, b2, x + 2, y, blockHigh, table2, alphaMask); + + int offset1 = y * BlockWidth + x; + int offset2 = y * BlockWidth + x + 2; + + tile[offset1] = color1; + tile[offset2] = color2; + } + } + } + else + { + for (int y = 0; y < BlockHeight / 2; y++) + { + for (int x = 0; x < BlockWidth; x++) + { + uint color1 = CalculatePixel(r1, g1, b1, x, y + 0, blockHigh, table1, alphaMask); + uint color2 = CalculatePixel(r2, g2, b2, x, y + 2, blockHigh, table2, alphaMask); + + int offset1 = (y * BlockWidth) + x; + int offset2 = ((y + 2) * BlockWidth) + x; + + tile[offset1] = color1; + tile[offset2] = color2; + } + } + } + } + + private static uint CalculatePixel(uint r, uint g, uint b, int x, int y, uint block, int[] table, uint alphaMask) + { + int index = x * BlockHeight + y; + uint msb = block << 1; + uint tableIndex = index < 8 + ? ((block >> (index + 24)) & 1) + ((msb >> (index + 8)) & 2) + : ((block >> (index + 8)) & 1) + ((msb >> (index - 8)) & 2); + + if (alphaMask != 0) + { + if (tableIndex == 0) + { + return Pack(r, g, b) | alphaMask; + } + else if (tableIndex == 2) + { + return 0; + } + else + { + return Pack(r, g, b, table[tableIndex]) | alphaMask; + } + } + + return Pack(r, g, b, table[tableIndex]); + } + + private static uint Pack(uint r, uint g, uint b, int offset) + { + r = Saturate((int)(r + offset)); + g = Saturate((int)(g + offset)); + b = Saturate((int)(b + offset)); + + return Pack(r, g, b); + } + + private static uint Pack(uint r, uint g, uint b) + { + return r | (g << 8) | (b << 16); + } + + private static uint Pack4Be(uint r, uint g, uint b) + { + return (r << 8) | (g << 4) | b; + } + + private static byte Saturate(int value) + { + return value > byte.MaxValue ? byte.MaxValue : value < byte.MinValue ? byte.MinValue : (byte)value; + } + + private static int CalculateOutputSize(int width, int height, int depth, int levels, int layers) + { + int size = 0; + + for (int l = 0; l < levels; l++) + { + size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4; + } + + return size; + } + } +} diff --git a/src/Ryujinx.Graphics.Texture/Encoders/BC7Encoder.cs b/src/Ryujinx.Graphics.Texture/Encoders/BC7Encoder.cs new file mode 100644 index 00000000..35d36bce --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Encoders/BC7Encoder.cs @@ -0,0 +1,1005 @@ +using Ryujinx.Graphics.Texture.Utils; +using System; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using System.Threading.Tasks; + +namespace Ryujinx.Graphics.Texture.Encoders +{ + static class BC7Encoder + { + private const int MinColorVarianceForModeChange = 160; + + public static void Encode(Memory<byte> outputStorage, ReadOnlyMemory<byte> data, int width, int height, EncodeMode mode) + { + int widthInBlocks = (width + 3) / 4; + int heightInBlocks = (height + 3) / 4; + + bool fastMode = (mode & EncodeMode.ModeMask) == EncodeMode.Fast; + + if (mode.HasFlag(EncodeMode.Multithreaded)) + { + Parallel.For(0, heightInBlocks, (yInBlocks) => + { + Span<ulong> output = MemoryMarshal.Cast<byte, ulong>(outputStorage.Span); + int y = yInBlocks * 4; + + for (int xInBlocks = 0; xInBlocks < widthInBlocks; xInBlocks++) + { + int x = xInBlocks * 4; + Block block = CompressBlock(data.Span, x, y, width, height, fastMode); + + int offset = (yInBlocks * widthInBlocks + xInBlocks) * 2; + output[offset] = block.Low; + output[offset + 1] = block.High; + } + }); + } + else + { + Span<ulong> output = MemoryMarshal.Cast<byte, ulong>(outputStorage.Span); + int offset = 0; + + for (int y = 0; y < height; y += 4) + { + for (int x = 0; x < width; x += 4) + { + Block block = CompressBlock(data.Span, x, y, width, height, fastMode); + + output[offset++] = block.Low; + output[offset++] = block.High; + } + } + } + } + + private static readonly int[] _mostFrequentPartitions = new int[] + { + 0, 13, 2, 1, 15, 14, 10, 23 + }; + + private static Block CompressBlock(ReadOnlySpan<byte> data, int x, int y, int width, int height, bool fastMode) + { + int w = Math.Min(4, width - x); + int h = Math.Min(4, height - y); + + var dataUint = MemoryMarshal.Cast<byte, uint>(data); + + int baseOffset = y * width + x; + + Span<uint> tile = stackalloc uint[w * h]; + + for (int ty = 0; ty < h; ty++) + { + int rowOffset = baseOffset + ty * width; + + for (int tx = 0; tx < w; tx++) + { + tile[ty * w + tx] = dataUint[rowOffset + tx]; + } + } + + return fastMode ? EncodeFast(tile, w, h) : EncodeExhaustive(tile, w, h); + } + + private static Block EncodeFast(ReadOnlySpan<uint> tile, int w, int h) + { + (RgbaColor8 minColor, RgbaColor8 maxColor) = BC67Utils.GetMinMaxColors(tile, w, h); + + bool alphaNotOne = minColor.A != 255 || maxColor.A != 255; + int variance = BC67Utils.SquaredDifference(minColor.GetColor32(), maxColor.GetColor32()); + int selectedMode; + int indexMode = 0; + + if (alphaNotOne) + { + bool constantAlpha = minColor.A == maxColor.A; + if (constantAlpha) + { + selectedMode = variance > MinColorVarianceForModeChange ? 7 : 6; + } + else + { + if (variance > MinColorVarianceForModeChange) + { + Span<uint> uniqueRGB = stackalloc uint[16]; + Span<uint> uniqueAlpha = stackalloc uint[16]; + + int uniqueRGBCount = 0; + int uniqueAlphaCount = 0; + + uint rgbMask = new RgbaColor8(255, 255, 255, 0).ToUInt32(); + uint alphaMask = new RgbaColor8(0, 0, 0, 255).ToUInt32(); + + for (int i = 0; i < tile.Length; i++) + { + uint c = tile[i]; + + if (!uniqueRGB.Slice(0, uniqueRGBCount).Contains(c & rgbMask)) + { + uniqueRGB[uniqueRGBCount++] = c & rgbMask; + } + + if (!uniqueAlpha.Slice(0, uniqueAlphaCount).Contains(c & alphaMask)) + { + uniqueAlpha[uniqueAlphaCount++] = c & alphaMask; + } + } + + selectedMode = 4; + indexMode = uniqueRGBCount > uniqueAlphaCount ? 1 : 0; + } + else + { + selectedMode = 5; + } + } + } + else + { + if (variance > MinColorVarianceForModeChange) + { + selectedMode = 1; + } + else + { + selectedMode = 6; + } + } + + int selectedPartition = 0; + + if (selectedMode == 1 || selectedMode == 7) + { + int partitionSelectionLowestError = int.MaxValue; + + for (int i = 0; i < _mostFrequentPartitions.Length; i++) + { + int p = _mostFrequentPartitions[i]; + int error = GetEndPointSelectionErrorFast(tile, 2, p, w, h, partitionSelectionLowestError); + if (error < partitionSelectionLowestError) + { + partitionSelectionLowestError = error; + selectedPartition = p; + } + } + } + + return Encode(selectedMode, selectedPartition, 0, indexMode, fastMode: true, tile, w, h, out _); + } + + private static Block EncodeExhaustive(ReadOnlySpan<uint> tile, int w, int h) + { + Block bestBlock = default; + int lowestError = int.MaxValue; + int lowestErrorSubsets = int.MaxValue; + + for (int m = 0; m < 8; m++) + { + for (int r = 0; r < (m == 4 || m == 5 ? 4 : 1); r++) + { + for (int im = 0; im < (m == 4 ? 2 : 1); im++) + { + for (int p = 0; p < 1 << BC67Tables.BC7ModeInfos[m].PartitionBitCount; p++) + { + Block block = Encode(m, p, r, im, fastMode: false, tile, w, h, out int maxError); + if (maxError < lowestError || (maxError == lowestError && BC67Tables.BC7ModeInfos[m].SubsetCount < lowestErrorSubsets)) + { + lowestError = maxError; + lowestErrorSubsets = BC67Tables.BC7ModeInfos[m].SubsetCount; + bestBlock = block; + } + } + } + } + } + + return bestBlock; + } + + private static Block Encode( + int mode, + int partition, + int rotation, + int indexMode, + bool fastMode, + ReadOnlySpan<uint> tile, + int w, + int h, + out int errorSum) + { + BC7ModeInfo modeInfo = BC67Tables.BC7ModeInfos[mode]; + int subsetCount = modeInfo.SubsetCount; + int partitionBitCount = modeInfo.PartitionBitCount; + int rotationBitCount = modeInfo.RotationBitCount; + int indexModeBitCount = modeInfo.IndexModeBitCount; + int colorDepth = modeInfo.ColorDepth; + int alphaDepth = modeInfo.AlphaDepth; + int pBits = modeInfo.PBits; + int colorIndexBitCount = modeInfo.ColorIndexBitCount; + int alphaIndexBitCount = modeInfo.AlphaIndexBitCount; + bool separateAlphaIndices = alphaIndexBitCount != 0; + + uint alphaMask; + + if (separateAlphaIndices) + { + alphaMask = rotation switch + { + 1 => new RgbaColor8(255, 0, 0, 0).ToUInt32(), + 2 => new RgbaColor8(0, 255, 0, 0).ToUInt32(), + 3 => new RgbaColor8(0, 0, 255, 0).ToUInt32(), + _ => new RgbaColor8(0, 0, 0, 255).ToUInt32() + }; + } + else + { + alphaMask = new RgbaColor8(0, 0, 0, 0).ToUInt32(); + } + + if (indexMode != 0) + { + alphaMask = ~alphaMask; + } + + // + // Select color palette. + // + + Span<uint> endPoints0 = stackalloc uint[subsetCount]; + Span<uint> endPoints1 = stackalloc uint[subsetCount]; + + SelectEndPoints( + tile, + w, + h, + endPoints0, + endPoints1, + subsetCount, + partition, + colorIndexBitCount, + colorDepth, + alphaDepth, + ~alphaMask, + fastMode); + + if (separateAlphaIndices) + { + SelectEndPoints( + tile, + w, + h, + endPoints0, + endPoints1, + subsetCount, + partition, + alphaIndexBitCount, + colorDepth, + alphaDepth, + alphaMask, + fastMode); + } + + Span<int> pBitValues = stackalloc int[pBits]; + + for (int i = 0; i < pBits; i++) + { + int pBit; + + if (pBits == subsetCount) + { + pBit = GetPBit(endPoints0[i], endPoints1[i], colorDepth, alphaDepth); + } + else + { + int subset = i >> 1; + uint color = (i & 1) == 0 ? endPoints0[subset] : endPoints1[subset]; + pBit = GetPBit(color, colorDepth, alphaDepth); + } + + pBitValues[i] = pBit; + } + + int colorIndexCount = 1 << colorIndexBitCount; + int alphaIndexCount = 1 << alphaIndexBitCount; + + Span<byte> colorIndices = stackalloc byte[16]; + Span<byte> alphaIndices = stackalloc byte[16]; + + errorSum = BC67Utils.SelectIndices( + tile, + w, + h, + endPoints0, + endPoints1, + pBitValues, + colorIndices, + subsetCount, + partition, + colorIndexBitCount, + colorIndexCount, + colorDepth, + alphaDepth, + pBits, + alphaMask); + + if (separateAlphaIndices) + { + errorSum += BC67Utils.SelectIndices( + tile, + w, + h, + endPoints0, + endPoints1, + pBitValues, + alphaIndices, + subsetCount, + partition, + alphaIndexBitCount, + alphaIndexCount, + colorDepth, + alphaDepth, + pBits, + ~alphaMask); + } + + Span<bool> colorSwapSubset = stackalloc bool[3]; + + for (int i = 0; i < 3; i++) + { + colorSwapSubset[i] = colorIndices[BC67Tables.FixUpIndices[subsetCount - 1][partition][i]] >= (colorIndexCount >> 1); + } + + bool alphaSwapSubset = alphaIndices[0] >= (alphaIndexCount >> 1); + + Block block = new Block(); + + int offset = 0; + + block.Encode(1UL << mode, ref offset, mode + 1); + block.Encode((ulong)partition, ref offset, partitionBitCount); + block.Encode((ulong)rotation, ref offset, rotationBitCount); + block.Encode((ulong)indexMode, ref offset, indexModeBitCount); + + for (int comp = 0; comp < 3; comp++) + { + int rotatedComp = comp; + + if (((comp + 1) & 3) == rotation) + { + rotatedComp = 3; + } + + for (int subset = 0; subset < subsetCount; subset++) + { + RgbaColor8 color0 = RgbaColor8.FromUInt32(endPoints0[subset]); + RgbaColor8 color1 = RgbaColor8.FromUInt32(endPoints1[subset]); + + int pBit0 = -1, pBit1 = -1; + + if (pBits == subsetCount) + { + pBit0 = pBit1 = pBitValues[subset]; + } + else if (pBits != 0) + { + pBit0 = pBitValues[subset * 2]; + pBit1 = pBitValues[subset * 2 + 1]; + } + + if (indexMode == 0 ? colorSwapSubset[subset] : alphaSwapSubset) + { + block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), colorDepth, pBit1), ref offset, colorDepth); + block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), colorDepth, pBit0), ref offset, colorDepth); + } + else + { + block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), colorDepth, pBit0), ref offset, colorDepth); + block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), colorDepth, pBit1), ref offset, colorDepth); + } + } + } + + if (alphaDepth != 0) + { + int rotatedComp = (rotation - 1) & 3; + + for (int subset = 0; subset < subsetCount; subset++) + { + RgbaColor8 color0 = RgbaColor8.FromUInt32(endPoints0[subset]); + RgbaColor8 color1 = RgbaColor8.FromUInt32(endPoints1[subset]); + + int pBit0 = -1, pBit1 = -1; + + if (pBits == subsetCount) + { + pBit0 = pBit1 = pBitValues[subset]; + } + else if (pBits != 0) + { + pBit0 = pBitValues[subset * 2]; + pBit1 = pBitValues[subset * 2 + 1]; + } + + if (separateAlphaIndices && indexMode == 0 ? alphaSwapSubset : colorSwapSubset[subset]) + { + block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), alphaDepth, pBit1), ref offset, alphaDepth); + block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), alphaDepth, pBit0), ref offset, alphaDepth); + } + else + { + block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), alphaDepth, pBit0), ref offset, alphaDepth); + block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), alphaDepth, pBit1), ref offset, alphaDepth); + } + } + } + + for (int i = 0; i < pBits; i++) + { + block.Encode((ulong)pBitValues[i], ref offset, 1); + } + + byte[] fixUpTable = BC67Tables.FixUpIndices[subsetCount - 1][partition]; + + for (int i = 0; i < 16; i++) + { + int subset = BC67Tables.PartitionTable[subsetCount - 1][partition][i]; + byte index = colorIndices[i]; + + if (colorSwapSubset[subset]) + { + index = (byte)(index ^ (colorIndexCount - 1)); + } + + int finalIndexBitCount = i == fixUpTable[subset] ? colorIndexBitCount - 1 : colorIndexBitCount; + + Debug.Assert(index < (1 << finalIndexBitCount)); + + block.Encode(index, ref offset, finalIndexBitCount); + } + + if (separateAlphaIndices) + { + for (int i = 0; i < 16; i++) + { + byte index = alphaIndices[i]; + + if (alphaSwapSubset) + { + index = (byte)(index ^ (alphaIndexCount - 1)); + } + + int finalIndexBitCount = i == 0 ? alphaIndexBitCount - 1 : alphaIndexBitCount; + + Debug.Assert(index < (1 << finalIndexBitCount)); + + block.Encode(index, ref offset, finalIndexBitCount); + } + } + + return block; + } + + private static unsafe int GetEndPointSelectionErrorFast(ReadOnlySpan<uint> tile, int subsetCount, int partition, int w, int h, int maxError) + { + byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition]; + + Span<RgbaColor8> minColors = stackalloc RgbaColor8[subsetCount]; + Span<RgbaColor8> maxColors = stackalloc RgbaColor8[subsetCount]; + + BC67Utils.GetMinMaxColors(partitionTable, tile, w, h, minColors, maxColors, subsetCount); + + Span<uint> endPoints0 = stackalloc uint[subsetCount]; + Span<uint> endPoints1 = stackalloc uint[subsetCount]; + + SelectEndPointsFast(partitionTable, tile, w, h, subsetCount, minColors, maxColors, endPoints0, endPoints1, uint.MaxValue); + + Span<RgbaColor32> palette = stackalloc RgbaColor32[8]; + + int errorSum = 0; + + for (int subset = 0; subset < subsetCount; subset++) + { + RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32(); + int sum = blockDir.R + blockDir.G + blockDir.B + blockDir.A; + if (sum != 0) + { + blockDir = (blockDir << 6) / new RgbaColor32(sum); + } + + uint c0 = endPoints0[subset]; + uint c1 = endPoints1[subset]; + + int pBit0 = GetPBit(c0, 6, 0); + int pBit1 = GetPBit(c1, 6, 0); + + c0 = BC67Utils.Quantize(RgbaColor8.FromUInt32(c0), 6, 0, pBit0).ToUInt32(); + c1 = BC67Utils.Quantize(RgbaColor8.FromUInt32(c1), 6, 0, pBit1).ToUInt32(); + + if (Sse41.IsSupported) + { + Vector128<byte> c0Rep = Vector128.Create(c0).AsByte(); + Vector128<byte> c1Rep = Vector128.Create(c1).AsByte(); + + Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); + + Vector128<byte> rWeights; + Vector128<byte> lWeights; + + fixed (byte* pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1]) + { + rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte(); + lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte(); + } + + Vector128<byte> iWeights = Sse2.UnpackLow(rWeights, lWeights); + Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); + Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); + Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + + static Vector128<short> ShiftRoundToNearest(Vector128<short> x) + { + return Sse2.ShiftRightLogical(Sse2.Add(x, Vector128.Create((short)32)), 6); + } + + Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); + Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); + Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte())); + Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte())); + + for (int i = 0; i < tile.Length; i++) + { + if (partitionTable[i] != subset) + { + continue; + } + + uint c = tile[i]; + + Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); + + Vector128<short> delta0 = Sse2.Subtract(color, pal0); + Vector128<short> delta1 = Sse2.Subtract(color, pal1); + Vector128<short> delta2 = Sse2.Subtract(color, pal2); + Vector128<short> delta3 = Sse2.Subtract(color, pal3); + + Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); + Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); + Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2); + Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3); + + Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); + Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3); + + Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23); + + Vector128<ushort> min = Sse41.MinHorizontal(delta); + + errorSum += min.GetElement(0); + } + } + else + { + RgbaColor32 e032 = RgbaColor8.FromUInt32(c0).GetColor32(); + RgbaColor32 e132 = RgbaColor8.FromUInt32(c1).GetColor32(); + + palette[0] = e032; + palette[palette.Length - 1] = e132; + + for (int i = 1; i < palette.Length - 1; i++) + { + palette[i] = BC67Utils.Interpolate(e032, e132, i, 3); + } + + for (int i = 0; i < tile.Length; i++) + { + if (partitionTable[i] != subset) + { + continue; + } + + uint c = tile[i]; + RgbaColor32 color = Unsafe.As<uint, RgbaColor8>(ref c).GetColor32(); + + int bestMatchScore = int.MaxValue; + + for (int j = 0; j < palette.Length; j++) + { + int score = BC67Utils.SquaredDifference(color, palette[j]); + + if (score < bestMatchScore) + { + bestMatchScore = score; + } + } + + errorSum += bestMatchScore; + } + } + + // No point in continuing if we are already above maximum. + if (errorSum >= maxError) + { + return int.MaxValue; + } + } + + return errorSum; + } + + private static void SelectEndPoints( + ReadOnlySpan<uint> tile, + int w, + int h, + Span<uint> endPoints0, + Span<uint> endPoints1, + int subsetCount, + int partition, + int indexBitCount, + int colorDepth, + int alphaDepth, + uint writeMask, + bool fastMode) + { + byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition]; + + Span<RgbaColor8> minColors = stackalloc RgbaColor8[subsetCount]; + Span<RgbaColor8> maxColors = stackalloc RgbaColor8[subsetCount]; + + BC67Utils.GetMinMaxColors(partitionTable, tile, w, h, minColors, maxColors, subsetCount); + + uint inverseMask = ~writeMask; + + for (int i = 0; i < subsetCount; i++) + { + Unsafe.As<RgbaColor8, uint>(ref minColors[i]) |= inverseMask; + Unsafe.As<RgbaColor8, uint>(ref maxColors[i]) |= inverseMask; + } + + if (fastMode) + { + SelectEndPointsFast(partitionTable, tile, w, h, subsetCount, minColors, maxColors, endPoints0, endPoints1, writeMask); + } + else + { + Span<RgbaColor8> colors = stackalloc RgbaColor8[subsetCount * 16]; + Span<byte> counts = stackalloc byte[subsetCount]; + + int i = 0; + for (int ty = 0; ty < h; ty++) + { + for (int tx = 0; tx < w; tx++) + { + int subset = partitionTable[ty * 4 + tx]; + RgbaColor8 color = RgbaColor8.FromUInt32(tile[i++] | inverseMask); + + static void AddIfNew(Span<RgbaColor8> values, RgbaColor8 value, int subset, ref byte count) + { + for (int i = 0; i < count; i++) + { + if (values[subset * 16 + i] == value) + { + return; + } + } + + values[subset * 16 + count++] = value; + } + + AddIfNew(colors, color, subset, ref counts[subset]); + } + } + + for (int subset = 0; subset < subsetCount; subset++) + { + int offset = subset * 16; + + RgbaColor8 minColor = minColors[subset]; + RgbaColor8 maxColor = maxColors[subset]; + + ReadOnlySpan<RgbaColor8> subsetColors = colors.Slice(offset, counts[subset]); + + (RgbaColor8 e0, RgbaColor8 e1) = SelectEndPoints(subsetColors, minColor, maxColor, indexBitCount, colorDepth, alphaDepth, inverseMask); + + endPoints0[subset] = (endPoints0[subset] & inverseMask) | (e0.ToUInt32() & writeMask); + endPoints1[subset] = (endPoints1[subset] & inverseMask) | (e1.ToUInt32() & writeMask); + } + } + } + + private static unsafe void SelectEndPointsFast( + ReadOnlySpan<byte> partitionTable, + ReadOnlySpan<uint> tile, + int w, + int h, + int subsetCount, + ReadOnlySpan<RgbaColor8> minColors, + ReadOnlySpan<RgbaColor8> maxColors, + Span<uint> endPoints0, + Span<uint> endPoints1, + uint writeMask) + { + uint inverseMask = ~writeMask; + + if (Sse41.IsSupported && w == 4 && h == 4) + { + Vector128<byte> row0, row1, row2, row3; + Vector128<short> ones = Vector128<short>.AllBitsSet; + + fixed (uint* pTile = tile) + { + row0 = Sse2.LoadVector128(pTile).AsByte(); + row1 = Sse2.LoadVector128(pTile + 4).AsByte(); + row2 = Sse2.LoadVector128(pTile + 8).AsByte(); + row3 = Sse2.LoadVector128(pTile + 12).AsByte(); + } + + Vector128<byte> partitionMask; + + fixed (byte* pPartitionTable = partitionTable) + { + partitionMask = Sse2.LoadVector128(pPartitionTable); + } + + for (int subset = 0; subset < subsetCount; subset++) + { + RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32(); + int sum = blockDir.R + blockDir.G + blockDir.B + blockDir.A; + if (sum != 0) + { + blockDir = (blockDir << 6) / new RgbaColor32(sum); + } + + Vector128<byte> bd = Vector128.Create(blockDir.GetColor8().ToUInt32()).AsByte(); + + Vector128<short> delta0 = Ssse3.MultiplyAddAdjacent(row0, bd.AsSByte()); + Vector128<short> delta1 = Ssse3.MultiplyAddAdjacent(row1, bd.AsSByte()); + Vector128<short> delta2 = Ssse3.MultiplyAddAdjacent(row2, bd.AsSByte()); + Vector128<short> delta3 = Ssse3.MultiplyAddAdjacent(row3, bd.AsSByte()); + + Vector128<short> delta01 = Ssse3.HorizontalAdd(delta0, delta1); + Vector128<short> delta23 = Ssse3.HorizontalAdd(delta2, delta3); + + Vector128<byte> subsetMask = Sse2.Xor(Sse2.CompareEqual(partitionMask, Vector128.Create((byte)subset)), ones.AsByte()); + + Vector128<short> subsetMask01 = Sse2.UnpackLow(subsetMask, subsetMask).AsInt16(); + Vector128<short> subsetMask23 = Sse2.UnpackHigh(subsetMask, subsetMask).AsInt16(); + + Vector128<ushort> min01 = Sse41.MinHorizontal(Sse2.Or(delta01, subsetMask01).AsUInt16()); + Vector128<ushort> min23 = Sse41.MinHorizontal(Sse2.Or(delta23, subsetMask23).AsUInt16()); + Vector128<ushort> max01 = Sse41.MinHorizontal(Sse2.Xor(Sse2.AndNot(subsetMask01, delta01), ones).AsUInt16()); + Vector128<ushort> max23 = Sse41.MinHorizontal(Sse2.Xor(Sse2.AndNot(subsetMask23, delta23), ones).AsUInt16()); + + uint minPos01 = min01.AsUInt32().GetElement(0); + uint minPos23 = min23.AsUInt32().GetElement(0); + uint maxPos01 = max01.AsUInt32().GetElement(0); + uint maxPos23 = max23.AsUInt32().GetElement(0); + + uint minDistColor = (ushort)minPos23 < (ushort)minPos01 + ? tile[(int)(minPos23 >> 16) + 8] + : tile[(int)(minPos01 >> 16)]; + + // Note that we calculate the maximum as the minimum of the inverse, so less here is actually greater. + uint maxDistColor = (ushort)maxPos23 < (ushort)maxPos01 + ? tile[(int)(maxPos23 >> 16) + 8] + : tile[(int)(maxPos01 >> 16)]; + + endPoints0[subset] = (endPoints0[subset] & inverseMask) | (minDistColor & writeMask); + endPoints1[subset] = (endPoints1[subset] & inverseMask) | (maxDistColor & writeMask); + } + } + else + { + for (int subset = 0; subset < subsetCount; subset++) + { + RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32(); + blockDir = RgbaColor32.DivideGuarded(blockDir << 6, new RgbaColor32(blockDir.R + blockDir.G + blockDir.B + blockDir.A), 0); + + int minDist = int.MaxValue; + int maxDist = int.MinValue; + + RgbaColor8 minDistColor = default; + RgbaColor8 maxDistColor = default; + + int i = 0; + for (int ty = 0; ty < h; ty++) + { + for (int tx = 0; tx < w; tx++, i++) + { + if (partitionTable[ty * 4 + tx] != subset) + { + continue; + } + + RgbaColor8 color = RgbaColor8.FromUInt32(tile[i]); + int dist = RgbaColor32.Dot(color.GetColor32(), blockDir); + + if (minDist > dist) + { + minDist = dist; + minDistColor = color; + } + + if (maxDist < dist) + { + maxDist = dist; + maxDistColor = color; + } + } + } + + endPoints0[subset] = (endPoints0[subset] & inverseMask) | (minDistColor.ToUInt32() & writeMask); + endPoints1[subset] = (endPoints1[subset] & inverseMask) | (maxDistColor.ToUInt32() & writeMask); + } + } + } + + private static (RgbaColor8, RgbaColor8) SelectEndPoints( + ReadOnlySpan<RgbaColor8> values, + RgbaColor8 minValue, + RgbaColor8 maxValue, + int indexBitCount, + int colorDepth, + int alphaDepth, + uint alphaMask) + { + int n = values.Length; + int numInterpolatedColors = 1 << indexBitCount; + int numInterpolatedColorsMinus1 = numInterpolatedColors - 1; + + if (n == 0) + { + return (default, default); + } + + minValue = BC67Utils.Quantize(minValue, colorDepth, alphaDepth); + maxValue = BC67Utils.Quantize(maxValue, colorDepth, alphaDepth); + + RgbaColor32 blockDir = maxValue.GetColor32() - minValue.GetColor32(); + blockDir = RgbaColor32.DivideGuarded(blockDir << 6, new RgbaColor32(blockDir.R + blockDir.G + blockDir.B + blockDir.A), 0); + + int minDist = int.MaxValue; + int maxDist = 0; + + for (int i = 0; i < values.Length; i++) + { + RgbaColor8 color = values[i]; + int dist = RgbaColor32.Dot(BC67Utils.Quantize(color, colorDepth, alphaDepth).GetColor32(), blockDir); + + if (minDist >= dist) + { + minDist = dist; + } + + if (maxDist <= dist) + { + maxDist = dist; + } + } + + Span<RgbaColor8> palette = stackalloc RgbaColor8[numInterpolatedColors]; + + int distRange = Math.Max(1, maxDist - minDist); + + RgbaColor32 nV = new RgbaColor32(n); + + int bestErrorSum = int.MaxValue; + RgbaColor8 bestE0 = default; + RgbaColor8 bestE1 = default; + + Span<int> indices = stackalloc int[n]; + Span<RgbaColor32> colors = stackalloc RgbaColor32[n]; + + for (int maxIndex = numInterpolatedColorsMinus1; maxIndex >= 1; maxIndex--) + { + int sumX = 0; + int sumXX = 0; + int sumXXIncrement = 0; + + for (int i = 0; i < values.Length; i++) + { + RgbaColor32 color = values[i].GetColor32(); + + int dist = RgbaColor32.Dot(color, blockDir); + + int normalizedValue = ((dist - minDist) << 6) / distRange; + int texelIndex = (normalizedValue * maxIndex + 32) >> 6; + + indices[i] = texelIndex; + colors[i] = color; + + sumX += texelIndex; + sumXX += texelIndex * texelIndex; + sumXXIncrement += 1 + texelIndex * 2; + } + + for (int start = 0; start < numInterpolatedColors - maxIndex; start++) + { + RgbaColor32 sumY = new RgbaColor32(0); + RgbaColor32 sumXY = new RgbaColor32(0); + + for (int i = 0; i < indices.Length; i++) + { + RgbaColor32 y = colors[i]; + + sumY += y; + sumXY += new RgbaColor32(start + indices[i]) * y; + } + + RgbaColor32 sumXV = new RgbaColor32(sumX); + RgbaColor32 sumXXV = new RgbaColor32(sumXX); + RgbaColor32 m = RgbaColor32.DivideGuarded((nV * sumXY - sumXV * sumY) << 6, nV * sumXXV - sumXV * sumXV, 0); + RgbaColor32 b = ((sumY << 6) - m * sumXV) / nV; + + RgbaColor8 candidateE0 = (b >> 6).GetColor8(); + RgbaColor8 candidateE1 = ((b + m * new RgbaColor32(numInterpolatedColorsMinus1)) >> 6).GetColor8(); + + int pBit0 = GetPBit(candidateE0.ToUInt32(), colorDepth, alphaDepth); + int pBit1 = GetPBit(candidateE1.ToUInt32(), colorDepth, alphaDepth); + + int errorSum = BC67Utils.SelectIndices( + MemoryMarshal.Cast<RgbaColor8, uint>(values), + candidateE0.ToUInt32(), + candidateE1.ToUInt32(), + pBit0, + pBit1, + indexBitCount, + numInterpolatedColors, + colorDepth, + alphaDepth, + alphaMask); + + if (errorSum <= bestErrorSum) + { + bestErrorSum = errorSum; + bestE0 = candidateE0; + bestE1 = candidateE1; + } + + sumX += n; + sumXX += sumXXIncrement; + sumXXIncrement += 2 * n; + } + } + + return (bestE0, bestE1); + } + + private static int GetPBit(uint color, int colorDepth, int alphaDepth) + { + uint mask = 0x808080u >> colorDepth; + + if (alphaDepth != 0) + { + // If alpha is 0, let's assume the color information is not too important and prefer + // to preserve alpha instead. + if ((color >> 24) == 0) + { + return 0; + } + + mask |= 0x80000000u >> alphaDepth; + } + + color &= 0x7f7f7f7fu; + color += mask >> 1; + + int onesCount = BitOperations.PopCount(color & mask); + return onesCount >= 2 ? 1 : 0; + } + + private static int GetPBit(uint c0, uint c1, int colorDepth, int alphaDepth) + { + // Giving preference to the first endpoint yields better results, + // might be a side effect of the endpoint selection algorithm? + return GetPBit(c0, colorDepth, alphaDepth); + } + } +} diff --git a/src/Ryujinx.Graphics.Texture/Encoders/EncodeMode.cs b/src/Ryujinx.Graphics.Texture/Encoders/EncodeMode.cs new file mode 100644 index 00000000..5734d301 --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Encoders/EncodeMode.cs @@ -0,0 +1,10 @@ +namespace Ryujinx.Graphics.Texture.Encoders +{ + enum EncodeMode + { + Fast, + Exhaustive, + ModeMask = 0xff, + Multithreaded = 1 << 8 + } +} diff --git a/src/Ryujinx.Graphics.Texture/LayoutConverter.cs b/src/Ryujinx.Graphics.Texture/LayoutConverter.cs new file mode 100644 index 00000000..09eaf300 --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/LayoutConverter.cs @@ -0,0 +1,591 @@ +using Ryujinx.Common; +using System; +using System.Runtime.Intrinsics; +using static Ryujinx.Graphics.Texture.BlockLinearConstants; + +namespace Ryujinx.Graphics.Texture +{ + public static class LayoutConverter + { + public const int HostStrideAlignment = 4; + + public static void ConvertBlockLinearToLinear( + Span<byte> dst, + int width, + int height, + int stride, + int bytesPerPixel, + int gobBlocksInY, + ReadOnlySpan<byte> data) + { + int gobHeight = gobBlocksInY * GobHeight; + + int strideTrunc = BitUtils.AlignDown(width * bytesPerPixel, 16); + int strideTrunc64 = BitUtils.AlignDown(width * bytesPerPixel, 64); + + int xStart = strideTrunc / bytesPerPixel; + + int outStrideGap = stride - width * bytesPerPixel; + + int alignment = GobStride / bytesPerPixel; + + int wAligned = BitUtils.AlignUp(width, alignment); + + BlockLinearLayout layoutConverter = new BlockLinearLayout(wAligned, height, gobBlocksInY, 1, bytesPerPixel); + + unsafe bool Convert<T>(Span<byte> output, ReadOnlySpan<byte> data) where T : unmanaged + { + fixed (byte* outputPtr = output, dataPtr = data) + { + byte* outPtr = outputPtr; + + for (int y = 0; y < height; y++) + { + layoutConverter.SetY(y); + + for (int x = 0; x < strideTrunc64; x += 64, outPtr += 64) + { + byte* offset = dataPtr + layoutConverter.GetOffsetWithLineOffset64(x); + byte* offset2 = offset + 0x20; + byte* offset3 = offset + 0x100; + byte* offset4 = offset + 0x120; + + Vector128<byte> value = *(Vector128<byte>*)offset; + Vector128<byte> value2 = *(Vector128<byte>*)offset2; + Vector128<byte> value3 = *(Vector128<byte>*)offset3; + Vector128<byte> value4 = *(Vector128<byte>*)offset4; + + *(Vector128<byte>*)outPtr = value; + *(Vector128<byte>*)(outPtr + 16) = value2; + *(Vector128<byte>*)(outPtr + 32) = value3; + *(Vector128<byte>*)(outPtr + 48) = value4; + } + + for (int x = strideTrunc64; x < strideTrunc; x += 16, outPtr += 16) + { + byte* offset = dataPtr + layoutConverter.GetOffsetWithLineOffset16(x); + + *(Vector128<byte>*)outPtr = *(Vector128<byte>*)offset; + } + + for (int x = xStart; x < width; x++, outPtr += bytesPerPixel) + { + byte* offset = dataPtr + layoutConverter.GetOffset(x); + + *(T*)outPtr = *(T*)offset; + } + + outPtr += outStrideGap; + } + } + return true; + } + + bool _ = bytesPerPixel switch + { + 1 => Convert<byte>(dst, data), + 2 => Convert<ushort>(dst, data), + 4 => Convert<uint>(dst, data), + 8 => Convert<ulong>(dst, data), + 12 => Convert<Bpp12Pixel>(dst, data), + 16 => Convert<Vector128<byte>>(dst, data), + _ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.") + }; + } + + public static byte[] ConvertBlockLinearToLinear( + int width, + int height, + int depth, + int sliceDepth, + int levels, + int layers, + int blockWidth, + int blockHeight, + int bytesPerPixel, + int gobBlocksInY, + int gobBlocksInZ, + int gobBlocksInTileX, + SizeInfo sizeInfo, + ReadOnlySpan<byte> data) + { + int outSize = GetTextureSize( + width, + height, + sliceDepth, + levels, + layers, + blockWidth, + blockHeight, + bytesPerPixel); + + byte[] output = new byte[outSize]; + + int outOffs = 0; + + int mipGobBlocksInY = gobBlocksInY; + int mipGobBlocksInZ = gobBlocksInZ; + + int gobWidth = (GobStride / bytesPerPixel) * gobBlocksInTileX; + int gobHeight = gobBlocksInY * GobHeight; + + for (int level = 0; level < levels; level++) + { + int w = Math.Max(1, width >> level); + int h = Math.Max(1, height >> level); + int d = Math.Max(1, depth >> level); + + w = BitUtils.DivRoundUp(w, blockWidth); + h = BitUtils.DivRoundUp(h, blockHeight); + + while (h <= (mipGobBlocksInY >> 1) * GobHeight && mipGobBlocksInY != 1) + { + mipGobBlocksInY >>= 1; + } + + while (d <= (mipGobBlocksInZ >> 1) && mipGobBlocksInZ != 1) + { + mipGobBlocksInZ >>= 1; + } + + int strideTrunc = BitUtils.AlignDown(w * bytesPerPixel, 16); + int strideTrunc64 = BitUtils.AlignDown(w * bytesPerPixel, 64); + + int xStart = strideTrunc / bytesPerPixel; + + int stride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment); + + int outStrideGap = stride - w * bytesPerPixel; + + int alignment = gobWidth; + + if (d < gobBlocksInZ || w <= gobWidth || h <= gobHeight) + { + alignment = GobStride / bytesPerPixel; + } + + int wAligned = BitUtils.AlignUp(w, alignment); + + BlockLinearLayout layoutConverter = new BlockLinearLayout( + wAligned, + h, + mipGobBlocksInY, + mipGobBlocksInZ, + bytesPerPixel); + + int sd = Math.Max(1, sliceDepth >> level); + + unsafe bool Convert<T>(Span<byte> output, ReadOnlySpan<byte> data) where T : unmanaged + { + fixed (byte* outputPtr = output, dataPtr = data) + { + byte* outPtr = outputPtr + outOffs; + for (int layer = 0; layer < layers; layer++) + { + byte* inBaseOffset = dataPtr + (layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level)); + + for (int z = 0; z < sd; z++) + { + layoutConverter.SetZ(z); + for (int y = 0; y < h; y++) + { + layoutConverter.SetY(y); + + for (int x = 0; x < strideTrunc64; x += 64, outPtr += 64) + { + byte* offset = inBaseOffset + layoutConverter.GetOffsetWithLineOffset64(x); + byte* offset2 = offset + 0x20; + byte* offset3 = offset + 0x100; + byte* offset4 = offset + 0x120; + + Vector128<byte> value = *(Vector128<byte>*)offset; + Vector128<byte> value2 = *(Vector128<byte>*)offset2; + Vector128<byte> value3 = *(Vector128<byte>*)offset3; + Vector128<byte> value4 = *(Vector128<byte>*)offset4; + + *(Vector128<byte>*)outPtr = value; + *(Vector128<byte>*)(outPtr + 16) = value2; + *(Vector128<byte>*)(outPtr + 32) = value3; + *(Vector128<byte>*)(outPtr + 48) = value4; + } + + for (int x = strideTrunc64; x < strideTrunc; x += 16, outPtr += 16) + { + byte* offset = inBaseOffset + layoutConverter.GetOffsetWithLineOffset16(x); + + *(Vector128<byte>*)outPtr = *(Vector128<byte>*)offset; + } + + for (int x = xStart; x < w; x++, outPtr += bytesPerPixel) + { + byte* offset = inBaseOffset + layoutConverter.GetOffset(x); + + *(T*)outPtr = *(T*)offset; + } + + outPtr += outStrideGap; + } + } + } + outOffs += stride * h * d * layers; + } + return true; + } + + bool _ = bytesPerPixel switch + { + 1 => Convert<byte>(output, data), + 2 => Convert<ushort>(output, data), + 4 => Convert<uint>(output, data), + 8 => Convert<ulong>(output, data), + 12 => Convert<Bpp12Pixel>(output, data), + 16 => Convert<Vector128<byte>>(output, data), + _ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.") + }; + } + return output; + } + + public static byte[] ConvertLinearStridedToLinear( + int width, + int height, + int blockWidth, + int blockHeight, + int lineSize, + int stride, + int bytesPerPixel, + ReadOnlySpan<byte> data) + { + int w = BitUtils.DivRoundUp(width, blockWidth); + int h = BitUtils.DivRoundUp(height, blockHeight); + + int outStride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment); + lineSize = Math.Min(lineSize, outStride); + + byte[] output = new byte[h * outStride]; + Span<byte> outSpan = output; + + int outOffs = 0; + int inOffs = 0; + + for (int y = 0; y < h; y++) + { + data.Slice(inOffs, lineSize).CopyTo(outSpan.Slice(outOffs, lineSize)); + + inOffs += stride; + outOffs += outStride; + } + + return output; + } + + public static void ConvertLinearToBlockLinear( + Span<byte> dst, + int width, + int height, + int stride, + int bytesPerPixel, + int gobBlocksInY, + ReadOnlySpan<byte> data) + { + int gobHeight = gobBlocksInY * GobHeight; + + int strideTrunc = BitUtils.AlignDown(width * bytesPerPixel, 16); + int strideTrunc64 = BitUtils.AlignDown(width * bytesPerPixel, 64); + + int xStart = strideTrunc / bytesPerPixel; + + int inStrideGap = stride - width * bytesPerPixel; + + int alignment = GobStride / bytesPerPixel; + + int wAligned = BitUtils.AlignUp(width, alignment); + + BlockLinearLayout layoutConverter = new BlockLinearLayout(wAligned, height, gobBlocksInY, 1, bytesPerPixel); + + unsafe bool Convert<T>(Span<byte> output, ReadOnlySpan<byte> data) where T : unmanaged + { + fixed (byte* outputPtr = output, dataPtr = data) + { + byte* inPtr = dataPtr; + + for (int y = 0; y < height; y++) + { + layoutConverter.SetY(y); + + for (int x = 0; x < strideTrunc64; x += 64, inPtr += 64) + { + byte* offset = outputPtr + layoutConverter.GetOffsetWithLineOffset64(x); + byte* offset2 = offset + 0x20; + byte* offset3 = offset + 0x100; + byte* offset4 = offset + 0x120; + + Vector128<byte> value = *(Vector128<byte>*)inPtr; + Vector128<byte> value2 = *(Vector128<byte>*)(inPtr + 16); + Vector128<byte> value3 = *(Vector128<byte>*)(inPtr + 32); + Vector128<byte> value4 = *(Vector128<byte>*)(inPtr + 48); + + *(Vector128<byte>*)offset = value; + *(Vector128<byte>*)offset2 = value2; + *(Vector128<byte>*)offset3 = value3; + *(Vector128<byte>*)offset4 = value4; + } + + for (int x = strideTrunc64; x < strideTrunc; x += 16, inPtr += 16) + { + byte* offset = outputPtr + layoutConverter.GetOffsetWithLineOffset16(x); + + *(Vector128<byte>*)offset = *(Vector128<byte>*)inPtr; + } + + for (int x = xStart; x < width; x++, inPtr += bytesPerPixel) + { + byte* offset = outputPtr + layoutConverter.GetOffset(x); + + *(T*)offset = *(T*)inPtr; + } + + inPtr += inStrideGap; + } + } + return true; + } + + bool _ = bytesPerPixel switch + { + 1 => Convert<byte>(dst, data), + 2 => Convert<ushort>(dst, data), + 4 => Convert<uint>(dst, data), + 8 => Convert<ulong>(dst, data), + 12 => Convert<Bpp12Pixel>(dst, data), + 16 => Convert<Vector128<byte>>(dst, data), + _ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.") + }; + } + + public static ReadOnlySpan<byte> ConvertLinearToBlockLinear( + Span<byte> output, + int width, + int height, + int depth, + int sliceDepth, + int levels, + int layers, + int blockWidth, + int blockHeight, + int bytesPerPixel, + int gobBlocksInY, + int gobBlocksInZ, + int gobBlocksInTileX, + SizeInfo sizeInfo, + ReadOnlySpan<byte> data) + { + if (output.Length == 0) + { + output = new byte[sizeInfo.TotalSize]; + } + + int inOffs = 0; + + int mipGobBlocksInY = gobBlocksInY; + int mipGobBlocksInZ = gobBlocksInZ; + + int gobWidth = (GobStride / bytesPerPixel) * gobBlocksInTileX; + int gobHeight = gobBlocksInY * GobHeight; + + for (int level = 0; level < levels; level++) + { + int w = Math.Max(1, width >> level); + int h = Math.Max(1, height >> level); + int d = Math.Max(1, depth >> level); + + w = BitUtils.DivRoundUp(w, blockWidth); + h = BitUtils.DivRoundUp(h, blockHeight); + + while (h <= (mipGobBlocksInY >> 1) * GobHeight && mipGobBlocksInY != 1) + { + mipGobBlocksInY >>= 1; + } + + while (d <= (mipGobBlocksInZ >> 1) && mipGobBlocksInZ != 1) + { + mipGobBlocksInZ >>= 1; + } + + int strideTrunc = BitUtils.AlignDown(w * bytesPerPixel, 16); + int strideTrunc64 = BitUtils.AlignDown(w * bytesPerPixel, 64); + + int xStart = strideTrunc / bytesPerPixel; + + int stride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment); + + int inStrideGap = stride - w * bytesPerPixel; + + int alignment = gobWidth; + + if (d < gobBlocksInZ || w <= gobWidth || h <= gobHeight) + { + alignment = GobStride / bytesPerPixel; + } + + int wAligned = BitUtils.AlignUp(w, alignment); + + BlockLinearLayout layoutConverter = new BlockLinearLayout( + wAligned, + h, + mipGobBlocksInY, + mipGobBlocksInZ, + bytesPerPixel); + + int sd = Math.Max(1, sliceDepth >> level); + + unsafe bool Convert<T>(Span<byte> output, ReadOnlySpan<byte> data) where T : unmanaged + { + fixed (byte* outputPtr = output, dataPtr = data) + { + byte* inPtr = dataPtr + inOffs; + for (int layer = 0; layer < layers; layer++) + { + byte* outBaseOffset = outputPtr + (layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level)); + + for (int z = 0; z < sd; z++) + { + layoutConverter.SetZ(z); + for (int y = 0; y < h; y++) + { + layoutConverter.SetY(y); + + for (int x = 0; x < strideTrunc64; x += 64, inPtr += 64) + { + byte* offset = outBaseOffset + layoutConverter.GetOffsetWithLineOffset64(x); + byte* offset2 = offset + 0x20; + byte* offset3 = offset + 0x100; + byte* offset4 = offset + 0x120; + + Vector128<byte> value = *(Vector128<byte>*)inPtr; + Vector128<byte> value2 = *(Vector128<byte>*)(inPtr + 16); + Vector128<byte> value3 = *(Vector128<byte>*)(inPtr + 32); + Vector128<byte> value4 = *(Vector128<byte>*)(inPtr + 48); + + *(Vector128<byte>*)offset = value; + *(Vector128<byte>*)offset2 = value2; + *(Vector128<byte>*)offset3 = value3; + *(Vector128<byte>*)offset4 = value4; + } + + for (int x = strideTrunc64; x < strideTrunc; x += 16, inPtr += 16) + { + byte* offset = outBaseOffset + layoutConverter.GetOffsetWithLineOffset16(x); + + *(Vector128<byte>*)offset = *(Vector128<byte>*)inPtr; + } + + for (int x = xStart; x < w; x++, inPtr += bytesPerPixel) + { + byte* offset = outBaseOffset + layoutConverter.GetOffset(x); + + *(T*)offset = *(T*)inPtr; + } + + inPtr += inStrideGap; + } + } + } + inOffs += stride * h * d * layers; + } + return true; + } + + bool _ = bytesPerPixel switch + { + 1 => Convert<byte>(output, data), + 2 => Convert<ushort>(output, data), + 4 => Convert<uint>(output, data), + 8 => Convert<ulong>(output, data), + 12 => Convert<Bpp12Pixel>(output, data), + 16 => Convert<Vector128<byte>>(output, data), + _ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.") + }; + } + + return output; + } + + public static ReadOnlySpan<byte> ConvertLinearToLinearStrided( + Span<byte> output, + int width, + int height, + int blockWidth, + int blockHeight, + int stride, + int bytesPerPixel, + ReadOnlySpan<byte> data) + { + int w = BitUtils.DivRoundUp(width, blockWidth); + int h = BitUtils.DivRoundUp(height, blockHeight); + + int inStride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment); + int lineSize = width * bytesPerPixel; + + if (inStride == stride) + { + if (output.Length != 0) + { + data.CopyTo(output); + return output; + } + else + { + return data; + } + } + + if (output.Length == 0) + { + output = new byte[h * stride]; + } + + int inOffs = 0; + int outOffs = 0; + + for (int y = 0; y < h; y++) + { + data.Slice(inOffs, lineSize).CopyTo(output.Slice(outOffs, lineSize)); + + inOffs += inStride; + outOffs += stride; + } + + return output; + } + + private static int GetTextureSize( + int width, + int height, + int depth, + int levels, + int layers, + int blockWidth, + int blockHeight, + int bytesPerPixel) + { + int layerSize = 0; + + for (int level = 0; level < levels; level++) + { + int w = Math.Max(1, width >> level); + int h = Math.Max(1, height >> level); + int d = Math.Max(1, depth >> level); + + w = BitUtils.DivRoundUp(w, blockWidth); + h = BitUtils.DivRoundUp(h, blockHeight); + + int stride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment); + + layerSize += stride * h * d; + } + + return layerSize * layers; + } + } +}
\ No newline at end of file diff --git a/src/Ryujinx.Graphics.Texture/OffsetCalculator.cs b/src/Ryujinx.Graphics.Texture/OffsetCalculator.cs new file mode 100644 index 00000000..d7472e2f --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/OffsetCalculator.cs @@ -0,0 +1,141 @@ +using Ryujinx.Common; +using System; +using System.Runtime.CompilerServices; +using static Ryujinx.Graphics.Texture.BlockLinearConstants; + +namespace Ryujinx.Graphics.Texture +{ + public class OffsetCalculator + { + private int _width; + private int _height; + private int _stride; + private bool _isLinear; + private int _bytesPerPixel; + + private BlockLinearLayout _layoutConverter; + + // Variables for built in iteration. + private int _yPart; + + public OffsetCalculator( + int width, + int height, + int stride, + bool isLinear, + int gobBlocksInY, + int gobBlocksInZ, + int bytesPerPixel) + { + _width = width; + _height = height; + _stride = stride; + _isLinear = isLinear; + _bytesPerPixel = bytesPerPixel; + + int wAlignment = GobStride / bytesPerPixel; + + int wAligned = BitUtils.AlignUp(width, wAlignment); + + if (!isLinear) + { + _layoutConverter = new BlockLinearLayout( + wAligned, + height, + gobBlocksInY, + gobBlocksInZ, + bytesPerPixel); + } + } + + public OffsetCalculator( + int width, + int height, + int stride, + bool isLinear, + int gobBlocksInY, + int bytesPerPixel) : this(width, height, stride, isLinear, gobBlocksInY, 1, bytesPerPixel) + { + } + + public void SetY(int y) + { + if (_isLinear) + { + _yPart = y * _stride; + } + else + { + _layoutConverter.SetY(y); + } + } + + public int GetOffset(int x, int y) + { + if (_isLinear) + { + return x * _bytesPerPixel + y * _stride; + } + else + { + return _layoutConverter.GetOffset(x, y, 0); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetOffset(int x) + { + if (_isLinear) + { + return x * _bytesPerPixel + _yPart; + } + else + { + return _layoutConverter.GetOffset(x); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetOffsetWithLineOffset64(int x) + { + if (_isLinear) + { + return x + _yPart; + } + else + { + return _layoutConverter.GetOffsetWithLineOffset64(x); + } + } + + public (int offset, int size) GetRectangleRange(int x, int y, int width, int height) + { + if (_isLinear) + { + int start = y * Math.Abs(_stride) + x * _bytesPerPixel; + int end = (y + height - 1) * Math.Abs(_stride) + (x + width) * _bytesPerPixel; + return (y * _stride + x * _bytesPerPixel, end - start); + } + else + { + return _layoutConverter.GetRectangleRange(x, y, width, height); + } + } + + public bool LayoutMatches(OffsetCalculator other) + { + if (_isLinear) + { + return other._isLinear && + _width == other._width && + _height == other._height && + _stride == other._stride && + _bytesPerPixel == other._bytesPerPixel; + } + else + { + return !other._isLinear && _layoutConverter.LayoutMatches(other._layoutConverter); + } + } + } +}
\ No newline at end of file diff --git a/src/Ryujinx.Graphics.Texture/PixelConverter.cs b/src/Ryujinx.Graphics.Texture/PixelConverter.cs new file mode 100644 index 00000000..add25cd3 --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/PixelConverter.cs @@ -0,0 +1,216 @@ +using Ryujinx.Common; +using System; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace Ryujinx.Graphics.Texture +{ + public static class PixelConverter + { + private static (int remainder, int outRemainder, int height) GetLineRemainders(int length, int width, int bpp, int outBpp) + { + int stride = BitUtils.AlignUp(width * bpp, LayoutConverter.HostStrideAlignment); + int remainder = stride / bpp - width; + + int outStride = BitUtils.AlignUp(width * outBpp, LayoutConverter.HostStrideAlignment); + int outRemainder = outStride / outBpp - width; + + return (remainder, outRemainder, length / stride); + } + + public unsafe static byte[] ConvertR4G4ToR4G4B4A4(ReadOnlySpan<byte> data, int width) + { + byte[] output = new byte[data.Length * 2]; + + (int remainder, int outRemainder, int height) = GetLineRemainders(data.Length, width, 1, 2); + + Span<ushort> outputSpan = MemoryMarshal.Cast<byte, ushort>(output); + + if (remainder == 0) + { + int start = 0; + + if (Sse41.IsSupported) + { + int sizeTrunc = data.Length & ~7; + start = sizeTrunc; + + fixed (byte* inputPtr = data, outputPtr = output) + { + for (ulong offset = 0; offset < (ulong)sizeTrunc; offset += 8) + { + Sse2.Store(outputPtr + offset * 2, Sse41.ConvertToVector128Int16(inputPtr + offset).AsByte()); + } + } + } + + for (int i = start; i < data.Length; i++) + { + outputSpan[i] = (ushort)data[i]; + } + } + else + { + int offset = 0; + int outOffset = 0; + + for (int y = 0; y < height; y++) + { + for (int x = 0; x < width; x++) + { + outputSpan[outOffset++] = data[offset++]; + } + + offset += remainder; + outOffset += outRemainder; + } + } + + return output; + } + + public unsafe static byte[] ConvertR5G6B5ToR8G8B8A8(ReadOnlySpan<byte> data, int width) + { + byte[] output = new byte[data.Length * 2]; + int offset = 0; + int outOffset = 0; + + (int remainder, int outRemainder, int height) = GetLineRemainders(data.Length, width, 2, 4); + + ReadOnlySpan<ushort> inputSpan = MemoryMarshal.Cast<byte, ushort>(data); + Span<uint> outputSpan = MemoryMarshal.Cast<byte, uint>(output); + + for (int y = 0; y < height; y++) + { + for (int x = 0; x < width; x++) + { + uint packed = inputSpan[offset++]; + + uint outputPacked = 0xff000000; + outputPacked |= (packed << 3) & 0x000000f8; + outputPacked |= (packed << 8) & 0x00f80000; + + // Replicate 5 bit components. + outputPacked |= (outputPacked >> 5) & 0x00070007; + + // Include and replicate 6 bit component. + outputPacked |= ((packed << 5) & 0x0000fc00) | ((packed >> 1) & 0x00000300); + + outputSpan[outOffset++] = outputPacked; + } + + offset += remainder; + outOffset += outRemainder; + } + + return output; + } + + public unsafe static byte[] ConvertR5G5B5ToR8G8B8A8(ReadOnlySpan<byte> data, int width, bool forceAlpha) + { + byte[] output = new byte[data.Length * 2]; + int offset = 0; + int outOffset = 0; + + (int remainder, int outRemainder, int height) = GetLineRemainders(data.Length, width, 2, 4); + + ReadOnlySpan<ushort> inputSpan = MemoryMarshal.Cast<byte, ushort>(data); + Span<uint> outputSpan = MemoryMarshal.Cast<byte, uint>(output); + + for (int y = 0; y < height; y++) + { + for (int x = 0; x < width; x++) + { + uint packed = inputSpan[offset++]; + + uint a = forceAlpha ? 1 : (packed >> 15); + + uint outputPacked = a * 0xff000000; + outputPacked |= (packed << 3) & 0x000000f8; + outputPacked |= (packed << 6) & 0x0000f800; + outputPacked |= (packed << 9) & 0x00f80000; + + // Replicate 5 bit components. + outputPacked |= (outputPacked >> 5) & 0x00070707; + + outputSpan[outOffset++] = outputPacked; + } + + offset += remainder; + outOffset += outRemainder; + } + + return output; + } + + public unsafe static byte[] ConvertA1B5G5R5ToR8G8B8A8(ReadOnlySpan<byte> data, int width) + { + byte[] output = new byte[data.Length * 2]; + int offset = 0; + int outOffset = 0; + + (int remainder, int outRemainder, int height) = GetLineRemainders(data.Length, width, 2, 4); + + ReadOnlySpan<ushort> inputSpan = MemoryMarshal.Cast<byte, ushort>(data); + Span<uint> outputSpan = MemoryMarshal.Cast<byte, uint>(output); + + for (int y = 0; y < height; y++) + { + for (int x = 0; x < width; x++) + { + uint packed = inputSpan[offset++]; + + uint a = packed >> 15; + + uint outputPacked = a * 0xff000000; + outputPacked |= (packed >> 8) & 0x000000f8; + outputPacked |= (packed << 5) & 0x0000f800; + outputPacked |= (packed << 18) & 0x00f80000; + + // Replicate 5 bit components. + outputPacked |= (outputPacked >> 5) & 0x00070707; + + outputSpan[outOffset++] = outputPacked; + } + + offset += remainder; + outOffset += outRemainder; + } + + return output; + } + + public unsafe static byte[] ConvertR4G4B4A4ToR8G8B8A8(ReadOnlySpan<byte> data, int width) + { + byte[] output = new byte[data.Length * 2]; + int offset = 0; + int outOffset = 0; + + (int remainder, int outRemainder, int height) = GetLineRemainders(data.Length, width, 2, 4); + + ReadOnlySpan<ushort> inputSpan = MemoryMarshal.Cast<byte, ushort>(data); + Span<uint> outputSpan = MemoryMarshal.Cast<byte, uint>(output); + + for (int y = 0; y < height; y++) + { + for (int x = 0; x < width; x++) + { + uint packed = inputSpan[offset++]; + + uint outputPacked = packed & 0x0000000f; + outputPacked |= (packed << 4) & 0x00000f00; + outputPacked |= (packed << 8) & 0x000f0000; + outputPacked |= (packed << 12) & 0x0f000000; + + outputSpan[outOffset++] = outputPacked * 0x11; + } + + offset += remainder; + outOffset += outRemainder; + } + + return output; + } + } +} diff --git a/src/Ryujinx.Graphics.Texture/Region.cs b/src/Ryujinx.Graphics.Texture/Region.cs new file mode 100644 index 00000000..e59888a0 --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Region.cs @@ -0,0 +1,14 @@ +namespace Ryujinx.Graphics.Texture +{ + public readonly struct Region + { + public int Offset { get; } + public int Size { get; } + + public Region(int offset, int size) + { + Offset = offset; + Size = size; + } + } +} diff --git a/src/Ryujinx.Graphics.Texture/Ryujinx.Graphics.Texture.csproj b/src/Ryujinx.Graphics.Texture/Ryujinx.Graphics.Texture.csproj new file mode 100644 index 00000000..70e3453c --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Ryujinx.Graphics.Texture.csproj @@ -0,0 +1,11 @@ +<Project Sdk="Microsoft.NET.Sdk"> + <PropertyGroup> + <TargetFramework>net7.0</TargetFramework> + <AllowUnsafeBlocks>true</AllowUnsafeBlocks> + </PropertyGroup> + + <ItemGroup> + <ProjectReference Include="..\Ryujinx.Common\Ryujinx.Common.csproj" /> + </ItemGroup> + +</Project> diff --git a/src/Ryujinx.Graphics.Texture/Size.cs b/src/Ryujinx.Graphics.Texture/Size.cs new file mode 100644 index 00000000..21c45b38 --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Size.cs @@ -0,0 +1,16 @@ +namespace Ryujinx.Graphics.Texture +{ + public readonly struct Size + { + public int Width { get; } + public int Height { get; } + public int Depth { get; } + + public Size(int width, int height, int depth) + { + Width = width; + Height = height; + Depth = depth; + } + } +}
\ No newline at end of file diff --git a/src/Ryujinx.Graphics.Texture/SizeCalculator.cs b/src/Ryujinx.Graphics.Texture/SizeCalculator.cs new file mode 100644 index 00000000..5568784f --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/SizeCalculator.cs @@ -0,0 +1,287 @@ +using Ryujinx.Common; +using System; + +using static Ryujinx.Graphics.Texture.BlockLinearConstants; + +namespace Ryujinx.Graphics.Texture +{ + public static class SizeCalculator + { + private const int StrideAlignment = 32; + + private static int Calculate3DOffsetCount(int levels, int depth) + { + int offsetCount = depth; + + while (--levels > 0) + { + depth = Math.Max(1, depth >> 1); + offsetCount += depth; + } + + return offsetCount; + } + + public static SizeInfo GetBlockLinearTextureSize( + int width, + int height, + int depth, + int levels, + int layers, + int blockWidth, + int blockHeight, + int bytesPerPixel, + int gobBlocksInY, + int gobBlocksInZ, + int gobBlocksInTileX, + int gpuLayerSize = 0) + { + bool is3D = depth > 1; + + int layerSize = 0; + + int[] allOffsets = new int[is3D ? Calculate3DOffsetCount(levels, depth) : levels * layers * depth]; + int[] mipOffsets = new int[levels]; + int[] sliceSizes = new int[levels]; + int[] levelSizes = new int[levels]; + + int mipGobBlocksInY = gobBlocksInY; + int mipGobBlocksInZ = gobBlocksInZ; + + int gobWidth = (GobStride / bytesPerPixel) * gobBlocksInTileX; + int gobHeight = gobBlocksInY * GobHeight; + + int depthLevelOffset = 0; + + for (int level = 0; level < levels; level++) + { + int w = Math.Max(1, width >> level); + int h = Math.Max(1, height >> level); + int d = Math.Max(1, depth >> level); + + w = BitUtils.DivRoundUp(w, blockWidth); + h = BitUtils.DivRoundUp(h, blockHeight); + + while (h <= (mipGobBlocksInY >> 1) * GobHeight && mipGobBlocksInY != 1) + { + mipGobBlocksInY >>= 1; + } + + while (d <= (mipGobBlocksInZ >> 1) && mipGobBlocksInZ != 1) + { + mipGobBlocksInZ >>= 1; + } + + int widthInGobs = BitUtils.DivRoundUp(w * bytesPerPixel, GobStride); + + int alignment = gobBlocksInTileX; + + if (d < gobBlocksInZ || w <= gobWidth || h <= gobHeight) + { + alignment = 1; + } + + widthInGobs = BitUtils.AlignUp(widthInGobs, alignment); + + int totalBlocksOfGobsInZ = BitUtils.DivRoundUp(d, mipGobBlocksInZ); + int totalBlocksOfGobsInY = BitUtils.DivRoundUp(BitUtils.DivRoundUp(h, GobHeight), mipGobBlocksInY); + + int robSize = widthInGobs * mipGobBlocksInY * mipGobBlocksInZ * GobSize; + + if (is3D) + { + int gobSize = mipGobBlocksInY * GobSize; + + int sliceSize = totalBlocksOfGobsInY * widthInGobs * gobSize; + + int baseOffset = layerSize; + + int mask = gobBlocksInZ - 1; + + for (int z = 0; z < d; z++) + { + int zLow = z & mask; + int zHigh = z & ~mask; + + allOffsets[z + depthLevelOffset] = baseOffset + zLow * gobSize + zHigh * sliceSize; + } + } + + mipOffsets[level] = layerSize; + sliceSizes[level] = totalBlocksOfGobsInY * robSize; + levelSizes[level] = totalBlocksOfGobsInZ * sliceSizes[level]; + + layerSize += levelSizes[level]; + + depthLevelOffset += d; + } + + if (layers > 1) + { + layerSize = AlignLayerSize( + layerSize, + height, + depth, + blockHeight, + gobBlocksInY, + gobBlocksInZ, + gobBlocksInTileX); + } + + int totalSize; + + if (layerSize < gpuLayerSize) + { + totalSize = (layers - 1) * gpuLayerSize + layerSize; + layerSize = gpuLayerSize; + } + else + { + totalSize = layerSize * layers; + } + + if (!is3D) + { + for (int layer = 0; layer < layers; layer++) + { + int baseIndex = layer * levels; + int baseOffset = layer * layerSize; + + for (int level = 0; level < levels; level++) + { + allOffsets[baseIndex + level] = baseOffset + mipOffsets[level]; + } + } + } + + return new SizeInfo(mipOffsets, allOffsets, sliceSizes, levelSizes, depth, levels, layerSize, totalSize, is3D); + } + + public static SizeInfo GetLinearTextureSize(int stride, int height, int blockHeight) + { + // Non-2D or mipmapped linear textures are not supported by the Switch GPU, + // so we only need to handle a single case (2D textures without mipmaps). + int totalSize = stride * BitUtils.DivRoundUp(height, blockHeight); + + return new SizeInfo(totalSize); + } + + private static int AlignLayerSize( + int size, + int height, + int depth, + int blockHeight, + int gobBlocksInY, + int gobBlocksInZ, + int gobBlocksInTileX) + { + if (gobBlocksInTileX < 2) + { + height = BitUtils.DivRoundUp(height, blockHeight); + + while (height <= (gobBlocksInY >> 1) * GobHeight && gobBlocksInY != 1) + { + gobBlocksInY >>= 1; + } + + while (depth <= (gobBlocksInZ >> 1) && gobBlocksInZ != 1) + { + gobBlocksInZ >>= 1; + } + + int blockOfGobsSize = gobBlocksInY * gobBlocksInZ * GobSize; + + int sizeInBlockOfGobs = size / blockOfGobsSize; + + if (size != sizeInBlockOfGobs * blockOfGobsSize) + { + size = (sizeInBlockOfGobs + 1) * blockOfGobsSize; + } + } + else + { + int alignment = (gobBlocksInTileX * GobSize) * gobBlocksInY * gobBlocksInZ; + + size = BitUtils.AlignUp(size, alignment); + } + + return size; + } + + public static Size GetBlockLinearAlignedSize( + int width, + int height, + int depth, + int blockWidth, + int blockHeight, + int bytesPerPixel, + int gobBlocksInY, + int gobBlocksInZ, + int gobBlocksInTileX) + { + width = BitUtils.DivRoundUp(width, blockWidth); + height = BitUtils.DivRoundUp(height, blockHeight); + + int gobWidth = (GobStride / bytesPerPixel) * gobBlocksInTileX; + int gobHeight = gobBlocksInY * GobHeight; + + int alignment = gobWidth; + + if (depth < gobBlocksInZ || width <= gobWidth || height <= gobHeight) + { + alignment = GobStride / bytesPerPixel; + } + + // Height has already been divided by block height, so pass it as 1. + (gobBlocksInY, gobBlocksInZ) = GetMipGobBlockSizes(height, depth, 1, gobBlocksInY, gobBlocksInZ); + + int blockOfGobsHeight = gobBlocksInY * GobHeight; + int blockOfGobsDepth = gobBlocksInZ; + + width = BitUtils.AlignUp(width, alignment); + height = BitUtils.AlignUp(height, blockOfGobsHeight); + depth = BitUtils.AlignUp(depth, blockOfGobsDepth); + + return new Size(width, height, depth); + } + + public static Size GetLinearAlignedSize( + int width, + int height, + int blockWidth, + int blockHeight, + int bytesPerPixel) + { + width = BitUtils.DivRoundUp(width, blockWidth); + height = BitUtils.DivRoundUp(height, blockHeight); + + int widthAlignment = StrideAlignment / bytesPerPixel; + + width = BitUtils.AlignUp(width, widthAlignment); + + return new Size(width, height, 1); + } + + public static (int, int) GetMipGobBlockSizes( + int height, + int depth, + int blockHeight, + int gobBlocksInY, + int gobBlocksInZ) + { + height = BitUtils.DivRoundUp(height, blockHeight); + + while (height <= (gobBlocksInY >> 1) * GobHeight && gobBlocksInY != 1) + { + gobBlocksInY >>= 1; + } + + while (depth <= (gobBlocksInZ >> 1) && gobBlocksInZ != 1) + { + gobBlocksInZ >>= 1; + } + + return (gobBlocksInY, gobBlocksInZ); + } + } +}
\ No newline at end of file diff --git a/src/Ryujinx.Graphics.Texture/SizeInfo.cs b/src/Ryujinx.Graphics.Texture/SizeInfo.cs new file mode 100644 index 00000000..eb573728 --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/SizeInfo.cs @@ -0,0 +1,119 @@ +using System; +using System.Collections.Generic; + +namespace Ryujinx.Graphics.Texture +{ + public readonly struct SizeInfo + { + private readonly int[] _mipOffsets; + + private readonly int _levels; + private readonly int _depth; + private readonly bool _is3D; + + public readonly int[] AllOffsets; + public readonly int[] SliceSizes; + public readonly int[] LevelSizes; + public int LayerSize { get; } + public int TotalSize { get; } + + public SizeInfo(int size) + { + _mipOffsets = new int[] { 0 }; + AllOffsets = new int[] { 0 }; + SliceSizes = new int[] { size }; + LevelSizes = new int[] { size }; + _depth = 1; + _levels = 1; + LayerSize = size; + TotalSize = size; + _is3D = false; + } + + internal SizeInfo( + int[] mipOffsets, + int[] allOffsets, + int[] sliceSizes, + int[] levelSizes, + int depth, + int levels, + int layerSize, + int totalSize, + bool is3D) + { + _mipOffsets = mipOffsets; + AllOffsets = allOffsets; + SliceSizes = sliceSizes; + LevelSizes = levelSizes; + _depth = depth; + _levels = levels; + LayerSize = layerSize; + TotalSize = totalSize; + _is3D = is3D; + } + + public int GetMipOffset(int level) + { + if ((uint)level >= _mipOffsets.Length) + { + throw new ArgumentOutOfRangeException(nameof(level)); + } + + return _mipOffsets[level]; + } + + public bool FindView(int offset, out int firstLayer, out int firstLevel) + { + int index = Array.BinarySearch(AllOffsets, offset); + + if (index < 0) + { + firstLayer = 0; + firstLevel = 0; + + return false; + } + + if (_is3D) + { + firstLayer = index; + firstLevel = 0; + + int levelDepth = _depth; + + while (firstLayer >= levelDepth) + { + firstLayer -= levelDepth; + firstLevel++; + levelDepth = Math.Max(levelDepth >> 1, 1); + } + } + else + { + firstLayer = index / _levels; + firstLevel = index - (firstLayer * _levels); + } + + return true; + } + + public IEnumerable<Region> AllRegions() + { + if (_is3D) + { + for (int i = 0; i < _mipOffsets.Length; i++) + { + int maxSize = TotalSize - _mipOffsets[i]; + yield return new Region(_mipOffsets[i], Math.Min(maxSize, LevelSizes[i])); + } + } + else + { + for (int i = 0; i < AllOffsets.Length; i++) + { + yield return new Region(AllOffsets[i], SliceSizes[i % _levels]); + } + } + } + } +}
\ No newline at end of file diff --git a/src/Ryujinx.Graphics.Texture/Utils/BC67Tables.cs b/src/Ryujinx.Graphics.Texture/Utils/BC67Tables.cs new file mode 100644 index 00000000..d890652c --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Utils/BC67Tables.cs @@ -0,0 +1,297 @@ +namespace Ryujinx.Graphics.Texture.Utils +{ + static class BC67Tables + { + public static readonly BC7ModeInfo[] BC7ModeInfos = new BC7ModeInfo[] + { + new BC7ModeInfo(3, 4, 6, 0, 0, 3, 0, 4, 0), + new BC7ModeInfo(2, 6, 2, 0, 0, 3, 0, 6, 0), + new BC7ModeInfo(3, 6, 0, 0, 0, 2, 0, 5, 0), + new BC7ModeInfo(2, 6, 4, 0, 0, 2, 0, 7, 0), + new BC7ModeInfo(1, 0, 0, 2, 1, 2, 3, 5, 6), + new BC7ModeInfo(1, 0, 0, 2, 0, 2, 2, 7, 8), + new BC7ModeInfo(1, 0, 2, 0, 0, 4, 0, 7, 7), + new BC7ModeInfo(2, 6, 4, 0, 0, 2, 0, 5, 5) + }; + + public static readonly byte[][] Weights = + { + new byte[] { 0, 21, 43, 64 }, + new byte[] { 0, 9, 18, 27, 37, 46, 55, 64 }, + new byte[] { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 } + }; + + public static readonly byte[][] InverseWeights = + { + new byte[] { 64, 43, 21, 0 }, + new byte[] { 64, 55, 46, 37, 27, 18, 9, 0 }, + new byte[] { 64, 60, 55, 51, 47, 43, 38, 34, 30, 26, 21, 17, 13, 9, 4, 0 } + }; + + public static readonly byte[][][] FixUpIndices = new byte[3][][] + { + new byte[64][] + { + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 } + }, + new byte[64][] + { + new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, + new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, + new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, + new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, + new byte[] { 0, 15, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 2, 0 }, + new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 15, 0 }, + new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 }, + new byte[] { 0, 8, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 }, + new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 6, 0 }, new byte[] { 0, 8, 0 }, + new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, + new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 }, + new byte[] { 0, 2, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 6, 0 }, + new byte[] { 0, 6, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 6, 0 }, new byte[] { 0, 8, 0 }, + new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 }, + new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, + new byte[] { 0, 15, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 15, 0 } + }, + new byte[64][] + { + new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 }, new byte[] { 0, 15, 8 }, new byte[] { 0, 15, 3 }, + new byte[] { 0, 8, 15 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 15, 3 }, new byte[] { 0, 15, 8 }, + new byte[] { 0, 8, 15 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 6, 15 }, new byte[] { 0, 6, 15 }, + new byte[] { 0, 6, 15 }, new byte[] { 0, 5, 15 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 }, + new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 15, 3 }, + new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 }, new byte[] { 0, 6, 15 }, new byte[] { 0, 10, 8 }, + new byte[] { 0, 5, 3 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 8, 6 }, new byte[] { 0, 6, 10 }, + new byte[] { 0, 8, 15 }, new byte[] { 0, 5, 15 }, new byte[] { 0, 15, 10 }, new byte[] { 0, 15, 8 }, + new byte[] { 0, 8, 15 }, new byte[] { 0, 15, 3 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 5, 10 }, + new byte[] { 0, 6, 10 }, new byte[] { 0, 10, 8 }, new byte[] { 0, 8, 9 }, new byte[] { 0, 15, 10 }, + new byte[] { 0, 15, 6 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 15, 8 }, new byte[] { 0, 5, 15 }, + new byte[] { 0, 15, 3 }, new byte[] { 0, 15, 6 }, new byte[] { 0, 15, 6 }, new byte[] { 0, 15, 8 }, + new byte[] { 0, 3, 15 }, new byte[] { 0, 15, 3 }, new byte[] { 0, 5, 15 }, new byte[] { 0, 5, 15 }, + new byte[] { 0, 5, 15 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 5, 15 }, new byte[] { 0, 10, 15 }, + new byte[] { 0, 5, 15 }, new byte[] { 0, 10, 15 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 13, 15 }, + new byte[] { 0, 15, 3 }, new byte[] { 0, 12, 15 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 } + } + }; + + public static readonly byte[][][] PartitionTable = new byte[3][][] + { + new byte[64][] + { + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 0 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 1 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 2 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 3 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 4 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 5 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 6 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 7 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 8 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 9 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 10 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 11 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 12 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 13 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 14 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 15 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 16 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 17 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 18 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 19 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 20 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 21 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 22 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 23 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 24 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 25 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 26 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 27 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 28 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 29 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 30 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 31 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 32 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 33 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 34 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 35 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 36 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 37 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 38 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 39 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 40 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 41 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 42 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 43 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 44 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 45 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 46 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 47 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 48 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 49 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 50 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 51 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 52 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 53 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 54 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 55 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 56 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 57 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 58 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 59 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 60 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 61 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 62 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } // 63 + }, + new byte[64][] + { + new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, // 0 + new byte[16] { 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1 }, // 1 + new byte[16] { 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1 }, // 2 + new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1 }, // 3 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1 }, // 4 + new byte[16] { 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, // 5 + new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, // 6 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1 }, // 7 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1 }, // 8 + new byte[16] { 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // 9 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, // 10 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1 }, // 11 + new byte[16] { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // 12 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }, // 13 + new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // 14 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1 }, // 15 + new byte[16] { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1 }, // 16 + new byte[16] { 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, // 17 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0 }, // 18 + new byte[16] { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0 }, // 19 + new byte[16] { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, // 20 + new byte[16] { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0 }, // 21 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 }, // 22 + new byte[16] { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1 }, // 23 + new byte[16] { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0 }, // 24 + new byte[16] { 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 }, // 25 + new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0 }, // 26 + new byte[16] { 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0 }, // 27 + new byte[16] { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0 }, // 28 + new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, // 29 + new byte[16] { 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0 }, // 30 + new byte[16] { 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0 }, // 31 + new byte[16] { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }, // 32 + new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1 }, // 33 + new byte[16] { 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0 }, // 34 + new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0 }, // 35 + new byte[16] { 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0 }, // 36 + new byte[16] { 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0 }, // 37 + new byte[16] { 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1 }, // 38 + new byte[16] { 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1 }, // 39 + new byte[16] { 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0 }, // 40 + new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0 }, // 41 + new byte[16] { 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0 }, // 42 + new byte[16] { 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0 }, // 43 + new byte[16] { 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0 }, // 44 + new byte[16] { 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1 }, // 45 + new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1 }, // 46 + new byte[16] { 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0 }, // 47 + new byte[16] { 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0 }, // 48 + new byte[16] { 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0 }, // 49 + new byte[16] { 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0 }, // 50 + new byte[16] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0 }, // 51 + new byte[16] { 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1 }, // 52 + new byte[16] { 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1 }, // 53 + new byte[16] { 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0 }, // 54 + new byte[16] { 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0 }, // 55 + new byte[16] { 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1 }, // 56 + new byte[16] { 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1 }, // 57 + new byte[16] { 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1 }, // 58 + new byte[16] { 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1 }, // 59 + new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, // 60 + new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, // 61 + new byte[16] { 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0 }, // 62 + new byte[16] { 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1 } // 63 + }, + new byte[64][] + { + new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 1, 2, 2, 2, 2 }, // 0 + new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1 }, // 1 + new byte[16] { 0, 0, 0, 0, 2, 0, 0, 1, 2, 2, 1, 1, 2, 2, 1, 1 }, // 2 + new byte[16] { 0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 1, 0, 1, 1, 1 }, // 3 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2 }, // 4 + new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 2, 2 }, // 5 + new byte[16] { 0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1 }, // 6 + new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1 }, // 7 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2 }, // 8 + new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2 }, // 9 + new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 }, // 10 + new byte[16] { 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2 }, // 11 + new byte[16] { 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2 }, // 12 + new byte[16] { 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2 }, // 13 + new byte[16] { 0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 2 }, // 14 + new byte[16] { 0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0, 2, 2, 2, 0 }, // 15 + new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2 }, // 16 + new byte[16] { 0, 1, 1, 1, 0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0 }, // 17 + new byte[16] { 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2 }, // 18 + new byte[16] { 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1 }, // 19 + new byte[16] { 0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2 }, // 20 + new byte[16] { 0, 0, 0, 1, 0, 0, 0, 1, 2, 2, 2, 1, 2, 2, 2, 1 }, // 21 + new byte[16] { 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2 }, // 22 + new byte[16] { 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 1, 0, 2, 2, 1, 0 }, // 23 + new byte[16] { 0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0 }, // 24 + new byte[16] { 0, 0, 1, 2, 0, 0, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2 }, // 25 + new byte[16] { 0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1, 0, 1, 1, 0 }, // 26 + new byte[16] { 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1 }, // 27 + new byte[16] { 0, 0, 2, 2, 1, 1, 0, 2, 1, 1, 0, 2, 0, 0, 2, 2 }, // 28 + new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 2, 0, 0, 2, 2, 2, 2, 2 }, // 29 + new byte[16] { 0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1 }, // 30 + new byte[16] { 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 1, 1, 2, 2, 2, 1 }, // 31 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 2, 2, 2 }, // 32 + new byte[16] { 0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 2, 0, 0, 1, 1 }, // 33 + new byte[16] { 0, 0, 1, 1, 0, 0, 1, 2, 0, 0, 2, 2, 0, 2, 2, 2 }, // 34 + new byte[16] { 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0 }, // 35 + new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0 }, // 36 + new byte[16] { 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0 }, // 37 + new byte[16] { 0, 1, 2, 0, 2, 0, 1, 2, 1, 2, 0, 1, 0, 1, 2, 0 }, // 38 + new byte[16] { 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1 }, // 39 + new byte[16] { 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0, 1, 1 }, // 40 + new byte[16] { 0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2 }, // 41 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1 }, // 42 + new byte[16] { 0, 0, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2, 1, 1, 2, 2 }, // 43 + new byte[16] { 0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 1, 1 }, // 44 + new byte[16] { 0, 2, 2, 0, 1, 2, 2, 1, 0, 2, 2, 0, 1, 2, 2, 1 }, // 45 + new byte[16] { 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 0, 1 }, // 46 + new byte[16] { 0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1 }, // 47 + new byte[16] { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2 }, // 48 + new byte[16] { 0, 2, 2, 2, 0, 1, 1, 1, 0, 2, 2, 2, 0, 1, 1, 1 }, // 49 + new byte[16] { 0, 0, 0, 2, 1, 1, 1, 2, 0, 0, 0, 2, 1, 1, 1, 2 }, // 50 + new byte[16] { 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2 }, // 51 + new byte[16] { 0, 2, 2, 2, 0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2 }, // 52 + new byte[16] { 0, 0, 0, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2 }, // 53 + new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2 }, // 54 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2 }, // 55 + new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2 }, // 56 + new byte[16] { 0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2 }, // 57 + new byte[16] { 0, 0, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2 }, // 58 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2 }, // 59 + new byte[16] { 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1 }, // 60 + new byte[16] { 0, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2 }, // 61 + new byte[16] { 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, // 62 + new byte[16] { 0, 1, 1, 1, 2, 0, 1, 1, 2, 2, 0, 1, 2, 2, 2, 0 } // 63 + } + }; + } +} diff --git a/src/Ryujinx.Graphics.Texture/Utils/BC67Utils.cs b/src/Ryujinx.Graphics.Texture/Utils/BC67Utils.cs new file mode 100644 index 00000000..e6c3f6e7 --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Utils/BC67Utils.cs @@ -0,0 +1,1327 @@ +using System; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace Ryujinx.Graphics.Texture.Utils +{ + static class BC67Utils + { + private static byte[][] _quantizationLut; + private static byte[][] _quantizationLutNoPBit; + + static BC67Utils() + { + _quantizationLut = new byte[5][]; + _quantizationLutNoPBit = new byte[5][]; + + for (int depth = 4; depth < 9; depth++) + { + byte[] lut = new byte[512]; + byte[] lutNoPBit = new byte[256]; + + for (int i = 0; i < lut.Length; i++) + { + lut[i] = QuantizeComponentForLut((byte)i, depth, i >> 8); + + if (i < lutNoPBit.Length) + { + lutNoPBit[i] = QuantizeComponentForLut((byte)i, depth); + } + } + + _quantizationLut[depth - 4] = lut; + _quantizationLutNoPBit[depth - 4] = lutNoPBit; + } + } + + public static (RgbaColor8, RgbaColor8) GetMinMaxColors(ReadOnlySpan<uint> tile, int w, int h) + { + if (Sse41.IsSupported && w == 4 && h == 4) + { + GetMinMaxColorsOneSubset4x4Sse41(tile, out RgbaColor8 minColor, out RgbaColor8 maxColor); + + return (minColor, maxColor); + } + else + { + RgbaColor8 minColor = new RgbaColor8(255, 255, 255, 255); + RgbaColor8 maxColor = default; + + for (int i = 0; i < tile.Length; i++) + { + RgbaColor8 color = RgbaColor8.FromUInt32(tile[i]); + + minColor.R = Math.Min(minColor.R, color.R); + minColor.G = Math.Min(minColor.G, color.G); + minColor.B = Math.Min(minColor.B, color.B); + minColor.A = Math.Min(minColor.A, color.A); + + maxColor.R = Math.Max(maxColor.R, color.R); + maxColor.G = Math.Max(maxColor.G, color.G); + maxColor.B = Math.Max(maxColor.B, color.B); + maxColor.A = Math.Max(maxColor.A, color.A); + } + + return (minColor, maxColor); + } + } + + public static void GetMinMaxColors( + ReadOnlySpan<byte> partitionTable, + ReadOnlySpan<uint> tile, + int w, + int h, + Span<RgbaColor8> minColors, + Span<RgbaColor8> maxColors, + int subsetCount) + { + if (Sse41.IsSupported && w == 4 && h == 4) + { + if (subsetCount == 1) + { + GetMinMaxColorsOneSubset4x4Sse41(tile, out minColors[0], out maxColors[0]); + return; + } + else if (subsetCount == 2) + { + GetMinMaxColorsTwoSubsets4x4Sse41(partitionTable, tile, minColors, maxColors); + return; + } + } + + minColors.Fill(new RgbaColor8(255, 255, 255, 255)); + + int i = 0; + for (int ty = 0; ty < h; ty++) + { + for (int tx = 0; tx < w; tx++) + { + int subset = partitionTable[ty * w + tx]; + RgbaColor8 color = RgbaColor8.FromUInt32(tile[i++]); + + minColors[subset].R = Math.Min(minColors[subset].R, color.R); + minColors[subset].G = Math.Min(minColors[subset].G, color.G); + minColors[subset].B = Math.Min(minColors[subset].B, color.B); + minColors[subset].A = Math.Min(minColors[subset].A, color.A); + + maxColors[subset].R = Math.Max(maxColors[subset].R, color.R); + maxColors[subset].G = Math.Max(maxColors[subset].G, color.G); + maxColors[subset].B = Math.Max(maxColors[subset].B, color.B); + maxColors[subset].A = Math.Max(maxColors[subset].A, color.A); + } + } + } + + private static unsafe void GetMinMaxColorsOneSubset4x4Sse41(ReadOnlySpan<uint> tile, out RgbaColor8 minColor, out RgbaColor8 maxColor) + { + Vector128<byte> min = Vector128<byte>.AllBitsSet; + Vector128<byte> max = Vector128<byte>.Zero; + Vector128<byte> row0, row1, row2, row3; + + fixed (uint* pTile = tile) + { + row0 = Sse2.LoadVector128(pTile).AsByte(); + row1 = Sse2.LoadVector128(pTile + 4).AsByte(); + row2 = Sse2.LoadVector128(pTile + 8).AsByte(); + row3 = Sse2.LoadVector128(pTile + 12).AsByte(); + } + + min = Sse2.Min(min, row0); + max = Sse2.Max(max, row0); + min = Sse2.Min(min, row1); + max = Sse2.Max(max, row1); + min = Sse2.Min(min, row2); + max = Sse2.Max(max, row2); + min = Sse2.Min(min, row3); + max = Sse2.Max(max, row3); + + minColor = HorizontalMin(min); + maxColor = HorizontalMax(max); + } + + private static unsafe void GetMinMaxColorsTwoSubsets4x4Sse41( + ReadOnlySpan<byte> partitionTable, + ReadOnlySpan<uint> tile, + Span<RgbaColor8> minColors, + Span<RgbaColor8> maxColors) + { + Vector128<byte> partitionMask; + + fixed (byte* pPartitionTable = partitionTable) + { + partitionMask = Sse2.LoadVector128(pPartitionTable); + } + + Vector128<byte> subset0Mask = Sse2.CompareEqual(partitionMask, Vector128<byte>.Zero); + + Vector128<byte> subset0MaskRep16Low = Sse2.UnpackLow(subset0Mask, subset0Mask); + Vector128<byte> subset0MaskRep16High = Sse2.UnpackHigh(subset0Mask, subset0Mask); + + Vector128<byte> subset0Mask0 = Sse2.UnpackLow(subset0MaskRep16Low.AsInt16(), subset0MaskRep16Low.AsInt16()).AsByte(); + Vector128<byte> subset0Mask1 = Sse2.UnpackHigh(subset0MaskRep16Low.AsInt16(), subset0MaskRep16Low.AsInt16()).AsByte(); + Vector128<byte> subset0Mask2 = Sse2.UnpackLow(subset0MaskRep16High.AsInt16(), subset0MaskRep16High.AsInt16()).AsByte(); + Vector128<byte> subset0Mask3 = Sse2.UnpackHigh(subset0MaskRep16High.AsInt16(), subset0MaskRep16High.AsInt16()).AsByte(); + + Vector128<byte> min0 = Vector128<byte>.AllBitsSet; + Vector128<byte> min1 = Vector128<byte>.AllBitsSet; + Vector128<byte> max0 = Vector128<byte>.Zero; + Vector128<byte> max1 = Vector128<byte>.Zero; + + Vector128<byte> row0, row1, row2, row3; + + fixed (uint* pTile = tile) + { + row0 = Sse2.LoadVector128(pTile).AsByte(); + row1 = Sse2.LoadVector128(pTile + 4).AsByte(); + row2 = Sse2.LoadVector128(pTile + 8).AsByte(); + row3 = Sse2.LoadVector128(pTile + 12).AsByte(); + } + + min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row0, subset0Mask0)); + min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row1, subset0Mask1)); + min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row2, subset0Mask2)); + min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row3, subset0Mask3)); + + min1 = Sse2.Min(min1, Sse2.Or(row0, subset0Mask0)); + min1 = Sse2.Min(min1, Sse2.Or(row1, subset0Mask1)); + min1 = Sse2.Min(min1, Sse2.Or(row2, subset0Mask2)); + min1 = Sse2.Min(min1, Sse2.Or(row3, subset0Mask3)); + + max0 = Sse2.Max(max0, Sse2.And(row0, subset0Mask0)); + max0 = Sse2.Max(max0, Sse2.And(row1, subset0Mask1)); + max0 = Sse2.Max(max0, Sse2.And(row2, subset0Mask2)); + max0 = Sse2.Max(max0, Sse2.And(row3, subset0Mask3)); + + max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask0, row0)); + max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask1, row1)); + max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask2, row2)); + max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask3, row3)); + + minColors[0] = HorizontalMin(min0); + minColors[1] = HorizontalMin(min1); + maxColors[0] = HorizontalMax(max0); + maxColors[1] = HorizontalMax(max1); + } + + private static RgbaColor8 HorizontalMin(Vector128<byte> x) + { + x = Sse2.Min(x, Sse2.Shuffle(x.AsInt32(), 0x31).AsByte()); + x = Sse2.Min(x, Sse2.Shuffle(x.AsInt32(), 2).AsByte()); + return RgbaColor8.FromUInt32(x.AsUInt32().GetElement(0)); + } + + private static RgbaColor8 HorizontalMax(Vector128<byte> x) + { + x = Sse2.Max(x, Sse2.Shuffle(x.AsInt32(), 0x31).AsByte()); + x = Sse2.Max(x, Sse2.Shuffle(x.AsInt32(), 2).AsByte()); + return RgbaColor8.FromUInt32(x.AsUInt32().GetElement(0)); + } + + public static int SelectIndices( + ReadOnlySpan<uint> values, + uint endPoint0, + uint endPoint1, + int pBit0, + int pBit1, + int indexBitCount, + int indexCount, + int colorDepth, + int alphaDepth, + uint alphaMask) + { + if (Sse41.IsSupported) + { + if (indexBitCount == 2) + { + return Select2BitIndicesSse41( + values, + endPoint0, + endPoint1, + pBit0, + pBit1, + indexBitCount, + indexCount, + colorDepth, + alphaDepth, + alphaMask); + } + else if (indexBitCount == 3) + { + return Select3BitIndicesSse41( + values, + endPoint0, + endPoint1, + pBit0, + pBit1, + indexBitCount, + indexCount, + colorDepth, + alphaDepth, + alphaMask); + } + else if (indexBitCount == 4) + { + return Select4BitIndicesOneSubsetSse41( + values, + endPoint0, + endPoint1, + pBit0, + pBit1, + indexBitCount, + indexCount, + colorDepth, + alphaDepth, + alphaMask); + } + } + + return SelectIndicesFallback( + values, + endPoint0, + endPoint1, + pBit0, + pBit1, + indexBitCount, + indexCount, + colorDepth, + alphaDepth, + alphaMask); + } + + private static unsafe int Select2BitIndicesSse41( + ReadOnlySpan<uint> values, + uint endPoint0, + uint endPoint1, + int pBit0, + int pBit1, + int indexBitCount, + int indexCount, + int colorDepth, + int alphaDepth, + uint alphaMask) + { + uint alphaMaskForPalette = alphaMask; + + if (alphaDepth == 0) + { + alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); + } + + int errorSum = 0; + + RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0); + RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1); + + Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte(); + Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte(); + + Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); + + Vector128<byte> rWeights; + Vector128<byte> lWeights; + + fixed (byte* pWeights = BC67Tables.Weights[0], pInvWeights = BC67Tables.InverseWeights[0]) + { + rWeights = Sse2.LoadScalarVector128((uint*)pWeights).AsByte(); + lWeights = Sse2.LoadScalarVector128((uint*)pInvWeights).AsByte(); + } + + Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights); + Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); + Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + + Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); + Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); + + for (int i = 0; i < values.Length; i++) + { + uint c = values[i] | alphaMask; + + Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); + + Vector128<short> delta0 = Sse2.Subtract(color, pal0); + Vector128<short> delta1 = Sse2.Subtract(color, pal1); + + Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); + Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); + + Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); + + Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum01); + + Vector128<ushort> min = Sse41.MinHorizontal(delta); + + ushort error = min.GetElement(0); + + errorSum += error; + } + + return errorSum; + } + + private static unsafe int Select3BitIndicesSse41( + ReadOnlySpan<uint> values, + uint endPoint0, + uint endPoint1, + int pBit0, + int pBit1, + int indexBitCount, + int indexCount, + int colorDepth, + int alphaDepth, + uint alphaMask) + { + uint alphaMaskForPalette = alphaMask; + + if (alphaDepth == 0) + { + alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); + } + + int errorSum = 0; + + RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0); + RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1); + + Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte(); + Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte(); + + Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); + + Vector128<byte> rWeights; + Vector128<byte> lWeights; + + fixed (byte* pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1]) + { + rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte(); + lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte(); + } + + Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights); + Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); + Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); + Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + + Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); + Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); + Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte())); + Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte())); + + for (int i = 0; i < values.Length; i++) + { + uint c = values[i] | alphaMask; + + Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); + + Vector128<short> delta0 = Sse2.Subtract(color, pal0); + Vector128<short> delta1 = Sse2.Subtract(color, pal1); + Vector128<short> delta2 = Sse2.Subtract(color, pal2); + Vector128<short> delta3 = Sse2.Subtract(color, pal3); + + Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); + Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); + Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2); + Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3); + + Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); + Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3); + + Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23); + + Vector128<ushort> min = Sse41.MinHorizontal(delta); + + ushort error = min.GetElement(0); + + errorSum += error; + } + + return errorSum; + } + + private static unsafe int Select4BitIndicesOneSubsetSse41( + ReadOnlySpan<uint> values, + uint endPoint0, + uint endPoint1, + int pBit0, + int pBit1, + int indexBitCount, + int indexCount, + int colorDepth, + int alphaDepth, + uint alphaMask) + { + uint alphaMaskForPalette = alphaMask; + + if (alphaDepth == 0) + { + alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); + } + + int errorSum = 0; + + RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0); + RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1); + + Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte(); + Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte(); + + Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); + + Vector128<byte> rWeights; + Vector128<byte> lWeights; + + fixed (byte* pWeights = BC67Tables.Weights[2], pInvWeights = BC67Tables.InverseWeights[2]) + { + rWeights = Sse2.LoadVector128(pWeights); + lWeights = Sse2.LoadVector128(pInvWeights); + } + + Vector128<byte> iWeightsLow = Sse2.UnpackLow(lWeights, rWeights); + Vector128<byte> iWeightsHigh = Sse2.UnpackHigh(lWeights, rWeights); + Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte(); + Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte(); + Vector128<byte> iWeights45 = Sse2.UnpackLow(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte(); + Vector128<byte> iWeights67 = Sse2.UnpackHigh(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte(); + Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + Vector128<byte> iWeights4 = Sse2.UnpackLow(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte(); + Vector128<byte> iWeights5 = Sse2.UnpackHigh(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte(); + Vector128<byte> iWeights6 = Sse2.UnpackLow(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte(); + Vector128<byte> iWeights7 = Sse2.UnpackHigh(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte(); + + Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); + Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); + Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte())); + Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte())); + Vector128<short> pal4 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights4.AsSByte())); + Vector128<short> pal5 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights5.AsSByte())); + Vector128<short> pal6 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights6.AsSByte())); + Vector128<short> pal7 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights7.AsSByte())); + + for (int i = 0; i < values.Length; i++) + { + uint c = values[i] | alphaMask; + + Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); + + Vector128<short> delta0 = Sse2.Subtract(color, pal0); + Vector128<short> delta1 = Sse2.Subtract(color, pal1); + Vector128<short> delta2 = Sse2.Subtract(color, pal2); + Vector128<short> delta3 = Sse2.Subtract(color, pal3); + Vector128<short> delta4 = Sse2.Subtract(color, pal4); + Vector128<short> delta5 = Sse2.Subtract(color, pal5); + Vector128<short> delta6 = Sse2.Subtract(color, pal6); + Vector128<short> delta7 = Sse2.Subtract(color, pal7); + + Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); + Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); + Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2); + Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3); + Vector128<int> deltaSum4 = Sse2.MultiplyAddAdjacent(delta4, delta4); + Vector128<int> deltaSum5 = Sse2.MultiplyAddAdjacent(delta5, delta5); + Vector128<int> deltaSum6 = Sse2.MultiplyAddAdjacent(delta6, delta6); + Vector128<int> deltaSum7 = Sse2.MultiplyAddAdjacent(delta7, delta7); + + Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); + Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3); + Vector128<int> deltaSum45 = Ssse3.HorizontalAdd(deltaSum4, deltaSum5); + Vector128<int> deltaSum67 = Ssse3.HorizontalAdd(deltaSum6, deltaSum7); + + Vector128<ushort> delta0123 = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23); + Vector128<ushort> delta4567 = Sse41.PackUnsignedSaturate(deltaSum45, deltaSum67); + + Vector128<ushort> min0123 = Sse41.MinHorizontal(delta0123); + Vector128<ushort> min4567 = Sse41.MinHorizontal(delta4567); + + ushort minPos0123 = min0123.GetElement(0); + ushort minPos4567 = min4567.GetElement(0); + + if (minPos4567 < minPos0123) + { + errorSum += minPos4567; + } + else + { + errorSum += minPos0123; + } + } + + return errorSum; + } + + private static int SelectIndicesFallback( + ReadOnlySpan<uint> values, + uint endPoint0, + uint endPoint1, + int pBit0, + int pBit1, + int indexBitCount, + int indexCount, + int colorDepth, + int alphaDepth, + uint alphaMask) + { + int errorSum = 0; + + uint alphaMaskForPalette = alphaMask; + + if (alphaDepth == 0) + { + alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); + } + + Span<uint> palette = stackalloc uint[indexCount]; + + RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0); + RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1); + + Unsafe.As<RgbaColor8, uint>(ref c0) |= alphaMaskForPalette; + Unsafe.As<RgbaColor8, uint>(ref c1) |= alphaMaskForPalette; + + palette[0] = c0.ToUInt32(); + palette[indexCount - 1] = c1.ToUInt32(); + + for (int j = 1; j < indexCount - 1; j++) + { + palette[j] = Interpolate(c0, c1, j, indexBitCount).ToUInt32(); + } + + for (int i = 0; i < values.Length; i++) + { + uint color = values[i] | alphaMask; + + int bestMatchScore = int.MaxValue; + int bestMatchIndex = 0; + + for (int j = 0; j < indexCount; j++) + { + int score = SquaredDifference( + RgbaColor8.FromUInt32(color).GetColor32(), + RgbaColor8.FromUInt32(palette[j]).GetColor32()); + + if (score < bestMatchScore) + { + bestMatchScore = score; + bestMatchIndex = j; + } + } + + errorSum += bestMatchScore; + } + + return errorSum; + } + + public static int SelectIndices( + ReadOnlySpan<uint> tile, + int w, + int h, + ReadOnlySpan<uint> endPoints0, + ReadOnlySpan<uint> endPoints1, + ReadOnlySpan<int> pBitValues, + Span<byte> indices, + int subsetCount, + int partition, + int indexBitCount, + int indexCount, + int colorDepth, + int alphaDepth, + int pBits, + uint alphaMask) + { + if (Sse41.IsSupported) + { + if (indexBitCount == 2) + { + return Select2BitIndicesSse41( + tile, + w, + h, + endPoints0, + endPoints1, + pBitValues, + indices, + subsetCount, + partition, + colorDepth, + alphaDepth, + pBits, + alphaMask); + } + else if (indexBitCount == 3) + { + return Select3BitIndicesSse41( + tile, + w, + h, + endPoints0, + endPoints1, + pBitValues, + indices, + subsetCount, + partition, + colorDepth, + alphaDepth, + pBits, + alphaMask); + } + else if (indexBitCount == 4) + { + Debug.Assert(subsetCount == 1); + + return Select4BitIndicesOneSubsetSse41( + tile, + w, + h, + endPoints0[0], + endPoints1[0], + pBitValues, + indices, + partition, + colorDepth, + alphaDepth, + pBits, + alphaMask); + } + } + + return SelectIndicesFallback( + tile, + w, + h, + endPoints0, + endPoints1, + pBitValues, + indices, + subsetCount, + partition, + indexBitCount, + indexCount, + colorDepth, + alphaDepth, + pBits, + alphaMask); + } + + private static unsafe int Select2BitIndicesSse41( + ReadOnlySpan<uint> tile, + int w, + int h, + ReadOnlySpan<uint> endPoints0, + ReadOnlySpan<uint> endPoints1, + ReadOnlySpan<int> pBitValues, + Span<byte> indices, + int subsetCount, + int partition, + int colorDepth, + int alphaDepth, + int pBits, + uint alphaMask) + { + byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition]; + + uint alphaMaskForPalette = alphaMask; + + if (alphaDepth == 0) + { + alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); + } + + int errorSum = 0; + + for (int subset = 0; subset < subsetCount; subset++) + { + int pBit0 = -1, pBit1 = -1; + + if (pBits == subsetCount) + { + pBit0 = pBit1 = pBitValues[subset]; + } + else if (pBits != 0) + { + pBit0 = pBitValues[subset * 2]; + pBit1 = pBitValues[subset * 2 + 1]; + } + + RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0); + RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1); + + Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte(); + Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte(); + + Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); + + Vector128<byte> rWeights; + Vector128<byte> lWeights; + + fixed (byte* pWeights = BC67Tables.Weights[0], pInvWeights = BC67Tables.InverseWeights[0]) + { + rWeights = Sse2.LoadScalarVector128((uint*)pWeights).AsByte(); + lWeights = Sse2.LoadScalarVector128((uint*)pInvWeights).AsByte(); + } + + Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights); + Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); + Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + + Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); + Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); + + int i = 0; + for (int ty = 0; ty < h; ty++) + { + for (int tx = 0; tx < w; tx++, i++) + { + int tileOffset = ty * 4 + tx; + if (partitionTable[tileOffset] != subset) + { + continue; + } + + uint c = tile[i] | alphaMask; + + Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); + + Vector128<short> delta0 = Sse2.Subtract(color, pal0); + Vector128<short> delta1 = Sse2.Subtract(color, pal1); + + Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); + Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); + + Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); + + Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum01); + + Vector128<ushort> min = Sse41.MinHorizontal(delta); + + uint minPos = min.AsUInt32().GetElement(0); + ushort error = (ushort)minPos; + uint index = minPos >> 16; + + indices[tileOffset] = (byte)index; + errorSum += error; + } + } + } + + return errorSum; + } + + private static unsafe int Select3BitIndicesSse41( + ReadOnlySpan<uint> tile, + int w, + int h, + ReadOnlySpan<uint> endPoints0, + ReadOnlySpan<uint> endPoints1, + ReadOnlySpan<int> pBitValues, + Span<byte> indices, + int subsetCount, + int partition, + int colorDepth, + int alphaDepth, + int pBits, + uint alphaMask) + { + byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition]; + + uint alphaMaskForPalette = alphaMask; + + if (alphaDepth == 0) + { + alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); + } + + int errorSum = 0; + + for (int subset = 0; subset < subsetCount; subset++) + { + int pBit0 = -1, pBit1 = -1; + + if (pBits == subsetCount) + { + pBit0 = pBit1 = pBitValues[subset]; + } + else if (pBits != 0) + { + pBit0 = pBitValues[subset * 2]; + pBit1 = pBitValues[subset * 2 + 1]; + } + + RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0); + RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1); + + Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte(); + Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte(); + + Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); + + Vector128<byte> rWeights; + Vector128<byte> lWeights; + + fixed (byte* pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1]) + { + rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte(); + lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte(); + } + + Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights); + Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); + Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); + Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + + Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); + Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); + Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte())); + Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte())); + + int i = 0; + for (int ty = 0; ty < h; ty++) + { + for (int tx = 0; tx < w; tx++, i++) + { + int tileOffset = ty * 4 + tx; + if (partitionTable[tileOffset] != subset) + { + continue; + } + + uint c = tile[i] | alphaMask; + + Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); + + Vector128<short> delta0 = Sse2.Subtract(color, pal0); + Vector128<short> delta1 = Sse2.Subtract(color, pal1); + Vector128<short> delta2 = Sse2.Subtract(color, pal2); + Vector128<short> delta3 = Sse2.Subtract(color, pal3); + + Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); + Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); + Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2); + Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3); + + Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); + Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3); + + Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23); + + Vector128<ushort> min = Sse41.MinHorizontal(delta); + + uint minPos = min.AsUInt32().GetElement(0); + ushort error = (ushort)minPos; + uint index = minPos >> 16; + + indices[tileOffset] = (byte)index; + errorSum += error; + } + } + } + + return errorSum; + } + + private static unsafe int Select4BitIndicesOneSubsetSse41( + ReadOnlySpan<uint> tile, + int w, + int h, + uint endPoint0, + uint endPoint1, + ReadOnlySpan<int> pBitValues, + Span<byte> indices, + int partition, + int colorDepth, + int alphaDepth, + int pBits, + uint alphaMask) + { + uint alphaMaskForPalette = alphaMask; + + if (alphaDepth == 0) + { + alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); + } + + int errorSum = 0; + + int pBit0 = -1, pBit1 = -1; + + if (pBits != 0) + { + pBit0 = pBitValues[0]; + pBit1 = pBitValues[1]; + } + + RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0); + RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1); + + Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte(); + Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte(); + + Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); + + Vector128<byte> rWeights; + Vector128<byte> lWeights; + + fixed (byte* pWeights = BC67Tables.Weights[2], pInvWeights = BC67Tables.InverseWeights[2]) + { + rWeights = Sse2.LoadVector128(pWeights); + lWeights = Sse2.LoadVector128(pInvWeights); + } + + Vector128<byte> iWeightsLow = Sse2.UnpackLow(lWeights, rWeights); + Vector128<byte> iWeightsHigh = Sse2.UnpackHigh(lWeights, rWeights); + Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte(); + Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte(); + Vector128<byte> iWeights45 = Sse2.UnpackLow(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte(); + Vector128<byte> iWeights67 = Sse2.UnpackHigh(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte(); + Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + Vector128<byte> iWeights4 = Sse2.UnpackLow(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte(); + Vector128<byte> iWeights5 = Sse2.UnpackHigh(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte(); + Vector128<byte> iWeights6 = Sse2.UnpackLow(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte(); + Vector128<byte> iWeights7 = Sse2.UnpackHigh(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte(); + + Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); + Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); + Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte())); + Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte())); + Vector128<short> pal4 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights4.AsSByte())); + Vector128<short> pal5 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights5.AsSByte())); + Vector128<short> pal6 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights6.AsSByte())); + Vector128<short> pal7 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights7.AsSByte())); + + int i = 0; + for (int ty = 0; ty < h; ty++) + { + for (int tx = 0; tx < w; tx++, i++) + { + uint c = tile[i] | alphaMask; + + Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); + + Vector128<short> delta0 = Sse2.Subtract(color, pal0); + Vector128<short> delta1 = Sse2.Subtract(color, pal1); + Vector128<short> delta2 = Sse2.Subtract(color, pal2); + Vector128<short> delta3 = Sse2.Subtract(color, pal3); + Vector128<short> delta4 = Sse2.Subtract(color, pal4); + Vector128<short> delta5 = Sse2.Subtract(color, pal5); + Vector128<short> delta6 = Sse2.Subtract(color, pal6); + Vector128<short> delta7 = Sse2.Subtract(color, pal7); + + Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); + Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); + Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2); + Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3); + Vector128<int> deltaSum4 = Sse2.MultiplyAddAdjacent(delta4, delta4); + Vector128<int> deltaSum5 = Sse2.MultiplyAddAdjacent(delta5, delta5); + Vector128<int> deltaSum6 = Sse2.MultiplyAddAdjacent(delta6, delta6); + Vector128<int> deltaSum7 = Sse2.MultiplyAddAdjacent(delta7, delta7); + + Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); + Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3); + Vector128<int> deltaSum45 = Ssse3.HorizontalAdd(deltaSum4, deltaSum5); + Vector128<int> deltaSum67 = Ssse3.HorizontalAdd(deltaSum6, deltaSum7); + + Vector128<ushort> delta0123 = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23); + Vector128<ushort> delta4567 = Sse41.PackUnsignedSaturate(deltaSum45, deltaSum67); + + Vector128<ushort> min0123 = Sse41.MinHorizontal(delta0123); + Vector128<ushort> min4567 = Sse41.MinHorizontal(delta4567); + + uint minPos0123 = min0123.AsUInt32().GetElement(0); + uint minPos4567 = min4567.AsUInt32().GetElement(0); + + if ((ushort)minPos4567 < (ushort)minPos0123) + { + errorSum += (ushort)minPos4567; + indices[ty * 4 + tx] = (byte)(8 + (minPos4567 >> 16)); + } + else + { + errorSum += (ushort)minPos0123; + indices[ty * 4 + tx] = (byte)(minPos0123 >> 16); + } + } + } + + return errorSum; + } + + private static Vector128<short> ShiftRoundToNearest(Vector128<short> x) + { + return Sse2.ShiftRightLogical(Sse2.Add(x, Vector128.Create((short)32)), 6); + } + + private static int SelectIndicesFallback( + ReadOnlySpan<uint> tile, + int w, + int h, + ReadOnlySpan<uint> endPoints0, + ReadOnlySpan<uint> endPoints1, + ReadOnlySpan<int> pBitValues, + Span<byte> indices, + int subsetCount, + int partition, + int indexBitCount, + int indexCount, + int colorDepth, + int alphaDepth, + int pBits, + uint alphaMask) + { + int errorSum = 0; + + uint alphaMaskForPalette = alphaMask; + + if (alphaDepth == 0) + { + alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); + } + + Span<uint> palette = stackalloc uint[subsetCount * indexCount]; + + for (int subset = 0; subset < subsetCount; subset++) + { + int palBase = subset * indexCount; + + int pBit0 = -1, pBit1 = -1; + + if (pBits == subsetCount) + { + pBit0 = pBit1 = pBitValues[subset]; + } + else if (pBits != 0) + { + pBit0 = pBitValues[subset * 2]; + pBit1 = pBitValues[subset * 2 + 1]; + } + + RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0); + RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1); + + Unsafe.As<RgbaColor8, uint>(ref c0) |= alphaMaskForPalette; + Unsafe.As<RgbaColor8, uint>(ref c1) |= alphaMaskForPalette; + + palette[palBase + 0] = c0.ToUInt32(); + palette[palBase + indexCount - 1] = c1.ToUInt32(); + + for (int j = 1; j < indexCount - 1; j++) + { + palette[palBase + j] = Interpolate(c0, c1, j, indexBitCount).ToUInt32(); + } + } + + int i = 0; + for (int ty = 0; ty < h; ty++) + { + for (int tx = 0; tx < w; tx++) + { + int subset = BC67Tables.PartitionTable[subsetCount - 1][partition][ty * 4 + tx]; + uint color = tile[i++] | alphaMask; + + int bestMatchScore = int.MaxValue; + int bestMatchIndex = 0; + + for (int j = 0; j < indexCount; j++) + { + int score = SquaredDifference( + RgbaColor8.FromUInt32(color).GetColor32(), + RgbaColor8.FromUInt32(palette[subset * indexCount + j]).GetColor32()); + + if (score < bestMatchScore) + { + bestMatchScore = score; + bestMatchIndex = j; + } + } + + indices[ty * 4 + tx] = (byte)bestMatchIndex; + errorSum += bestMatchScore; + } + } + + return errorSum; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int SquaredDifference(RgbaColor32 color1, RgbaColor32 color2) + { + RgbaColor32 delta = color1 - color2; + return RgbaColor32.Dot(delta, delta); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static RgbaColor8 Interpolate(RgbaColor8 color1, RgbaColor8 color2, int weightIndex, int indexBitCount) + { + return Interpolate(color1.GetColor32(), color2.GetColor32(), weightIndex, indexBitCount).GetColor8(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static RgbaColor32 Interpolate(RgbaColor32 color1, RgbaColor32 color2, int weightIndex, int indexBitCount) + { + Debug.Assert(indexBitCount >= 2 && indexBitCount <= 4); + + int weight = (((weightIndex << 7) / ((1 << indexBitCount) - 1)) + 1) >> 1; + + RgbaColor32 weightV = new RgbaColor32(weight); + RgbaColor32 invWeightV = new RgbaColor32(64 - weight); + + return (color1 * invWeightV + color2 * weightV + new RgbaColor32(32)) >> 6; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static RgbaColor32 Interpolate( + RgbaColor32 color1, + RgbaColor32 color2, + int colorWeightIndex, + int alphaWeightIndex, + int colorIndexBitCount, + int alphaIndexBitCount) + { + Debug.Assert(colorIndexBitCount >= 2 && colorIndexBitCount <= 4); + Debug.Assert(alphaIndexBitCount >= 2 && alphaIndexBitCount <= 4); + + int colorWeight = BC67Tables.Weights[colorIndexBitCount - 2][colorWeightIndex]; + int alphaWeight = BC67Tables.Weights[alphaIndexBitCount - 2][alphaWeightIndex]; + + RgbaColor32 weightV = new RgbaColor32(colorWeight); + weightV.A = alphaWeight; + RgbaColor32 invWeightV = new RgbaColor32(64) - weightV; + + return (color1 * invWeightV + color2 * weightV + new RgbaColor32(32)) >> 6; + } + + public static RgbaColor8 Quantize(RgbaColor8 color, int colorBits, int alphaBits, int pBit = -1) + { + if (alphaBits == 0) + { + int colorShift = 8 - colorBits; + + uint c; + + if (pBit >= 0) + { + byte[] lutColor = _quantizationLut[colorBits - 4]; + + Debug.Assert(pBit <= 1); + int high = pBit << 8; + uint mask = (0xffu >> (colorBits + 1)) * 0x10101; + + c = lutColor[color.R | high]; + c |= (uint)lutColor[color.G | high] << 8; + c |= (uint)lutColor[color.B | high] << 16; + + c <<= colorShift; + c |= (c >> (colorBits + 1)) & mask; + c |= ((uint)pBit * 0x10101) << (colorShift - 1); + } + else + { + byte[] lutColor = _quantizationLutNoPBit[colorBits - 4]; + + uint mask = (0xffu >> colorBits) * 0x10101; + + c = lutColor[color.R]; + c |= (uint)lutColor[color.G] << 8; + c |= (uint)lutColor[color.B] << 16; + + c <<= colorShift; + c |= (c >> colorBits) & mask; + } + + c |= (uint)color.A << 24; + + return RgbaColor8.FromUInt32(c); + } + + return QuantizeFallback(color, colorBits, alphaBits, pBit); + } + + private static RgbaColor8 QuantizeFallback(RgbaColor8 color, int colorBits, int alphaBits, int pBit = -1) + { + byte r = UnquantizeComponent(QuantizeComponent(color.R, colorBits, pBit), colorBits, pBit); + byte g = UnquantizeComponent(QuantizeComponent(color.G, colorBits, pBit), colorBits, pBit); + byte b = UnquantizeComponent(QuantizeComponent(color.B, colorBits, pBit), colorBits, pBit); + byte a = alphaBits == 0 ? color.A : UnquantizeComponent(QuantizeComponent(color.A, alphaBits, pBit), alphaBits, pBit); + return new RgbaColor8(r, g, b, a); + } + + public static byte QuantizeComponent(byte component, int bits, int pBit = -1) + { + return pBit >= 0 ? _quantizationLut[bits - 4][component | (pBit << 8)] : _quantizationLutNoPBit[bits - 4][component]; + } + + private static byte QuantizeComponentForLut(byte component, int bits, int pBit = -1) + { + int shift = 8 - bits; + int fill = component >> bits; + + if (pBit >= 0) + { + Debug.Assert(pBit <= 1); + fill >>= 1; + fill |= pBit << (shift - 1); + } + + int q1 = component >> shift; + int q2 = Math.Max(q1 - 1, 0); + int q3 = Math.Min(q1 + 1, (1 << bits) - 1); + + int delta1 = FastAbs(((q1 << shift) | fill) - component); + int delta2 = component - ((q2 << shift) | fill); + int delta3 = ((q3 << shift) | fill) - component; + + if (delta1 < delta2 && delta1 < delta3) + { + return (byte)q1; + } + else if (delta2 < delta3) + { + return (byte)q2; + } + else + { + return (byte)q3; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int FastAbs(int x) + { + int sign = x >> 31; + return (x + sign) ^ sign; + } + + private static byte UnquantizeComponent(byte component, int bits, int pBit) + { + int shift = 8 - bits; + int value = component << shift; + + if (pBit >= 0) + { + Debug.Assert(pBit <= 1); + value |= value >> (bits + 1); + value |= pBit << (shift - 1); + } + else + { + value |= value >> bits; + } + + return (byte)value; + } + } +} diff --git a/src/Ryujinx.Graphics.Texture/Utils/BC7ModeInfo.cs b/src/Ryujinx.Graphics.Texture/Utils/BC7ModeInfo.cs new file mode 100644 index 00000000..687df22c --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Utils/BC7ModeInfo.cs @@ -0,0 +1,37 @@ +namespace Ryujinx.Graphics.Texture.Utils +{ + readonly struct BC7ModeInfo + { + public readonly int SubsetCount; + public readonly int PartitionBitCount; + public readonly int PBits; + public readonly int RotationBitCount; + public readonly int IndexModeBitCount; + public readonly int ColorIndexBitCount; + public readonly int AlphaIndexBitCount; + public readonly int ColorDepth; + public readonly int AlphaDepth; + + public BC7ModeInfo( + int subsetCount, + int partitionBitsCount, + int pBits, + int rotationBitCount, + int indexModeBitCount, + int colorIndexBitCount, + int alphaIndexBitCount, + int colorDepth, + int alphaDepth) + { + SubsetCount = subsetCount; + PartitionBitCount = partitionBitsCount; + PBits = pBits; + RotationBitCount = rotationBitCount; + IndexModeBitCount = indexModeBitCount; + ColorIndexBitCount = colorIndexBitCount; + AlphaIndexBitCount = alphaIndexBitCount; + ColorDepth = colorDepth; + AlphaDepth = alphaDepth; + } + } +}
\ No newline at end of file diff --git a/src/Ryujinx.Graphics.Texture/Utils/Block.cs b/src/Ryujinx.Graphics.Texture/Utils/Block.cs new file mode 100644 index 00000000..a8bae077 --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Utils/Block.cs @@ -0,0 +1,55 @@ +namespace Ryujinx.Graphics.Texture.Utils +{ + struct Block + { + public ulong Low; + public ulong High; + + public void Encode(ulong value, ref int offset, int bits) + { + if (offset >= 64) + { + High |= value << (offset - 64); + } + else + { + Low |= value << offset; + + if (offset + bits > 64) + { + int remainder = 64 - offset; + High |= value >> remainder; + } + } + + offset += bits; + } + + public ulong Decode(ref int offset, int bits) + { + ulong value; + ulong mask = bits == 64 ? ulong.MaxValue : (1UL << bits) - 1; + + if (offset >= 64) + { + value = (High >> (offset - 64)) & mask; + } + else + { + value = Low >> offset; + + if (offset + bits > 64) + { + int remainder = 64 - offset; + value |= High << remainder; + } + + value &= mask; + } + + offset += bits; + + return value; + } + } +}
\ No newline at end of file diff --git a/src/Ryujinx.Graphics.Texture/Utils/RgbaColor32.cs b/src/Ryujinx.Graphics.Texture/Utils/RgbaColor32.cs new file mode 100644 index 00000000..582044d9 --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Utils/RgbaColor32.cs @@ -0,0 +1,229 @@ +using System; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace Ryujinx.Graphics.Texture.Utils +{ + struct RgbaColor32 : IEquatable<RgbaColor32> + { + private Vector128<int> _color; + + public int R + { + get => _color.GetElement(0); + set => _color = _color.WithElement(0, value); + } + + public int G + { + get => _color.GetElement(1); + set => _color = _color.WithElement(1, value); + } + + public int B + { + get => _color.GetElement(2); + set => _color = _color.WithElement(2, value); + } + + public int A + { + get => _color.GetElement(3); + set => _color = _color.WithElement(3, value); + } + + public RgbaColor32(Vector128<int> color) + { + _color = color; + } + + public RgbaColor32(int r, int g, int b, int a) + { + _color = Vector128.Create(r, g, b, a); + } + + public RgbaColor32(int scalar) + { + _color = Vector128.Create(scalar); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static RgbaColor32 operator +(RgbaColor32 x, RgbaColor32 y) + { + if (Sse2.IsSupported) + { + return new RgbaColor32(Sse2.Add(x._color, y._color)); + } + else + { + return new RgbaColor32(x.R + y.R, x.G + y.G, x.B + y.B, x.A + y.A); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static RgbaColor32 operator -(RgbaColor32 x, RgbaColor32 y) + { + if (Sse2.IsSupported) + { + return new RgbaColor32(Sse2.Subtract(x._color, y._color)); + } + else + { + return new RgbaColor32(x.R - y.R, x.G - y.G, x.B - y.B, x.A - y.A); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static RgbaColor32 operator *(RgbaColor32 x, RgbaColor32 y) + { + if (Sse41.IsSupported) + { + return new RgbaColor32(Sse41.MultiplyLow(x._color, y._color)); + } + else + { + return new RgbaColor32(x.R * y.R, x.G * y.G, x.B * y.B, x.A * y.A); + } + } + + public static RgbaColor32 operator /(RgbaColor32 x, RgbaColor32 y) + { + return new RgbaColor32(x.R / y.R, x.G / y.G, x.B / y.B, x.A / y.A); + } + + public static RgbaColor32 DivideGuarded(RgbaColor32 x, RgbaColor32 y, int resultIfZero) + { + return new RgbaColor32( + DivideGuarded(x.R, y.R, resultIfZero), + DivideGuarded(x.G, y.G, resultIfZero), + DivideGuarded(x.B, y.B, resultIfZero), + DivideGuarded(x.A, y.A, resultIfZero)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static RgbaColor32 operator <<(RgbaColor32 x, int shift) + { + if (Sse2.IsSupported) + { + return new RgbaColor32(Sse2.ShiftLeftLogical(x._color, (byte)shift)); + } + else + { + return new RgbaColor32(x.R << shift, x.G << shift, x.B << shift, x.A << shift); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static RgbaColor32 operator >>(RgbaColor32 x, int shift) + { + if (Sse2.IsSupported) + { + return new RgbaColor32(Sse2.ShiftRightLogical(x._color, (byte)shift)); + } + else + { + return new RgbaColor32(x.R >> shift, x.G >> shift, x.B >> shift, x.A >> shift); + } + } + + public static bool operator ==(RgbaColor32 x, RgbaColor32 y) + { + return x.Equals(y); + } + + public static bool operator !=(RgbaColor32 x, RgbaColor32 y) + { + return !x.Equals(y); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int Dot(RgbaColor32 x, RgbaColor32 y) + { + if (Sse41.IsSupported) + { + Vector128<int> product = Sse41.MultiplyLow(x._color, y._color); + Vector128<int> sum = Ssse3.HorizontalAdd(product, product); + sum = Ssse3.HorizontalAdd(sum, sum); + return sum.GetElement(0); + } + else + { + return x.R * y.R + x.G * y.G + x.B * y.B + x.A * y.A; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static RgbaColor32 Max(RgbaColor32 x, RgbaColor32 y) + { + if (Sse41.IsSupported) + { + return new RgbaColor32(Sse41.Max(x._color, y._color)); + } + else + { + return new RgbaColor32(Math.Max(x.R, y.R), Math.Max(x.G, y.G), Math.Max(x.B, y.B), Math.Max(x.A, y.A)); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static RgbaColor32 Min(RgbaColor32 x, RgbaColor32 y) + { + if (Sse41.IsSupported) + { + return new RgbaColor32(Sse41.Min(x._color, y._color)); + } + else + { + return new RgbaColor32(Math.Min(x.R, y.R), Math.Min(x.G, y.G), Math.Min(x.B, y.B), Math.Min(x.A, y.A)); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public RgbaColor8 GetColor8() + { + if (Sse41.IsSupported) + { + Vector128<int> temp = _color; + Vector128<ushort> color16 = Sse41.PackUnsignedSaturate(temp, temp); + Vector128<byte> color8 = Sse2.PackUnsignedSaturate(color16.AsInt16(), color16.AsInt16()); + uint color = color8.AsUInt32().GetElement(0); + return Unsafe.As<uint, RgbaColor8>(ref color); + } + else + { + return new RgbaColor8(ClampByte(R), ClampByte(G), ClampByte(B), ClampByte(A)); + } + } + + private static int DivideGuarded(int dividend, int divisor, int resultIfZero) + { + if (divisor == 0) + { + return resultIfZero; + } + + return dividend / divisor; + } + + private static byte ClampByte(int value) + { + return (byte)Math.Clamp(value, 0, 255); + } + + public override int GetHashCode() + { + return HashCode.Combine(R, G, B, A); + } + + public override bool Equals(object obj) + { + return obj is RgbaColor32 other && Equals(other); + } + + public bool Equals(RgbaColor32 other) + { + return _color.Equals(other._color); + } + } +} diff --git a/src/Ryujinx.Graphics.Texture/Utils/RgbaColor8.cs b/src/Ryujinx.Graphics.Texture/Utils/RgbaColor8.cs new file mode 100644 index 00000000..0edf1cce --- /dev/null +++ b/src/Ryujinx.Graphics.Texture/Utils/RgbaColor8.cs @@ -0,0 +1,84 @@ +using System; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace Ryujinx.Graphics.Texture.Utils +{ + struct RgbaColor8 : IEquatable<RgbaColor8> + { + public byte R; + public byte G; + public byte B; + public byte A; + + public RgbaColor8(byte r, byte g, byte b, byte a) + { + R = r; + G = g; + B = b; + A = a; + } + + public static RgbaColor8 FromUInt32(uint color) + { + return Unsafe.As<uint, RgbaColor8>(ref color); + } + + public static bool operator ==(RgbaColor8 x, RgbaColor8 y) + { + return x.Equals(y); + } + + public static bool operator !=(RgbaColor8 x, RgbaColor8 y) + { + return !x.Equals(y); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public RgbaColor32 GetColor32() + { + if (Sse41.IsSupported) + { + Vector128<byte> color = Vector128.CreateScalarUnsafe(Unsafe.As<RgbaColor8, uint>(ref this)).AsByte(); + return new RgbaColor32(Sse41.ConvertToVector128Int32(color)); + } + else + { + return new RgbaColor32(R, G, B, A); + } + } + + public uint ToUInt32() + { + return Unsafe.As<RgbaColor8, uint>(ref this); + } + + public override int GetHashCode() + { + return HashCode.Combine(R, G, B, A); + } + + public override bool Equals(object obj) + { + return obj is RgbaColor8 other && Equals(other); + } + + public bool Equals(RgbaColor8 other) + { + return R == other.R && G == other.G && B == other.B && A == other.A; + } + + public byte GetComponent(int index) + { + return index switch + { + 0 => R, + 1 => G, + 2 => B, + 3 => A, + _ => throw new ArgumentOutOfRangeException(nameof(index)) + }; + } + } +} |
