aboutsummaryrefslogtreecommitdiff
path: root/Ryujinx.Graphics.Nvdec.Vp9/Dsp
diff options
context:
space:
mode:
authorgdkchan <gab.dark.100@gmail.com>2020-07-12 00:07:01 -0300
committerGitHub <noreply@github.com>2020-07-12 05:07:01 +0200
commit4d02a2d2c0451b4de1f6de3bbce54c457cacebe2 (patch)
tree120fe4fb8cfa1ac1c6ef4c97d92be47e955e8c0f /Ryujinx.Graphics.Nvdec.Vp9/Dsp
parent38b26cf4242999fa7d8c550993ac0940cd03d55f (diff)
New NVDEC and VIC implementation (#1384)
* Initial NVDEC and VIC implementation * Update FFmpeg.AutoGen to 4.3.0 * Add nvdec dependencies for Windows * Unify some VP9 structures * Rename VP9 structure fields * Improvements to Video API * XML docs for Common.Memory * Remove now unused or redundant overloads from MemoryAccessor * NVDEC UV surface read/write scalar paths * Add FIXME comments about hacky things/stuff that will need to be fixed in the future * Cleaned up VP9 memory allocation * Remove some debug logs * Rename some VP9 structs * Remove unused struct * No need to compile Ryujinx.Graphics.Host1x with unsafe anymore * Name AsyncWorkQueue threads to make debugging easier * Make Vp9PictureInfo a ref struct * LayoutConverter no longer needs the depth argument (broken by rebase) * Pooling of VP9 buffers, plus fix a memory leak on VP9 * Really wish VS could rename projects properly... * Address feedback * Remove using * Catch OperationCanceledException * Add licensing informations * Add THIRDPARTY.md to release too Co-authored-by: Thog <me@thog.eu>
Diffstat (limited to 'Ryujinx.Graphics.Nvdec.Vp9/Dsp')
-rw-r--r--Ryujinx.Graphics.Nvdec.Vp9/Dsp/Convolve.cs949
-rw-r--r--Ryujinx.Graphics.Nvdec.Vp9/Dsp/Filter.cs12
-rw-r--r--Ryujinx.Graphics.Nvdec.Vp9/Dsp/IntraPred.cs1379
-rw-r--r--Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs2868
-rw-r--r--Ryujinx.Graphics.Nvdec.Vp9/Dsp/Prob.cs73
-rw-r--r--Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs237
-rw-r--r--Ryujinx.Graphics.Nvdec.Vp9/Dsp/TxfmCommon.cs54
7 files changed, 5572 insertions, 0 deletions
diff --git a/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Convolve.cs b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Convolve.cs
new file mode 100644
index 00000000..b74c33dc
--- /dev/null
+++ b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Convolve.cs
@@ -0,0 +1,949 @@
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.Filter;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
+{
+ internal static class Convolve
+ {
+ private const bool UseIntrinsics = true;
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static Vector128<int> MultiplyAddAdjacent(
+ Vector128<short> vsrc0,
+ Vector128<short> vsrc1,
+ Vector128<short> vsrc2,
+ Vector128<short> vsrc3,
+ Vector128<short> vfilter,
+ Vector128<int> zero)
+ {
+ // < sumN, sumN, sumN, sumN >
+ Vector128<int> sum0 = Sse2.MultiplyAddAdjacent(vsrc0, vfilter);
+ Vector128<int> sum1 = Sse2.MultiplyAddAdjacent(vsrc1, vfilter);
+ Vector128<int> sum2 = Sse2.MultiplyAddAdjacent(vsrc2, vfilter);
+ Vector128<int> sum3 = Sse2.MultiplyAddAdjacent(vsrc3, vfilter);
+
+ // < 0, 0, sumN, sumN >
+ sum0 = Ssse3.HorizontalAdd(sum0, zero);
+ sum1 = Ssse3.HorizontalAdd(sum1, zero);
+ sum2 = Ssse3.HorizontalAdd(sum2, zero);
+ sum3 = Ssse3.HorizontalAdd(sum3, zero);
+
+ // < 0, 0, 0, sumN >
+ sum0 = Ssse3.HorizontalAdd(sum0, zero);
+ sum1 = Ssse3.HorizontalAdd(sum1, zero);
+ sum2 = Ssse3.HorizontalAdd(sum2, zero);
+ sum3 = Ssse3.HorizontalAdd(sum3, zero);
+
+ // < 0, 0, sum1, sum0 >
+ Vector128<int> sum01 = Sse2.UnpackLow(sum0, sum1);
+
+ // < 0, 0, sum3, sum2 >
+ Vector128<int> sum23 = Sse2.UnpackLow(sum2, sum3);
+
+ // < sum3, sum2, sum1, sum0 >
+ return Sse.MoveLowToHigh(sum01.AsSingle(), sum23.AsSingle()).AsInt32();
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static Vector128<int> RoundShift(Vector128<int> value, Vector128<int> const64)
+ {
+ return Sse2.ShiftRightArithmetic(Sse2.Add(value, const64), FilterBits);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static Vector128<byte> PackUnsignedSaturate(Vector128<int> value, Vector128<int> zero)
+ {
+ return Sse2.PackUnsignedSaturate(Sse41.PackUnsignedSaturate(value, zero).AsInt16(), zero.AsInt16());
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void ConvolveHorizSse41(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] xFilters,
+ int x0Q4,
+ int w,
+ int h)
+ {
+ Vector128<int> zero = Vector128<int>.Zero;
+ Vector128<int> const64 = Vector128.Create(64);
+
+ ulong x, y;
+ src -= SubpelTaps / 2 - 1;
+
+ fixed (Array8<short>* xFilter = xFilters)
+ {
+ Vector128<short> vfilter = Sse2.LoadVector128((short*)xFilter + (uint)(x0Q4 & SubpelMask) * 8);
+
+ for (y = 0; y < (uint)h; ++y)
+ {
+ ulong srcOffset = (uint)x0Q4 >> SubpelBits;
+ for (x = 0; x < (uint)w; x += 4)
+ {
+ Vector128<short> vsrc0 = Sse41.ConvertToVector128Int16(&src[srcOffset + x]);
+ Vector128<short> vsrc1 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 1]);
+ Vector128<short> vsrc2 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 2]);
+ Vector128<short> vsrc3 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 3]);
+
+ Vector128<int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero);
+
+ Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle());
+ }
+ src += srcStride;
+ dst += dstStride;
+ }
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void ConvolveHoriz(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] xFilters,
+ int x0Q4,
+ int xStepQ4,
+ int w,
+ int h)
+ {
+ if (Sse41.IsSupported && UseIntrinsics && xStepQ4 == 1 << SubpelBits)
+ {
+ ConvolveHorizSse41(src, srcStride, dst, dstStride, xFilters, x0Q4, w, h);
+ return;
+ }
+
+ int x, y;
+ src -= SubpelTaps / 2 - 1;
+
+ for (y = 0; y < h; ++y)
+ {
+ int xQ4 = x0Q4;
+ for (x = 0; x < w; ++x)
+ {
+ byte* srcX = &src[xQ4 >> SubpelBits];
+ ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
+ int k, sum = 0;
+ for (k = 0; k < SubpelTaps; ++k)
+ {
+ sum += srcX[k] * xFilter[k];
+ }
+
+ dst[x] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits));
+ xQ4 += xStepQ4;
+ }
+ src += srcStride;
+ dst += dstStride;
+ }
+ }
+
+ private static unsafe void ConvolveAvgHoriz(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] xFilters,
+ int x0Q4,
+ int xStepQ4,
+ int w,
+ int h)
+ {
+ int x, y;
+ src -= SubpelTaps / 2 - 1;
+
+ for (y = 0; y < h; ++y)
+ {
+ int xQ4 = x0Q4;
+ for (x = 0; x < w; ++x)
+ {
+ byte* srcX = &src[xQ4 >> SubpelBits];
+ ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
+ int k, sum = 0;
+ for (k = 0; k < SubpelTaps; ++k)
+ {
+ sum += srcX[k] * xFilter[k];
+ }
+
+ dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1);
+ xQ4 += xStepQ4;
+ }
+ src += srcStride;
+ dst += dstStride;
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void ConvolveVertAvx2(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] yFilters,
+ int y0Q4,
+ int w,
+ int h)
+ {
+ Vector128<int> zero = Vector128<int>.Zero;
+ Vector128<int> const64 = Vector128.Create(64);
+ Vector256<int> indices = Vector256.Create(
+ 0,
+ srcStride,
+ srcStride * 2,
+ srcStride * 3,
+ srcStride * 4,
+ srcStride * 5,
+ srcStride * 6,
+ srcStride * 7);
+
+ ulong x, y;
+ src -= srcStride * (SubpelTaps / 2 - 1);
+
+ fixed (Array8<short>* yFilter = yFilters)
+ {
+ Vector128<short> vfilter = Sse2.LoadVector128((short*)yFilter + (uint)(y0Q4 & SubpelMask) * 8);
+
+ ulong srcBaseY = (uint)y0Q4 >> SubpelBits;
+ for (y = 0; y < (uint)h; ++y)
+ {
+ ulong srcOffset = (srcBaseY + y) * (uint)srcStride;
+ for (x = 0; x < (uint)w; x += 4)
+ {
+ Vector256<int> vsrc = Avx2.GatherVector256((uint*)&src[srcOffset + x], indices, 1).AsInt32();
+
+ Vector128<int> vsrcL = vsrc.GetLower();
+ Vector128<int> vsrcH = vsrc.GetUpper();
+
+ Vector128<byte> vsrcUnpck11 = Sse2.UnpackLow(vsrcL.AsByte(), vsrcH.AsByte());
+ Vector128<byte> vsrcUnpck12 = Sse2.UnpackHigh(vsrcL.AsByte(), vsrcH.AsByte());
+
+ Vector128<byte> vsrcUnpck21 = Sse2.UnpackLow(vsrcUnpck11, vsrcUnpck12);
+ Vector128<byte> vsrcUnpck22 = Sse2.UnpackHigh(vsrcUnpck11, vsrcUnpck12);
+
+ Vector128<byte> vsrc01 = Sse2.UnpackLow(vsrcUnpck21, vsrcUnpck22);
+ Vector128<byte> vsrc23 = Sse2.UnpackHigh(vsrcUnpck21, vsrcUnpck22);
+
+ Vector128<byte> vsrc11 = Sse.MoveHighToLow(vsrc01.AsSingle(), vsrc01.AsSingle()).AsByte();
+ Vector128<byte> vsrc33 = Sse.MoveHighToLow(vsrc23.AsSingle(), vsrc23.AsSingle()).AsByte();
+
+ Vector128<short> vsrc0 = Sse41.ConvertToVector128Int16(vsrc01);
+ Vector128<short> vsrc1 = Sse41.ConvertToVector128Int16(vsrc11);
+ Vector128<short> vsrc2 = Sse41.ConvertToVector128Int16(vsrc23);
+ Vector128<short> vsrc3 = Sse41.ConvertToVector128Int16(vsrc33);
+
+ Vector128<int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero);
+
+ Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle());
+ }
+ dst += dstStride;
+ }
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void ConvolveVert(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] yFilters,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h)
+ {
+ if (Avx2.IsSupported && UseIntrinsics && yStepQ4 == 1 << SubpelBits)
+ {
+ ConvolveVertAvx2(src, srcStride, dst, dstStride, yFilters, y0Q4, w, h);
+ return;
+ }
+
+ int x, y;
+ src -= srcStride * (SubpelTaps / 2 - 1);
+
+ for (x = 0; x < w; ++x)
+ {
+ int yQ4 = y0Q4;
+ for (y = 0; y < h; ++y)
+ {
+ byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
+ ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
+ int k, sum = 0;
+ for (k = 0; k < SubpelTaps; ++k)
+ {
+ sum += srcY[k * srcStride] * yFilter[k];
+ }
+
+ dst[y * dstStride] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits));
+ yQ4 += yStepQ4;
+ }
+ ++src;
+ ++dst;
+ }
+ }
+
+ private static unsafe void ConvolveAvgVert(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] yFilters,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h)
+ {
+ int x, y;
+ src -= srcStride * (SubpelTaps / 2 - 1);
+
+ for (x = 0; x < w; ++x)
+ {
+ int yQ4 = y0Q4;
+ for (y = 0; y < h; ++y)
+ {
+ byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
+ ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
+ int k, sum = 0;
+ for (k = 0; k < SubpelTaps; ++k)
+ {
+ sum += srcY[k * srcStride] * yFilter[k];
+ }
+
+ dst[y * dstStride] = (byte)BitUtils.RoundPowerOfTwo(
+ dst[y * dstStride] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1);
+ yQ4 += yStepQ4;
+ }
+ ++src;
+ ++dst;
+ }
+ }
+
+ public static unsafe void Convolve8Horiz(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h)
+ {
+ ConvolveHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h);
+ }
+
+ public static unsafe void Convolve8AvgHoriz(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h)
+ {
+ ConvolveAvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h);
+ }
+
+ public static unsafe void Convolve8Vert(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h)
+ {
+ ConvolveVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
+ }
+
+ public static unsafe void Convolve8AvgVert(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h)
+ {
+ ConvolveAvgVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
+ }
+
+ [StructLayout(LayoutKind.Sequential, Size = 64 * 135)]
+ struct Temp
+ {
+ }
+
+ public static unsafe void Convolve8(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h)
+ {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> yStepQ4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SubpelTaps rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ // When calling in frame scaling function, the smallest scaling factor is x1/4
+ // ==> yStepQ4 = 64. Since w and h are at most 16, the temp buffer is still
+ // big enough.
+ Temp tempStruct;
+ byte* temp = (byte*)Unsafe.AsPointer(ref tempStruct); // Avoid zero initialization.
+ int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps;
+
+ Debug.Assert(w <= 64);
+ Debug.Assert(h <= 64);
+ Debug.Assert(yStepQ4 <= 32 || (yStepQ4 <= 64 && h <= 32));
+ Debug.Assert(xStepQ4 <= 64);
+
+ ConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, intermediateHeight);
+ ConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
+ }
+
+ public static unsafe void Convolve8Avg(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h)
+ {
+ // Fixed size intermediate buffer places limits on parameters.
+ byte* temp = stackalloc byte[64 * 64];
+ Debug.Assert(w <= 64);
+ Debug.Assert(h <= 64);
+
+ Convolve8(src, srcStride, temp, 64, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
+ ConvolveAvg(temp, 64, dst, dstStride, null, 0, 0, 0, 0, w, h);
+ }
+
+ public static unsafe void ConvolveCopy(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h)
+ {
+ int r;
+
+ for (r = h; r > 0; --r)
+ {
+ MemoryUtil.Copy(dst, src, w);
+ src += srcStride;
+ dst += dstStride;
+ }
+ }
+
+ public static unsafe void ConvolveAvg(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h)
+ {
+ int x, y;
+
+ for (y = 0; y < h; ++y)
+ {
+ for (x = 0; x < w; ++x)
+ {
+ dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1);
+ }
+
+ src += srcStride;
+ dst += dstStride;
+ }
+ }
+
+ public static unsafe void ScaledHoriz(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h)
+ {
+ Convolve8Horiz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
+ }
+
+ public static unsafe void ScaledVert(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h)
+ {
+ Convolve8Vert(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
+ }
+
+ public static unsafe void Scaled2D(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h)
+ {
+ Convolve8(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
+ }
+
+ public static unsafe void ScaledAvgHoriz(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h)
+ {
+ Convolve8AvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
+ }
+
+ public static unsafe void ScaledAvgVert(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h)
+ {
+ Convolve8AvgVert(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
+ }
+
+ public static unsafe void ScaledAvg2D(
+ byte* src,
+ int srcStride,
+ byte* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h)
+ {
+ Convolve8Avg(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
+ }
+
+ private static unsafe void HighbdConvolveHoriz(
+ ushort* src,
+ int srcStride,
+ ushort* dst,
+ int dstStride,
+ Array8<short>[] xFilters,
+ int x0Q4,
+ int xStepQ4,
+ int w,
+ int h,
+ int bd)
+ {
+ int x, y;
+ src -= SubpelTaps / 2 - 1;
+
+ for (y = 0; y < h; ++y)
+ {
+ int xQ4 = x0Q4;
+ for (x = 0; x < w; ++x)
+ {
+ ushort* srcX = &src[xQ4 >> SubpelBits];
+ ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
+ int k, sum = 0;
+ for (k = 0; k < SubpelTaps; ++k)
+ {
+ sum += srcX[k] * xFilter[k];
+ }
+
+ dst[x] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd);
+ xQ4 += xStepQ4;
+ }
+ src += srcStride;
+ dst += dstStride;
+ }
+ }
+
+ private static unsafe void HighbdConvolveAvgHoriz(
+ ushort* src,
+ int srcStride,
+ ushort* dst,
+ int dstStride,
+ Array8<short>[] xFilters,
+ int x0Q4,
+ int xStepQ4,
+ int w,
+ int h,
+ int bd)
+ {
+ int x, y;
+ src -= SubpelTaps / 2 - 1;
+
+ for (y = 0; y < h; ++y)
+ {
+ int xQ4 = x0Q4;
+ for (x = 0; x < w; ++x)
+ {
+ ushort* srcX = &src[xQ4 >> SubpelBits];
+ ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
+ int k, sum = 0;
+ for (k = 0; k < SubpelTaps; ++k)
+ {
+ sum += srcX[k] * xFilter[k];
+ }
+
+ dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1);
+ xQ4 += xStepQ4;
+ }
+ src += srcStride;
+ dst += dstStride;
+ }
+ }
+
+ private static unsafe void HighbdConvolveVert(
+ ushort* src,
+ int srcStride,
+ ushort* dst,
+ int dstStride,
+ Array8<short>[] yFilters,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h,
+ int bd)
+ {
+ int x, y;
+ src -= srcStride * (SubpelTaps / 2 - 1);
+
+ for (x = 0; x < w; ++x)
+ {
+ int yQ4 = y0Q4;
+ for (y = 0; y < h; ++y)
+ {
+ ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
+ ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
+ int k, sum = 0;
+ for (k = 0; k < SubpelTaps; ++k)
+ {
+ sum += srcY[k * srcStride] * yFilter[k];
+ }
+
+ dst[y * dstStride] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd);
+ yQ4 += yStepQ4;
+ }
+ ++src;
+ ++dst;
+ }
+ }
+
+ private static unsafe void HighConvolveAvgVert(
+ ushort* src,
+ int srcStride,
+ ushort* dst,
+ int dstStride,
+ Array8<short>[] yFilters,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h,
+ int bd)
+ {
+ int x, y;
+ src -= srcStride * (SubpelTaps / 2 - 1);
+
+ for (x = 0; x < w; ++x)
+ {
+ int yQ4 = y0Q4;
+ for (y = 0; y < h; ++y)
+ {
+ ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
+ ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
+ int k, sum = 0;
+ for (k = 0; k < SubpelTaps; ++k)
+ {
+ sum += srcY[k * srcStride] * yFilter[k];
+ }
+
+ dst[y * dstStride] = (ushort)BitUtils.RoundPowerOfTwo(
+ dst[y * dstStride] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1);
+ yQ4 += yStepQ4;
+ }
+ ++src;
+ ++dst;
+ }
+ }
+
+ private static unsafe void HighbdConvolve(
+ ushort* src,
+ int srcStride,
+ ushort* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h,
+ int bd)
+ {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> yStepQ4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SubpelTaps rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ ushort* temp = stackalloc ushort[64 * 135];
+ int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps;
+
+ Debug.Assert(w <= 64);
+ Debug.Assert(h <= 64);
+ Debug.Assert(yStepQ4 <= 32);
+ Debug.Assert(xStepQ4 <= 32);
+
+ HighbdConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, intermediateHeight, bd);
+ HighbdConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd);
+ }
+
+ public static unsafe void HighbdConvolve8Horiz(
+ ushort* src,
+ int srcStride,
+ ushort* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h,
+ int bd)
+ {
+ HighbdConvolveHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h, bd);
+ }
+
+ public static unsafe void HighbdConvolve8AvgHoriz(
+ ushort* src,
+ int srcStride,
+ ushort* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h,
+ int bd)
+ {
+ HighbdConvolveAvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h, bd);
+ }
+
+ public static unsafe void HighbdConvolve8Vert(
+ ushort* src,
+ int srcStride,
+ ushort* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h,
+ int bd)
+ {
+ HighbdConvolveVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd);
+ }
+
+ public static unsafe void HighbdConvolve8AvgVert(
+ ushort* src,
+ int srcStride,
+ ushort* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h,
+ int bd)
+ {
+ HighConvolveAvgVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd);
+ }
+
+ public static unsafe void HighbdConvolve8(
+ ushort* src,
+ int srcStride,
+ ushort* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h,
+ int bd)
+ {
+ HighbdConvolve(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h, bd);
+ }
+
+ public static unsafe void HighbdConvolve8Avg(
+ ushort* src,
+ int srcStride,
+ ushort* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h,
+ int bd)
+ {
+ // Fixed size intermediate buffer places limits on parameters.
+ ushort* temp = stackalloc ushort[64 * 64];
+ Debug.Assert(w <= 64);
+ Debug.Assert(h <= 64);
+
+ HighbdConvolve8(src, srcStride, temp, 64, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h, bd);
+ HighbdConvolveAvg(temp, 64, dst, dstStride, null, 0, 0, 0, 0, w, h, bd);
+ }
+
+ public static unsafe void HighbdConvolveCopy(
+ ushort* src,
+ int srcStride,
+ ushort* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h,
+ int bd)
+ {
+ int r;
+
+ for (r = h; r > 0; --r)
+ {
+ MemoryUtil.Copy(dst, src, w);
+ src += srcStride;
+ dst += dstStride;
+ }
+ }
+
+ public static unsafe void HighbdConvolveAvg(
+ ushort* src,
+ int srcStride,
+ ushort* dst,
+ int dstStride,
+ Array8<short>[] filter,
+ int x0Q4,
+ int xStepQ4,
+ int y0Q4,
+ int yStepQ4,
+ int w,
+ int h,
+ int bd)
+ {
+ int x, y;
+
+ for (y = 0; y < h; ++y)
+ {
+ for (x = 0; x < w; ++x)
+ {
+ dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1);
+ }
+
+ src += srcStride;
+ dst += dstStride;
+ }
+ }
+ }
+}
diff --git a/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Filter.cs b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Filter.cs
new file mode 100644
index 00000000..16962897
--- /dev/null
+++ b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Filter.cs
@@ -0,0 +1,12 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
+{
+ internal static class Filter
+ {
+ public const int FilterBits = 7;
+
+ public const int SubpelBits = 4;
+ public const int SubpelMask = (1 << SubpelBits) - 1;
+ public const int SubpelShifts = 1 << SubpelBits;
+ public const int SubpelTaps = 8;
+ }
+}
diff --git a/Ryujinx.Graphics.Nvdec.Vp9/Dsp/IntraPred.cs b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/IntraPred.cs
new file mode 100644
index 00000000..62b3a9b1
--- /dev/null
+++ b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/IntraPred.cs
@@ -0,0 +1,1379 @@
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
+{
+ internal static class IntraPred
+ {
+ private static unsafe ref byte Dst(byte* dst, int stride, int x, int y)
+ {
+ return ref dst[x + y * stride];
+ }
+
+ private static unsafe ref ushort Dst(ushort* dst, int stride, int x, int y)
+ {
+ return ref dst[x + y * stride];
+ }
+
+ private static byte Avg3(byte a, byte b, byte c)
+ {
+ return (byte)((a + 2 * b + c + 2) >> 2);
+ }
+
+ private static ushort Avg3(ushort a, ushort b, ushort c)
+ {
+ return (ushort)((a + 2 * b + c + 2) >> 2);
+ }
+
+ private static byte Avg2(byte a, byte b)
+ {
+ return (byte)((a + b + 1) >> 1);
+ }
+
+ private static ushort Avg2(ushort a, ushort b)
+ {
+ return (ushort)((a + b + 1) >> 1);
+ }
+
+ public static unsafe void D207Predictor8x8(byte* dst, int stride, byte* above, byte* left)
+ {
+ D207Predictor(dst, stride, 8, above, left);
+ }
+
+ public static unsafe void D207Predictor16x16(byte* dst, int stride, byte* above, byte* left)
+ {
+ D207Predictor(dst, stride, 16, above, left);
+ }
+
+ public static unsafe void D207Predictor32x32(byte* dst, int stride, byte* above, byte* left)
+ {
+ D207Predictor(dst, stride, 32, above, left);
+ }
+
+ private static unsafe void D207Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
+ {
+ int r, c;
+ // First column
+ for (r = 0; r < bs - 1; ++r)
+ {
+ dst[r * stride] = Avg2(left[r], left[r + 1]);
+ }
+
+ dst[(bs - 1) * stride] = left[bs - 1];
+ dst++;
+
+ // Second column
+ for (r = 0; r < bs - 2; ++r)
+ {
+ dst[r * stride] = Avg3(left[r], left[r + 1], left[r + 2]);
+ }
+
+ dst[(bs - 2) * stride] = Avg3(left[bs - 2], left[bs - 1], left[bs - 1]);
+ dst[(bs - 1) * stride] = left[bs - 1];
+ dst++;
+
+ // Rest of last row
+ for (c = 0; c < bs - 2; ++c)
+ {
+ dst[(bs - 1) * stride + c] = left[bs - 1];
+ }
+
+ for (r = bs - 2; r >= 0; --r)
+ {
+ for (c = 0; c < bs - 2; ++c)
+ {
+ dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
+ }
+ }
+ }
+
+ public static unsafe void D63Predictor8x8(byte* dst, int stride, byte* above, byte* left)
+ {
+ D63Predictor(dst, stride, 8, above, left);
+ }
+
+ public static unsafe void D63Predictor16x16(byte* dst, int stride, byte* above, byte* left)
+ {
+ D63Predictor(dst, stride, 16, above, left);
+ }
+
+ public static unsafe void D63Predictor32x32(byte* dst, int stride, byte* above, byte* left)
+ {
+ D63Predictor(dst, stride, 32, above, left);
+ }
+
+ private static unsafe void D63Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
+ {
+ int r, c;
+ int size;
+ for (c = 0; c < bs; ++c)
+ {
+ dst[c] = Avg2(above[c], above[c + 1]);
+ dst[stride + c] = Avg3(above[c], above[c + 1], above[c + 2]);
+ }
+ for (r = 2, size = bs - 2; r < bs; r += 2, --size)
+ {
+ MemoryUtil.Copy(dst + (r + 0) * stride, dst + (r >> 1), size);
+ MemoryUtil.Fill(dst + (r + 0) * stride + size, above[bs - 1], bs - size);
+ MemoryUtil.Copy(dst + (r + 1) * stride, dst + stride + (r >> 1), size);
+ MemoryUtil.Fill(dst + (r + 1) * stride + size, above[bs - 1], bs - size);
+ }
+ }
+
+ public static unsafe void D45Predictor8x8(byte* dst, int stride, byte* above, byte* left)
+ {
+ D45Predictor(dst, stride, 8, above, left);
+ }
+
+ public static unsafe void D45Predictor16x16(byte* dst, int stride, byte* above, byte* left)
+ {
+ D45Predictor(dst, stride, 16, above, left);
+ }
+
+ public static unsafe void D45Predictor32x32(byte* dst, int stride, byte* above, byte* left)
+ {
+ D45Predictor(dst, stride, 32, above, left);
+ }
+
+ private static unsafe void D45Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
+ {
+ byte aboveRight = above[bs - 1];
+ byte* dstRow0 = dst;
+ int x, size;
+
+ for (x = 0; x < bs - 1; ++x)
+ {
+ dst[x] = Avg3(above[x], above[x + 1], above[x + 2]);
+ }
+ dst[bs - 1] = aboveRight;
+ dst += stride;
+ for (x = 1, size = bs - 2; x < bs; ++x, --size)
+ {
+ MemoryUtil.Copy(dst, dstRow0 + x, size);
+ MemoryUtil.Fill(dst + size, aboveRight, x + 1);
+ dst += stride;
+ }
+ }
+
+ public static unsafe void D117Predictor8x8(byte* dst, int stride, byte* above, byte* left)
+ {
+ D117Predictor(dst, stride, 8, above, left);
+ }
+
+ public static unsafe void D117Predictor16x16(byte* dst, int stride, byte* above, byte* left)
+ {
+ D117Predictor(dst, stride, 16, above, left);
+ }
+
+ public static unsafe void D117Predictor32x32(byte* dst, int stride, byte* above, byte* left)
+ {
+ D117Predictor(dst, stride, 32, above, left);
+ }
+
+ private static unsafe void D117Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
+ {
+ int r, c;
+
+ // First row
+ for (c = 0; c < bs; c++)
+ {
+ dst[c] = Avg2(above[c - 1], above[c]);
+ }
+
+ dst += stride;
+
+ // Second row
+ dst[0] = Avg3(left[0], above[-1], above[0]);
+ for (c = 1; c < bs; c++)
+ {
+ dst[c] = Avg3(above[c - 2], above[c - 1], above[c]);
+ }
+
+ dst += stride;
+
+ // The rest of first col
+ dst[0] = Avg3(above[-1], left[0], left[1]);
+ for (r = 3; r < bs; ++r)
+ {
+ dst[(r - 2) * stride] = Avg3(left[r - 3], left[r - 2], left[r - 1]);
+ }
+
+ // The rest of the block
+ for (r = 2; r < bs; ++r)
+ {
+ for (c = 1; c < bs; c++)
+ {
+ dst[c] = dst[-2 * stride + c - 1];
+ }
+
+ dst += stride;
+ }
+ }
+
+ public static unsafe void D135Predictor8x8(byte* dst, int stride, byte* above, byte* left)
+ {
+ D135Predictor(dst, stride, 8, above, left);
+ }
+
+ public static unsafe void D135Predictor16x16(byte* dst, int stride, byte* above, byte* left)
+ {
+ D135Predictor(dst, stride, 16, above, left);
+ }
+
+ public static unsafe void D135Predictor32x32(byte* dst, int stride, byte* above, byte* left)
+ {
+ D135Predictor(dst, stride, 32, above, left);
+ }
+
+ private static unsafe void D135Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
+ {
+ int i;
+ byte* border = stackalloc byte[32 + 32 - 1]; // outer border from bottom-left to top-right
+
+ // Dst(dst, stride, bs, bs - 2)[0], i.e., border starting at bottom-left
+ for (i = 0; i < bs - 2; ++i)
+ {
+ border[i] = Avg3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]);
+ }
+ border[bs - 2] = Avg3(above[-1], left[0], left[1]);
+ border[bs - 1] = Avg3(left[0], above[-1], above[0]);
+ border[bs - 0] = Avg3(above[-1], above[0], above[1]);
+ // dst[0][2, size), i.e., remaining top border ascending
+ for (i = 0; i < bs - 2; ++i)
+ {
+ border[bs + 1 + i] = Avg3(above[i], above[i + 1], above[i + 2]);
+ }
+
+ for (i = 0; i < bs; ++i)
+ {
+ MemoryUtil.Copy(dst + i * stride, border + bs - 1 - i, bs);
+ }
+ }
+
+ public static unsafe void D153Predictor8x8(byte* dst, int stride, byte* above, byte* left)
+ {
+ D153Predictor(dst, stride, 8, above, left);
+ }
+
+ public static unsafe void D153Predictor16x16(byte* dst, int stride, byte* above, byte* left)
+ {
+ D153Predictor(dst, stride, 16, above, left);
+ }
+
+ public static unsafe void D153Predictor32x32(byte* dst, int stride, byte* above, byte* left)
+ {
+ D153Predictor(dst, stride, 32, above, left);
+ }
+
+ private static unsafe void D153Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
+ {
+ int r, c;
+ dst[0] = Avg2(above[-1], left[0]);
+ for (r = 1; r < bs; r++)
+ {
+ dst[r * stride] = Avg2(left[r - 1], left[r]);
+ }
+
+ dst++;
+
+ dst[0] = Avg3(left[0], above[-1], above[0]);
+ dst[stride] = Avg3(above[-1], left[0], left[1]);
+ for (r = 2; r < bs; r++)
+ {
+ dst[r * stride] = Avg3(left[r - 2], left[r - 1], left[r]);
+ }
+
+ dst++;
+
+ for (c = 0; c < bs - 2; c++)
+ {
+ dst[c] = Avg3(above[c - 1], above[c], above[c + 1]);
+ }
+
+ dst += stride;
+
+ for (r = 1; r < bs; ++r)
+ {
+ for (c = 0; c < bs - 2; c++)
+ {
+ dst[c] = dst[-stride + c - 2];
+ }
+
+ dst += stride;
+ }
+ }
+
+ public static unsafe void VPredictor4x4(byte* dst, int stride, byte* above, byte* left)
+ {
+ VPredictor(dst, stride, 4, above, left);
+ }
+
+ public static unsafe void VPredictor8x8(byte* dst, int stride, byte* above, byte* left)
+ {
+ VPredictor(dst, stride, 8, above, left);
+ }
+
+ public static unsafe void VPredictor16x16(byte* dst, int stride, byte* above, byte* left)
+ {
+ VPredictor(dst, stride, 16, above, left);
+ }
+
+ public static unsafe void VPredictor32x32(byte* dst, int stride, byte* above, byte* left)
+ {
+ VPredictor(dst, stride, 32, above, left);
+ }
+
+ private static unsafe void VPredictor(byte* dst, int stride, int bs, byte* above, byte* left)
+ {
+ int r;
+
+ for (r = 0; r < bs; r++)
+ {
+ MemoryUtil.Copy(dst, above, bs);
+ dst += stride;
+ }
+ }
+
+ public static unsafe void HPredictor4x4(byte* dst, int stride, byte* above, byte* left)
+ {
+ HPredictor(dst, stride, 4, above, left);
+ }
+
+ public static unsafe void HPredictor8x8(byte* dst, int stride, byte* above, byte* left)
+ {
+ HPredictor(dst, stride, 8, above, left);
+ }
+
+ public static unsafe void HPredictor16x16(byte* dst, int stride, byte* above, byte* left)
+ {
+ HPredictor(dst, stride, 16, above, left);
+ }
+
+ public static unsafe void HPredictor32x32(byte* dst, int stride, byte* above, byte* left)
+ {
+ HPredictor(dst, stride, 32, above, left);
+ }
+
+ private static unsafe void HPredictor(byte* dst, int stride, int bs, byte* above, byte* left)
+ {
+ int r;
+
+ for (r = 0; r < bs; r++)
+ {
+ MemoryUtil.Fill(dst, left[r], bs);
+ dst += stride;
+ }
+ }
+
+ public static unsafe void TMPredictor4x4(byte* dst, int stride, byte* above, byte* left)
+ {
+ TMPredictor(dst, stride, 4, above, left);
+ }
+
+ public static unsafe void TMPredictor8x8(byte* dst, int stride, byte* above, byte* left)
+ {
+ TMPredictor(dst, stride, 8, above, left);
+ }
+
+ public static unsafe void TMPredictor16x16(byte* dst, int stride, byte* above, byte* left)
+ {
+ TMPredictor(dst, stride, 16, above, left);
+ }
+
+ public static unsafe void TMPredictor32x32(byte* dst, int stride, byte* above, byte* left)
+ {
+ TMPredictor(dst, stride, 32, above, left);
+ }
+
+ private static unsafe void TMPredictor(byte* dst, int stride, int bs, byte* above, byte* left)
+ {
+ int r, c;
+ int yTopLeft = above[-1];
+
+ for (r = 0; r < bs; r++)
+ {
+ for (c = 0; c < bs; c++)
+ {
+ dst[c] = BitUtils.ClipPixel(left[r] + above[c] - yTopLeft);
+ }
+
+ dst += stride;
+ }
+ }
+
+ public static unsafe void Dc128Predictor4x4(byte* dst, int stride, byte* above, byte* left)
+ {
+ Dc128Predictor(dst, stride, 4, above, left);
+ }
+
+ public static unsafe void Dc128Predictor8x8(byte* dst, int stride, byte* above, byte* left)
+ {
+ Dc128Predictor(dst, stride, 8, above, left);
+ }
+
+ public static unsafe void Dc128Predictor16x16(byte* dst, int stride, byte* above, byte* left)
+ {
+ Dc128Predictor(dst, stride, 16, above, left);
+ }
+
+ public static unsafe void Dc128Predictor32x32(byte* dst, int stride, byte* above, byte* left)
+ {
+ Dc128Predictor(dst, stride, 32, above, left);
+ }
+
+ private static unsafe void Dc128Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
+ {
+ int r;
+
+ for (r = 0; r < bs; r++)
+ {
+ MemoryUtil.Fill(dst, (byte)128, bs);
+ dst += stride;
+ }
+ }
+
+ public static unsafe void DcLeftPredictor4x4(byte* dst, int stride, byte* above, byte* left)
+ {
+ DcLeftPredictor(dst, stride, 4, above, left);
+ }
+
+ public static unsafe void DcLeftPredictor8x8(byte* dst, int stride, byte* above, byte* left)
+ {
+ DcLeftPredictor(dst, stride, 8, above, left);
+ }
+
+ public static unsafe void DcLeftPredictor16x16(byte* dst, int stride, byte* above, byte* left)
+ {
+ DcLeftPredictor(dst, stride, 16, above, left);
+ }
+
+ public static unsafe void DcLeftPredictor32x32(byte* dst, int stride, byte* above, byte* left)
+ {
+ DcLeftPredictor(dst, stride, 32, above, left);
+ }
+
+ private static unsafe void DcLeftPredictor(byte* dst, int stride, int bs, byte* above, byte* left)
+ {
+ int i, r, expectedDc, sum = 0;
+
+ for (i = 0; i < bs; i++)
+ {
+ sum += left[i];
+ }
+
+ expectedDc = (sum + (bs >> 1)) / bs;
+
+ for (r = 0; r < bs; r++)
+ {
+ MemoryUtil.Fill(dst, (byte)expectedDc, bs);
+ dst += stride;
+ }
+ }
+
+ public static unsafe void DcTopPredictor4x4(byte* dst, int stride, byte* above, byte* left)
+ {
+ DcTopPredictor(dst, stride, 4, above, left);
+ }
+
+ public static unsafe void DcTopPredictor8x8(byte* dst, int stride, byte* above, byte* left)
+ {
+ DcTopPredictor(dst, stride, 8, above, left);
+ }
+
+ public static unsafe void DcTopPredictor16x16(byte* dst, int stride, byte* above, byte* left)
+ {
+ DcTopPredictor(dst, stride, 16, above, left);
+ }
+
+ public static unsafe void DcTopPredictor32x32(byte* dst, int stride, byte* above, byte* left)
+ {
+ DcTopPredictor(dst, stride, 32, above, left);
+ }
+
+ private static unsafe void DcTopPredictor(byte* dst, int stride, int bs, byte* above, byte* left)
+ {
+ int i, r, expectedDc, sum = 0;
+
+ for (i = 0; i < bs; i++)
+ {
+ sum += above[i];
+ }
+
+ expectedDc = (sum + (bs >> 1)) / bs;
+
+ for (r = 0; r < bs; r++)
+ {
+ MemoryUtil.Fill(dst, (byte)expectedDc, bs);
+ dst += stride;
+ }
+ }
+
+ public static unsafe void DcPredictor4x4(byte* dst, int stride, byte* above, byte* left)
+ {
+ DcPredictor(dst, stride, 4, above, left);
+ }
+
+ public static unsafe void DcPredictor8x8(byte* dst, int stride, byte* above, byte* left)
+ {
+ DcPredictor(dst, stride, 8, above, left);
+ }
+
+ public static unsafe void DcPredictor16x16(byte* dst, int stride, byte* above, byte* left)
+ {
+ DcPredictor(dst, stride, 16, above, left);
+ }
+
+ public static unsafe void DcPredictor32x32(byte* dst, int stride, byte* above, byte* left)
+ {
+ DcPredictor(dst, stride, 32, above, left);
+ }
+
+ private static unsafe void DcPredictor(byte* dst, int stride, int bs, byte* above, byte* left)
+ {
+ int i, r, expectedDc, sum = 0;
+ int count = 2 * bs;
+
+ for (i = 0; i < bs; i++)
+ {
+ sum += above[i];
+ sum += left[i];
+ }
+
+ expectedDc = (sum + (count >> 1)) / count;
+
+ for (r = 0; r < bs; r++)
+ {
+ MemoryUtil.Fill(dst, (byte)expectedDc, bs);
+ dst += stride;
+ }
+ }
+
+ public static unsafe void HePredictor4x4(byte* dst, int stride, byte* above, byte* left)
+ {
+ byte h = above[-1];
+ byte I = left[0];
+ byte j = left[1];
+ byte k = left[2];
+ byte l = left[3];
+
+ MemoryUtil.Fill(dst + stride * 0, Avg3(h, I, j), 4);
+ MemoryUtil.Fill(dst + stride * 1, Avg3(I, j, k), 4);
+ MemoryUtil.Fill(dst + stride * 2, Avg3(j, k, l), 4);
+ MemoryUtil.Fill(dst + stride * 3, Avg3(k, l, l), 4);
+ }
+
+ public static unsafe void VePredictor4x4(byte* dst, int stride, byte* above, byte* left)
+ {
+ byte h = above[-1];
+ byte I = above[0];
+ byte j = above[1];
+ byte k = above[2];
+ byte l = above[3];
+ byte m = above[4];
+
+ dst[0] = Avg3(h, I, j);
+ dst[1] = Avg3(I, j, k);
+ dst[2] = Avg3(j, k, l);
+ dst[3] = Avg3(k, l, m);
+ MemoryUtil.Copy(dst + stride * 1, dst, 4);
+ MemoryUtil.Copy(dst + stride * 2, dst, 4);
+ MemoryUtil.Copy(dst + stride * 3, dst, 4);
+ }
+
+ public static unsafe void D207Predictor4x4(byte* dst, int stride, byte* above, byte* left)
+ {
+ byte I = left[0];
+ byte j = left[1];
+ byte k = left[2];
+ byte l = left[3];
+ Dst(dst, stride, 0, 0) = Avg2(I, j);
+ Dst(dst, stride, 2, 0) = Dst(dst, stride, 0, 1) = Avg2(j, k);
+ Dst(dst, stride, 2, 1) = Dst(dst, stride, 0, 2) = Avg2(k, l);
+ Dst(dst, stride, 1, 0) = Avg3(I, j, k);
+ Dst(dst, stride, 3, 0) = Dst(dst, stride, 1, 1) = Avg3(j, k, l);
+ Dst(dst, stride, 3, 1) = Dst(dst, stride, 1, 2) = Avg3(k, l, l);
+ Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 0, 3) = Dst(dst, stride, 1, 3) = Dst(dst, stride, 2, 3) = Dst(dst, stride, 3, 3) = l;
+ }
+
+ public static unsafe void D63Predictor4x4(byte* dst, int stride, byte* above, byte* left)
+ {
+ byte a = above[0];
+ byte b = above[1];
+ byte c = above[2];
+ byte d = above[3];
+ byte e = above[4];
+ byte f = above[5];
+ byte g = above[6];
+ Dst(dst, stride, 0, 0) = Avg2(a, b);
+ Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 2) = Avg2(b, c);
+ Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 2) = Avg2(c, d);
+ Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 2) = Avg2(d, e);
+ Dst(dst, stride, 3, 2) = Avg2(e, f); // Differs from vp8
+
+ Dst(dst, stride, 0, 1) = Avg3(a, b, c);
+ Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 3) = Avg3(b, c, d);
+ Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 3) = Avg3(c, d, e);
+ Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 3) = Avg3(d, e, f);
+ Dst(dst, stride, 3, 3) = Avg3(e, f, g); // Differs from vp8
+ }
+
+ public static unsafe void D63ePredictor4x4(byte* dst, int stride, byte* above, byte* left)
+ {
+ byte a = above[0];
+ byte b = above[1];
+ byte c = above[2];
+ byte d = above[3];
+ byte e = above[4];
+ byte f = above[5];
+ byte g = above[6];
+ byte h = above[7];
+ Dst(dst, stride, 0, 0) = Avg2(a, b);
+ Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 2) = Avg2(b, c);
+ Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 2) = Avg2(c, d);
+ Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 2) = Avg2(d, e);
+ Dst(dst, stride, 3, 2) = Avg3(e, f, g);
+
+ Dst(dst, stride, 0, 1) = Avg3(a, b, c);
+ Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 3) = Avg3(b, c, d);
+ Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 3) = Avg3(c, d, e);
+ Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 3) = Avg3(d, e, f);
+ Dst(dst, stride, 3, 3) = Avg3(f, g, h);
+ }
+
+ public static unsafe void D45Predictor4x4(byte* dst, int stride, byte* above, byte* left)
+ {
+ byte a = above[0];
+ byte b = above[1];
+ byte c = above[2];
+ byte d = above[3];
+ byte e = above[4];
+ byte f = above[5];
+ byte g = above[6];
+ byte h = above[7];
+ Dst(dst, stride, 0, 0) = Avg3(a, b, c);
+ Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 1) = Avg3(b, c, d);
+ Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 2) = Avg3(c, d, e);
+ Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f);
+ Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 3) = Avg3(e, f, g);
+ Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 3) = Avg3(f, g, h);
+ Dst(dst, stride, 3, 3) = h; // differs from vp8
+ }
+
+ public static unsafe void D45ePredictor4x4(byte* dst, int stride, byte* above, byte* left)
+ {
+ byte a = above[0];
+ byte b = above[1];
+ byte c = above[2];
+ byte d = above[3];
+ byte e = above[4];
+ byte f = above[5];
+ byte g = above[6];
+ byte h = above[7];
+ Dst(dst, stride, 0, 0) = Avg3(a, b, c);
+ Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 1) = Avg3(b, c, d);
+ Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 2) = Avg3(c, d, e);
+ Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f);
+ Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 3) = Avg3(e, f, g);
+ Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 3) = Avg3(f, g, h);
+ Dst(dst, stride, 3, 3) = Avg3(g, h, h);
+ }
+
+ public static unsafe void D117Predictor4x4(byte* dst, int stride, byte* above, byte* left)
+ {
+ byte I = left[0];
+ byte j = left[1];
+ byte k = left[2];
+ byte x = above[-1];
+ byte a = above[0];
+ byte b = above[1];
+ byte c = above[2];
+ byte d = above[3];
+ Dst(dst, stride, 0, 0) = Dst(dst, stride, 1, 2) = Avg2(x, a);
+ Dst(dst, stride, 1, 0) = Dst(dst, stride, 2, 2) = Avg2(a, b);
+ Dst(dst, stride, 2, 0) = Dst(dst, stride, 3, 2) = Avg2(b, c);
+ Dst(dst, stride, 3, 0) = Avg2(c, d);
+
+ Dst(dst, stride, 0, 3) = Avg3(k, j, I);
+ Dst(dst, stride, 0, 2) = Avg3(j, I, x);
+ Dst(dst, stride, 0, 1) = Dst(dst, stride, 1, 3) = Avg3(I, x, a);
+ Dst(dst, stride, 1, 1) = Dst(dst, stride, 2, 3) = Avg3(x, a, b);
+ Dst(dst, stride, 2, 1) = Dst(dst, stride, 3, 3) = Avg3(a, b, c);
+ Dst(dst, stride, 3, 1) = Avg3(b, c, d);
+ }
+
+ public static unsafe void D135Predictor4x4(byte* dst, int stride, byte* above, byte* left)
+ {
+ byte I = left[0];
+ byte j = left[1];
+ byte k = left[2];
+ byte l = left[3];
+ byte x = above[-1];
+ byte a = above[0];
+ byte b = above[1];
+ byte c = above[2];
+ byte d = above[3];
+ Dst(dst, stride, 0, 3) = Avg3(j, k, l);
+ Dst(dst, stride, 1, 3) = Dst(dst, stride, 0, 2) = Avg3(I, j, k);
+ Dst(dst, stride, 2, 3) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 1) = Avg3(x, I, j);
+ Dst(dst, stride, 3, 3) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 0) = Avg3(a, x, I);
+ Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 0) = Avg3(b, a, x);
+ Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 0) = Avg3(c, b, a);
+ Dst(dst, stride, 3, 0) = Avg3(d, c, b);
+ }
+
+ public static unsafe void D153Predictor4x4(byte* dst, int stride, byte* above, byte* left)
+ {
+ byte I = left[0];
+ byte j = left[1];
+ byte k = left[2];
+ byte l = left[3];
+ byte x = above[-1];
+ byte a = above[0];
+ byte b = above[1];
+ byte c = above[2];
+ Dst(dst, stride, 0, 0) = Dst(dst, stride, 2, 1) = Avg2(I, x);
+ Dst(dst, stride, 0, 1) = Dst(dst, stride, 2, 2) = Avg2(j, I);
+ Dst(dst, stride, 0, 2) = Dst(dst, stride, 2, 3) = Avg2(k, j);
+ Dst(dst, stride, 0, 3) = Avg2(l, k);
+
+ Dst(dst, stride, 3, 0) = Avg3(a, b, c);
+ Dst(dst, stride, 2, 0) = Avg3(x, a, b);
+ Dst(dst, stride, 1, 0) = Dst(dst, stride, 3, 1) = Avg3(I, x, a);
+ Dst(dst, stride, 1, 1) = Dst(dst, stride, 3, 2) = Avg3(j, I, x);
+ Dst(dst, stride, 1, 2) = Dst(dst, stride, 3, 3) = Avg3(k, j, I);
+ Dst(dst, stride, 1, 3) = Avg3(l, k, j);
+ }
+
+ public static unsafe void HighbdD207Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdD207Predictor(dst, stride, 8, above, left, bd);
+ }
+
+ public static unsafe void HighbdD207Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdD207Predictor(dst, stride, 16, above, left, bd);
+ }
+
+ public static unsafe void HighbdD207Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdD207Predictor(dst, stride, 32, above, left, bd);
+ }
+
+ private static unsafe void HighbdD207Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+ {
+ int r, c;
+
+ // First column.
+ for (r = 0; r < bs - 1; ++r)
+ {
+ dst[r * stride] = Avg2(left[r], left[r + 1]);
+ }
+ dst[(bs - 1) * stride] = left[bs - 1];
+ dst++;
+
+ // Second column.
+ for (r = 0; r < bs - 2; ++r)
+ {
+ dst[r * stride] = Avg3(left[r], left[r + 1], left[r + 2]);
+ }
+ dst[(bs - 2) * stride] = Avg3(left[bs - 2], left[bs - 1], left[bs - 1]);
+ dst[(bs - 1) * stride] = left[bs - 1];
+ dst++;
+
+ // Rest of last row.
+ for (c = 0; c < bs - 2; ++c)
+ {
+ dst[(bs - 1) * stride + c] = left[bs - 1];
+ }
+
+ for (r = bs - 2; r >= 0; --r)
+ {
+ for (c = 0; c < bs - 2; ++c)
+ {
+ dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
+ }
+ }
+ }
+
+ public static unsafe void HighbdD63Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdD63Predictor(dst, stride, 8, above, left, bd);
+ }
+
+ public static unsafe void HighbdD63Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdD63Predictor(dst, stride, 16, above, left, bd);
+ }
+
+ public static unsafe void HighbdD63Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdD63Predictor(dst, stride, 32, above, left, bd);
+ }
+
+ private static unsafe void HighbdD63Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+ {
+ int r, c;
+ int size;
+ for (c = 0; c < bs; ++c)
+ {
+ dst[c] = Avg2(above[c], above[c + 1]);
+ dst[stride + c] = Avg3(above[c], above[c + 1], above[c + 2]);
+ }
+ for (r = 2, size = bs - 2; r < bs; r += 2, --size)
+ {
+ MemoryUtil.Copy(dst + (r + 0) * stride, dst + (r >> 1), size);
+ MemoryUtil.Fill(dst + (r + 0) * stride + size, above[bs - 1], bs - size);
+ MemoryUtil.Copy(dst + (r + 1) * stride, dst + stride + (r >> 1), size);
+ MemoryUtil.Fill(dst + (r + 1) * stride + size, above[bs - 1], bs - size);
+ }
+ }
+
+ public static unsafe void HighbdD45Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdD45Predictor(dst, stride, 8, above, left, bd);
+ }
+
+ public static unsafe void HighbdD45Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdD45Predictor(dst, stride, 16, above, left, bd);
+ }
+
+ public static unsafe void HighbdD45Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdD45Predictor(dst, stride, 32, above, left, bd);
+ }
+
+ private static unsafe void HighbdD45Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+ {
+ ushort aboveRight = above[bs - 1];
+ ushort* dstRow0 = dst;
+ int x, size;
+
+ for (x = 0; x < bs - 1; ++x)
+ {
+ dst[x] = Avg3(above[x], above[x + 1], above[x + 2]);
+ }
+ dst[bs - 1] = aboveRight;
+ dst += stride;
+ for (x = 1, size = bs - 2; x < bs; ++x, --size)
+ {
+ MemoryUtil.Copy(dst, dstRow0 + x, size);
+ MemoryUtil.Fill(dst + size, aboveRight, x + 1);
+ dst += stride;
+ }
+ }
+
+ public static unsafe void HighbdD117Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdD117Predictor(dst, stride, 8, above, left, bd);
+ }
+
+ public static unsafe void HighbdD117Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdD117Predictor(dst, stride, 16, above, left, bd);
+ }
+
+ public static unsafe void HighbdD117Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdD117Predictor(dst, stride, 32, above, left, bd);
+ }
+
+ private static unsafe void HighbdD117Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+ {
+ int r, c;
+
+ // First row
+ for (c = 0; c < bs; c++)
+ {
+ dst[c] = Avg2(above[c - 1], above[c]);
+ }
+
+ dst += stride;
+
+ // Second row
+ dst[0] = Avg3(left[0], above[-1], above[0]);
+ for (c = 1; c < bs; c++)
+ {
+ dst[c] = Avg3(above[c - 2], above[c - 1], above[c]);
+ }
+
+ dst += stride;
+
+ // The rest of first col
+ dst[0] = Avg3(above[-1], left[0], left[1]);
+ for (r = 3; r < bs; ++r)
+ {
+ dst[(r - 2) * stride] = Avg3(left[r - 3], left[r - 2], left[r - 1]);
+ }
+
+ // The rest of the block
+ for (r = 2; r < bs; ++r)
+ {
+ for (c = 1; c < bs; c++)
+ {
+ dst[c] = dst[-2 * stride + c - 1];
+ }
+
+ dst += stride;
+ }
+ }
+
+ public static unsafe void HighbdD135Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdD135Predictor(dst, stride, 8, above, left, bd);
+ }
+
+ public static unsafe void HighbdD135Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdD135Predictor(dst, stride, 16, above, left, bd);
+ }
+
+ public static unsafe void HighbdD135Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdD135Predictor(dst, stride, 32, above, left, bd);
+ }
+
+ private static unsafe void HighbdD135Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+ {
+ int i;
+ ushort* border = stackalloc ushort[32 + 32 - 1]; // Outer border from bottom-left to top-right
+
+ // Dst(dst, stride, bs, bs - 2)[0], i.e., border starting at bottom-left
+ for (i = 0; i < bs - 2; ++i)
+ {
+ border[i] = Avg3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]);
+ }
+ border[bs - 2] = Avg3(above[-1], left[0], left[1]);
+ border[bs - 1] = Avg3(left[0], above[-1], above[0]);
+ border[bs - 0] = Avg3(above[-1], above[0], above[1]);
+ // dst[0][2, size), i.e., remaining top border ascending
+ for (i = 0; i < bs - 2; ++i)
+ {
+ border[bs + 1 + i] = Avg3(above[i], above[i + 1], above[i + 2]);
+ }
+
+ for (i = 0; i < bs; ++i)
+ {
+ MemoryUtil.Copy(dst + i * stride, border + bs - 1 - i, bs);
+ }
+ }
+
+ public static unsafe void HighbdD153Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdD153Predictor(dst, stride, 8, above, left, bd);
+ }
+
+ public static unsafe void HighbdD153Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdD153Predictor(dst, stride, 16, above, left, bd);
+ }
+
+ public static unsafe void HighbdD153Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdD153Predictor(dst, stride, 32, above, left, bd);
+ }
+
+ private static unsafe void HighbdD153Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+ {
+ int r, c;
+ dst[0] = Avg2(above[-1], left[0]);
+ for (r = 1; r < bs; r++)
+ {
+ dst[r * stride] = Avg2(left[r - 1], left[r]);
+ }
+
+ dst++;
+
+ dst[0] = Avg3(left[0], above[-1], above[0]);
+ dst[stride] = Avg3(above[-1], left[0], left[1]);
+ for (r = 2; r < bs; r++)
+ {
+ dst[r * stride] = Avg3(left[r - 2], left[r - 1], left[r]);
+ }
+
+ dst++;
+
+ for (c = 0; c < bs - 2; c++)
+ {
+ dst[c] = Avg3(above[c - 1], above[c], above[c + 1]);
+ }
+
+ dst += stride;
+
+ for (r = 1; r < bs; ++r)
+ {
+ for (c = 0; c < bs - 2; c++)
+ {
+ dst[c] = dst[-stride + c - 2];
+ }
+
+ dst += stride;
+ }
+ }
+
+ public static unsafe void HighbdVPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdVPredictor(dst, stride, 4, above, left, bd);
+ }
+
+ public static unsafe void HighbdVPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdVPredictor(dst, stride, 8, above, left, bd);
+ }
+
+ public static unsafe void HighbdVPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdVPredictor(dst, stride, 16, above, left, bd);
+ }
+
+ public static unsafe void HighbdVPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdVPredictor(dst, stride, 32, above, left, bd);
+ }
+
+ private static unsafe void HighbdVPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+ {
+ int r;
+ for (r = 0; r < bs; r++)
+ {
+ MemoryUtil.Copy(dst, above, bs);
+ dst += stride;
+ }
+ }
+
+ public static unsafe void HighbdHPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdHPredictor(dst, stride, 4, above, left, bd);
+ }
+
+ public static unsafe void HighbdHPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdHPredictor(dst, stride, 8, above, left, bd);
+ }
+
+ public static unsafe void HighbdHPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdHPredictor(dst, stride, 16, above, left, bd);
+ }
+
+ public static unsafe void HighbdHPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdHPredictor(dst, stride, 32, above, left, bd);
+ }
+
+ private static unsafe void HighbdHPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+ {
+ int r;
+ for (r = 0; r < bs; r++)
+ {
+ MemoryUtil.Fill(dst, left[r], bs);
+ dst += stride;
+ }
+ }
+
+ public static unsafe void HighbdTMPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdTMPredictor(dst, stride, 4, above, left, bd);
+ }
+
+ public static unsafe void HighbdTMPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdTMPredictor(dst, stride, 8, above, left, bd);
+ }
+
+ public static unsafe void HighbdTMPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdTMPredictor(dst, stride, 16, above, left, bd);
+ }
+
+ public static unsafe void HighbdTMPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdTMPredictor(dst, stride, 32, above, left, bd);
+ }
+
+ private static unsafe void HighbdTMPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+ {
+ int r, c;
+ int yTopLeft = above[-1];
+
+ for (r = 0; r < bs; r++)
+ {
+ for (c = 0; c < bs; c++)
+ {
+ dst[c] = BitUtils.ClipPixelHighbd(left[r] + above[c] - yTopLeft, bd);
+ }
+
+ dst += stride;
+ }
+ }
+
+ public static unsafe void HighbdDc128Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdDc128Predictor(dst, stride, 4, above, left, bd);
+ }
+
+ public static unsafe void HighbdDc128Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdDc128Predictor(dst, stride, 8, above, left, bd);
+ }
+
+ public static unsafe void HighbdDc128Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdDc128Predictor(dst, stride, 16, above, left, bd);
+ }
+
+ public static unsafe void HighbdDc128Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdDc128Predictor(dst, stride, 32, above, left, bd);
+ }
+
+ private static unsafe void HighbdDc128Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+ {
+ int r;
+
+ for (r = 0; r < bs; r++)
+ {
+ MemoryUtil.Fill(dst, (ushort)(128 << (bd - 8)), bs);
+ dst += stride;
+ }
+ }
+
+ public static unsafe void HighbdDcLeftPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdDcLeftPredictor(dst, stride, 4, above, left, bd);
+ }
+
+ public static unsafe void HighbdDcLeftPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdDcLeftPredictor(dst, stride, 8, above, left, bd);
+ }
+
+ public static unsafe void HighbdDcLeftPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdDcLeftPredictor(dst, stride, 16, above, left, bd);
+ }
+
+ public static unsafe void HighbdDcLeftPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdDcLeftPredictor(dst, stride, 32, above, left, bd);
+ }
+
+ private static unsafe void HighbdDcLeftPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+ {
+ int i, r, expectedDc, sum = 0;
+
+ for (i = 0; i < bs; i++)
+ {
+ sum += left[i];
+ }
+
+ expectedDc = (sum + (bs >> 1)) / bs;
+
+ for (r = 0; r < bs; r++)
+ {
+ MemoryUtil.Fill(dst, (ushort)expectedDc, bs);
+ dst += stride;
+ }
+ }
+
+ public static unsafe void HighbdDcTopPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdDcTopPredictor(dst, stride, 4, above, left, bd);
+ }
+
+ public static unsafe void HighbdDcTopPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdDcTopPredictor(dst, stride, 8, above, left, bd);
+ }
+
+ public static unsafe void HighbdDcTopPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdDcTopPredictor(dst, stride, 16, above, left, bd);
+ }
+
+ public static unsafe void HighbdDcTopPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdDcTopPredictor(dst, stride, 32, above, left, bd);
+ }
+
+ private static unsafe void HighbdDcTopPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+ {
+ int i, r, expectedDc, sum = 0;
+
+ for (i = 0; i < bs; i++)
+ {
+ sum += above[i];
+ }
+
+ expectedDc = (sum + (bs >> 1)) / bs;
+
+ for (r = 0; r < bs; r++)
+ {
+ MemoryUtil.Fill(dst, (ushort)expectedDc, bs);
+ dst += stride;
+ }
+ }
+
+ public static unsafe void HighbdDcPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdDcPredictor(dst, stride, 4, above, left, bd);
+ }
+
+ public static unsafe void HighbdDcPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdDcPredictor(dst, stride, 8, above, left, bd);
+ }
+
+ public static unsafe void HighbdDcPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdDcPredictor(dst, stride, 16, above, left, bd);
+ }
+
+ public static unsafe void HighbdDcPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ HighbdDcPredictor(dst, stride, 32, above, left, bd);
+ }
+
+ private static unsafe void HighbdDcPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+ {
+ int i, r, expectedDc, sum = 0;
+ int count = 2 * bs;
+
+ for (i = 0; i < bs; i++)
+ {
+ sum += above[i];
+ sum += left[i];
+ }
+
+ expectedDc = (sum + (count >> 1)) / count;
+
+ for (r = 0; r < bs; r++)
+ {
+ MemoryUtil.Fill(dst, (ushort)expectedDc, bs);
+ dst += stride;
+ }
+ }
+
+ public static unsafe void HighbdD207Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ ushort I = left[0];
+ ushort j = left[1];
+ ushort k = left[2];
+ ushort l = left[3];
+ Dst(dst, stride, 0, 0) = Avg2(I, j);
+ Dst(dst, stride, 2, 0) = Dst(dst, stride, 0, 1) = Avg2(j, k);
+ Dst(dst, stride, 2, 1) = Dst(dst, stride, 0, 2) = Avg2(k, l);
+ Dst(dst, stride, 1, 0) = Avg3(I, j, k);
+ Dst(dst, stride, 3, 0) = Dst(dst, stride, 1, 1) = Avg3(j, k, l);
+ Dst(dst, stride, 3, 1) = Dst(dst, stride, 1, 2) = Avg3(k, l, l);
+ Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 0, 3) = Dst(dst, stride, 1, 3) = Dst(dst, stride, 2, 3) = Dst(dst, stride, 3, 3) = l;
+ }
+
+ public static unsafe void HighbdD63Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ ushort a = above[0];
+ ushort b = above[1];
+ ushort c = above[2];
+ ushort d = above[3];
+ ushort e = above[4];
+ ushort f = above[5];
+ ushort g = above[6];
+ Dst(dst, stride, 0, 0) = Avg2(a, b);
+ Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 2) = Avg2(b, c);
+ Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 2) = Avg2(c, d);
+ Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 2) = Avg2(d, e);
+ Dst(dst, stride, 3, 2) = Avg2(e, f); // Differs from vp8
+
+ Dst(dst, stride, 0, 1) = Avg3(a, b, c);
+ Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 3) = Avg3(b, c, d);
+ Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 3) = Avg3(c, d, e);
+ Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 3) = Avg3(d, e, f);
+ Dst(dst, stride, 3, 3) = Avg3(e, f, g); // Differs from vp8
+ }
+
+ public static unsafe void HighbdD45Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ ushort a = above[0];
+ ushort b = above[1];
+ ushort c = above[2];
+ ushort d = above[3];
+ ushort e = above[4];
+ ushort f = above[5];
+ ushort g = above[6];
+ ushort h = above[7];
+ Dst(dst, stride, 0, 0) = Avg3(a, b, c);
+ Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 1) = Avg3(b, c, d);
+ Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 2) = Avg3(c, d, e);
+ Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f);
+ Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 3) = Avg3(e, f, g);
+ Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 3) = Avg3(f, g, h);
+ Dst(dst, stride, 3, 3) = h; // Differs from vp8
+ }
+
+ public static unsafe void HighbdD117Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ ushort I = left[0];
+ ushort j = left[1];
+ ushort k = left[2];
+ ushort x = above[-1];
+ ushort a = above[0];
+ ushort b = above[1];
+ ushort c = above[2];
+ ushort d = above[3];
+ Dst(dst, stride, 0, 0) = Dst(dst, stride, 1, 2) = Avg2(x, a);
+ Dst(dst, stride, 1, 0) = Dst(dst, stride, 2, 2) = Avg2(a, b);
+ Dst(dst, stride, 2, 0) = Dst(dst, stride, 3, 2) = Avg2(b, c);
+ Dst(dst, stride, 3, 0) = Avg2(c, d);
+
+ Dst(dst, stride, 0, 3) = Avg3(k, j, I);
+ Dst(dst, stride, 0, 2) = Avg3(j, I, x);
+ Dst(dst, stride, 0, 1) = Dst(dst, stride, 1, 3) = Avg3(I, x, a);
+ Dst(dst, stride, 1, 1) = Dst(dst, stride, 2, 3) = Avg3(x, a, b);
+ Dst(dst, stride, 2, 1) = Dst(dst, stride, 3, 3) = Avg3(a, b, c);
+ Dst(dst, stride, 3, 1) = Avg3(b, c, d);
+ }
+
+ public static unsafe void HighbdD135Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ ushort I = left[0];
+ ushort j = left[1];
+ ushort k = left[2];
+ ushort l = left[3];
+ ushort x = above[-1];
+ ushort a = above[0];
+ ushort b = above[1];
+ ushort c = above[2];
+ ushort d = above[3];
+ Dst(dst, stride, 0, 3) = Avg3(j, k, l);
+ Dst(dst, stride, 1, 3) = Dst(dst, stride, 0, 2) = Avg3(I, j, k);
+ Dst(dst, stride, 2, 3) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 1) = Avg3(x, I, j);
+ Dst(dst, stride, 3, 3) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 0) = Avg3(a, x, I);
+ Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 0) = Avg3(b, a, x);
+ Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 0) = Avg3(c, b, a);
+ Dst(dst, stride, 3, 0) = Avg3(d, c, b);
+ }
+
+ public static unsafe void HighbdD153Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+ {
+ ushort I = left[0];
+ ushort j = left[1];
+ ushort k = left[2];
+ ushort l = left[3];
+ ushort x = above[-1];
+ ushort a = above[0];
+ ushort b = above[1];
+ ushort c = above[2];
+
+ Dst(dst, stride, 0, 0) = Dst(dst, stride, 2, 1) = Avg2(I, x);
+ Dst(dst, stride, 0, 1) = Dst(dst, stride, 2, 2) = Avg2(j, I);
+ Dst(dst, stride, 0, 2) = Dst(dst, stride, 2, 3) = Avg2(k, j);
+ Dst(dst, stride, 0, 3) = Avg2(l, k);
+
+ Dst(dst, stride, 3, 0) = Avg3(a, b, c);
+ Dst(dst, stride, 2, 0) = Avg3(x, a, b);
+ Dst(dst, stride, 1, 0) = Dst(dst, stride, 3, 1) = Avg3(I, x, a);
+ Dst(dst, stride, 1, 1) = Dst(dst, stride, 3, 2) = Avg3(j, I, x);
+ Dst(dst, stride, 1, 2) = Dst(dst, stride, 3, 3) = Avg3(k, j, I);
+ Dst(dst, stride, 1, 3) = Avg3(l, k, j);
+ }
+ }
+}
diff --git a/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs
new file mode 100644
index 00000000..b4ad4344
--- /dev/null
+++ b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs
@@ -0,0 +1,2868 @@
+using System;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.TxfmCommon;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
+{
+ internal static class InvTxfm
+ {
+ // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
+ // transform amplify bits + 1 bit for contingency in rounding and quantizing
+ private const int HighbdValidTxfmMagnitudeRange = (1 << 25);
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static int DetectInvalidHighbdInput(ReadOnlySpan<int> input, int size)
+ {
+ int i;
+ for (i = 0; i < size; ++i)
+ {
+ if (Math.Abs(input[i]) >= HighbdValidTxfmMagnitudeRange)
+ {
+ return 1;
+ }
+ }
+
+ return 0;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static long CheckRange(long input)
+ {
+ // For valid VP9 input streams, intermediate stage coefficients should always
+ // stay within the range of a signed 16 bit integer. Coefficients can go out
+ // of this range for invalid/corrupt VP9 streams.
+ Debug.Assert(short.MinValue <= input);
+ Debug.Assert(input <= short.MaxValue);
+ return input;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static long HighbdCheckRange(long input, int bd)
+ {
+ // For valid highbitdepth VP9 streams, intermediate stage coefficients will
+ // stay within the ranges:
+ // - 8 bit: signed 16 bit integer
+ // - 10 bit: signed 18 bit integer
+ // - 12 bit: signed 20 bit integer
+ int intMax = (1 << (7 + bd)) - 1;
+ int intMin = -intMax - 1;
+ Debug.Assert(intMin <= input);
+ Debug.Assert(input <= intMax);
+
+ return input;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static int WrapLow(long x)
+ {
+ return (short)CheckRange(x);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static int HighbdWrapLow(long x, int bd)
+ {
+ return ((int)HighbdCheckRange(x, bd) << (24 - bd)) >> (24 - bd);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static byte ClipPixelAdd(byte dest, long trans)
+ {
+ trans = WrapLow(trans);
+ return BitUtils.ClipPixel(dest + (int)trans);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static ushort HighbdClipPixelAdd(ushort dest, long trans, int bd)
+ {
+ trans = HighbdWrapLow(trans, bd);
+ return BitUtils.ClipPixelHighbd(dest + (int)trans, bd);
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static long DctConstRoundShift(long input)
+ {
+ long rv = BitUtils.RoundPowerOfTwo(input, DctConstBits);
+ return rv;
+ }
+
+ public static void Iwht4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+ {
+ /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+ 0.5 shifts per pixel. */
+ int i;
+ Span<int> output = stackalloc int[16];
+ long a1, b1, c1, d1, e1;
+ ReadOnlySpan<int> ip = input;
+ Span<int> op = output;
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ip[0] >> UnitQuantShift;
+ c1 = ip[1] >> UnitQuantShift;
+ d1 = ip[2] >> UnitQuantShift;
+ b1 = ip[3] >> UnitQuantShift;
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ op[0] = WrapLow(a1);
+ op[1] = WrapLow(b1);
+ op[2] = WrapLow(c1);
+ op[3] = WrapLow(d1);
+ ip = ip.Slice(4);
+ op = op.Slice(4);
+ }
+
+ Span<int> ip2 = output;
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ip2[4 * 0];
+ c1 = ip2[4 * 1];
+ d1 = ip2[4 * 2];
+ b1 = ip2[4 * 3];
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ dest[stride * 0] = ClipPixelAdd(dest[stride * 0], WrapLow(a1));
+ dest[stride * 1] = ClipPixelAdd(dest[stride * 1], WrapLow(b1));
+ dest[stride * 2] = ClipPixelAdd(dest[stride * 2], WrapLow(c1));
+ dest[stride * 3] = ClipPixelAdd(dest[stride * 3], WrapLow(d1));
+
+ ip2 = ip2.Slice(1);
+ dest = dest.Slice(1);
+ }
+ }
+
+ public static void Iwht4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+ {
+ int i;
+ long a1, e1;
+ Span<int> tmp = stackalloc int[4];
+ ReadOnlySpan<int> ip = input;
+ Span<int> op = tmp;
+
+ a1 = ip[0] >> UnitQuantShift;
+ e1 = a1 >> 1;
+ a1 -= e1;
+ op[0] = WrapLow(a1);
+ op[1] = op[2] = op[3] = WrapLow(e1);
+
+ Span<int> ip2 = tmp;
+ for (i = 0; i < 4; i++)
+ {
+ e1 = ip2[0] >> 1;
+ a1 = ip2[0] - e1;
+ dest[stride * 0] = ClipPixelAdd(dest[stride * 0], a1);
+ dest[stride * 1] = ClipPixelAdd(dest[stride * 1], e1);
+ dest[stride * 2] = ClipPixelAdd(dest[stride * 2], e1);
+ dest[stride * 3] = ClipPixelAdd(dest[stride * 3], e1);
+ ip2 = ip2.Slice(1);
+ dest = dest.Slice(1);
+ }
+ }
+
+ public static void Iadst4(ReadOnlySpan<int> input, Span<int> output)
+ {
+ long s0, s1, s2, s3, s4, s5, s6, s7;
+ int x0 = input[0];
+ int x1 = input[1];
+ int x2 = input[2];
+ int x3 = input[3];
+
+ if ((x0 | x1 | x2 | x3) == 0)
+ {
+ output.Slice(0, 4).Fill(0);
+ return;
+ }
+
+ // 32-bit result is enough for the following multiplications.
+ s0 = SinPi1_9 * x0;
+ s1 = SinPi2_9 * x0;
+ s2 = SinPi3_9 * x1;
+ s3 = SinPi4_9 * x2;
+ s4 = SinPi1_9 * x2;
+ s5 = SinPi2_9 * x3;
+ s6 = SinPi4_9 * x3;
+ s7 = WrapLow(x0 - x2 + x3);
+
+ s0 = s0 + s3 + s5;
+ s1 = s1 - s4 - s6;
+ s3 = s2;
+ s2 = SinPi3_9 * s7;
+
+ // 1-D transform scaling factor is sqrt(2).
+ // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+ // + 1b (addition) = 29b.
+ // Hence the output bit depth is 15b.
+ output[0] = WrapLow(DctConstRoundShift(s0 + s3));
+ output[1] = WrapLow(DctConstRoundShift(s1 + s3));
+ output[2] = WrapLow(DctConstRoundShift(s2));
+ output[3] = WrapLow(DctConstRoundShift(s0 + s1 - s3));
+ }
+
+ public static void Idct4(ReadOnlySpan<int> input, Span<int> output)
+ {
+ Span<short> step = stackalloc short[4];
+ long temp1, temp2;
+
+ // stage 1
+ temp1 = ((short)input[0] + (short)input[2]) * CosPi16_64;
+ temp2 = ((short)input[0] - (short)input[2]) * CosPi16_64;
+ step[0] = (short)WrapLow(DctConstRoundShift(temp1));
+ step[1] = (short)WrapLow(DctConstRoundShift(temp2));
+ temp1 = (short)input[1] * CosPi24_64 - (short)input[3] * CosPi8_64;
+ temp2 = (short)input[1] * CosPi8_64 + (short)input[3] * CosPi24_64;
+ step[2] = (short)WrapLow(DctConstRoundShift(temp1));
+ step[3] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ // stage 2
+ output[0] = WrapLow(step[0] + step[3]);
+ output[1] = WrapLow(step[1] + step[2]);
+ output[2] = WrapLow(step[1] - step[2]);
+ output[3] = WrapLow(step[0] - step[3]);
+ }
+
+ public static void Idct4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+ {
+ int i, j;
+ Span<int> output = stackalloc int[4 * 4];
+ Span<int> outptr = output;
+ Span<int> tempIn = stackalloc int[4];
+ Span<int> tempOut = stackalloc int[4];
+
+ // Rows
+ for (i = 0; i < 4; ++i)
+ {
+ Idct4(input, outptr);
+ input = input.Slice(4);
+ outptr = outptr.Slice(4);
+ }
+
+ // Columns
+ for (i = 0; i < 4; ++i)
+ {
+ for (j = 0; j < 4; ++j)
+ {
+ tempIn[j] = output[j * 4 + i];
+ }
+
+ Idct4(tempIn, tempOut);
+ for (j = 0; j < 4; ++j)
+ {
+ dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4));
+ }
+ }
+ }
+
+ public static void Idct4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+ {
+ int i;
+ long a1;
+ int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
+
+ output = WrapLow(DctConstRoundShift(output * CosPi16_64));
+ a1 = BitUtils.RoundPowerOfTwo(output, 4);
+
+ for (i = 0; i < 4; i++)
+ {
+ dest[0] = ClipPixelAdd(dest[0], a1);
+ dest[1] = ClipPixelAdd(dest[1], a1);
+ dest[2] = ClipPixelAdd(dest[2], a1);
+ dest[3] = ClipPixelAdd(dest[3], a1);
+ dest = dest.Slice(stride);
+ }
+ }
+
+ public static void Iadst8(ReadOnlySpan<int> input, Span<int> output)
+ {
+ int s0, s1, s2, s3, s4, s5, s6, s7;
+ long x0 = input[7];
+ long x1 = input[0];
+ long x2 = input[5];
+ long x3 = input[2];
+ long x4 = input[3];
+ long x5 = input[4];
+ long x6 = input[1];
+ long x7 = input[6];
+
+ if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7) == 0)
+ {
+ output.Slice(0, 8).Fill(0);
+ return;
+ }
+
+ // stage 1
+ s0 = (int)(CosPi2_64 * x0 + CosPi30_64 * x1);
+ s1 = (int)(CosPi30_64 * x0 - CosPi2_64 * x1);
+ s2 = (int)(CosPi10_64 * x2 + CosPi22_64 * x3);
+ s3 = (int)(CosPi22_64 * x2 - CosPi10_64 * x3);
+ s4 = (int)(CosPi18_64 * x4 + CosPi14_64 * x5);
+ s5 = (int)(CosPi14_64 * x4 - CosPi18_64 * x5);
+ s6 = (int)(CosPi26_64 * x6 + CosPi6_64 * x7);
+ s7 = (int)(CosPi6_64 * x6 - CosPi26_64 * x7);
+
+ x0 = WrapLow(DctConstRoundShift(s0 + s4));
+ x1 = WrapLow(DctConstRoundShift(s1 + s5));
+ x2 = WrapLow(DctConstRoundShift(s2 + s6));
+ x3 = WrapLow(DctConstRoundShift(s3 + s7));
+ x4 = WrapLow(DctConstRoundShift(s0 - s4));
+ x5 = WrapLow(DctConstRoundShift(s1 - s5));
+ x6 = WrapLow(DctConstRoundShift(s2 - s6));
+ x7 = WrapLow(DctConstRoundShift(s3 - s7));
+
+ // stage 2
+ s0 = (int)x0;
+ s1 = (int)x1;
+ s2 = (int)x2;
+ s3 = (int)x3;
+ s4 = (int)(CosPi8_64 * x4 + CosPi24_64 * x5);
+ s5 = (int)(CosPi24_64 * x4 - CosPi8_64 * x5);
+ s6 = (int)(-CosPi24_64 * x6 + CosPi8_64 * x7);
+ s7 = (int)(CosPi8_64 * x6 + CosPi24_64 * x7);
+
+ x0 = WrapLow(s0 + s2);
+ x1 = WrapLow(s1 + s3);
+ x2 = WrapLow(s0 - s2);
+ x3 = WrapLow(s1 - s3);
+ x4 = WrapLow(DctConstRoundShift(s4 + s6));
+ x5 = WrapLow(DctConstRoundShift(s5 + s7));
+ x6 = WrapLow(DctConstRoundShift(s4 - s6));
+ x7 = WrapLow(DctConstRoundShift(s5 - s7));
+
+ // stage 3
+ s2 = (int)(CosPi16_64 * (x2 + x3));
+ s3 = (int)(CosPi16_64 * (x2 - x3));
+ s6 = (int)(CosPi16_64 * (x6 + x7));
+ s7 = (int)(CosPi16_64 * (x6 - x7));
+
+ x2 = WrapLow(DctConstRoundShift(s2));
+ x3 = WrapLow(DctConstRoundShift(s3));
+ x6 = WrapLow(DctConstRoundShift(s6));
+ x7 = WrapLow(DctConstRoundShift(s7));
+
+ output[0] = WrapLow(x0);
+ output[1] = WrapLow(-x4);
+ output[2] = WrapLow(x6);
+ output[3] = WrapLow(-x2);
+ output[4] = WrapLow(x3);
+ output[5] = WrapLow(-x7);
+ output[6] = WrapLow(x5);
+ output[7] = WrapLow(-x1);
+ }
+
+ public static void Idct8(ReadOnlySpan<int> input, Span<int> output)
+ {
+ Span<short> step1 = stackalloc short[8];
+ Span<short> step2 = stackalloc short[8];
+ long temp1, temp2;
+
+ // stage 1
+ step1[0] = (short)input[0];
+ step1[2] = (short)input[4];
+ step1[1] = (short)input[2];
+ step1[3] = (short)input[6];
+ temp1 = (short)input[1] * CosPi28_64 - (short)input[7] * CosPi4_64;
+ temp2 = (short)input[1] * CosPi4_64 + (short)input[7] * CosPi28_64;
+ step1[4] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[7] = (short)WrapLow(DctConstRoundShift(temp2));
+ temp1 = (short)input[5] * CosPi12_64 - (short)input[3] * CosPi20_64;
+ temp2 = (short)input[5] * CosPi20_64 + (short)input[3] * CosPi12_64;
+ step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ // stage 2
+ temp1 = (step1[0] + step1[2]) * CosPi16_64;
+ temp2 = (step1[0] - step1[2]) * CosPi16_64;
+ step2[0] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[1] = (short)WrapLow(DctConstRoundShift(temp2));
+ temp1 = step1[1] * CosPi24_64 - step1[3] * CosPi8_64;
+ temp2 = step1[1] * CosPi8_64 + step1[3] * CosPi24_64;
+ step2[2] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[3] = (short)WrapLow(DctConstRoundShift(temp2));
+ step2[4] = (short)WrapLow(step1[4] + step1[5]);
+ step2[5] = (short)WrapLow(step1[4] - step1[5]);
+ step2[6] = (short)WrapLow(-step1[6] + step1[7]);
+ step2[7] = (short)WrapLow(step1[6] + step1[7]);
+
+ // stage 3
+ step1[0] = (short)WrapLow(step2[0] + step2[3]);
+ step1[1] = (short)WrapLow(step2[1] + step2[2]);
+ step1[2] = (short)WrapLow(step2[1] - step2[2]);
+ step1[3] = (short)WrapLow(step2[0] - step2[3]);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * CosPi16_64;
+ temp2 = (step2[5] + step2[6]) * CosPi16_64;
+ step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
+ step1[7] = step2[7];
+
+ // stage 4
+ output[0] = WrapLow(step1[0] + step1[7]);
+ output[1] = WrapLow(step1[1] + step1[6]);
+ output[2] = WrapLow(step1[2] + step1[5]);
+ output[3] = WrapLow(step1[3] + step1[4]);
+ output[4] = WrapLow(step1[3] - step1[4]);
+ output[5] = WrapLow(step1[2] - step1[5]);
+ output[6] = WrapLow(step1[1] - step1[6]);
+ output[7] = WrapLow(step1[0] - step1[7]);
+ }
+
+ public static void Idct8x864Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+ {
+ int i, j;
+ Span<int> output = stackalloc int[8 * 8];
+ Span<int> outptr = output;
+ Span<int> tempIn = stackalloc int[8];
+ Span<int> tempOut = stackalloc int[8];
+
+ // First transform rows
+ for (i = 0; i < 8; ++i)
+ {
+ Idct8(input, outptr);
+ input = input.Slice(8);
+ outptr = outptr.Slice(8);
+ }
+
+ // Then transform columns
+ for (i = 0; i < 8; ++i)
+ {
+ for (j = 0; j < 8; ++j)
+ {
+ tempIn[j] = output[j * 8 + i];
+ }
+
+ Idct8(tempIn, tempOut);
+ for (j = 0; j < 8; ++j)
+ {
+ dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i],
+ BitUtils.RoundPowerOfTwo(tempOut[j], 5));
+ }
+ }
+ }
+
+ public static void Idct8x812Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+ {
+ int i, j;
+ Span<int> output = stackalloc int[8 * 8];
+ Span<int> outptr = output;
+ Span<int> tempIn = stackalloc int[8];
+ Span<int> tempOut = stackalloc int[8];
+
+ // First transform rows
+ // Only first 4 row has non-zero coefs
+ for (i = 0; i < 4; ++i)
+ {
+ Idct8(input, outptr);
+ input = input.Slice(8);
+ outptr = outptr.Slice(8);
+ }
+
+ // Then transform columns
+ for (i = 0; i < 8; ++i)
+ {
+ for (j = 0; j < 8; ++j)
+ {
+ tempIn[j] = output[j * 8 + i];
+ }
+
+ Idct8(tempIn, tempOut);
+ for (j = 0; j < 8; ++j)
+ {
+ dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i],
+ BitUtils.RoundPowerOfTwo(tempOut[j], 5));
+ }
+ }
+ }
+
+ public static void Idct8x81Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+ {
+ int i, j;
+ long a1;
+ int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
+
+ output = WrapLow(DctConstRoundShift(output * CosPi16_64));
+ a1 = BitUtils.RoundPowerOfTwo(output, 5);
+ for (j = 0; j < 8; ++j)
+ {
+ for (i = 0; i < 8; ++i)
+ {
+ dest[i] = ClipPixelAdd(dest[i], a1);
+ }
+
+ dest = dest.Slice(stride);
+ }
+ }
+
+ public static void Iadst16(ReadOnlySpan<int> input, Span<int> output)
+ {
+ long s0, s1, s2, s3, s4, s5, s6, s7, s8;
+ long s9, s10, s11, s12, s13, s14, s15;
+ long x0 = input[15];
+ long x1 = input[0];
+ long x2 = input[13];
+ long x3 = input[2];
+ long x4 = input[11];
+ long x5 = input[4];
+ long x6 = input[9];
+ long x7 = input[6];
+ long x8 = input[7];
+ long x9 = input[8];
+ long x10 = input[5];
+ long x11 = input[10];
+ long x12 = input[3];
+ long x13 = input[12];
+ long x14 = input[1];
+ long x15 = input[14];
+
+ if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15) == 0)
+ {
+ output.Slice(0, 16).Fill(0);
+ return;
+ }
+
+ // stage 1
+ s0 = x0 * CosPi1_64 + x1 * CosPi31_64;
+ s1 = x0 * CosPi31_64 - x1 * CosPi1_64;
+ s2 = x2 * CosPi5_64 + x3 * CosPi27_64;
+ s3 = x2 * CosPi27_64 - x3 * CosPi5_64;
+ s4 = x4 * CosPi9_64 + x5 * CosPi23_64;
+ s5 = x4 * CosPi23_64 - x5 * CosPi9_64;
+ s6 = x6 * CosPi13_64 + x7 * CosPi19_64;
+ s7 = x6 * CosPi19_64 - x7 * CosPi13_64;
+ s8 = x8 * CosPi17_64 + x9 * CosPi15_64;
+ s9 = x8 * CosPi15_64 - x9 * CosPi17_64;
+ s10 = x10 * CosPi21_64 + x11 * CosPi11_64;
+ s11 = x10 * CosPi11_64 - x11 * CosPi21_64;
+ s12 = x12 * CosPi25_64 + x13 * CosPi7_64;
+ s13 = x12 * CosPi7_64 - x13 * CosPi25_64;
+ s14 = x14 * CosPi29_64 + x15 * CosPi3_64;
+ s15 = x14 * CosPi3_64 - x15 * CosPi29_64;
+
+ x0 = WrapLow(DctConstRoundShift(s0 + s8));
+ x1 = WrapLow(DctConstRoundShift(s1 + s9));
+ x2 = WrapLow(DctConstRoundShift(s2 + s10));
+ x3 = WrapLow(DctConstRoundShift(s3 + s11));
+ x4 = WrapLow(DctConstRoundShift(s4 + s12));
+ x5 = WrapLow(DctConstRoundShift(s5 + s13));
+ x6 = WrapLow(DctConstRoundShift(s6 + s14));
+ x7 = WrapLow(DctConstRoundShift(s7 + s15));
+ x8 = WrapLow(DctConstRoundShift(s0 - s8));
+ x9 = WrapLow(DctConstRoundShift(s1 - s9));
+ x10 = WrapLow(DctConstRoundShift(s2 - s10));
+ x11 = WrapLow(DctConstRoundShift(s3 - s11));
+ x12 = WrapLow(DctConstRoundShift(s4 - s12));
+ x13 = WrapLow(DctConstRoundShift(s5 - s13));
+ x14 = WrapLow(DctConstRoundShift(s6 - s14));
+ x15 = WrapLow(DctConstRoundShift(s7 - s15));
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * CosPi4_64 + x9 * CosPi28_64;
+ s9 = x8 * CosPi28_64 - x9 * CosPi4_64;
+ s10 = x10 * CosPi20_64 + x11 * CosPi12_64;
+ s11 = x10 * CosPi12_64 - x11 * CosPi20_64;
+ s12 = -x12 * CosPi28_64 + x13 * CosPi4_64;
+ s13 = x12 * CosPi4_64 + x13 * CosPi28_64;
+ s14 = -x14 * CosPi12_64 + x15 * CosPi20_64;
+ s15 = x14 * CosPi20_64 + x15 * CosPi12_64;
+
+ x0 = WrapLow(s0 + s4);
+ x1 = WrapLow(s1 + s5);
+ x2 = WrapLow(s2 + s6);
+ x3 = WrapLow(s3 + s7);
+ x4 = WrapLow(s0 - s4);
+ x5 = WrapLow(s1 - s5);
+ x6 = WrapLow(s2 - s6);
+ x7 = WrapLow(s3 - s7);
+ x8 = WrapLow(DctConstRoundShift(s8 + s12));
+ x9 = WrapLow(DctConstRoundShift(s9 + s13));
+ x10 = WrapLow(DctConstRoundShift(s10 + s14));
+ x11 = WrapLow(DctConstRoundShift(s11 + s15));
+ x12 = WrapLow(DctConstRoundShift(s8 - s12));
+ x13 = WrapLow(DctConstRoundShift(s9 - s13));
+ x14 = WrapLow(DctConstRoundShift(s10 - s14));
+ x15 = WrapLow(DctConstRoundShift(s11 - s15));
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * CosPi8_64 + x5 * CosPi24_64;
+ s5 = x4 * CosPi24_64 - x5 * CosPi8_64;
+ s6 = -x6 * CosPi24_64 + x7 * CosPi8_64;
+ s7 = x6 * CosPi8_64 + x7 * CosPi24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * CosPi8_64 + x13 * CosPi24_64;
+ s13 = x12 * CosPi24_64 - x13 * CosPi8_64;
+ s14 = -x14 * CosPi24_64 + x15 * CosPi8_64;
+ s15 = x14 * CosPi8_64 + x15 * CosPi24_64;
+
+ x0 = WrapLow(s0 + s2);
+ x1 = WrapLow(s1 + s3);
+ x2 = WrapLow(s0 - s2);
+ x3 = WrapLow(s1 - s3);
+ x4 = WrapLow(DctConstRoundShift(s4 + s6));
+ x5 = WrapLow(DctConstRoundShift(s5 + s7));
+ x6 = WrapLow(DctConstRoundShift(s4 - s6));
+ x7 = WrapLow(DctConstRoundShift(s5 - s7));
+ x8 = WrapLow(s8 + s10);
+ x9 = WrapLow(s9 + s11);
+ x10 = WrapLow(s8 - s10);
+ x11 = WrapLow(s9 - s11);
+ x12 = WrapLow(DctConstRoundShift(s12 + s14));
+ x13 = WrapLow(DctConstRoundShift(s13 + s15));
+ x14 = WrapLow(DctConstRoundShift(s12 - s14));
+ x15 = WrapLow(DctConstRoundShift(s13 - s15));
+
+ // stage 4
+ s2 = (-CosPi16_64) * (x2 + x3);
+ s3 = CosPi16_64 * (x2 - x3);
+ s6 = CosPi16_64 * (x6 + x7);
+ s7 = CosPi16_64 * (-x6 + x7);
+ s10 = CosPi16_64 * (x10 + x11);
+ s11 = CosPi16_64 * (-x10 + x11);
+ s14 = (-CosPi16_64) * (x14 + x15);
+ s15 = CosPi16_64 * (x14 - x15);
+
+ x2 = WrapLow(DctConstRoundShift(s2));
+ x3 = WrapLow(DctConstRoundShift(s3));
+ x6 = WrapLow(DctConstRoundShift(s6));
+ x7 = WrapLow(DctConstRoundShift(s7));
+ x10 = WrapLow(DctConstRoundShift(s10));
+ x11 = WrapLow(DctConstRoundShift(s11));
+ x14 = WrapLow(DctConstRoundShift(s14));
+ x15 = WrapLow(DctConstRoundShift(s15));
+
+ output[0] = WrapLow(x0);
+ output[1] = WrapLow(-x8);
+ output[2] = WrapLow(x12);
+ output[3] = WrapLow(-x4);
+ output[4] = WrapLow(x6);
+ output[5] = WrapLow(x14);
+ output[6] = WrapLow(x10);
+ output[7] = WrapLow(x2);
+ output[8] = WrapLow(x3);
+ output[9] = WrapLow(x11);
+ output[10] = WrapLow(x15);
+ output[11] = WrapLow(x7);
+ output[12] = WrapLow(x5);
+ output[13] = WrapLow(-x13);
+ output[14] = WrapLow(x9);
+ output[15] = WrapLow(-x1);
+ }
+
+ public static void Idct16(ReadOnlySpan<int> input, Span<int> output)
+ {
+ Span<short> step1 = stackalloc short[16];
+ Span<short> step2 = stackalloc short[16];
+ long temp1, temp2;
+
+ // stage 1
+ step1[0] = (short)input[0 / 2];
+ step1[1] = (short)input[16 / 2];
+ step1[2] = (short)input[8 / 2];
+ step1[3] = (short)input[24 / 2];
+ step1[4] = (short)input[4 / 2];
+ step1[5] = (short)input[20 / 2];
+ step1[6] = (short)input[12 / 2];
+ step1[7] = (short)input[28 / 2];
+ step1[8] = (short)input[2 / 2];
+ step1[9] = (short)input[18 / 2];
+ step1[10] = (short)input[10 / 2];
+ step1[11] = (short)input[26 / 2];
+ step1[12] = (short)input[6 / 2];
+ step1[13] = (short)input[22 / 2];
+ step1[14] = (short)input[14 / 2];
+ step1[15] = (short)input[30 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * CosPi30_64 - step1[15] * CosPi2_64;
+ temp2 = step1[8] * CosPi2_64 + step1[15] * CosPi30_64;
+ step2[8] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[15] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ temp1 = step1[9] * CosPi14_64 - step1[14] * CosPi18_64;
+ temp2 = step1[9] * CosPi18_64 + step1[14] * CosPi14_64;
+ step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ temp1 = step1[10] * CosPi22_64 - step1[13] * CosPi10_64;
+ temp2 = step1[10] * CosPi10_64 + step1[13] * CosPi22_64;
+ step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ temp1 = step1[11] * CosPi6_64 - step1[12] * CosPi26_64;
+ temp2 = step1[11] * CosPi26_64 + step1[12] * CosPi6_64;
+ step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * CosPi28_64 - step2[7] * CosPi4_64;
+ temp2 = step2[4] * CosPi4_64 + step2[7] * CosPi28_64;
+ step1[4] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[7] = (short)WrapLow(DctConstRoundShift(temp2));
+ temp1 = step2[5] * CosPi12_64 - step2[6] * CosPi20_64;
+ temp2 = step2[5] * CosPi20_64 + step2[6] * CosPi12_64;
+ step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ step1[8] = (short)WrapLow(step2[8] + step2[9]);
+ step1[9] = (short)WrapLow(step2[8] - step2[9]);
+ step1[10] = (short)WrapLow(-step2[10] + step2[11]);
+ step1[11] = (short)WrapLow(step2[10] + step2[11]);
+ step1[12] = (short)WrapLow(step2[12] + step2[13]);
+ step1[13] = (short)WrapLow(step2[12] - step2[13]);
+ step1[14] = (short)WrapLow(-step2[14] + step2[15]);
+ step1[15] = (short)WrapLow(step2[14] + step2[15]);
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * CosPi16_64;
+ temp2 = (step1[0] - step1[1]) * CosPi16_64;
+ step2[0] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[1] = (short)WrapLow(DctConstRoundShift(temp2));
+ temp1 = step1[2] * CosPi24_64 - step1[3] * CosPi8_64;
+ temp2 = step1[2] * CosPi8_64 + step1[3] * CosPi24_64;
+ step2[2] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[3] = (short)WrapLow(DctConstRoundShift(temp2));
+ step2[4] = (short)WrapLow(step1[4] + step1[5]);
+ step2[5] = (short)WrapLow(step1[4] - step1[5]);
+ step2[6] = (short)WrapLow(-step1[6] + step1[7]);
+ step2[7] = (short)WrapLow(step1[6] + step1[7]);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * CosPi8_64 + step1[14] * CosPi24_64;
+ temp2 = step1[9] * CosPi24_64 + step1[14] * CosPi8_64;
+ step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
+ temp1 = -step1[10] * CosPi24_64 - step1[13] * CosPi8_64;
+ temp2 = -step1[10] * CosPi8_64 + step1[13] * CosPi24_64;
+ step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[0] = (short)WrapLow(step2[0] + step2[3]);
+ step1[1] = (short)WrapLow(step2[1] + step2[2]);
+ step1[2] = (short)WrapLow(step2[1] - step2[2]);
+ step1[3] = (short)WrapLow(step2[0] - step2[3]);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * CosPi16_64;
+ temp2 = (step2[5] + step2[6]) * CosPi16_64;
+ step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
+ step1[7] = step2[7];
+
+ step1[8] = (short)WrapLow(step2[8] + step2[11]);
+ step1[9] = (short)WrapLow(step2[9] + step2[10]);
+ step1[10] = (short)WrapLow(step2[9] - step2[10]);
+ step1[11] = (short)WrapLow(step2[8] - step2[11]);
+ step1[12] = (short)WrapLow(-step2[12] + step2[15]);
+ step1[13] = (short)WrapLow(-step2[13] + step2[14]);
+ step1[14] = (short)WrapLow(step2[13] + step2[14]);
+ step1[15] = (short)WrapLow(step2[12] + step2[15]);
+
+ // stage 6
+ step2[0] = (short)WrapLow(step1[0] + step1[7]);
+ step2[1] = (short)WrapLow(step1[1] + step1[6]);
+ step2[2] = (short)WrapLow(step1[2] + step1[5]);
+ step2[3] = (short)WrapLow(step1[3] + step1[4]);
+ step2[4] = (short)WrapLow(step1[3] - step1[4]);
+ step2[5] = (short)WrapLow(step1[2] - step1[5]);
+ step2[6] = (short)WrapLow(step1[1] - step1[6]);
+ step2[7] = (short)WrapLow(step1[0] - step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * CosPi16_64;
+ temp2 = (step1[10] + step1[13]) * CosPi16_64;
+ step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
+ temp1 = (-step1[11] + step1[12]) * CosPi16_64;
+ temp2 = (step1[11] + step1[12]) * CosPi16_64;
+ step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ output[0] = WrapLow(step2[0] + step2[15]);
+ output[1] = WrapLow(step2[1] + step2[14]);
+ output[2] = WrapLow(step2[2] + step2[13]);
+ output[3] = WrapLow(step2[3] + step2[12]);
+ output[4] = WrapLow(step2[4] + step2[11]);
+ output[5] = WrapLow(step2[5] + step2[10]);
+ output[6] = WrapLow(step2[6] + step2[9]);
+ output[7] = WrapLow(step2[7] + step2[8]);
+ output[8] = WrapLow(step2[7] - step2[8]);
+ output[9] = WrapLow(step2[6] - step2[9]);
+ output[10] = WrapLow(step2[5] - step2[10]);
+ output[11] = WrapLow(step2[4] - step2[11]);
+ output[12] = WrapLow(step2[3] - step2[12]);
+ output[13] = WrapLow(step2[2] - step2[13]);
+ output[14] = WrapLow(step2[1] - step2[14]);
+ output[15] = WrapLow(step2[0] - step2[15]);
+ }
+
+ public static void Idct16x16256Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+ {
+ int i, j;
+ Span<int> output = stackalloc int[16 * 16];
+ Span<int> outptr = output;
+ Span<int> tempIn = stackalloc int[16];
+ Span<int> tempOut = stackalloc int[16];
+
+ // First transform rows
+ for (i = 0; i < 16; ++i)
+ {
+ Idct16(input, outptr);
+ input = input.Slice(16);
+ outptr = outptr.Slice(16);
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i)
+ {
+ for (j = 0; j < 16; ++j)
+ {
+ tempIn[j] = output[j * 16 + i];
+ }
+
+ Idct16(tempIn, tempOut);
+ for (j = 0; j < 16; ++j)
+ {
+ dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
+ }
+ }
+ }
+
+ public static void Idct16x1638Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+ {
+ int i, j;
+ Span<int> output = stackalloc int[16 * 16];
+ Span<int> outptr = output;
+ Span<int> tempIn = stackalloc int[16];
+ Span<int> tempOut = stackalloc int[16];
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 8x8 area, we only need to calculate first 8 rows here.
+ for (i = 0; i < 8; ++i)
+ {
+ Idct16(input, outptr);
+ input = input.Slice(16);
+ outptr = outptr.Slice(16);
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i)
+ {
+ for (j = 0; j < 16; ++j)
+ {
+ tempIn[j] = output[j * 16 + i];
+ }
+
+ Idct16(tempIn, tempOut);
+ for (j = 0; j < 16; ++j)
+ {
+ dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
+ }
+ }
+ }
+
+ public static void Idct16x1610Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+ {
+ int i, j;
+ Span<int> output = stackalloc int[16 * 16];
+ Span<int> outptr = output;
+ Span<int> tempIn = stackalloc int[16];
+ Span<int> tempOut = stackalloc int[16];
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 4x4 area, we only need to calculate first 4 rows here.
+ for (i = 0; i < 4; ++i)
+ {
+ Idct16(input, outptr);
+ input = input.Slice(16);
+ outptr = outptr.Slice(16);
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i)
+ {
+ for (j = 0; j < 16; ++j)
+ {
+ tempIn[j] = output[j * 16 + i];
+ }
+
+ Idct16(tempIn, tempOut);
+ for (j = 0; j < 16; ++j)
+ {
+ dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
+ }
+ }
+ }
+
+ public static void Idct16x161Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+ {
+ int i, j;
+ long a1;
+ int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
+
+ output = WrapLow(DctConstRoundShift(output * CosPi16_64));
+ a1 = BitUtils.RoundPowerOfTwo(output, 6);
+ for (j = 0; j < 16; ++j)
+ {
+ for (i = 0; i < 16; ++i)
+ {
+ dest[i] = ClipPixelAdd(dest[i], a1);
+ }
+
+ dest = dest.Slice(stride);
+ }
+ }
+
+ public static void Idct32(ReadOnlySpan<int> input, Span<int> output)
+ {
+ Span<short> step1 = stackalloc short[32];
+ Span<short> step2 = stackalloc short[32];
+ long temp1, temp2;
+
+ // stage 1
+ step1[0] = (short)input[0];
+ step1[1] = (short)input[16];
+ step1[2] = (short)input[8];
+ step1[3] = (short)input[24];
+ step1[4] = (short)input[4];
+ step1[5] = (short)input[20];
+ step1[6] = (short)input[12];
+ step1[7] = (short)input[28];
+ step1[8] = (short)input[2];
+ step1[9] = (short)input[18];
+ step1[10] = (short)input[10];
+ step1[11] = (short)input[26];
+ step1[12] = (short)input[6];
+ step1[13] = (short)input[22];
+ step1[14] = (short)input[14];
+ step1[15] = (short)input[30];
+
+ temp1 = (short)input[1] * CosPi31_64 - (short)input[31] * CosPi1_64;
+ temp2 = (short)input[1] * CosPi1_64 + (short)input[31] * CosPi31_64;
+ step1[16] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[31] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ temp1 = (short)input[17] * CosPi15_64 - (short)input[15] * CosPi17_64;
+ temp2 = (short)input[17] * CosPi17_64 + (short)input[15] * CosPi15_64;
+ step1[17] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[30] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ temp1 = (short)input[9] * CosPi23_64 - (short)input[23] * CosPi9_64;
+ temp2 = (short)input[9] * CosPi9_64 + (short)input[23] * CosPi23_64;
+ step1[18] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[29] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ temp1 = (short)input[25] * CosPi7_64 - (short)input[7] * CosPi25_64;
+ temp2 = (short)input[25] * CosPi25_64 + (short)input[7] * CosPi7_64;
+ step1[19] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[28] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ temp1 = (short)input[5] * CosPi27_64 - (short)input[27] * CosPi5_64;
+ temp2 = (short)input[5] * CosPi5_64 + (short)input[27] * CosPi27_64;
+ step1[20] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[27] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ temp1 = (short)input[21] * CosPi11_64 - (short)input[11] * CosPi21_64;
+ temp2 = (short)input[21] * CosPi21_64 + (short)input[11] * CosPi11_64;
+ step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ temp1 = (short)input[13] * CosPi19_64 - (short)input[19] * CosPi13_64;
+ temp2 = (short)input[13] * CosPi13_64 + (short)input[19] * CosPi19_64;
+ step1[22] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[25] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ temp1 = (short)input[29] * CosPi3_64 - (short)input[3] * CosPi29_64;
+ temp2 = (short)input[29] * CosPi29_64 + (short)input[3] * CosPi3_64;
+ step1[23] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[24] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * CosPi30_64 - step1[15] * CosPi2_64;
+ temp2 = step1[8] * CosPi2_64 + step1[15] * CosPi30_64;
+ step2[8] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[15] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ temp1 = step1[9] * CosPi14_64 - step1[14] * CosPi18_64;
+ temp2 = step1[9] * CosPi18_64 + step1[14] * CosPi14_64;
+ step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ temp1 = step1[10] * CosPi22_64 - step1[13] * CosPi10_64;
+ temp2 = step1[10] * CosPi10_64 + step1[13] * CosPi22_64;
+ step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ temp1 = step1[11] * CosPi6_64 - step1[12] * CosPi26_64;
+ temp2 = step1[11] * CosPi26_64 + step1[12] * CosPi6_64;
+ step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ step2[16] = (short)WrapLow(step1[16] + step1[17]);
+ step2[17] = (short)WrapLow(step1[16] - step1[17]);
+ step2[18] = (short)WrapLow(-step1[18] + step1[19]);
+ step2[19] = (short)WrapLow(step1[18] + step1[19]);
+ step2[20] = (short)WrapLow(step1[20] + step1[21]);
+ step2[21] = (short)WrapLow(step1[20] - step1[21]);
+ step2[22] = (short)WrapLow(-step1[22] + step1[23]);
+ step2[23] = (short)WrapLow(step1[22] + step1[23]);
+ step2[24] = (short)WrapLow(step1[24] + step1[25]);
+ step2[25] = (short)WrapLow(step1[24] - step1[25]);
+ step2[26] = (short)WrapLow(-step1[26] + step1[27]);
+ step2[27] = (short)WrapLow(step1[26] + step1[27]);
+ step2[28] = (short)WrapLow(step1[28] + step1[29]);
+ step2[29] = (short)WrapLow(step1[28] - step1[29]);
+ step2[30] = (short)WrapLow(-step1[30] + step1[31]);
+ step2[31] = (short)WrapLow(step1[30] + step1[31]);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * CosPi28_64 - step2[7] * CosPi4_64;
+ temp2 = step2[4] * CosPi4_64 + step2[7] * CosPi28_64;
+ step1[4] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[7] = (short)WrapLow(DctConstRoundShift(temp2));
+ temp1 = step2[5] * CosPi12_64 - step2[6] * CosPi20_64;
+ temp2 = step2[5] * CosPi20_64 + step2[6] * CosPi12_64;
+ step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
+
+ step1[8] = (short)WrapLow(step2[8] + step2[9]);
+ step1[9] = (short)WrapLow(step2[8] - step2[9]);
+ step1[10] = (short)WrapLow(-step2[10] + step2[11]);
+ step1[11] = (short)WrapLow(step2[10] + step2[11]);
+ step1[12] = (short)WrapLow(step2[12] + step2[13]);
+ step1[13] = (short)WrapLow(step2[12] - step2[13]);
+ step1[14] = (short)WrapLow(-step2[14] + step2[15]);
+ step1[15] = (short)WrapLow(step2[14] + step2[15]);
+
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ temp1 = -step2[17] * CosPi4_64 + step2[30] * CosPi28_64;
+ temp2 = step2[17] * CosPi28_64 + step2[30] * CosPi4_64;
+ step1[17] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[30] = (short)WrapLow(DctConstRoundShift(temp2));
+ temp1 = -step2[18] * CosPi28_64 - step2[29] * CosPi4_64;
+ temp2 = -step2[18] * CosPi4_64 + step2[29] * CosPi28_64;
+ step1[18] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[29] = (short)WrapLow(DctConstRoundShift(temp2));
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ temp1 = -step2[21] * CosPi20_64 + step2[26] * CosPi12_64;
+ temp2 = step2[21] * CosPi12_64 + step2[26] * CosPi20_64;
+ step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
+ temp1 = -step2[22] * CosPi12_64 - step2[25] * CosPi20_64;
+ temp2 = -step2[22] * CosPi20_64 + step2[25] * CosPi12_64;
+ step1[22] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[25] = (short)WrapLow(DctConstRoundShift(temp2));
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * CosPi16_64;
+ temp2 = (step1[0] - step1[1]) * CosPi16_64;
+ step2[0] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[1] = (short)WrapLow(DctConstRoundShift(temp2));
+ temp1 = step1[2] * CosPi24_64 - step1[3] * CosPi8_64;
+ temp2 = step1[2] * CosPi8_64 + step1[3] * CosPi24_64;
+ step2[2] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[3] = (short)WrapLow(DctConstRoundShift(temp2));
+ step2[4] = (short)WrapLow(step1[4] + step1[5]);
+ step2[5] = (short)WrapLow(step1[4] - step1[5]);
+ step2[6] = (short)WrapLow(-step1[6] + step1[7]);
+ step2[7] = (short)WrapLow(step1[6] + step1[7]);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * CosPi8_64 + step1[14] * CosPi24_64;
+ temp2 = step1[9] * CosPi24_64 + step1[14] * CosPi8_64;
+ step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
+ temp1 = -step1[10] * CosPi24_64 - step1[13] * CosPi8_64;
+ temp2 = -step1[10] * CosPi8_64 + step1[13] * CosPi24_64;
+ step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ step2[16] = (short)WrapLow(step1[16] + step1[19]);
+ step2[17] = (short)WrapLow(step1[17] + step1[18]);
+ step2[18] = (short)WrapLow(step1[17] - step1[18]);
+ step2[19] = (short)WrapLow(step1[16] - step1[19]);
+ step2[20] = (short)WrapLow(-step1[20] + step1[23]);
+ step2[21] = (short)WrapLow(-step1[21] + step1[22]);
+ step2[22] = (short)WrapLow(step1[21] + step1[22]);
+ step2[23] = (short)WrapLow(step1[20] + step1[23]);
+
+ step2[24] = (short)WrapLow(step1[24] + step1[27]);
+ step2[25] = (short)WrapLow(step1[25] + step1[26]);
+ step2[26] = (short)WrapLow(step1[25] - step1[26]);
+ step2[27] = (short)WrapLow(step1[24] - step1[27]);
+ step2[28] = (short)WrapLow(-step1[28] + step1[31]);
+ step2[29] = (short)WrapLow(-step1[29] + step1[30]);
+ step2[30] = (short)WrapLow(step1[29] + step1[30]);
+ step2[31] = (short)WrapLow(step1[28] + step1[31]);
+
+ // stage 5
+ step1[0] = (short)WrapLow(step2[0] + step2[3]);
+ step1[1] = (short)WrapLow(step2[1] + step2[2]);
+ step1[2] = (short)WrapLow(step2[1] - step2[2]);
+ step1[3] = (short)WrapLow(step2[0] - step2[3]);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * CosPi16_64;
+ temp2 = (step2[5] + step2[6]) * CosPi16_64;
+ step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
+ step1[7] = step2[7];
+
+ step1[8] = (short)WrapLow(step2[8] + step2[11]);
+ step1[9] = (short)WrapLow(step2[9] + step2[10]);
+ step1[10] = (short)WrapLow(step2[9] - step2[10]);
+ step1[11] = (short)WrapLow(step2[8] - step2[11]);
+ step1[12] = (short)WrapLow(-step2[12] + step2[15]);
+ step1[13] = (short)WrapLow(-step2[13] + step2[14]);
+ step1[14] = (short)WrapLow(step2[13] + step2[14]);
+ step1[15] = (short)WrapLow(step2[12] + step2[15]);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ temp1 = -step2[18] * CosPi8_64 + step2[29] * CosPi24_64;
+ temp2 = step2[18] * CosPi24_64 + step2[29] * CosPi8_64;
+ step1[18] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[29] = (short)WrapLow(DctConstRoundShift(temp2));
+ temp1 = -step2[19] * CosPi8_64 + step2[28] * CosPi24_64;
+ temp2 = step2[19] * CosPi24_64 + step2[28] * CosPi8_64;
+ step1[19] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[28] = (short)WrapLow(DctConstRoundShift(temp2));
+ temp1 = -step2[20] * CosPi24_64 - step2[27] * CosPi8_64;
+ temp2 = -step2[20] * CosPi8_64 + step2[27] * CosPi24_64;
+ step1[20] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[27] = (short)WrapLow(DctConstRoundShift(temp2));
+ temp1 = -step2[21] * CosPi24_64 - step2[26] * CosPi8_64;
+ temp2 = -step2[21] * CosPi8_64 + step2[26] * CosPi24_64;
+ step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ step2[0] = (short)WrapLow(step1[0] + step1[7]);
+ step2[1] = (short)WrapLow(step1[1] + step1[6]);
+ step2[2] = (short)WrapLow(step1[2] + step1[5]);
+ step2[3] = (short)WrapLow(step1[3] + step1[4]);
+ step2[4] = (short)WrapLow(step1[3] - step1[4]);
+ step2[5] = (short)WrapLow(step1[2] - step1[5]);
+ step2[6] = (short)WrapLow(step1[1] - step1[6]);
+ step2[7] = (short)WrapLow(step1[0] - step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * CosPi16_64;
+ temp2 = (step1[10] + step1[13]) * CosPi16_64;
+ step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
+ temp1 = (-step1[11] + step1[12]) * CosPi16_64;
+ temp2 = (step1[11] + step1[12]) * CosPi16_64;
+ step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
+ step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ step2[16] = (short)WrapLow(step1[16] + step1[23]);
+ step2[17] = (short)WrapLow(step1[17] + step1[22]);
+ step2[18] = (short)WrapLow(step1[18] + step1[21]);
+ step2[19] = (short)WrapLow(step1[19] + step1[20]);
+ step2[20] = (short)WrapLow(step1[19] - step1[20]);
+ step2[21] = (short)WrapLow(step1[18] - step1[21]);
+ step2[22] = (short)WrapLow(step1[17] - step1[22]);
+ step2[23] = (short)WrapLow(step1[16] - step1[23]);
+
+ step2[24] = (short)WrapLow(-step1[24] + step1[31]);
+ step2[25] = (short)WrapLow(-step1[25] + step1[30]);
+ step2[26] = (short)WrapLow(-step1[26] + step1[29]);
+ step2[27] = (short)WrapLow(-step1[27] + step1[28]);
+ step2[28] = (short)WrapLow(step1[27] + step1[28]);
+ step2[29] = (short)WrapLow(step1[26] + step1[29]);
+ step2[30] = (short)WrapLow(step1[25] + step1[30]);
+ step2[31] = (short)WrapLow(step1[24] + step1[31]);
+
+ // stage 7
+ step1[0] = (short)WrapLow(step2[0] + step2[15]);
+ step1[1] = (short)WrapLow(step2[1] + step2[14]);
+ step1[2] = (short)WrapLow(step2[2] + step2[13]);
+ step1[3] = (short)WrapLow(step2[3] + step2[12]);
+ step1[4] = (short)WrapLow(step2[4] + step2[11]);
+ step1[5] = (short)WrapLow(step2[5] + step2[10]);
+ step1[6] = (short)WrapLow(step2[6] + step2[9]);
+ step1[7] = (short)WrapLow(step2[7] + step2[8]);
+ step1[8] = (short)WrapLow(step2[7] - step2[8]);
+ step1[9] = (short)WrapLow(step2[6] - step2[9]);
+ step1[10] = (short)WrapLow(step2[5] - step2[10]);
+ step1[11] = (short)WrapLow(step2[4] - step2[11]);
+ step1[12] = (short)WrapLow(step2[3] - step2[12]);
+ step1[13] = (short)WrapLow(step2[2] - step2[13]);
+ step1[14] = (short)WrapLow(step2[1] - step2[14]);
+ step1[15] = (short)WrapLow(step2[0] - step2[15]);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ step1[18] = step2[18];
+ step1[19] = step2[19];
+ temp1 = (-step2[20] + step2[27]) * CosPi16_64;
+ temp2 = (step2[20] + step2[27]) * CosPi16_64;
+ step1[20] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[27] = (short)WrapLow(DctConstRoundShift(temp2));
+ temp1 = (-step2[21] + step2[26]) * CosPi16_64;
+ temp2 = (step2[21] + step2[26]) * CosPi16_64;
+ step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
+ temp1 = (-step2[22] + step2[25]) * CosPi16_64;
+ temp2 = (step2[22] + step2[25]) * CosPi16_64;
+ step1[22] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[25] = (short)WrapLow(DctConstRoundShift(temp2));
+ temp1 = (-step2[23] + step2[24]) * CosPi16_64;
+ temp2 = (step2[23] + step2[24]) * CosPi16_64;
+ step1[23] = (short)WrapLow(DctConstRoundShift(temp1));
+ step1[24] = (short)WrapLow(DctConstRoundShift(temp2));
+ step1[28] = step2[28];
+ step1[29] = step2[29];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // final stage
+ output[0] = WrapLow(step1[0] + step1[31]);
+ output[1] = WrapLow(step1[1] + step1[30]);
+ output[2] = WrapLow(step1[2] + step1[29]);
+ output[3] = WrapLow(step1[3] + step1[28]);
+ output[4] = WrapLow(step1[4] + step1[27]);
+ output[5] = WrapLow(step1[5] + step1[26]);
+ output[6] = WrapLow(step1[6] + step1[25]);
+ output[7] = WrapLow(step1[7] + step1[24]);
+ output[8] = WrapLow(step1[8] + step1[23]);
+ output[9] = WrapLow(step1[9] + step1[22]);
+ output[10] = WrapLow(step1[10] + step1[21]);
+ output[11] = WrapLow(step1[11] + step1[20]);
+ output[12] = WrapLow(step1[12] + step1[19]);
+ output[13] = WrapLow(step1[13] + step1[18]);
+ output[14] = WrapLow(step1[14] + step1[17]);
+ output[15] = WrapLow(step1[15] + step1[16]);
+ output[16] = WrapLow(step1[15] - step1[16]);
+ output[17] = WrapLow(step1[14] - step1[17]);
+ output[18] = WrapLow(step1[13] - step1[18]);
+ output[19] = WrapLow(step1[12] - step1[19]);
+ output[20] = WrapLow(step1[11] - step1[20]);
+ output[21] = WrapLow(step1[10] - step1[21]);
+ output[22] = WrapLow(step1[9] - step1[22]);
+ output[23] = WrapLow(step1[8] - step1[23]);
+ output[24] = WrapLow(step1[7] - step1[24]);
+ output[25] = WrapLow(step1[6] - step1[25]);
+ output[26] = WrapLow(step1[5] - step1[26]);
+ output[27] = WrapLow(step1[4] - step1[27]);
+ output[28] = WrapLow(step1[3] - step1[28]);
+ output[29] = WrapLow(step1[2] - step1[29]);
+ output[30] = WrapLow(step1[1] - step1[30]);
+ output[31] = WrapLow(step1[0] - step1[31]);
+ }
+
+ public static void Idct32x321024Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+ {
+ int i, j;
+ Span<int> output = stackalloc int[32 * 32];
+ Span<int> outptr = output;
+ Span<int> tempIn = stackalloc int[32];
+ Span<int> tempOut = stackalloc int[32];
+
+ // Rows
+ for (i = 0; i < 32; ++i)
+ {
+ short zeroCoeff = 0;
+ for (j = 0; j < 32; ++j)
+ {
+ zeroCoeff |= (short)input[j];
+ }
+
+ if (zeroCoeff != 0)
+ {
+ Idct32(input, outptr);
+ }
+ else
+ {
+ outptr.Slice(0, 32).Fill(0);
+ }
+
+ input = input.Slice(32);
+ outptr = outptr.Slice(32);
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i)
+ {
+ for (j = 0; j < 32; ++j)
+ {
+ tempIn[j] = output[j * 32 + i];
+ }
+
+ Idct32(tempIn, tempOut);
+ for (j = 0; j < 32; ++j)
+ {
+ dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
+ }
+ }
+ }
+
+ public static void Idct32x32135Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+ {
+ int i, j;
+ Span<int> output = stackalloc int[32 * 32];
+ Span<int> outptr = output;
+ Span<int> tempIn = stackalloc int[32];
+ Span<int> tempOut = stackalloc int[32];
+
+ // Rows
+ // Only upper-left 16x16 has non-zero coeff
+ for (i = 0; i < 16; ++i)
+ {
+ Idct32(input, outptr);
+ input = input.Slice(32);
+ outptr = outptr.Slice(32);
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i)
+ {
+ for (j = 0; j < 32; ++j)
+ {
+ tempIn[j] = output[j * 32 + i];
+ }
+
+ Idct32(tempIn, tempOut);
+ for (j = 0; j < 32; ++j)
+ {
+ dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
+ }
+ }
+ }
+
+ public static void Idct32x3234Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+ {
+ int i, j;
+ Span<int> output = stackalloc int[32 * 32];
+ Span<int> outptr = output;
+ Span<int> tempIn = stackalloc int[32];
+ Span<int> tempOut = stackalloc int[32];
+
+ // Rows
+ // Only upper-left 8x8 has non-zero coeff
+ for (i = 0; i < 8; ++i)
+ {
+ Idct32(input, outptr);
+ input = input.Slice(32);
+ outptr = outptr.Slice(32);
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i)
+ {
+ for (j = 0; j < 32; ++j)
+ {
+ tempIn[j] = output[j * 32 + i];
+ }
+
+ Idct32(tempIn, tempOut);
+ for (j = 0; j < 32; ++j)
+ {
+ dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
+ }
+ }
+ }
+
+ public static void Idct32x321Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+ {
+ int i, j;
+ long a1;
+ int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
+
+ output = WrapLow(DctConstRoundShift(output * CosPi16_64));
+ a1 = BitUtils.RoundPowerOfTwo(output, 6);
+
+ for (j = 0; j < 32; ++j)
+ {
+ for (i = 0; i < 32; ++i)
+ {
+ dest[i] = ClipPixelAdd(dest[i], a1);
+ }
+
+ dest = dest.Slice(stride);
+ }
+ }
+
+ public static void HighbdIwht4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+ {
+ /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+ 0.5 shifts per pixel. */
+ int i;
+ Span<int> output = stackalloc int[16];
+ long a1, b1, c1, d1, e1;
+ ReadOnlySpan<int> ip = input;
+ Span<int> op = output;
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ip[0] >> UnitQuantShift;
+ c1 = ip[1] >> UnitQuantShift;
+ d1 = ip[2] >> UnitQuantShift;
+ b1 = ip[3] >> UnitQuantShift;
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ op[0] = HighbdWrapLow(a1, bd);
+ op[1] = HighbdWrapLow(b1, bd);
+ op[2] = HighbdWrapLow(c1, bd);
+ op[3] = HighbdWrapLow(d1, bd);
+ ip = ip.Slice(4);
+ op = op.Slice(4);
+ }
+
+ ReadOnlySpan<int> ip2 = output;
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ip2[4 * 0];
+ c1 = ip2[4 * 1];
+ d1 = ip2[4 * 2];
+ b1 = ip2[4 * 3];
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ dest[stride * 0] = HighbdClipPixelAdd(dest[stride * 0], HighbdWrapLow(a1, bd), bd);
+ dest[stride * 1] = HighbdClipPixelAdd(dest[stride * 1], HighbdWrapLow(b1, bd), bd);
+ dest[stride * 2] = HighbdClipPixelAdd(dest[stride * 2], HighbdWrapLow(c1, bd), bd);
+ dest[stride * 3] = HighbdClipPixelAdd(dest[stride * 3], HighbdWrapLow(d1, bd), bd);
+
+ ip2 = ip2.Slice(1);
+ dest = dest.Slice(1);
+ }
+ }
+
+ public static void HighbdIwht4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+ {
+ int i;
+ long a1, e1;
+ Span<int> tmp = stackalloc int[4];
+ ReadOnlySpan<int> ip = input;
+ Span<int> op = tmp;
+
+ a1 = ip[0] >> UnitQuantShift;
+ e1 = a1 >> 1;
+ a1 -= e1;
+ op[0] = HighbdWrapLow(a1, bd);
+ op[1] = op[2] = op[3] = HighbdWrapLow(e1, bd);
+
+ ReadOnlySpan<int> ip2 = tmp;
+ for (i = 0; i < 4; i++)
+ {
+ e1 = ip2[0] >> 1;
+ a1 = ip2[0] - e1;
+ dest[stride * 0] = HighbdClipPixelAdd(dest[stride * 0], a1, bd);
+ dest[stride * 1] = HighbdClipPixelAdd(dest[stride * 1], e1, bd);
+ dest[stride * 2] = HighbdClipPixelAdd(dest[stride * 2], e1, bd);
+ dest[stride * 3] = HighbdClipPixelAdd(dest[stride * 3], e1, bd);
+ ip2 = ip2.Slice(1);
+ dest = dest.Slice(1);
+ }
+ }
+
+ public static void HighbdIadst4(ReadOnlySpan<int> input, Span<int> output, int bd)
+ {
+ long s0, s1, s2, s3, s4, s5, s6, s7;
+ int x0 = input[0];
+ int x1 = input[1];
+ int x2 = input[2];
+ int x3 = input[3];
+
+ if (DetectInvalidHighbdInput(input, 4) != 0)
+ {
+ Debug.Assert(false, "invalid highbd txfm input");
+ output.Slice(0, 4).Fill(0);
+ return;
+ }
+
+ if ((x0 | x1 | x2 | x3) == 0)
+ {
+ output.Slice(0, 4).Fill(0);
+ return;
+ }
+
+ s0 = (long)SinPi1_9 * x0;
+ s1 = (long)SinPi2_9 * x0;
+ s2 = (long)SinPi3_9 * x1;
+ s3 = (long)SinPi4_9 * x2;
+ s4 = (long)SinPi1_9 * x2;
+ s5 = (long)SinPi2_9 * x3;
+ s6 = (long)SinPi4_9 * x3;
+ s7 = HighbdWrapLow(x0 - x2 + x3, bd);
+
+ s0 = s0 + s3 + s5;
+ s1 = s1 - s4 - s6;
+ s3 = s2;
+ s2 = SinPi3_9 * s7;
+
+ // 1-D transform scaling factor is sqrt(2).
+ // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+ // + 1b (addition) = 29b.
+ // Hence the output bit depth is 15b.
+ output[0] = HighbdWrapLow(DctConstRoundShift(s0 + s3), bd);
+ output[1] = HighbdWrapLow(DctConstRoundShift(s1 + s3), bd);
+ output[2] = HighbdWrapLow(DctConstRoundShift(s2), bd);
+ output[3] = HighbdWrapLow(DctConstRoundShift(s0 + s1 - s3), bd);
+ }
+
+ public static void HighbdIdct4(ReadOnlySpan<int> input, Span<int> output, int bd)
+ {
+ Span<int> step = stackalloc int[4];
+ long temp1, temp2;
+
+ if (DetectInvalidHighbdInput(input, 4) != 0)
+ {
+ Debug.Assert(false, "invalid highbd txfm input");
+ output.Slice(0, 4).Fill(0);
+ return;
+ }
+
+ // stage 1
+ temp1 = (input[0] + input[2]) * (long)CosPi16_64;
+ temp2 = (input[0] - input[2]) * (long)CosPi16_64;
+ step[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ temp1 = input[1] * (long)CosPi24_64 - input[3] * (long)CosPi8_64;
+ temp2 = input[1] * (long)CosPi8_64 + input[3] * (long)CosPi24_64;
+ step[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ // stage 2
+ output[0] = HighbdWrapLow(step[0] + step[3], bd);
+ output[1] = HighbdWrapLow(step[1] + step[2], bd);
+ output[2] = HighbdWrapLow(step[1] - step[2], bd);
+ output[3] = HighbdWrapLow(step[0] - step[3], bd);
+ }
+
+ public static void HighbdIdct4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+ {
+ int i, j;
+ Span<int> output = stackalloc int[4 * 4];
+ Span<int> outptr = output;
+ Span<int> tempIn = stackalloc int[4];
+ Span<int> tempOut = stackalloc int[4];
+
+ // Rows
+ for (i = 0; i < 4; ++i)
+ {
+ HighbdIdct4(input, outptr, bd);
+ input = input.Slice(4);
+ outptr = outptr.Slice(4);
+ }
+
+ // Columns
+ for (i = 0; i < 4; ++i)
+ {
+ for (j = 0; j < 4; ++j)
+ {
+ tempIn[j] = output[j * 4 + i];
+ }
+
+ HighbdIdct4(tempIn, tempOut, bd);
+ for (j = 0; j < 4; ++j)
+ {
+ dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4), bd);
+ }
+ }
+ }
+
+ public static void HighbdIdct4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+ {
+ int i;
+ long a1;
+ int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
+
+ output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
+ a1 = BitUtils.RoundPowerOfTwo(output, 4);
+
+ for (i = 0; i < 4; i++)
+ {
+ dest[0] = HighbdClipPixelAdd(dest[0], a1, bd);
+ dest[1] = HighbdClipPixelAdd(dest[1], a1, bd);
+ dest[2] = HighbdClipPixelAdd(dest[2], a1, bd);
+ dest[3] = HighbdClipPixelAdd(dest[3], a1, bd);
+ dest = dest.Slice(stride);
+ }
+ }
+
+ public static void HighbdIadst8(ReadOnlySpan<int> input, Span<int> output, int bd)
+ {
+ long s0, s1, s2, s3, s4, s5, s6, s7;
+ int x0 = input[7];
+ int x1 = input[0];
+ int x2 = input[5];
+ int x3 = input[2];
+ int x4 = input[3];
+ int x5 = input[4];
+ int x6 = input[1];
+ int x7 = input[6];
+
+ if (DetectInvalidHighbdInput(input, 8) != 0)
+ {
+ Debug.Assert(false, "invalid highbd txfm input");
+ output.Slice(0, 8).Fill(0);
+ return;
+ }
+
+ if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7) == 0)
+ {
+ output.Slice(0, 8).Fill(0);
+ return;
+ }
+
+ // stage 1
+ s0 = (long)CosPi2_64 * x0 + (long)CosPi30_64 * x1;
+ s1 = (long)CosPi30_64 * x0 - (long)CosPi2_64 * x1;
+ s2 = (long)CosPi10_64 * x2 + (long)CosPi22_64 * x3;
+ s3 = (long)CosPi22_64 * x2 - (long)CosPi10_64 * x3;
+ s4 = (long)CosPi18_64 * x4 + (long)CosPi14_64 * x5;
+ s5 = (long)CosPi14_64 * x4 - (long)CosPi18_64 * x5;
+ s6 = (long)CosPi26_64 * x6 + (long)CosPi6_64 * x7;
+ s7 = (long)CosPi6_64 * x6 - (long)CosPi26_64 * x7;
+
+ x0 = HighbdWrapLow(DctConstRoundShift(s0 + s4), bd);
+ x1 = HighbdWrapLow(DctConstRoundShift(s1 + s5), bd);
+ x2 = HighbdWrapLow(DctConstRoundShift(s2 + s6), bd);
+ x3 = HighbdWrapLow(DctConstRoundShift(s3 + s7), bd);
+ x4 = HighbdWrapLow(DctConstRoundShift(s0 - s4), bd);
+ x5 = HighbdWrapLow(DctConstRoundShift(s1 - s5), bd);
+ x6 = HighbdWrapLow(DctConstRoundShift(s2 - s6), bd);
+ x7 = HighbdWrapLow(DctConstRoundShift(s3 - s7), bd);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = (long)CosPi8_64 * x4 + (long)CosPi24_64 * x5;
+ s5 = (long)CosPi24_64 * x4 - (long)CosPi8_64 * x5;
+ s6 = (long)(-CosPi24_64) * x6 + (long)CosPi8_64 * x7;
+ s7 = (long)CosPi8_64 * x6 + (long)CosPi24_64 * x7;
+
+ x0 = HighbdWrapLow(s0 + s2, bd);
+ x1 = HighbdWrapLow(s1 + s3, bd);
+ x2 = HighbdWrapLow(s0 - s2, bd);
+ x3 = HighbdWrapLow(s1 - s3, bd);
+ x4 = HighbdWrapLow(DctConstRoundShift(s4 + s6), bd);
+ x5 = HighbdWrapLow(DctConstRoundShift(s5 + s7), bd);
+ x6 = HighbdWrapLow(DctConstRoundShift(s4 - s6), bd);
+ x7 = HighbdWrapLow(DctConstRoundShift(s5 - s7), bd);
+
+ // stage 3
+ s2 = (long)CosPi16_64 * (x2 + x3);
+ s3 = (long)CosPi16_64 * (x2 - x3);
+ s6 = (long)CosPi16_64 * (x6 + x7);
+ s7 = (long)CosPi16_64 * (x6 - x7);
+
+ x2 = HighbdWrapLow(DctConstRoundShift(s2), bd);
+ x3 = HighbdWrapLow(DctConstRoundShift(s3), bd);
+ x6 = HighbdWrapLow(DctConstRoundShift(s6), bd);
+ x7 = HighbdWrapLow(DctConstRoundShift(s7), bd);
+
+ output[0] = HighbdWrapLow(x0, bd);
+ output[1] = HighbdWrapLow(-x4, bd);
+ output[2] = HighbdWrapLow(x6, bd);
+ output[3] = HighbdWrapLow(-x2, bd);
+ output[4] = HighbdWrapLow(x3, bd);
+ output[5] = HighbdWrapLow(-x7, bd);
+ output[6] = HighbdWrapLow(x5, bd);
+ output[7] = HighbdWrapLow(-x1, bd);
+ }
+
+ public static void HighbdIdct8(ReadOnlySpan<int> input, Span<int> output, int bd)
+ {
+ Span<int> step1 = stackalloc int[8];
+ Span<int> step2 = stackalloc int[8];
+ long temp1, temp2;
+
+ if (DetectInvalidHighbdInput(input, 8) != 0)
+ {
+ Debug.Assert(false, "invalid highbd txfm input");
+ output.Slice(0, 8).Fill(0);
+ return;
+ }
+
+ // stage 1
+ step1[0] = input[0];
+ step1[2] = input[4];
+ step1[1] = input[2];
+ step1[3] = input[6];
+ temp1 = input[1] * (long)CosPi28_64 - input[7] * (long)CosPi4_64;
+ temp2 = input[1] * (long)CosPi4_64 + input[7] * (long)CosPi28_64;
+ step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ temp1 = input[5] * (long)CosPi12_64 - input[3] * (long)CosPi20_64;
+ temp2 = input[5] * (long)CosPi20_64 + input[3] * (long)CosPi12_64;
+ step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ // stage 2 & stage 3 - even half
+ HighbdIdct4(step1, step1, bd);
+
+ // stage 2 - odd half
+ step2[4] = HighbdWrapLow(step1[4] + step1[5], bd);
+ step2[5] = HighbdWrapLow(step1[4] - step1[5], bd);
+ step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd);
+ step2[7] = HighbdWrapLow(step1[6] + step1[7], bd);
+
+ // stage 3 - odd half
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * (long)CosPi16_64;
+ temp2 = (step2[5] + step2[6]) * (long)CosPi16_64;
+ step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ step1[7] = step2[7];
+
+ // stage 4
+ output[0] = HighbdWrapLow(step1[0] + step1[7], bd);
+ output[1] = HighbdWrapLow(step1[1] + step1[6], bd);
+ output[2] = HighbdWrapLow(step1[2] + step1[5], bd);
+ output[3] = HighbdWrapLow(step1[3] + step1[4], bd);
+ output[4] = HighbdWrapLow(step1[3] - step1[4], bd);
+ output[5] = HighbdWrapLow(step1[2] - step1[5], bd);
+ output[6] = HighbdWrapLow(step1[1] - step1[6], bd);
+ output[7] = HighbdWrapLow(step1[0] - step1[7], bd);
+ }
+
+ public static void HighbdIdct8x864Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+ {
+ int i, j;
+ Span<int> output = stackalloc int[8 * 8];
+ Span<int> outptr = output;
+ Span<int> tempIn = stackalloc int[8];
+ Span<int> tempOut = stackalloc int[8];
+
+ // First transform rows
+ for (i = 0; i < 8; ++i)
+ {
+ HighbdIdct8(input, outptr, bd);
+ input = input.Slice(8);
+ outptr = outptr.Slice(8);
+ }
+
+ // Then transform columns
+ for (i = 0; i < 8; ++i)
+ {
+ for (j = 0; j < 8; ++j)
+ {
+ tempIn[j] = output[j * 8 + i];
+ }
+
+ HighbdIdct8(tempIn, tempOut, bd);
+ for (j = 0; j < 8; ++j)
+ {
+ dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd);
+ }
+ }
+ }
+
+ public static void HighbdIdct8x812Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+ {
+ int i, j;
+ Span<int> output = stackalloc int[8 * 8];
+ Span<int> outptr = output;
+ Span<int> tempIn = stackalloc int[8];
+ Span<int> tempOut = stackalloc int[8];
+
+ // First transform rows
+ // Only first 4 row has non-zero coefs
+ for (i = 0; i < 4; ++i)
+ {
+ HighbdIdct8(input, outptr, bd);
+ input = input.Slice(8);
+ outptr = outptr.Slice(8);
+ }
+
+ // Then transform columns
+ for (i = 0; i < 8; ++i)
+ {
+ for (j = 0; j < 8; ++j)
+ {
+ tempIn[j] = output[j * 8 + i];
+ }
+
+ HighbdIdct8(tempIn, tempOut, bd);
+ for (j = 0; j < 8; ++j)
+ {
+ dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd);
+ }
+ }
+ }
+
+ public static void vpx_Highbdidct8x8_1_add_c(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+ {
+ int i, j;
+ long a1;
+ int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
+
+ output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
+ a1 = BitUtils.RoundPowerOfTwo(output, 5);
+ for (j = 0; j < 8; ++j)
+ {
+ for (i = 0; i < 8; ++i)
+ {
+ dest[i] = HighbdClipPixelAdd(dest[i], a1, bd);
+ }
+
+ dest = dest.Slice(stride);
+ }
+ }
+
+ public static void HighbdIadst16(ReadOnlySpan<int> input, Span<int> output, int bd)
+ {
+ long s0, s1, s2, s3, s4, s5, s6, s7, s8;
+ long s9, s10, s11, s12, s13, s14, s15;
+ int x0 = input[15];
+ int x1 = input[0];
+ int x2 = input[13];
+ int x3 = input[2];
+ int x4 = input[11];
+ int x5 = input[4];
+ int x6 = input[9];
+ int x7 = input[6];
+ int x8 = input[7];
+ int x9 = input[8];
+ int x10 = input[5];
+ int x11 = input[10];
+ int x12 = input[3];
+ int x13 = input[12];
+ int x14 = input[1];
+ int x15 = input[14];
+ if (DetectInvalidHighbdInput(input, 16) != 0)
+ {
+ Debug.Assert(false, "invalid highbd txfm input");
+ output.Slice(0, 16).Fill(0);
+ return;
+ }
+
+ if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15) == 0)
+ {
+ output.Slice(0, 16).Fill(0);
+ return;
+ }
+
+ // stage 1
+ s0 = x0 * (long)CosPi1_64 + x1 * (long)CosPi31_64;
+ s1 = x0 * (long)CosPi31_64 - x1 * (long)CosPi1_64;
+ s2 = x2 * (long)CosPi5_64 + x3 * (long)CosPi27_64;
+ s3 = x2 * (long)CosPi27_64 - x3 * (long)CosPi5_64;
+ s4 = x4 * (long)CosPi9_64 + x5 * (long)CosPi23_64;
+ s5 = x4 * (long)CosPi23_64 - x5 * (long)CosPi9_64;
+ s6 = x6 * (long)CosPi13_64 + x7 * (long)CosPi19_64;
+ s7 = x6 * (long)CosPi19_64 - x7 * (long)CosPi13_64;
+ s8 = x8 * (long)CosPi17_64 + x9 * (long)CosPi15_64;
+ s9 = x8 * (long)CosPi15_64 - x9 * (long)CosPi17_64;
+ s10 = x10 * (long)CosPi21_64 + x11 * (long)CosPi11_64;
+ s11 = x10 * (long)CosPi11_64 - x11 * (long)CosPi21_64;
+ s12 = x12 * (long)CosPi25_64 + x13 * (long)CosPi7_64;
+ s13 = x12 * (long)CosPi7_64 - x13 * (long)CosPi25_64;
+ s14 = x14 * (long)CosPi29_64 + x15 * (long)CosPi3_64;
+ s15 = x14 * (long)CosPi3_64 - x15 * (long)CosPi29_64;
+
+ x0 = HighbdWrapLow(DctConstRoundShift(s0 + s8), bd);
+ x1 = HighbdWrapLow(DctConstRoundShift(s1 + s9), bd);
+ x2 = HighbdWrapLow(DctConstRoundShift(s2 + s10), bd);
+ x3 = HighbdWrapLow(DctConstRoundShift(s3 + s11), bd);
+ x4 = HighbdWrapLow(DctConstRoundShift(s4 + s12), bd);
+ x5 = HighbdWrapLow(DctConstRoundShift(s5 + s13), bd);
+ x6 = HighbdWrapLow(DctConstRoundShift(s6 + s14), bd);
+ x7 = HighbdWrapLow(DctConstRoundShift(s7 + s15), bd);
+ x8 = HighbdWrapLow(DctConstRoundShift(s0 - s8), bd);
+ x9 = HighbdWrapLow(DctConstRoundShift(s1 - s9), bd);
+ x10 = HighbdWrapLow(DctConstRoundShift(s2 - s10), bd);
+ x11 = HighbdWrapLow(DctConstRoundShift(s3 - s11), bd);
+ x12 = HighbdWrapLow(DctConstRoundShift(s4 - s12), bd);
+ x13 = HighbdWrapLow(DctConstRoundShift(s5 - s13), bd);
+ x14 = HighbdWrapLow(DctConstRoundShift(s6 - s14), bd);
+ x15 = HighbdWrapLow(DctConstRoundShift(s7 - s15), bd);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * (long)CosPi4_64 + x9 * (long)CosPi28_64;
+ s9 = x8 * (long)CosPi28_64 - x9 * (long)CosPi4_64;
+ s10 = x10 * (long)CosPi20_64 + x11 * (long)CosPi12_64;
+ s11 = x10 * (long)CosPi12_64 - x11 * (long)CosPi20_64;
+ s12 = -x12 * (long)CosPi28_64 + x13 * (long)CosPi4_64;
+ s13 = x12 * (long)CosPi4_64 + x13 * (long)CosPi28_64;
+ s14 = -x14 * (long)CosPi12_64 + x15 * (long)CosPi20_64;
+ s15 = x14 * (long)CosPi20_64 + x15 * (long)CosPi12_64;
+
+ x0 = HighbdWrapLow(s0 + s4, bd);
+ x1 = HighbdWrapLow(s1 + s5, bd);
+ x2 = HighbdWrapLow(s2 + s6, bd);
+ x3 = HighbdWrapLow(s3 + s7, bd);
+ x4 = HighbdWrapLow(s0 - s4, bd);
+ x5 = HighbdWrapLow(s1 - s5, bd);
+ x6 = HighbdWrapLow(s2 - s6, bd);
+ x7 = HighbdWrapLow(s3 - s7, bd);
+ x8 = HighbdWrapLow(DctConstRoundShift(s8 + s12), bd);
+ x9 = HighbdWrapLow(DctConstRoundShift(s9 + s13), bd);
+ x10 = HighbdWrapLow(DctConstRoundShift(s10 + s14), bd);
+ x11 = HighbdWrapLow(DctConstRoundShift(s11 + s15), bd);
+ x12 = HighbdWrapLow(DctConstRoundShift(s8 - s12), bd);
+ x13 = HighbdWrapLow(DctConstRoundShift(s9 - s13), bd);
+ x14 = HighbdWrapLow(DctConstRoundShift(s10 - s14), bd);
+ x15 = HighbdWrapLow(DctConstRoundShift(s11 - s15), bd);
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * (long)CosPi8_64 + x5 * (long)CosPi24_64;
+ s5 = x4 * (long)CosPi24_64 - x5 * (long)CosPi8_64;
+ s6 = -x6 * (long)CosPi24_64 + x7 * (long)CosPi8_64;
+ s7 = x6 * (long)CosPi8_64 + x7 * (long)CosPi24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * (long)CosPi8_64 + x13 * (long)CosPi24_64;
+ s13 = x12 * (long)CosPi24_64 - x13 * (long)CosPi8_64;
+ s14 = -x14 * (long)CosPi24_64 + x15 * (long)CosPi8_64;
+ s15 = x14 * (long)CosPi8_64 + x15 * (long)CosPi24_64;
+
+ x0 = HighbdWrapLow(s0 + s2, bd);
+ x1 = HighbdWrapLow(s1 + s3, bd);
+ x2 = HighbdWrapLow(s0 - s2, bd);
+ x3 = HighbdWrapLow(s1 - s3, bd);
+ x4 = HighbdWrapLow(DctConstRoundShift(s4 + s6), bd);
+ x5 = HighbdWrapLow(DctConstRoundShift(s5 + s7), bd);
+ x6 = HighbdWrapLow(DctConstRoundShift(s4 - s6), bd);
+ x7 = HighbdWrapLow(DctConstRoundShift(s5 - s7), bd);
+ x8 = HighbdWrapLow(s8 + s10, bd);
+ x9 = HighbdWrapLow(s9 + s11, bd);
+ x10 = HighbdWrapLow(s8 - s10, bd);
+ x11 = HighbdWrapLow(s9 - s11, bd);
+ x12 = HighbdWrapLow(DctConstRoundShift(s12 + s14), bd);
+ x13 = HighbdWrapLow(DctConstRoundShift(s13 + s15), bd);
+ x14 = HighbdWrapLow(DctConstRoundShift(s12 - s14), bd);
+ x15 = HighbdWrapLow(DctConstRoundShift(s13 - s15), bd);
+
+ // stage 4
+ s2 = (long)(-CosPi16_64) * (x2 + x3);
+ s3 = (long)CosPi16_64 * (x2 - x3);
+ s6 = (long)CosPi16_64 * (x6 + x7);
+ s7 = (long)CosPi16_64 * (-x6 + x7);
+ s10 = (long)CosPi16_64 * (x10 + x11);
+ s11 = (long)CosPi16_64 * (-x10 + x11);
+ s14 = (long)(-CosPi16_64) * (x14 + x15);
+ s15 = (long)CosPi16_64 * (x14 - x15);
+
+ x2 = HighbdWrapLow(DctConstRoundShift(s2), bd);
+ x3 = HighbdWrapLow(DctConstRoundShift(s3), bd);
+ x6 = HighbdWrapLow(DctConstRoundShift(s6), bd);
+ x7 = HighbdWrapLow(DctConstRoundShift(s7), bd);
+ x10 = HighbdWrapLow(DctConstRoundShift(s10), bd);
+ x11 = HighbdWrapLow(DctConstRoundShift(s11), bd);
+ x14 = HighbdWrapLow(DctConstRoundShift(s14), bd);
+ x15 = HighbdWrapLow(DctConstRoundShift(s15), bd);
+
+ output[0] = HighbdWrapLow(x0, bd);
+ output[1] = HighbdWrapLow(-x8, bd);
+ output[2] = HighbdWrapLow(x12, bd);
+ output[3] = HighbdWrapLow(-x4, bd);
+ output[4] = HighbdWrapLow(x6, bd);
+ output[5] = HighbdWrapLow(x14, bd);
+ output[6] = HighbdWrapLow(x10, bd);
+ output[7] = HighbdWrapLow(x2, bd);
+ output[8] = HighbdWrapLow(x3, bd);
+ output[9] = HighbdWrapLow(x11, bd);
+ output[10] = HighbdWrapLow(x15, bd);
+ output[11] = HighbdWrapLow(x7, bd);
+ output[12] = HighbdWrapLow(x5, bd);
+ output[13] = HighbdWrapLow(-x13, bd);
+ output[14] = HighbdWrapLow(x9, bd);
+ output[15] = HighbdWrapLow(-x1, bd);
+ }
+
+ public static void HighbdIdct16(ReadOnlySpan<int> input, Span<int> output, int bd)
+ {
+ Span<int> step1 = stackalloc int[16];
+ Span<int> step2 = stackalloc int[16];
+ long temp1, temp2;
+
+ if (DetectInvalidHighbdInput(input, 16) != 0)
+ {
+ Debug.Assert(false, "invalid highbd txfm input");
+ output.Slice(0, 16).Fill(0);
+ return;
+ }
+
+ // stage 1
+ step1[0] = input[0 / 2];
+ step1[1] = input[16 / 2];
+ step1[2] = input[8 / 2];
+ step1[3] = input[24 / 2];
+ step1[4] = input[4 / 2];
+ step1[5] = input[20 / 2];
+ step1[6] = input[12 / 2];
+ step1[7] = input[28 / 2];
+ step1[8] = input[2 / 2];
+ step1[9] = input[18 / 2];
+ step1[10] = input[10 / 2];
+ step1[11] = input[26 / 2];
+ step1[12] = input[6 / 2];
+ step1[13] = input[22 / 2];
+ step1[14] = input[14 / 2];
+ step1[15] = input[30 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * (long)CosPi30_64 - step1[15] * (long)CosPi2_64;
+ temp2 = step1[8] * (long)CosPi2_64 + step1[15] * (long)CosPi30_64;
+ step2[8] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[15] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ temp1 = step1[9] * (long)CosPi14_64 - step1[14] * (long)CosPi18_64;
+ temp2 = step1[9] * (long)CosPi18_64 + step1[14] * (long)CosPi14_64;
+ step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ temp1 = step1[10] * (long)CosPi22_64 - step1[13] * (long)CosPi10_64;
+ temp2 = step1[10] * (long)CosPi10_64 + step1[13] * (long)CosPi22_64;
+ step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ temp1 = step1[11] * (long)CosPi6_64 - step1[12] * (long)CosPi26_64;
+ temp2 = step1[11] * (long)CosPi26_64 + step1[12] * (long)CosPi6_64;
+ step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * (long)CosPi28_64 - step2[7] * (long)CosPi4_64;
+ temp2 = step2[4] * (long)CosPi4_64 + step2[7] * (long)CosPi28_64;
+ step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ temp1 = step2[5] * (long)CosPi12_64 - step2[6] * (long)CosPi20_64;
+ temp2 = step2[5] * (long)CosPi20_64 + step2[6] * (long)CosPi12_64;
+ step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ step1[8] = HighbdWrapLow(step2[8] + step2[9], bd);
+ step1[9] = HighbdWrapLow(step2[8] - step2[9], bd);
+ step1[10] = HighbdWrapLow(-step2[10] + step2[11], bd);
+ step1[11] = HighbdWrapLow(step2[10] + step2[11], bd);
+ step1[12] = HighbdWrapLow(step2[12] + step2[13], bd);
+ step1[13] = HighbdWrapLow(step2[12] - step2[13], bd);
+ step1[14] = HighbdWrapLow(-step2[14] + step2[15], bd);
+ step1[15] = HighbdWrapLow(step2[14] + step2[15], bd);
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * (long)CosPi16_64;
+ temp2 = (step1[0] - step1[1]) * (long)CosPi16_64;
+ step2[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ temp1 = step1[2] * (long)CosPi24_64 - step1[3] * (long)CosPi8_64;
+ temp2 = step1[2] * (long)CosPi8_64 + step1[3] * (long)CosPi24_64;
+ step2[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ step2[4] = HighbdWrapLow(step1[4] + step1[5], bd);
+ step2[5] = HighbdWrapLow(step1[4] - step1[5], bd);
+ step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd);
+ step2[7] = HighbdWrapLow(step1[6] + step1[7], bd);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64;
+ temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64;
+ step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64;
+ temp2 = -step1[10] * (long)CosPi8_64 + step1[13] * (long)CosPi24_64;
+ step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[0] = HighbdWrapLow(step2[0] + step2[3], bd);
+ step1[1] = HighbdWrapLow(step2[1] + step2[2], bd);
+ step1[2] = HighbdWrapLow(step2[1] - step2[2], bd);
+ step1[3] = HighbdWrapLow(step2[0] - step2[3], bd);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * (long)CosPi16_64;
+ temp2 = (step2[5] + step2[6]) * (long)CosPi16_64;
+ step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ step1[7] = step2[7];
+
+ step1[8] = HighbdWrapLow(step2[8] + step2[11], bd);
+ step1[9] = HighbdWrapLow(step2[9] + step2[10], bd);
+ step1[10] = HighbdWrapLow(step2[9] - step2[10], bd);
+ step1[11] = HighbdWrapLow(step2[8] - step2[11], bd);
+ step1[12] = HighbdWrapLow(-step2[12] + step2[15], bd);
+ step1[13] = HighbdWrapLow(-step2[13] + step2[14], bd);
+ step1[14] = HighbdWrapLow(step2[13] + step2[14], bd);
+ step1[15] = HighbdWrapLow(step2[12] + step2[15], bd);
+
+ // stage 6
+ step2[0] = HighbdWrapLow(step1[0] + step1[7], bd);
+ step2[1] = HighbdWrapLow(step1[1] + step1[6], bd);
+ step2[2] = HighbdWrapLow(step1[2] + step1[5], bd);
+ step2[3] = HighbdWrapLow(step1[3] + step1[4], bd);
+ step2[4] = HighbdWrapLow(step1[3] - step1[4], bd);
+ step2[5] = HighbdWrapLow(step1[2] - step1[5], bd);
+ step2[6] = HighbdWrapLow(step1[1] - step1[6], bd);
+ step2[7] = HighbdWrapLow(step1[0] - step1[7], bd);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * (long)CosPi16_64;
+ temp2 = (step1[10] + step1[13]) * (long)CosPi16_64;
+ step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ temp1 = (-step1[11] + step1[12]) * (long)CosPi16_64;
+ temp2 = (step1[11] + step1[12]) * (long)CosPi16_64;
+ step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ output[0] = HighbdWrapLow(step2[0] + step2[15], bd);
+ output[1] = HighbdWrapLow(step2[1] + step2[14], bd);
+ output[2] = HighbdWrapLow(step2[2] + step2[13], bd);
+ output[3] = HighbdWrapLow(step2[3] + step2[12], bd);
+ output[4] = HighbdWrapLow(step2[4] + step2[11], bd);
+ output[5] = HighbdWrapLow(step2[5] + step2[10], bd);
+ output[6] = HighbdWrapLow(step2[6] + step2[9], bd);
+ output[7] = HighbdWrapLow(step2[7] + step2[8], bd);
+ output[8] = HighbdWrapLow(step2[7] - step2[8], bd);
+ output[9] = HighbdWrapLow(step2[6] - step2[9], bd);
+ output[10] = HighbdWrapLow(step2[5] - step2[10], bd);
+ output[11] = HighbdWrapLow(step2[4] - step2[11], bd);
+ output[12] = HighbdWrapLow(step2[3] - step2[12], bd);
+ output[13] = HighbdWrapLow(step2[2] - step2[13], bd);
+ output[14] = HighbdWrapLow(step2[1] - step2[14], bd);
+ output[15] = HighbdWrapLow(step2[0] - step2[15], bd);
+ }
+
+ public static void HighbdIdct16x16256Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+ {
+ int i, j;
+ Span<int> output = stackalloc int[16 * 16];
+ Span<int> outptr = output;
+ Span<int> tempIn = stackalloc int[16];
+ Span<int> tempOut = stackalloc int[16];
+
+ // First transform rows
+ for (i = 0; i < 16; ++i)
+ {
+ HighbdIdct16(input, outptr, bd);
+ input = input.Slice(16);
+ outptr = outptr.Slice(16);
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i)
+ {
+ for (j = 0; j < 16; ++j)
+ {
+ tempIn[j] = output[j * 16 + i];
+ }
+
+ HighbdIdct16(tempIn, tempOut, bd);
+ for (j = 0; j < 16; ++j)
+ {
+ dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
+ }
+ }
+ }
+
+ public static void HighbdIdct16x1638Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+ {
+ int i, j;
+ Span<int> output = stackalloc int[16 * 16];
+ Span<int> outptr = output;
+ Span<int> tempIn = stackalloc int[16];
+ Span<int> tempOut = stackalloc int[16];
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 8x8 area, we only need to calculate first 8 rows here.
+ for (i = 0; i < 8; ++i)
+ {
+ HighbdIdct16(input, outptr, bd);
+ input = input.Slice(16);
+ outptr = outptr.Slice(16);
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i)
+ {
+ Span<ushort> destT = dest;
+ for (j = 0; j < 16; ++j)
+ {
+ tempIn[j] = output[j * 16 + i];
+ }
+
+ HighbdIdct16(tempIn, tempOut, bd);
+ for (j = 0; j < 16; ++j)
+ {
+ destT[i] = HighbdClipPixelAdd(destT[i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
+ destT = destT.Slice(stride);
+ }
+ }
+ }
+
+ public static void HighbdIdct16x1610Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+ {
+ int i, j;
+ Span<int> output = stackalloc int[16 * 16];
+ Span<int> outptr = output;
+ Span<int> tempIn = stackalloc int[16];
+ Span<int> tempOut = stackalloc int[16];
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 4x4 area, we only need to calculate first 4 rows here.
+ for (i = 0; i < 4; ++i)
+ {
+ HighbdIdct16(input, outptr, bd);
+ input = input.Slice(16);
+ outptr = outptr.Slice(16);
+ }
+
+ // Then transform columns
+ for (i = 0; i < 16; ++i)
+ {
+ for (j = 0; j < 16; ++j)
+ {
+ tempIn[j] = output[j * 16 + i];
+ }
+
+ HighbdIdct16(tempIn, tempOut, bd);
+ for (j = 0; j < 16; ++j)
+ {
+ dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
+ }
+ }
+ }
+
+ public static void HighbdIdct16x161Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+ {
+ int i, j;
+ long a1;
+ int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
+
+ output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
+ a1 = BitUtils.RoundPowerOfTwo(output, 6);
+ for (j = 0; j < 16; ++j)
+ {
+ for (i = 0; i < 16; ++i)
+ {
+ dest[i] = HighbdClipPixelAdd(dest[i], a1, bd);
+ }
+
+ dest = dest.Slice(stride);
+ }
+ }
+
+ public static void HighbdIdct32(ReadOnlySpan<int> input, Span<int> output, int bd)
+ {
+ Span<int> step1 = stackalloc int[32];
+ Span<int> step2 = stackalloc int[32];
+ long temp1, temp2;
+
+ if (DetectInvalidHighbdInput(input, 32) != 0)
+ {
+ Debug.Assert(false, "invalid highbd txfm input");
+ output.Slice(0, 32).Fill(0);
+ return;
+ }
+
+ // stage 1
+ step1[0] = input[0];
+ step1[1] = input[16];
+ step1[2] = input[8];
+ step1[3] = input[24];
+ step1[4] = input[4];
+ step1[5] = input[20];
+ step1[6] = input[12];
+ step1[7] = input[28];
+ step1[8] = input[2];
+ step1[9] = input[18];
+ step1[10] = input[10];
+ step1[11] = input[26];
+ step1[12] = input[6];
+ step1[13] = input[22];
+ step1[14] = input[14];
+ step1[15] = input[30];
+
+ temp1 = input[1] * (long)CosPi31_64 - input[31] * (long)CosPi1_64;
+ temp2 = input[1] * (long)CosPi1_64 + input[31] * (long)CosPi31_64;
+ step1[16] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[31] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ temp1 = input[17] * (long)CosPi15_64 - input[15] * (long)CosPi17_64;
+ temp2 = input[17] * (long)CosPi17_64 + input[15] * (long)CosPi15_64;
+ step1[17] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[30] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ temp1 = input[9] * (long)CosPi23_64 - input[23] * (long)CosPi9_64;
+ temp2 = input[9] * (long)CosPi9_64 + input[23] * (long)CosPi23_64;
+ step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ temp1 = input[25] * (long)CosPi7_64 - input[7] * (long)CosPi25_64;
+ temp2 = input[25] * (long)CosPi25_64 + input[7] * (long)CosPi7_64;
+ step1[19] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[28] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ temp1 = input[5] * (long)CosPi27_64 - input[27] * (long)CosPi5_64;
+ temp2 = input[5] * (long)CosPi5_64 + input[27] * (long)CosPi27_64;
+ step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ temp1 = input[21] * (long)CosPi11_64 - input[11] * (long)CosPi21_64;
+ temp2 = input[21] * (long)CosPi21_64 + input[11] * (long)CosPi11_64;
+ step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ temp1 = input[13] * (long)CosPi19_64 - input[19] * (long)CosPi13_64;
+ temp2 = input[13] * (long)CosPi13_64 + input[19] * (long)CosPi19_64;
+ step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ temp1 = input[29] * (long)CosPi3_64 - input[3] * (long)CosPi29_64;
+ temp2 = input[29] * (long)CosPi29_64 + input[3] * (long)CosPi3_64;
+ step1[23] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[24] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * (long)CosPi30_64 - step1[15] * (long)CosPi2_64;
+ temp2 = step1[8] * (long)CosPi2_64 + step1[15] * (long)CosPi30_64;
+ step2[8] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[15] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ temp1 = step1[9] * (long)CosPi14_64 - step1[14] * (long)CosPi18_64;
+ temp2 = step1[9] * (long)CosPi18_64 + step1[14] * (long)CosPi14_64;
+ step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ temp1 = step1[10] * (long)CosPi22_64 - step1[13] * (long)CosPi10_64;
+ temp2 = step1[10] * (long)CosPi10_64 + step1[13] * (long)CosPi22_64;
+ step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ temp1 = step1[11] * (long)CosPi6_64 - step1[12] * (long)CosPi26_64;
+ temp2 = step1[11] * (long)CosPi26_64 + step1[12] * (long)CosPi6_64;
+ step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ step2[16] = HighbdWrapLow(step1[16] + step1[17], bd);
+ step2[17] = HighbdWrapLow(step1[16] - step1[17], bd);
+ step2[18] = HighbdWrapLow(-step1[18] + step1[19], bd);
+ step2[19] = HighbdWrapLow(step1[18] + step1[19], bd);
+ step2[20] = HighbdWrapLow(step1[20] + step1[21], bd);
+ step2[21] = HighbdWrapLow(step1[20] - step1[21], bd);
+ step2[22] = HighbdWrapLow(-step1[22] + step1[23], bd);
+ step2[23] = HighbdWrapLow(step1[22] + step1[23], bd);
+ step2[24] = HighbdWrapLow(step1[24] + step1[25], bd);
+ step2[25] = HighbdWrapLow(step1[24] - step1[25], bd);
+ step2[26] = HighbdWrapLow(-step1[26] + step1[27], bd);
+ step2[27] = HighbdWrapLow(step1[26] + step1[27], bd);
+ step2[28] = HighbdWrapLow(step1[28] + step1[29], bd);
+ step2[29] = HighbdWrapLow(step1[28] - step1[29], bd);
+ step2[30] = HighbdWrapLow(-step1[30] + step1[31], bd);
+ step2[31] = HighbdWrapLow(step1[30] + step1[31], bd);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * (long)CosPi28_64 - step2[7] * (long)CosPi4_64;
+ temp2 = step2[4] * (long)CosPi4_64 + step2[7] * (long)CosPi28_64;
+ step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ temp1 = step2[5] * (long)CosPi12_64 - step2[6] * (long)CosPi20_64;
+ temp2 = step2[5] * (long)CosPi20_64 + step2[6] * (long)CosPi12_64;
+ step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+ step1[8] = HighbdWrapLow(step2[8] + step2[9], bd);
+ step1[9] = HighbdWrapLow(step2[8] - step2[9], bd);
+ step1[10] = HighbdWrapLow(-step2[10] + step2[11], bd);
+ step1[11] = HighbdWrapLow(step2[10] + step2[11], bd);
+ step1[12] = HighbdWrapLow(step2[12] + step2[13], bd);
+ step1[13] = HighbdWrapLow(step2[12] - step2[13], bd);
+ step1[14] = HighbdWrapLow(-step2[14] + step2[15], bd);
+ step1[15] = HighbdWrapLow(step2[14] + step2[15], bd);
+
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ temp1 = -step2[17] * (long)CosPi4_64 + step2[30] * (long)CosPi28_64;
+ temp2 = step2[17] * (long)CosPi28_64 + step2[30] * (long)CosPi4_64;
+ step1[17] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[30] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ temp1 = -step2[18] * (long)CosPi28_64 - step2[29] * (long)CosPi4_64;
+ temp2 = -step2[18] * (long)CosPi4_64 + step2[29] * (long)CosPi28_64;
+ step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ temp1 = -step2[21] * (long)CosPi20_64 + step2[26] * (long)CosPi12_64;
+ temp2 = step2[21] * (long)CosPi12_64 + step2[26] * (long)CosPi20_64;
+ step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ temp1 = -step2[22] * (long)CosPi12_64 - step2[25] * (long)CosPi20_64;
+ temp2 = -step2[22] * (long)CosPi20_64 + step2[25] * (long)CosPi12_64;
+ step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * (long)CosPi16_64;
+ temp2 = (step1[0] - step1[1]) * (long)CosPi16_64;
+ step2[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ temp1 = step1[2] * (long)CosPi24_64 - step1[3] * (long)CosPi8_64;
+ temp2 = step1[2] * (long)CosPi8_64 + step1[3] * (long)CosPi24_64;
+ step2[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ step2[4] = HighbdWrapLow(step1[4] + step1[5], bd);
+ step2[5] = HighbdWrapLow(step1[4] - step1[5], bd);
+ step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd);
+ step2[7] = HighbdWrapLow(step1[6] + step1[7], bd);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64;
+ temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64;
+ step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64;
+ temp2 = -step1[10] * (long)CosPi8_64 + step1[13] * (long)CosPi24_64;
+ step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ step2[16] = HighbdWrapLow(step1[16] + step1[19], bd);
+ step2[17] = HighbdWrapLow(step1[17] + step1[18], bd);
+ step2[18] = HighbdWrapLow(step1[17] - step1[18], bd);
+ step2[19] = HighbdWrapLow(step1[16] - step1[19], bd);
+ step2[20] = HighbdWrapLow(-step1[20] + step1[23], bd);
+ step2[21] = HighbdWrapLow(-step1[21] + step1[22], bd);
+ step2[22] = HighbdWrapLow(step1[21] + step1[22], bd);
+ step2[23] = HighbdWrapLow(step1[20] + step1[23], bd);
+
+ step2[24] = HighbdWrapLow(step1[24] + step1[27], bd);
+ step2[25] = HighbdWrapLow(step1[25] + step1[26], bd);
+ step2[26] = HighbdWrapLow(step1[25] - step1[26], bd);
+ step2[27] = HighbdWrapLow(step1[24] - step1[27], bd);
+ step2[28] = HighbdWrapLow(-step1[28] + step1[31], bd);
+ step2[29] = HighbdWrapLow(-step1[29] + step1[30], bd);
+ step2[30] = HighbdWrapLow(step1[29] + step1[30], bd);
+ step2[31] = HighbdWrapLow(step1[28] + step1[31], bd);
+
+ // stage 5
+ step1[0] = HighbdWrapLow(step2[0] + step2[3], bd);
+ step1[1] = HighbdWrapLow(step2[1] + step2[2], bd);
+ step1[2] = HighbdWrapLow(step2[1] - step2[2], bd);
+ step1[3] = HighbdWrapLow(step2[0] - step2[3], bd);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * (long)CosPi16_64;
+ temp2 = (step2[5] + step2[6]) * (long)CosPi16_64;
+ step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ step1[7] = step2[7];
+
+ step1[8] = HighbdWrapLow(step2[8] + step2[11], bd);
+ step1[9] = HighbdWrapLow(step2[9] + step2[10], bd);
+ step1[10] = HighbdWrapLow(step2[9] - step2[10], bd);
+ step1[11] = HighbdWrapLow(step2[8] - step2[11], bd);
+ step1[12] = HighbdWrapLow(-step2[12] + step2[15], bd);
+ step1[13] = HighbdWrapLow(-step2[13] + step2[14], bd);
+ step1[14] = HighbdWrapLow(step2[13] + step2[14], bd);
+ step1[15] = HighbdWrapLow(step2[12] + step2[15], bd);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ temp1 = -step2[18] * (long)CosPi8_64 + step2[29] * (long)CosPi24_64;
+ temp2 = step2[18] * (long)CosPi24_64 + step2[29] * (long)CosPi8_64;
+ step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ temp1 = -step2[19] * (long)CosPi8_64 + step2[28] * (long)CosPi24_64;
+ temp2 = step2[19] * (long)CosPi24_64 + step2[28] * (long)CosPi8_64;
+ step1[19] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[28] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ temp1 = -step2[20] * (long)CosPi24_64 - step2[27] * (long)CosPi8_64;
+ temp2 = -step2[20] * (long)CosPi8_64 + step2[27] * (long)CosPi24_64;
+ step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ temp1 = -step2[21] * (long)CosPi24_64 - step2[26] * (long)CosPi8_64;
+ temp2 = -step2[21] * (long)CosPi8_64 + step2[26] * (long)CosPi24_64;
+ step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ step2[0] = HighbdWrapLow(step1[0] + step1[7], bd);
+ step2[1] = HighbdWrapLow(step1[1] + step1[6], bd);
+ step2[2] = HighbdWrapLow(step1[2] + step1[5], bd);
+ step2[3] = HighbdWrapLow(step1[3] + step1[4], bd);
+ step2[4] = HighbdWrapLow(step1[3] - step1[4], bd);
+ step2[5] = HighbdWrapLow(step1[2] - step1[5], bd);
+ step2[6] = HighbdWrapLow(step1[1] - step1[6], bd);
+ step2[7] = HighbdWrapLow(step1[0] - step1[7], bd);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * (long)CosPi16_64;
+ temp2 = (step1[10] + step1[13]) * (long)CosPi16_64;
+ step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ temp1 = (-step1[11] + step1[12]) * (long)CosPi16_64;
+ temp2 = (step1[11] + step1[12]) * (long)CosPi16_64;
+ step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ step2[16] = HighbdWrapLow(step1[16] + step1[23], bd);
+ step2[17] = HighbdWrapLow(step1[17] + step1[22], bd);
+ step2[18] = HighbdWrapLow(step1[18] + step1[21], bd);
+ step2[19] = HighbdWrapLow(step1[19] + step1[20], bd);
+ step2[20] = HighbdWrapLow(step1[19] - step1[20], bd);
+ step2[21] = HighbdWrapLow(step1[18] - step1[21], bd);
+ step2[22] = HighbdWrapLow(step1[17] - step1[22], bd);
+ step2[23] = HighbdWrapLow(step1[16] - step1[23], bd);
+
+ step2[24] = HighbdWrapLow(-step1[24] + step1[31], bd);
+ step2[25] = HighbdWrapLow(-step1[25] + step1[30], bd);
+ step2[26] = HighbdWrapLow(-step1[26] + step1[29], bd);
+ step2[27] = HighbdWrapLow(-step1[27] + step1[28], bd);
+ step2[28] = HighbdWrapLow(step1[27] + step1[28], bd);
+ step2[29] = HighbdWrapLow(step1[26] + step1[29], bd);
+ step2[30] = HighbdWrapLow(step1[25] + step1[30], bd);
+ step2[31] = HighbdWrapLow(step1[24] + step1[31], bd);
+
+ // stage 7
+ step1[0] = HighbdWrapLow(step2[0] + step2[15], bd);
+ step1[1] = HighbdWrapLow(step2[1] + step2[14], bd);
+ step1[2] = HighbdWrapLow(step2[2] + step2[13], bd);
+ step1[3] = HighbdWrapLow(step2[3] + step2[12], bd);
+ step1[4] = HighbdWrapLow(step2[4] + step2[11], bd);
+ step1[5] = HighbdWrapLow(step2[5] + step2[10], bd);
+ step1[6] = HighbdWrapLow(step2[6] + step2[9], bd);
+ step1[7] = HighbdWrapLow(step2[7] + step2[8], bd);
+ step1[8] = HighbdWrapLow(step2[7] - step2[8], bd);
+ step1[9] = HighbdWrapLow(step2[6] - step2[9], bd);
+ step1[10] = HighbdWrapLow(step2[5] - step2[10], bd);
+ step1[11] = HighbdWrapLow(step2[4] - step2[11], bd);
+ step1[12] = HighbdWrapLow(step2[3] - step2[12], bd);
+ step1[13] = HighbdWrapLow(step2[2] - step2[13], bd);
+ step1[14] = HighbdWrapLow(step2[1] - step2[14], bd);
+ step1[15] = HighbdWrapLow(step2[0] - step2[15], bd);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ step1[18] = step2[18];
+ step1[19] = step2[19];
+ temp1 = (-step2[20] + step2[27]) * (long)CosPi16_64;
+ temp2 = (step2[20] + step2[27]) * (long)CosPi16_64;
+ step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ temp1 = (-step2[21] + step2[26]) * (long)CosPi16_64;
+ temp2 = (step2[21] + step2[26]) * (long)CosPi16_64;
+ step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ temp1 = (-step2[22] + step2[25]) * (long)CosPi16_64;
+ temp2 = (step2[22] + step2[25]) * (long)CosPi16_64;
+ step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ temp1 = (-step2[23] + step2[24]) * (long)CosPi16_64;
+ temp2 = (step2[23] + step2[24]) * (long)CosPi16_64;
+ step1[23] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+ step1[24] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+ step1[28] = step2[28];
+ step1[29] = step2[29];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // final stage
+ output[0] = HighbdWrapLow(step1[0] + step1[31], bd);
+ output[1] = HighbdWrapLow(step1[1] + step1[30], bd);
+ output[2] = HighbdWrapLow(step1[2] + step1[29], bd);
+ output[3] = HighbdWrapLow(step1[3] + step1[28], bd);
+ output[4] = HighbdWrapLow(step1[4] + step1[27], bd);
+ output[5] = HighbdWrapLow(step1[5] + step1[26], bd);
+ output[6] = HighbdWrapLow(step1[6] + step1[25], bd);
+ output[7] = HighbdWrapLow(step1[7] + step1[24], bd);
+ output[8] = HighbdWrapLow(step1[8] + step1[23], bd);
+ output[9] = HighbdWrapLow(step1[9] + step1[22], bd);
+ output[10] = HighbdWrapLow(step1[10] + step1[21], bd);
+ output[11] = HighbdWrapLow(step1[11] + step1[20], bd);
+ output[12] = HighbdWrapLow(step1[12] + step1[19], bd);
+ output[13] = HighbdWrapLow(step1[13] + step1[18], bd);
+ output[14] = HighbdWrapLow(step1[14] + step1[17], bd);
+ output[15] = HighbdWrapLow(step1[15] + step1[16], bd);
+ output[16] = HighbdWrapLow(step1[15] - step1[16], bd);
+ output[17] = HighbdWrapLow(step1[14] - step1[17], bd);
+ output[18] = HighbdWrapLow(step1[13] - step1[18], bd);
+ output[19] = HighbdWrapLow(step1[12] - step1[19], bd);
+ output[20] = HighbdWrapLow(step1[11] - step1[20], bd);
+ output[21] = HighbdWrapLow(step1[10] - step1[21], bd);
+ output[22] = HighbdWrapLow(step1[9] - step1[22], bd);
+ output[23] = HighbdWrapLow(step1[8] - step1[23], bd);
+ output[24] = HighbdWrapLow(step1[7] - step1[24], bd);
+ output[25] = HighbdWrapLow(step1[6] - step1[25], bd);
+ output[26] = HighbdWrapLow(step1[5] - step1[26], bd);
+ output[27] = HighbdWrapLow(step1[4] - step1[27], bd);
+ output[28] = HighbdWrapLow(step1[3] - step1[28], bd);
+ output[29] = HighbdWrapLow(step1[2] - step1[29], bd);
+ output[30] = HighbdWrapLow(step1[1] - step1[30], bd);
+ output[31] = HighbdWrapLow(step1[0] - step1[31], bd);
+ }
+
+ public static void HighbdIdct32x321024Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+ {
+ int i, j;
+ Span<int> output = stackalloc int[32 * 32];
+ Span<int> outptr = output;
+ Span<int> tempIn = stackalloc int[32];
+ Span<int> tempOut = stackalloc int[32];
+
+ // Rows
+ for (i = 0; i < 32; ++i)
+ {
+ int zeroCoeff = 0;
+ for (j = 0; j < 32; ++j)
+ {
+ zeroCoeff |= input[j];
+ }
+
+ if (zeroCoeff != 0)
+ {
+ HighbdIdct32(input, outptr, bd);
+ }
+ else
+ {
+ outptr.Slice(0, 32).Fill(0);
+ }
+
+ input = input.Slice(32);
+ outptr = outptr.Slice(32);
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i)
+ {
+ for (j = 0; j < 32; ++j)
+ {
+ tempIn[j] = output[j * 32 + i];
+ }
+
+ HighbdIdct32(tempIn, tempOut, bd);
+ for (j = 0; j < 32; ++j)
+ {
+ dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
+ }
+ }
+ }
+
+ public static void HighbdIdct32x32135Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+ {
+ int i, j;
+ Span<int> output = stackalloc int[32 * 32];
+ Span<int> outptr = output;
+ Span<int> tempIn = stackalloc int[32];
+ Span<int> tempOut = stackalloc int[32];
+
+ // Rows
+ // Only upper-left 16x16 has non-zero coeff
+ for (i = 0; i < 16; ++i)
+ {
+ HighbdIdct32(input, outptr, bd);
+ input = input.Slice(32);
+ outptr = outptr.Slice(32);
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i)
+ {
+ Span<ushort> destT = dest;
+ for (j = 0; j < 32; ++j)
+ {
+ tempIn[j] = output[j * 32 + i];
+ }
+
+ HighbdIdct32(tempIn, tempOut, bd);
+ for (j = 0; j < 32; ++j)
+ {
+ destT[i] = HighbdClipPixelAdd(destT[i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
+ destT = destT.Slice(stride);
+ }
+ }
+ }
+
+ public static void HighbdIdct32x3234Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+ {
+ int i, j;
+ Span<int> output = stackalloc int[32 * 32];
+ Span<int> outptr = output;
+ Span<int> tempIn = stackalloc int[32];
+ Span<int> tempOut = stackalloc int[32];
+
+ // Rows
+ // Only upper-left 8x8 has non-zero coeff
+ for (i = 0; i < 8; ++i)
+ {
+ HighbdIdct32(input, outptr, bd);
+ input = input.Slice(32);
+ outptr = outptr.Slice(32);
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i)
+ {
+ for (j = 0; j < 32; ++j)
+ {
+ tempIn[j] = output[j * 32 + i];
+ }
+
+ HighbdIdct32(tempIn, tempOut, bd);
+ for (j = 0; j < 32; ++j)
+ {
+ dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
+ }
+ }
+ }
+
+ public static void HighbdIdct32x321Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+ {
+ int i, j;
+ int a1;
+ int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
+
+ output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
+ a1 = BitUtils.RoundPowerOfTwo(output, 6);
+
+ for (j = 0; j < 32; ++j)
+ {
+ for (i = 0; i < 32; ++i)
+ {
+ dest[i] = HighbdClipPixelAdd(dest[i], a1, bd);
+ }
+
+ dest = dest.Slice(stride);
+ }
+ }
+ }
+}
diff --git a/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Prob.cs b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Prob.cs
new file mode 100644
index 00000000..0d5e8b6e
--- /dev/null
+++ b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Prob.cs
@@ -0,0 +1,73 @@
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using System;
+using System.Diagnostics;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
+{
+ internal static class Prob
+ {
+ public const int MaxProb = 255;
+
+ private static byte GetProb(uint num, uint den)
+ {
+ Debug.Assert(den != 0);
+ {
+ int p = (int)(((ulong)num * 256 + (den >> 1)) / den);
+ // (p > 255) ? 255 : (p < 1) ? 1 : p;
+ int clippedProb = p | ((255 - p) >> 23) | (p == 0 ? 1 : 0);
+ return (byte)clippedProb;
+ }
+ }
+
+ /* This function assumes prob1 and prob2 are already within [1,255] range. */
+ public static byte WeightedProb(int prob1, int prob2, int factor)
+ {
+ return (byte)BitUtils.RoundPowerOfTwo(prob1 * (256 - factor) + prob2 * factor, 8);
+ }
+
+ // MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
+ private static readonly uint[] CountToUpdateFactor = new uint[]
+ {
+ 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64,
+ 70, 76, 83, 89, 96, 102, 108, 115, 121, 128
+ };
+
+ private const int ModeMvCountSat = 20;
+
+ public static byte ModeMvMergeProbs(byte preProb, uint ct0, uint ct1)
+ {
+ uint den = ct0 + ct1;
+ if (den == 0)
+ {
+ return preProb;
+ }
+ else
+ {
+ uint count = Math.Min(den, ModeMvCountSat);
+ uint factor = CountToUpdateFactor[(int)count];
+ byte prob = GetProb(ct0, den);
+ return WeightedProb(preProb, prob, (int)factor);
+ }
+ }
+
+ private static uint TreeMergeProbsImpl(
+ uint i,
+ sbyte[] tree,
+ ReadOnlySpan<byte> preProbs,
+ ReadOnlySpan<uint> counts,
+ Span<byte> probs)
+ {
+ int l = tree[i];
+ uint leftCount = (l <= 0) ? counts[-l] : TreeMergeProbsImpl((uint)l, tree, preProbs, counts, probs);
+ int r = tree[i + 1];
+ uint rightCount = (r <= 0) ? counts[-r] : TreeMergeProbsImpl((uint)r, tree, preProbs, counts, probs);
+ probs[(int)(i >> 1)] = ModeMvMergeProbs(preProbs[(int)(i >> 1)], leftCount, rightCount);
+ return leftCount + rightCount;
+ }
+
+ public static void TreeMergeProbs(sbyte[] tree, ReadOnlySpan<byte> preProbs, ReadOnlySpan<uint> counts, Span<byte> probs)
+ {
+ TreeMergeProbsImpl(0, tree, preProbs, counts, probs);
+ }
+ }
+}
diff --git a/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs
new file mode 100644
index 00000000..94aa6979
--- /dev/null
+++ b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs
@@ -0,0 +1,237 @@
+using System;
+using System.Buffers.Binary;
+using Ryujinx.Common.Memory;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
+{
+ internal struct Reader
+ {
+ private static readonly byte[] Norm = new byte[]
+ {
+ 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ };
+ private const int BdValueSize = sizeof(ulong) * 8;
+
+ // This is meant to be a large, positive constant that can still be efficiently
+ // loaded as an immediate (on platforms like ARM, for example).
+ // Even relatively modest values like 100 would work fine.
+ private const int LotsOfBits = 0x40000000;
+
+ public ulong Value;
+ public uint Range;
+ public int Count;
+ private ArrayPtr<byte> _buffer;
+
+ public bool Init(ArrayPtr<byte> buffer, int size)
+ {
+ if (size != 0 && buffer.IsNull)
+ {
+ return true;
+ }
+ else
+ {
+ _buffer = new ArrayPtr<byte>(ref buffer[0], size);
+ Value = 0;
+ Count = -8;
+ Range = 255;
+ Fill();
+ return ReadBit() != 0; // Marker bit
+ }
+ }
+
+ private void Fill()
+ {
+ ReadOnlySpan<byte> buffer = _buffer.ToSpan();
+ ReadOnlySpan<byte> bufferStart = buffer;
+ ulong value = Value;
+ int count = Count;
+ ulong bytesLeft = (ulong)buffer.Length;
+ ulong bitsLeft = bytesLeft * 8;
+ int shift = BdValueSize - 8 - (count + 8);
+
+ if (bitsLeft > BdValueSize)
+ {
+ int bits = (shift & unchecked((int)0xfffffff8)) + 8;
+ ulong nv;
+ ulong bigEndianValues = BinaryPrimitives.ReadUInt64BigEndian(buffer);
+ nv = bigEndianValues >> (BdValueSize - bits);
+ count += bits;
+ buffer = buffer.Slice(bits >> 3);
+ value = Value | (nv << (shift & 0x7));
+ }
+ else
+ {
+ int bitsOver = shift + 8 - (int)bitsLeft;
+ int loopEnd = 0;
+ if (bitsOver >= 0)
+ {
+ count += LotsOfBits;
+ loopEnd = bitsOver;
+ }
+
+ if (bitsOver < 0 || bitsLeft != 0)
+ {
+ while (shift >= loopEnd)
+ {
+ count += 8;
+ value |= (ulong)buffer[0] << shift;
+ buffer = buffer.Slice(1);
+ shift -= 8;
+ }
+ }
+ }
+
+ // NOTE: Variable 'buffer' may not relate to '_buffer' after decryption,
+ // so we increase '_buffer' by the amount that 'buffer' moved, rather than
+ // assign 'buffer' to '_buffer'.
+ _buffer = _buffer.Slice(bufferStart.Length - buffer.Length);
+ Value = value;
+ Count = count;
+ }
+
+ public bool HasError()
+ {
+ // Check if we have reached the end of the buffer.
+ //
+ // Variable 'count' stores the number of bits in the 'value' buffer, minus
+ // 8. The top byte is part of the algorithm, and the remainder is buffered
+ // to be shifted into it. So if count == 8, the top 16 bits of 'value' are
+ // occupied, 8 for the algorithm and 8 in the buffer.
+ //
+ // When reading a byte from the user's buffer, count is filled with 8 and
+ // one byte is filled into the value buffer. When we reach the end of the
+ // data, count is additionally filled with LotsOfBits. So when
+ // count == LotsOfBits - 1, the user's data has been exhausted.
+ //
+ // 1 if we have tried to decode bits after the end of stream was encountered.
+ // 0 No error.
+ return Count > BdValueSize && Count < LotsOfBits;
+ }
+
+ public int Read(int prob)
+ {
+ uint bit = 0;
+ ulong value;
+ ulong bigsplit;
+ int count;
+ uint range;
+ uint split = (Range * (uint)prob + (256 - (uint)prob)) >> 8;
+
+ if (Count < 0)
+ {
+ Fill();
+ }
+
+ value = Value;
+ count = Count;
+
+ bigsplit = (ulong)split << (BdValueSize - 8);
+
+ range = split;
+
+ if (value >= bigsplit)
+ {
+ range = Range - split;
+ value -= bigsplit;
+ bit = 1;
+ }
+
+ {
+ int shift = Norm[range];
+ range <<= shift;
+ value <<= shift;
+ count -= shift;
+ }
+ Value = value;
+ Count = count;
+ Range = range;
+
+ return (int)bit;
+ }
+
+ public int ReadBit()
+ {
+ return Read(128); // vpx_prob_half
+ }
+
+ public int ReadLiteral(int bits)
+ {
+ int literal = 0, bit;
+
+ for (bit = bits - 1; bit >= 0; bit--)
+ {
+ literal |= ReadBit() << bit;
+ }
+
+ return literal;
+ }
+
+ public int ReadTree(ReadOnlySpan<sbyte> tree, ReadOnlySpan<byte> probs)
+ {
+ sbyte i = 0;
+
+ while ((i = tree[i + Read(probs[i >> 1])]) > 0)
+ {
+ continue;
+ }
+
+ return -i;
+ }
+
+ public int ReadBool(int prob, ref ulong value, ref int count, ref uint range)
+ {
+ uint split = (range * (uint)prob + (256 - (uint)prob)) >> 8;
+ ulong bigsplit = (ulong)split << (BdValueSize - 8);
+
+ if (count < 0)
+ {
+ Value = value;
+ Count = count;
+ Fill();
+ value = Value;
+ count = Count;
+ }
+
+ if (value >= bigsplit)
+ {
+ range = range - split;
+ value = value - bigsplit;
+ {
+ int shift = Norm[range];
+ range <<= shift;
+ value <<= shift;
+ count -= shift;
+ }
+ return 1;
+ }
+ range = split;
+ {
+ int shift = Norm[range];
+ range <<= shift;
+ value <<= shift;
+ count -= shift;
+ }
+ return 0;
+ }
+
+ public ArrayPtr<byte> FindEnd()
+ {
+ // Find the end of the coded buffer
+ while (Count > 8 && Count < BdValueSize)
+ {
+ Count -= 8;
+ _buffer = _buffer.Slice(-1);
+ }
+ return _buffer;
+ }
+ }
+}
diff --git a/Ryujinx.Graphics.Nvdec.Vp9/Dsp/TxfmCommon.cs b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/TxfmCommon.cs
new file mode 100644
index 00000000..e041f2e0
--- /dev/null
+++ b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/TxfmCommon.cs
@@ -0,0 +1,54 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
+{
+ internal static class TxfmCommon
+ {
+ // Constants used by all idct/dct functions
+ public const int DctConstBits = 14;
+ public const int DctConstRounding = 1 << (DctConstBits - 1);
+
+ public const int UnitQuantShift = 2;
+ public const int UnitQuantFactor = 1 << UnitQuantShift;
+
+ // Constants:
+ // for (int i = 1; i < 32; ++i)
+ // Console.WriteLine("public const short CosPi{0}_64 = {1};", i, MathF.Round(16384 * MathF.Cos(i * MathF.PI / 64)));
+ // Note: sin(k * Pi / 64) = cos((32 - k) * Pi / 64)
+ public const short CosPi1_64 = 16364;
+ public const short CosPi2_64 = 16305;
+ public const short CosPi3_64 = 16207;
+ public const short CosPi4_64 = 16069;
+ public const short CosPi5_64 = 15893;
+ public const short CosPi6_64 = 15679;
+ public const short CosPi7_64 = 15426;
+ public const short CosPi8_64 = 15137;
+ public const short CosPi9_64 = 14811;
+ public const short CosPi10_64 = 14449;
+ public const short CosPi11_64 = 14053;
+ public const short CosPi12_64 = 13623;
+ public const short CosPi13_64 = 13160;
+ public const short CosPi14_64 = 12665;
+ public const short CosPi15_64 = 12140;
+ public const short CosPi16_64 = 11585;
+ public const short CosPi17_64 = 11003;
+ public const short CosPi18_64 = 10394;
+ public const short CosPi19_64 = 9760;
+ public const short CosPi20_64 = 9102;
+ public const short CosPi21_64 = 8423;
+ public const short CosPi22_64 = 7723;
+ public const short CosPi23_64 = 7005;
+ public const short CosPi24_64 = 6270;
+ public const short CosPi25_64 = 5520;
+ public const short CosPi26_64 = 4756;
+ public const short CosPi27_64 = 3981;
+ public const short CosPi28_64 = 3196;
+ public const short CosPi29_64 = 2404;
+ public const short CosPi30_64 = 1606;
+ public const short CosPi31_64 = 804;
+
+ // 16384 * sqrt(2) * sin(kPi / 9) * 2 / 3
+ public const short SinPi1_9 = 5283;
+ public const short SinPi2_9 = 9929;
+ public const short SinPi3_9 = 13377;
+ public const short SinPi4_9 = 15212;
+ }
+}