diff options
| author | TSR Berry <20988865+TSRBerry@users.noreply.github.com> | 2023-04-08 01:22:00 +0200 |
|---|---|---|
| committer | Mary <thog@protonmail.com> | 2023-04-27 23:51:14 +0200 |
| commit | cee712105850ac3385cd0091a923438167433f9f (patch) | |
| tree | 4a5274b21d8b7f938c0d0ce18736d3f2993b11b1 /src/Ryujinx.Graphics.Nvdec.Vp9 | |
| parent | cd124bda587ef09668a971fa1cac1c3f0cfc9f21 (diff) | |
Move solution and projects to src
Diffstat (limited to 'src/Ryujinx.Graphics.Nvdec.Vp9')
61 files changed, 14950 insertions, 0 deletions
diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/BitDepth.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/BitDepth.cs new file mode 100644 index 00000000..b7b70953 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/BitDepth.cs @@ -0,0 +1,9 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal enum BitDepth + { + Bits8 = 8, /**< 8 bits */ + Bits10 = 10, /**< 10 bits */ + Bits12 = 12, /**< 12 bits */ + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/CodecErr.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/CodecErr.cs new file mode 100644 index 00000000..b695fed5 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/CodecErr.cs @@ -0,0 +1,56 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal enum CodecErr + { + /*!\brief Operation completed without error */ + CodecOk, + + /*!\brief Unspecified error */ + CodecError, + + /*!\brief Memory operation failed */ + CodecMemError, + + /*!\brief ABI version mismatch */ + CodecAbiMismatch, + + /*!\brief Algorithm does not have required capability */ + CodecIncapable, + + /*!\brief The given bitstream is not supported. + * + * The bitstream was unable to be parsed at the highest level. The decoder + * is unable to proceed. This error \ref SHOULD be treated as fatal to the + * stream. */ + CodecUnsupBitstream, + + /*!\brief Encoded bitstream uses an unsupported feature + * + * The decoder does not implement a feature required by the encoder. This + * return code should only be used for features that prevent future + * pictures from being properly decoded. This error \ref MAY be treated as + * fatal to the stream or \ref MAY be treated as fatal to the current GOP. + */ + CodecUnsupFeature, + + /*!\brief The coded data for this stream is corrupt or incomplete + * + * There was a problem decoding the current frame. This return code + * should only be used for failures that prevent future pictures from + * being properly decoded. This error \ref MAY be treated as fatal to the + * stream or \ref MAY be treated as fatal to the current GOP. If decoding + * is continued for the current GOP, artifacts may be present. + */ + CodecCorruptFrame, + + /*!\brief An application-supplied parameter is not valid. + * + */ + CodecInvalidParam, + + /*!\brief An iterator reached the end of list. + * + */ + CodecListEnd + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Common/BitUtils.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Common/BitUtils.cs new file mode 100644 index 00000000..641188f8 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Common/BitUtils.cs @@ -0,0 +1,58 @@ +using System; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.CompilerServices; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Common +{ + internal static class BitUtils + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static byte ClipPixel(int val) + { + return (byte)((val > 255) ? 255 : (val < 0) ? 0 : val); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static ushort ClipPixelHighbd(int val, int bd) + { + return bd switch + { + 10 => (ushort)Math.Clamp(val, 0, 1023), + 12 => (ushort)Math.Clamp(val, 0, 4095), + _ => (ushort)Math.Clamp(val, 0, 255) + }; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int RoundPowerOfTwo(int value, int n) + { + return (value + (1 << (n - 1))) >> n; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static long RoundPowerOfTwo(long value, int n) + { + return (value + (1L << (n - 1))) >> n; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int AlignPowerOfTwo(int value, int n) + { + return (value + ((1 << n) - 1)) & ~((1 << n) - 1); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int GetMsb(uint n) + { + Debug.Assert(n != 0); + return 31 ^ BitOperations.LeadingZeroCount(n); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int GetUnsignedBits(uint numValues) + { + return numValues > 0 ? GetMsb(numValues) + 1 : 0; + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryAllocator.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryAllocator.cs new file mode 100644 index 00000000..473dd904 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryAllocator.cs @@ -0,0 +1,94 @@ +using Ryujinx.Common.Memory; +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Common +{ + internal class MemoryAllocator : IDisposable + { + private const int PoolEntries = 10; + + private struct PoolItem + { + public IntPtr Pointer; + public int Length; + public bool InUse; + } + + private PoolItem[] _pool = new PoolItem[PoolEntries]; + + public ArrayPtr<T> Allocate<T>(int length) where T : unmanaged + { + int lengthInBytes = Unsafe.SizeOf<T>() * length; + + IntPtr ptr = IntPtr.Zero; + + for (int i = 0; i < PoolEntries; i++) + { + ref PoolItem item = ref _pool[i]; + + if (!item.InUse && item.Length == lengthInBytes) + { + item.InUse = true; + ptr = item.Pointer; + break; + } + } + + if (ptr == IntPtr.Zero) + { + ptr = Marshal.AllocHGlobal(lengthInBytes); + + for (int i = 0; i < PoolEntries; i++) + { + ref PoolItem item = ref _pool[i]; + + if (!item.InUse) + { + item.InUse = true; + if (item.Pointer != IntPtr.Zero) + { + Marshal.FreeHGlobal(item.Pointer); + } + item.Pointer = ptr; + item.Length = lengthInBytes; + break; + } + } + } + + return new ArrayPtr<T>(ptr, length); + } + + public unsafe void Free<T>(ArrayPtr<T> arr) where T : unmanaged + { + IntPtr ptr = (IntPtr)arr.ToPointer(); + + for (int i = 0; i < PoolEntries; i++) + { + ref PoolItem item = ref _pool[i]; + + if (item.Pointer == ptr) + { + item.InUse = false; + break; + } + } + } + + public void Dispose() + { + for (int i = 0; i < PoolEntries; i++) + { + ref PoolItem item = ref _pool[i]; + + if (item.Pointer != IntPtr.Zero) + { + Marshal.FreeHGlobal(item.Pointer); + item.Pointer = IntPtr.Zero; + } + } + } + } +}
\ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryUtil.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryUtil.cs new file mode 100644 index 00000000..909a9483 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryUtil.cs @@ -0,0 +1,23 @@ +using System; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Common +{ + internal static class MemoryUtil + { + public static unsafe void Copy<T>(T* dest, T* source, int length) where T : unmanaged + { + new Span<T>(source, length).CopyTo(new Span<T>(dest, length)); + } + + public static void Copy<T>(ref T dest, ref T source) where T : unmanaged + { + MemoryMarshal.CreateSpan(ref source, 1).CopyTo(MemoryMarshal.CreateSpan(ref dest, 1)); + } + + public static unsafe void Fill<T>(T* ptr, T value, int length) where T : unmanaged + { + new Span<T>(ptr, length).Fill(value); + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Constants.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Constants.cs new file mode 100644 index 00000000..aaf1d7b9 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Constants.cs @@ -0,0 +1,69 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal static class Constants + { + public const int Vp9InterpExtend = 4; + + public const int MaxMbPlane = 3; + + public const int None = -1; + public const int IntraFrame = 0; + public const int LastFrame = 1; + public const int GoldenFrame = 2; + public const int AltRefFrame = 3; + public const int MaxRefFrames = 4; + + public const int MiSizeLog2 = 3; + public const int MiBlockSizeLog2 = 6 - MiSizeLog2; // 64 = 2^6 + + public const int MiSize = 1 << MiSizeLog2; // pixels per mi-unit + public const int MiBlockSize = 1 << MiBlockSizeLog2; // mi-units per max block + public const int MiMask = MiBlockSize - 1; + + public const int PartitionPloffset = 4; // number of probability models per block size + + /* Segment Feature Masks */ + public const int MaxMvRefCandidates = 2; + + public const int CompInterContexts = 5; + public const int RefContexts = 5; + + public const int EightTap = 0; + public const int EightTapSmooth = 1; + public const int EightTapSharp = 2; + public const int SwitchableFilters = 3; /* Number of switchable filters */ + public const int Bilinear = 3; + public const int Switchable = 4; /* should be the last one */ + + // Frame + public const int RefsPerFrame = 3; + + public const int NumPingPongBuffers = 2; + + public const int Class0Bits = 1; /* bits at integer precision for class 0 */ + public const int Class0Size = 1 << Class0Bits; + + public const int MvInUseBits = 14; + public const int MvUpp = (1 << MvInUseBits) - 1; + public const int MvLow = -(1 << MvInUseBits); + + // Coefficient token alphabet + public const int ZeroToken = 0; // 0 Extra Bits 0+0 + public const int OneToken = 1; // 1 Extra Bits 0+1 + public const int TwoToken = 2; // 2 Extra Bits 0+1 + + public const int PivotNode = 2; + + public const int Cat1MinVal = 5; + public const int Cat2MinVal = 7; + public const int Cat3MinVal = 11; + public const int Cat4MinVal = 19; + public const int Cat5MinVal = 35; + public const int Cat6MinVal = 67; + + public const int EobModelToken = 3; + + public const int SegmentAbsData = 1; + public const int MaxSegments = 8; + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/DecodeFrame.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/DecodeFrame.cs new file mode 100644 index 00000000..cdd645a3 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/DecodeFrame.cs @@ -0,0 +1,1357 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Common; +using Ryujinx.Graphics.Nvdec.Vp9.Dsp; +using Ryujinx.Graphics.Nvdec.Vp9.Types; +using Ryujinx.Graphics.Video; +using System; +using System.Buffers.Binary; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading.Tasks; +using Mv = Ryujinx.Graphics.Nvdec.Vp9.Types.Mv; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + static class DecodeFrame + { + private static bool ReadIsValid(ArrayPtr<byte> start, int len) + { + return len != 0 && len <= start.Length; + } + + private static void InverseTransformBlockInter(ref MacroBlockD xd, int plane, TxSize txSize, Span<byte> dst, int stride, int eob) + { + ref MacroBlockDPlane pd = ref xd.Plane[plane]; + ArrayPtr<int> dqcoeff = pd.DqCoeff; + Debug.Assert(eob > 0); + if (xd.CurBuf.HighBd) + { + Span<ushort> dst16 = MemoryMarshal.Cast<byte, ushort>(dst); + if (xd.Lossless) + { + Idct.HighbdIwht4x4Add(dqcoeff.AsSpan(), dst16, stride, eob, xd.Bd); + } + else + { + switch (txSize) + { + case TxSize.Tx4x4: + Idct.HighbdIdct4x4Add(dqcoeff.AsSpan(), dst16, stride, eob, xd.Bd); + break; + case TxSize.Tx8x8: + Idct.HighbdIdct8x8Add(dqcoeff.AsSpan(), dst16, stride, eob, xd.Bd); + break; + case TxSize.Tx16x16: + Idct.HighbdIdct16x16Add(dqcoeff.AsSpan(), dst16, stride, eob, xd.Bd); + break; + case TxSize.Tx32x32: + Idct.HighbdIdct32x32Add(dqcoeff.AsSpan(), dst16, stride, eob, xd.Bd); + break; + default: Debug.Assert(false, "Invalid transform size"); break; + } + } + } + else + { + if (xd.Lossless) + { + Idct.Iwht4x4Add(dqcoeff.AsSpan(), dst, stride, eob); + } + else + { + switch (txSize) + { + case TxSize.Tx4x4: Idct.Idct4x4Add(dqcoeff.AsSpan(), dst, stride, eob); break; + case TxSize.Tx8x8: Idct.Idct8x8Add(dqcoeff.AsSpan(), dst, stride, eob); break; + case TxSize.Tx16x16: Idct.Idct16x16Add(dqcoeff.AsSpan(), dst, stride, eob); break; + case TxSize.Tx32x32: Idct.Idct32x32Add(dqcoeff.AsSpan(), dst, stride, eob); break; + default: Debug.Assert(false, "Invalid transform size"); return; + } + } + } + + if (eob == 1) + { + dqcoeff.AsSpan()[0] = 0; + } + else + { + if (txSize <= TxSize.Tx16x16 && eob <= 10) + { + dqcoeff.AsSpan().Slice(0, 4 * (4 << (int)txSize)).Fill(0); + } + else if (txSize == TxSize.Tx32x32 && eob <= 34) + { + dqcoeff.AsSpan().Slice(0, 256).Fill(0); + } + else + { + dqcoeff.AsSpan().Slice(0, 16 << ((int)txSize << 1)).Fill(0); + } + } + } + + private static void InverseTransformBlockIntra( + ref MacroBlockD xd, + int plane, + TxType txType, + TxSize txSize, + Span<byte> dst, + int stride, + int eob) + { + ref MacroBlockDPlane pd = ref xd.Plane[plane]; + ArrayPtr<int> dqcoeff = pd.DqCoeff; + Debug.Assert(eob > 0); + if (xd.CurBuf.HighBd) + { + Span<ushort> dst16 = MemoryMarshal.Cast<byte, ushort>(dst); + if (xd.Lossless) + { + Idct.HighbdIwht4x4Add(dqcoeff.AsSpan(), dst16, stride, eob, xd.Bd); + } + else + { + switch (txSize) + { + case TxSize.Tx4x4: + Idct.HighbdIht4x4Add(txType, dqcoeff.AsSpan(), dst16, stride, eob, xd.Bd); + break; + case TxSize.Tx8x8: + Idct.HighbdIht8x8Add(txType, dqcoeff.AsSpan(), dst16, stride, eob, xd.Bd); + break; + case TxSize.Tx16x16: + Idct.HighbdIht16x16Add(txType, dqcoeff.AsSpan(), dst16, stride, eob, xd.Bd); + break; + case TxSize.Tx32x32: + Idct.HighbdIdct32x32Add(dqcoeff.AsSpan(), dst16, stride, eob, xd.Bd); + break; + default: Debug.Assert(false, "Invalid transform size"); break; + } + } + } + else + { + if (xd.Lossless) + { + Idct.Iwht4x4Add(dqcoeff.AsSpan(), dst, stride, eob); + } + else + { + switch (txSize) + { + case TxSize.Tx4x4: Idct.Iht4x4Add(txType, dqcoeff.AsSpan(), dst, stride, eob); break; + case TxSize.Tx8x8: Idct.Iht8x8Add(txType, dqcoeff.AsSpan(), dst, stride, eob); break; + case TxSize.Tx16x16: Idct.Iht16x16Add(txType, dqcoeff.AsSpan(), dst, stride, eob); break; + case TxSize.Tx32x32: Idct.Idct32x32Add(dqcoeff.AsSpan(), dst, stride, eob); break; + default: Debug.Assert(false, "Invalid transform size"); return; + } + } + } + + if (eob == 1) + { + dqcoeff.AsSpan()[0] = 0; + } + else + { + if (txType == TxType.DctDct && txSize <= TxSize.Tx16x16 && eob <= 10) + { + dqcoeff.AsSpan().Slice(0, 4 * (4 << (int)txSize)).Fill(0); + } + else if (txSize == TxSize.Tx32x32 && eob <= 34) + { + dqcoeff.AsSpan().Slice(0, 256).Fill(0); + } + else + { + dqcoeff.AsSpan().Slice(0, 16 << ((int)txSize << 1)).Fill(0); + } + } + } + + private static unsafe void PredictAndReconstructIntraBlock( + ref TileWorkerData twd, + ref ModeInfo mi, + int plane, + int row, + int col, + TxSize txSize) + { + ref MacroBlockD xd = ref twd.Xd; + ref MacroBlockDPlane pd = ref xd.Plane[plane]; + PredictionMode mode = (plane == 0) ? mi.Mode : mi.UvMode; + int dstOffset = 4 * row * pd.Dst.Stride + 4 * col; + byte* dst = &pd.Dst.Buf.ToPointer()[dstOffset]; + Span<byte> dstSpan = pd.Dst.Buf.AsSpan().Slice(dstOffset); + + if (mi.SbType < BlockSize.Block8x8) + { + if (plane == 0) + { + mode = xd.Mi[0].Value.Bmi[(row << 1) + col].Mode; + } + } + + ReconIntra.PredictIntraBlock(ref xd, pd.N4Wl, txSize, mode, dst, pd.Dst.Stride, dst, pd.Dst.Stride, col, row, plane); + + if (mi.Skip == 0) + { + TxType txType = + (plane != 0 || xd.Lossless) ? TxType.DctDct : ReconIntra.IntraModeToTxTypeLookup[(int)mode]; + var sc = (plane != 0 || xd.Lossless) + ? Luts.Vp9DefaultScanOrders[(int)txSize] + : Luts.Vp9ScanOrders[(int)txSize][(int)txType]; + int eob = Detokenize.DecodeBlockTokens(ref twd, plane, sc, col, row, txSize, mi.SegmentId); + if (eob > 0) + { + InverseTransformBlockIntra(ref xd, plane, txType, txSize, dstSpan, pd.Dst.Stride, eob); + } + } + } + + private static int ReconstructInterBlock( + ref TileWorkerData twd, + ref ModeInfo mi, + int plane, + int row, + int col, + TxSize txSize) + { + ref MacroBlockD xd = ref twd.Xd; + ref MacroBlockDPlane pd = ref xd.Plane[plane]; + var sc = Luts.Vp9DefaultScanOrders[(int)txSize]; + int eob = Detokenize.DecodeBlockTokens(ref twd, plane, sc, col, row, txSize, mi.SegmentId); + Span<byte> dst = pd.Dst.Buf.AsSpan().Slice(4 * row * pd.Dst.Stride + 4 * col); + + if (eob > 0) + { + InverseTransformBlockInter(ref xd, plane, txSize, dst, pd.Dst.Stride, eob); + } + return eob; + } + + private static unsafe void BuildMcBorder( + byte* src, + int srcStride, + byte* dst, + int dstStride, + int x, + int y, + int bW, + int bH, + int w, + int h) + { + // Get a pointer to the start of the real data for this row. + byte* refRow = src - x - y * srcStride; + + if (y >= h) + { + refRow += (h - 1) * srcStride; + } + else if (y > 0) + { + refRow += y * srcStride; + } + + do + { + int right = 0, copy; + int left = x < 0 ? -x : 0; + + if (left > bW) + { + left = bW; + } + + if (x + bW > w) + { + right = x + bW - w; + } + + if (right > bW) + { + right = bW; + } + + copy = bW - left - right; + + if (left != 0) + { + MemoryUtil.Fill(dst, refRow[0], left); + } + + if (copy != 0) + { + MemoryUtil.Copy(dst + left, refRow + x + left, copy); + } + + if (right != 0) + { + MemoryUtil.Fill(dst + left + copy, refRow[w - 1], right); + } + + dst += dstStride; + ++y; + + if (y > 0 && y < h) + { + refRow += srcStride; + } + } while (--bH != 0); + } + + private static unsafe void HighBuildMcBorder( + byte* src8, + int srcStride, + ushort* dst, + int dstStride, + int x, + int y, + int bW, + int bH, + int w, + int h) + { + // Get a pointer to the start of the real data for this row. + ushort* src = (ushort*)src8; + ushort* refRow = src - x - y * srcStride; + + if (y >= h) + { + refRow += (h - 1) * srcStride; + } + else if (y > 0) + { + refRow += y * srcStride; + } + + do + { + int right = 0, copy; + int left = x < 0 ? -x : 0; + + if (left > bW) + { + left = bW; + } + + if (x + bW > w) + { + right = x + bW - w; + } + + if (right > bW) + { + right = bW; + } + + copy = bW - left - right; + + if (left != 0) + { + MemoryUtil.Fill(dst, refRow[0], left); + } + + if (copy != 0) + { + MemoryUtil.Copy(dst + left, refRow + x + left, copy); + } + + if (right != 0) + { + MemoryUtil.Fill(dst + left + copy, refRow[w - 1], right); + } + + dst += dstStride; + ++y; + + if (y > 0 && y < h) + { + refRow += srcStride; + } + } while (--bH != 0); + } + + [SkipLocalsInit] + private static unsafe void ExtendAndPredict( + byte* bufPtr1, + int preBufStride, + int x0, + int y0, + int bW, + int bH, + int frameWidth, + int frameHeight, + int borderOffset, + byte* dst, + int dstBufStride, + int subpelX, + int subpelY, + Array8<short>[] kernel, + ref ScaleFactors sf, + ref MacroBlockD xd, + int w, + int h, + int refr, + int xs, + int ys) + { + ushort* mcBufHigh = stackalloc ushort[80 * 2 * 80 * 2]; + if (xd.CurBuf.HighBd) + { + HighBuildMcBorder(bufPtr1, preBufStride, mcBufHigh, bW, x0, y0, bW, bH, frameWidth, frameHeight); + ReconInter.HighbdInterPredictor( + mcBufHigh + borderOffset, + bW, + (ushort*)dst, + dstBufStride, + subpelX, + subpelY, + ref sf, + w, + h, + refr, + kernel, + xs, + ys, + xd.Bd); + } + else + { + BuildMcBorder(bufPtr1, preBufStride, (byte*)mcBufHigh, bW, x0, y0, bW, bH, frameWidth, frameHeight); + ReconInter.InterPredictor( + (byte*)mcBufHigh + borderOffset, + bW, + dst, + dstBufStride, + subpelX, + subpelY, + ref sf, + w, + h, + refr, + kernel, + xs, + ys); + } + } + + private static unsafe void DecBuildInterPredictors( + ref MacroBlockD xd, + int plane, + int bw, + int bh, + int x, + int y, + int w, + int h, + int miX, + int miY, + Array8<short>[] kernel, + ref ScaleFactors sf, + ref Buf2D preBuf, + ref Buf2D dstBuf, + ref Mv mv, + ref Surface refFrameBuf, + bool isScaled, + int refr) + { + ref MacroBlockDPlane pd = ref xd.Plane[plane]; + byte* dst = dstBuf.Buf.ToPointer() + dstBuf.Stride * y + x; + Mv32 scaledMv; + int xs, ys, x0, y0, x0_16, y0_16, frameWidth, frameHeight, bufStride, subpelX, subpelY; + byte* refFrame; + byte* bufPtr; + + // Get reference frame pointer, width and height. + if (plane == 0) + { + frameWidth = refFrameBuf.Width; + frameHeight = refFrameBuf.Height; + refFrame = refFrameBuf.YBuffer.ToPointer(); + } + else + { + frameWidth = refFrameBuf.UvWidth; + frameHeight = refFrameBuf.UvHeight; + refFrame = plane == 1 ? refFrameBuf.UBuffer.ToPointer() : refFrameBuf.VBuffer.ToPointer(); + } + + if (isScaled) + { + Mv mvQ4 = ReconInter.ClampMvToUmvBorderSb(ref xd, ref mv, bw, bh, pd.SubsamplingX, pd.SubsamplingY); + // Co-ordinate of containing block to pixel precision. + int xStart = (-xd.MbToLeftEdge >> (3 + pd.SubsamplingX)); + int yStart = (-xd.MbToTopEdge >> (3 + pd.SubsamplingY)); + // Co-ordinate of the block to 1/16th pixel precision. + x0_16 = (xStart + x) << Filter.SubpelBits; + y0_16 = (yStart + y) << Filter.SubpelBits; + + // Co-ordinate of current block in reference frame + // to 1/16th pixel precision. + x0_16 = sf.ScaleValueX(x0_16); + y0_16 = sf.ScaleValueY(y0_16); + + // Map the top left corner of the block into the reference frame. + x0 = sf.ScaleValueX(xStart + x); + y0 = sf.ScaleValueY(yStart + y); + + // Scale the MV and incorporate the sub-pixel offset of the block + // in the reference frame. + scaledMv = sf.ScaleMv(ref mvQ4, miX + x, miY + y); + xs = sf.XStepQ4; + ys = sf.YStepQ4; + } + else + { + // Co-ordinate of containing block to pixel precision. + x0 = (-xd.MbToLeftEdge >> (3 + pd.SubsamplingX)) + x; + y0 = (-xd.MbToTopEdge >> (3 + pd.SubsamplingY)) + y; + + // Co-ordinate of the block to 1/16th pixel precision. + x0_16 = x0 << Filter.SubpelBits; + y0_16 = y0 << Filter.SubpelBits; + + scaledMv.Row = mv.Row * (1 << (1 - pd.SubsamplingY)); + scaledMv.Col = mv.Col * (1 << (1 - pd.SubsamplingX)); + xs = ys = 16; + } + subpelX = scaledMv.Col & Filter.SubpelMask; + subpelY = scaledMv.Row & Filter.SubpelMask; + + // Calculate the top left corner of the best matching block in the + // reference frame. + x0 += scaledMv.Col >> Filter.SubpelBits; + y0 += scaledMv.Row >> Filter.SubpelBits; + x0_16 += scaledMv.Col; + y0_16 += scaledMv.Row; + + // Get reference block pointer. + bufPtr = refFrame + y0 * preBuf.Stride + x0; + bufStride = preBuf.Stride; + + // Do border extension if there is motion or the + // width/height is not a multiple of 8 pixels. + if (isScaled || scaledMv.Col != 0 || scaledMv.Row != 0 || (frameWidth & 0x7) != 0 || (frameHeight & 0x7) != 0) + { + int y1 = ((y0_16 + (h - 1) * ys) >> Filter.SubpelBits) + 1; + + // Get reference block bottom right horizontal coordinate. + int x1 = ((x0_16 + (w - 1) * xs) >> Filter.SubpelBits) + 1; + int xPad = 0, yPad = 0; + + if (subpelX != 0 || (sf.XStepQ4 != Filter.SubpelShifts)) + { + x0 -= Constants.Vp9InterpExtend - 1; + x1 += Constants.Vp9InterpExtend; + xPad = 1; + } + + if (subpelY != 0 || (sf.YStepQ4 != Filter.SubpelShifts)) + { + y0 -= Constants.Vp9InterpExtend - 1; + y1 += Constants.Vp9InterpExtend; + yPad = 1; + } + + // Skip border extension if block is inside the frame. + if (x0 < 0 || x0 > frameWidth - 1 || x1 < 0 || x1 > frameWidth - 1 || + y0 < 0 || y0 > frameHeight - 1 || y1 < 0 || y1 > frameHeight - 1) + { + // Extend the border. + byte* bufPtr1 = refFrame + y0 * bufStride + x0; + int bW = x1 - x0 + 1; + int bH = y1 - y0 + 1; + int borderOffset = yPad * 3 * bW + xPad * 3; + + ExtendAndPredict( + bufPtr1, + bufStride, + x0, + y0, + bW, + bH, + frameWidth, + frameHeight, + borderOffset, + dst, + dstBuf.Stride, + subpelX, + subpelY, + kernel, + ref sf, + ref xd, + w, + h, + refr, + xs, + ys); + return; + } + } + if (xd.CurBuf.HighBd) + { + ReconInter.HighbdInterPredictor( + (ushort*)bufPtr, + bufStride, + (ushort*)dst, + dstBuf.Stride, + subpelX, + subpelY, + ref sf, + w, + h, + refr, + kernel, + xs, + ys, + xd.Bd); + } + else + { + ReconInter.InterPredictor( + bufPtr, + bufStride, + dst, + dstBuf.Stride, + subpelX, + subpelY, + ref sf, + w, + h, + refr, + kernel, + xs, + ys); + } + } + + private static void DecBuildInterPredictorsSb(ref Vp9Common cm, ref MacroBlockD xd, int miRow, int miCol) + { + int plane; + int miX = miCol * Constants.MiSize; + int miY = miRow * Constants.MiSize; + ref ModeInfo mi = ref xd.Mi[0].Value; + Array8<short>[] kernel = Luts.Vp9FilterKernels[mi.InterpFilter]; + BlockSize sbType = mi.SbType; + int isCompound = mi.HasSecondRef() ? 1 : 0; + int refr; + bool isScaled; + + for (refr = 0; refr < 1 + isCompound; ++refr) + { + int frame = mi.RefFrame[refr]; + ref RefBuffer refBuf = ref cm.FrameRefs[frame - Constants.LastFrame]; + ref ScaleFactors sf = ref refBuf.Sf; + ref Surface refFrameBuf = ref refBuf.Buf; + + if (!sf.IsValidScale()) + { + xd.ErrorInfo.Value.InternalError(CodecErr.CodecUnsupBitstream, "Reference frame has invalid dimensions"); + } + + isScaled = sf.IsScaled(); + ReconInter.SetupPrePlanes(ref xd, refr, ref refFrameBuf, miRow, miCol, isScaled ? new Ptr<ScaleFactors>(ref sf) : Ptr<ScaleFactors>.Null); + xd.BlockRefs[refr] = new Ptr<RefBuffer>(ref refBuf); + + if (sbType < BlockSize.Block8x8) + { + for (plane = 0; plane < Constants.MaxMbPlane; ++plane) + { + ref MacroBlockDPlane pd = ref xd.Plane[plane]; + ref Buf2D dstBuf = ref pd.Dst; + int num4x4W = pd.N4W; + int num4x4H = pd.N4H; + int n4Wx4 = 4 * num4x4W; + int n4Hx4 = 4 * num4x4H; + ref Buf2D preBuf = ref pd.Pre[refr]; + int i = 0, x, y; + for (y = 0; y < num4x4H; ++y) + { + for (x = 0; x < num4x4W; ++x) + { + Mv mv = ReconInter.AverageSplitMvs(ref pd, ref mi, refr, i++); + DecBuildInterPredictors( + ref xd, + plane, + n4Wx4, + n4Hx4, + 4 * x, + 4 * y, + 4, + 4, + miX, + miY, + kernel, + ref sf, + ref preBuf, + ref dstBuf, + ref mv, + ref refFrameBuf, + isScaled, + refr); + } + } + } + } + else + { + Mv mv = mi.Mv[refr]; + for (plane = 0; plane < Constants.MaxMbPlane; ++plane) + { + ref MacroBlockDPlane pd = ref xd.Plane[plane]; + ref Buf2D dstBuf = ref pd.Dst; + int num4x4W = pd.N4W; + int num4x4H = pd.N4H; + int n4Wx4 = 4 * num4x4W; + int n4Hx4 = 4 * num4x4H; + ref Buf2D preBuf = ref pd.Pre[refr]; + DecBuildInterPredictors( + ref xd, + plane, + n4Wx4, + n4Hx4, + 0, + 0, + n4Wx4, + n4Hx4, + miX, + miY, + kernel, + ref sf, + ref preBuf, + ref dstBuf, + ref mv, + ref refFrameBuf, + isScaled, + refr); + } + } + } + } + + private static unsafe void DecResetSkipContext(ref MacroBlockD xd) + { + int i; + for (i = 0; i < Constants.MaxMbPlane; i++) + { + ref MacroBlockDPlane pd = ref xd.Plane[i]; + MemoryUtil.Fill(pd.AboveContext.ToPointer(), (sbyte)0, pd.N4W); + MemoryUtil.Fill(pd.LeftContext.ToPointer(), (sbyte)0, pd.N4H); + } + } + + private static void SetPlaneN4(ref MacroBlockD xd, int bw, int bh, int bwl, int bhl) + { + int i; + for (i = 0; i < Constants.MaxMbPlane; i++) + { + xd.Plane[i].N4W = (ushort)((bw << 1) >> xd.Plane[i].SubsamplingX); + xd.Plane[i].N4H = (ushort)((bh << 1) >> xd.Plane[i].SubsamplingY); + xd.Plane[i].N4Wl = (byte)(bwl - xd.Plane[i].SubsamplingX); + xd.Plane[i].N4Hl = (byte)(bhl - xd.Plane[i].SubsamplingY); + } + } + + private static ref ModeInfo SetOffsets( + ref Vp9Common cm, + ref MacroBlockD xd, + BlockSize bsize, + int miRow, + int miCol, + int bw, + int bh, + int xMis, + int yMis, + int bwl, + int bhl) + { + int offset = miRow * cm.MiStride + miCol; + int x, y; + ref TileInfo tile = ref xd.Tile; + + xd.Mi = cm.MiGridVisible.Slice(offset); + xd.Mi[0] = new Ptr<ModeInfo>(ref cm.Mi[offset]); + xd.Mi[0].Value.SbType = bsize; + for (y = 0; y < yMis; ++y) + { + for (x = y == 0 ? 1 : 0; x < xMis; ++x) + { + xd.Mi[y * cm.MiStride + x] = xd.Mi[0]; + } + } + + SetPlaneN4(ref xd, bw, bh, bwl, bhl); + + xd.SetSkipContext(miRow, miCol); + + // Distance of Mb to the various image edges. These are specified to 8th pel + // as they are always compared to values that are in 1/8th pel units + xd.SetMiRowCol(ref tile, miRow, bh, miCol, bw, cm.MiRows, cm.MiCols); + + ReconInter.SetupDstPlanes(ref xd.Plane, ref xd.CurBuf, miRow, miCol); + return ref xd.Mi[0].Value; + } + + private static void DecodeBlock( + ref TileWorkerData twd, + ref Vp9Common cm, + int miRow, + int miCol, + BlockSize bsize, + int bwl, + int bhl) + { + bool less8x8 = bsize < BlockSize.Block8x8; + int bw = 1 << (bwl - 1); + int bh = 1 << (bhl - 1); + int xMis = Math.Min(bw, cm.MiCols - miCol); + int yMis = Math.Min(bh, cm.MiRows - miRow); + ref Reader r = ref twd.BitReader; + ref MacroBlockD xd = ref twd.Xd; + + ref ModeInfo mi = ref SetOffsets(ref cm, ref xd, bsize, miRow, miCol, bw, bh, xMis, yMis, bwl, bhl); + + if (bsize >= BlockSize.Block8x8 && (cm.SubsamplingX != 0 || cm.SubsamplingY != 0)) + { + BlockSize uvSubsize = Luts.SsSizeLookup[(int)bsize][cm.SubsamplingX][cm.SubsamplingY]; + if (uvSubsize == BlockSize.BlockInvalid) + { + xd.ErrorInfo.Value.InternalError(CodecErr.CodecCorruptFrame, "Invalid block size."); + } + } + + DecodeMv.ReadModeInfo(ref twd, ref cm, miRow, miCol, xMis, yMis); + + if (mi.Skip != 0) + { + DecResetSkipContext(ref xd); + } + + if (!mi.IsInterBlock()) + { + int plane; + for (plane = 0; plane < Constants.MaxMbPlane; ++plane) + { + ref MacroBlockDPlane pd = ref xd.Plane[plane]; + TxSize txSize = plane != 0 ? mi.GetUvTxSize(ref pd) : mi.TxSize; + int num4x4W = pd.N4W; + int num4x4H = pd.N4H; + int step = 1 << (int)txSize; + int row, col; + int maxBlocksWide = num4x4W + (xd.MbToRightEdge >= 0 ? 0 : xd.MbToRightEdge >> (5 + pd.SubsamplingX)); + int maxBlocksHigh = num4x4H + (xd.MbToBottomEdge >= 0 ? 0 : xd.MbToBottomEdge >> (5 + pd.SubsamplingY)); + + xd.MaxBlocksWide = (uint)(xd.MbToRightEdge >= 0 ? 0 : maxBlocksWide); + xd.MaxBlocksHigh = (uint)(xd.MbToBottomEdge >= 0 ? 0 : maxBlocksHigh); + + for (row = 0; row < maxBlocksHigh; row += step) + { + for (col = 0; col < maxBlocksWide; col += step) + { + PredictAndReconstructIntraBlock(ref twd, ref mi, plane, row, col, txSize); + } + } + } + } + else + { + // Prediction + DecBuildInterPredictorsSb(ref cm, ref xd, miRow, miCol); + + // Reconstruction + if (mi.Skip == 0) + { + int eobtotal = 0; + int plane; + + for (plane = 0; plane < Constants.MaxMbPlane; ++plane) + { + ref MacroBlockDPlane pd = ref xd.Plane[plane]; + TxSize txSize = plane != 0 ? mi.GetUvTxSize(ref pd) : mi.TxSize; + int num4x4W = pd.N4W; + int num4x4H = pd.N4H; + int step = 1 << (int)txSize; + int row, col; + int maxBlocksWide = num4x4W + (xd.MbToRightEdge >= 0 ? 0 : xd.MbToRightEdge >> (5 + pd.SubsamplingX)); + int maxBlocksHigh = num4x4H + (xd.MbToBottomEdge >= 0 ? 0 : xd.MbToBottomEdge >> (5 + pd.SubsamplingY)); + + xd.MaxBlocksWide = (uint)(xd.MbToRightEdge >= 0 ? 0 : maxBlocksWide); + xd.MaxBlocksHigh = (uint)(xd.MbToBottomEdge >= 0 ? 0 : maxBlocksHigh); + + for (row = 0; row < maxBlocksHigh; row += step) + { + for (col = 0; col < maxBlocksWide; col += step) + { + eobtotal += ReconstructInterBlock(ref twd, ref mi, plane, row, col, txSize); + } + } + } + + if (!less8x8 && eobtotal == 0) + { + mi.Skip = 1; // Skip loopfilter + } + } + } + + xd.Corrupted |= r.HasError(); + + if (cm.Lf.FilterLevel != 0) + { + LoopFilter.BuildMask(ref cm, ref mi, miRow, miCol, bw, bh); + } + } + + private static int DecPartitionPlaneContext(ref TileWorkerData twd, int miRow, int miCol, int bsl) + { + ref sbyte aboveCtx = ref twd.Xd.AboveSegContext[miCol]; + ref sbyte leftCtx = ref twd.Xd.LeftSegContext[miRow & Constants.MiMask]; + int above = (aboveCtx >> bsl) & 1, left = (leftCtx >> bsl) & 1; + + return (left * 2 + above) + bsl * Constants.PartitionPloffset; + } + + private static void DecUpdatePartitionContext( + ref TileWorkerData twd, + int miRow, + int miCol, + BlockSize subsize, + int bw) + { + Span<sbyte> aboveCtx = twd.Xd.AboveSegContext.Slice(miCol).AsSpan(); + Span<sbyte> leftCtx = MemoryMarshal.CreateSpan(ref twd.Xd.LeftSegContext[miRow & Constants.MiMask], 8 - (miRow & Constants.MiMask)); + + // Update the partition context at the end notes. Set partition bits + // of block sizes larger than the current one to be one, and partition + // bits of smaller block sizes to be zero. + aboveCtx.Slice(0, bw).Fill(Luts.PartitionContextLookup[(int)subsize].Above); + leftCtx.Slice(0, bw).Fill(Luts.PartitionContextLookup[(int)subsize].Left); + } + + private static PartitionType ReadPartition( + ref TileWorkerData twd, + int miRow, + int miCol, + int hasRows, + int hasCols, + int bsl) + { + int ctx = DecPartitionPlaneContext(ref twd, miRow, miCol, bsl); + ReadOnlySpan<byte> probs = MemoryMarshal.CreateReadOnlySpan(ref twd.Xd.PartitionProbs[ctx][0], 3); + PartitionType p; + ref Reader r = ref twd.BitReader; + + if (hasRows != 0 && hasCols != 0) + { + p = (PartitionType)r.ReadTree(Luts.Vp9PartitionTree, probs); + } + else if (hasRows == 0 && hasCols != 0) + { + p = r.Read(probs[1]) != 0 ? PartitionType.PartitionSplit : PartitionType.PartitionHorz; + } + else if (hasRows != 0 && hasCols == 0) + { + p = r.Read(probs[2]) != 0 ? PartitionType.PartitionSplit : PartitionType.PartitionVert; + } + else + { + p = PartitionType.PartitionSplit; + } + + if (!twd.Xd.Counts.IsNull) + { + ++twd.Xd.Counts.Value.Partition[ctx][(int)p]; + } + + return p; + } + + private static void DecodePartition( + ref TileWorkerData twd, + ref Vp9Common cm, + int miRow, + int miCol, + BlockSize bsize, + int n4x4L2) + { + int n8x8L2 = n4x4L2 - 1; + int num8x8Wh = 1 << n8x8L2; + int hbs = num8x8Wh >> 1; + PartitionType partition; + BlockSize subsize; + bool hasRows = (miRow + hbs) < cm.MiRows; + bool hasCols = (miCol + hbs) < cm.MiCols; + ref MacroBlockD xd = ref twd.Xd; + + if (miRow >= cm.MiRows || miCol >= cm.MiCols) + { + return; + } + + partition = ReadPartition(ref twd, miRow, miCol, hasRows ? 1 : 0, hasCols ? 1 : 0, n8x8L2); + subsize = Luts.SubsizeLookup[(int)partition][(int)bsize]; + if (hbs == 0) + { + // Calculate bmode block dimensions (log 2) + xd.BmodeBlocksWl = (byte)(1 >> ((partition & PartitionType.PartitionVert) != 0 ? 1 : 0)); + xd.BmodeBlocksHl = (byte)(1 >> ((partition & PartitionType.PartitionHorz) != 0 ? 1 : 0)); + DecodeBlock(ref twd, ref cm, miRow, miCol, subsize, 1, 1); + } + else + { + switch (partition) + { + case PartitionType.PartitionNone: + DecodeBlock(ref twd, ref cm, miRow, miCol, subsize, n4x4L2, n4x4L2); + break; + case PartitionType.PartitionHorz: + DecodeBlock(ref twd, ref cm, miRow, miCol, subsize, n4x4L2, n8x8L2); + if (hasRows) + { + DecodeBlock(ref twd, ref cm, miRow + hbs, miCol, subsize, n4x4L2, n8x8L2); + } + + break; + case PartitionType.PartitionVert: + DecodeBlock(ref twd, ref cm, miRow, miCol, subsize, n8x8L2, n4x4L2); + if (hasCols) + { + DecodeBlock(ref twd, ref cm, miRow, miCol + hbs, subsize, n8x8L2, n4x4L2); + } + + break; + case PartitionType.PartitionSplit: + DecodePartition(ref twd, ref cm, miRow, miCol, subsize, n8x8L2); + DecodePartition(ref twd, ref cm, miRow, miCol + hbs, subsize, n8x8L2); + DecodePartition(ref twd, ref cm, miRow + hbs, miCol, subsize, n8x8L2); + DecodePartition(ref twd, ref cm, miRow + hbs, miCol + hbs, subsize, n8x8L2); + break; + default: Debug.Assert(false, "Invalid partition type"); break; + } + } + + // Update partition context + if (bsize >= BlockSize.Block8x8 && (bsize == BlockSize.Block8x8 || partition != PartitionType.PartitionSplit)) + { + DecUpdatePartitionContext(ref twd, miRow, miCol, subsize, num8x8Wh); + } + } + + private static void SetupTokenDecoder( + ArrayPtr<byte> data, + int readSize, + ref InternalErrorInfo errorInfo, + ref Reader r) + { + // Validate the calculated partition length. If the buffer described by the + // partition can't be fully read then throw an error. + if (!ReadIsValid(data, readSize)) + { + errorInfo.InternalError(CodecErr.CodecCorruptFrame, "Truncated packet or corrupt tile length"); + } + + if (r.Init(data, readSize)) + { + errorInfo.InternalError(CodecErr.CodecMemError, "Failed to allocate bool decoder 1"); + } + } + + // Reads the next tile returning its size and adjusting '*data' accordingly + // based on 'isLast'. + private static void GetTileBuffer( + bool isLast, + ref InternalErrorInfo errorInfo, + ref ArrayPtr<byte> data, + ref TileBuffer buf) + { + int size; + + if (!isLast) + { + if (!ReadIsValid(data, 4)) + { + errorInfo.InternalError(CodecErr.CodecCorruptFrame, "Truncated packet or corrupt tile length"); + } + + size = BinaryPrimitives.ReadInt32BigEndian(data.AsSpan()); + data = data.Slice(4); + + if (size > data.Length) + { + errorInfo.InternalError(CodecErr.CodecCorruptFrame, "Truncated packet or corrupt tile size"); + } + } + else + { + size = data.Length; + } + + buf.Data = data; + buf.Size = size; + + data = data.Slice(size); + } + + private static void GetTileBuffers(ref Vp9Common cm, ArrayPtr<byte> data, int tileCols, ref Array64<TileBuffer> tileBuffers) + { + int c; + + for (c = 0; c < tileCols; ++c) + { + bool isLast = c == tileCols - 1; + ref TileBuffer buf = ref tileBuffers[c]; + buf.Col = c; + GetTileBuffer(isLast, ref cm.Error, ref data, ref buf); + } + } + + private static void GetTileBuffers( + ref Vp9Common cm, + ArrayPtr<byte> data, + int tileCols, + int tileRows, + ref Array4<Array64<TileBuffer>> tileBuffers) + { + int r, c; + + for (r = 0; r < tileRows; ++r) + { + for (c = 0; c < tileCols; ++c) + { + bool isLast = (r == tileRows - 1) && (c == tileCols - 1); + ref TileBuffer buf = ref tileBuffers[r][c]; + GetTileBuffer(isLast, ref cm.Error, ref data, ref buf); + } + } + } + + public static unsafe ArrayPtr<byte> DecodeTiles(ref Vp9Common cm, ArrayPtr<byte> data) + { + int alignedCols = TileInfo.MiColsAlignedToSb(cm.MiCols); + int tileCols = 1 << cm.Log2TileCols; + int tileRows = 1 << cm.Log2TileRows; + Array4<Array64<TileBuffer>> tileBuffers = new Array4<Array64<TileBuffer>>(); + int tileRow, tileCol; + int miRow, miCol; + + Debug.Assert(tileRows <= 4); + Debug.Assert(tileCols <= (1 << 6)); + + // Note: this memset assumes above_context[0], [1] and [2] + // are allocated as part of the same buffer. + MemoryUtil.Fill(cm.AboveContext.ToPointer(), (sbyte)0, Constants.MaxMbPlane * 2 * alignedCols); + MemoryUtil.Fill(cm.AboveSegContext.ToPointer(), (sbyte)0, alignedCols); + + LoopFilter.ResetLfm(ref cm); + + GetTileBuffers(ref cm, data, tileCols, tileRows, ref tileBuffers); + // Load all tile information into tile_data. + for (tileRow = 0; tileRow < tileRows; ++tileRow) + { + for (tileCol = 0; tileCol < tileCols; ++tileCol) + { + ref TileBuffer buf = ref tileBuffers[tileRow][tileCol]; + ref TileWorkerData tileData = ref cm.TileWorkerData[tileCols * tileRow + tileCol]; + tileData.Xd = cm.Mb; + tileData.Xd.Corrupted = false; + tileData.Xd.Counts = cm.Counts; + tileData.Dqcoeff = new Array32<Array32<int>>(); + tileData.Xd.Tile.Init(ref cm, tileRow, tileCol); + SetupTokenDecoder(buf.Data, buf.Size, ref cm.Error, ref tileData.BitReader); + cm.InitMacroBlockD(ref tileData.Xd, new ArrayPtr<int>(ref tileData.Dqcoeff[0][0], 32 * 32)); + } + } + + for (tileRow = 0; tileRow < tileRows; ++tileRow) + { + TileInfo tile = new TileInfo(); + tile.SetRow(ref cm, tileRow); + for (miRow = tile.MiRowStart; miRow < tile.MiRowEnd; miRow += Constants.MiBlockSize) + { + for (tileCol = 0; tileCol < tileCols; ++tileCol) + { + int col = tileCol; + ref TileWorkerData tileData = ref cm.TileWorkerData[tileCols * tileRow + col]; + tile.SetCol(ref cm, col); + tileData.Xd.LeftContext = new Array3<Array16<sbyte>>(); + tileData.Xd.LeftSegContext = new Array8<sbyte>(); + for (miCol = tile.MiColStart; miCol < tile.MiColEnd; miCol += Constants.MiBlockSize) + { + DecodePartition(ref tileData, ref cm, miRow, miCol, BlockSize.Block64x64, 4); + } + cm.Mb.Corrupted |= tileData.Xd.Corrupted; + if (cm.Mb.Corrupted) + { + cm.Error.InternalError(CodecErr.CodecCorruptFrame, "Failed to decode tile data"); + } + } + } + } + + // Get last tile data. + return cm.TileWorkerData[tileCols * tileRows - 1].BitReader.FindEnd(); + } + + private static bool DecodeTileCol(ref TileWorkerData tileData, ref Vp9Common cm, ref Array64<TileBuffer> tileBuffers) + { + ref TileInfo tile = ref tileData.Xd.Tile; + int finalCol = (1 << cm.Log2TileCols) - 1; + ArrayPtr<byte> bitReaderEnd = ArrayPtr<byte>.Null; + + int n = tileData.BufStart; + + tileData.Xd.Corrupted = false; + + do + { + ref TileBuffer buf = ref tileBuffers[n]; + + Debug.Assert(cm.Log2TileRows == 0); + tileData.Dqcoeff = new Array32<Array32<int>>(); + tile.Init(ref cm, 0, buf.Col); + SetupTokenDecoder(buf.Data, buf.Size, ref tileData.ErrorInfo, ref tileData.BitReader); + cm.InitMacroBlockD(ref tileData.Xd, new ArrayPtr<int>(ref tileData.Dqcoeff[0][0], 32 * 32)); + tileData.Xd.ErrorInfo = new Ptr<InternalErrorInfo>(ref tileData.ErrorInfo); + + for (int miRow = tile.MiRowStart; miRow < tile.MiRowEnd; miRow += Constants.MiBlockSize) + { + tileData.Xd.LeftContext = new Array3<Array16<sbyte>>(); + tileData.Xd.LeftSegContext = new Array8<sbyte>(); + for (int miCol = tile.MiColStart; miCol < tile.MiColEnd; miCol += Constants.MiBlockSize) + { + DecodePartition(ref tileData, ref cm, miRow, miCol, BlockSize.Block64x64, 4); + } + } + + if (buf.Col == finalCol) + { + bitReaderEnd = tileData.BitReader.FindEnd(); + } + } while (!tileData.Xd.Corrupted && ++n <= tileData.BufEnd); + + tileData.DataEnd = bitReaderEnd; + return !tileData.Xd.Corrupted; + } + + public static unsafe ArrayPtr<byte> DecodeTilesMt(ref Vp9Common cm, ArrayPtr<byte> data, int maxThreads) + { + ArrayPtr<byte> bitReaderEnd = ArrayPtr<byte>.Null; + + int tileCols = 1 << cm.Log2TileCols; + int tileRows = 1 << cm.Log2TileRows; + int totalTiles = tileCols * tileRows; + int numWorkers = Math.Min(maxThreads, tileCols); + int n; + + Debug.Assert(tileCols <= (1 << 6)); + Debug.Assert(tileRows == 1); + + cm.AboveContext.AsSpan().Fill(0); + cm.AboveSegContext.AsSpan().Fill(0); + + for (n = 0; n < numWorkers; ++n) + { + ref TileWorkerData tileData = ref cm.TileWorkerData[n + totalTiles]; + + tileData.Xd = cm.Mb; + tileData.Xd.Counts = new Ptr<Vp9BackwardUpdates>(ref tileData.Counts); + tileData.Counts = new Vp9BackwardUpdates(); + } + + Array64<TileBuffer> tileBuffers = new Array64<TileBuffer>(); + + GetTileBuffers(ref cm, data, tileCols, ref tileBuffers); + + tileBuffers.AsSpan().Slice(0, tileCols).Sort(CompareTileBuffers); + + if (numWorkers == tileCols) + { + TileBuffer largest = tileBuffers[0]; + Span<TileBuffer> buffers = tileBuffers.AsSpan(); + buffers.Slice(1).CopyTo(buffers.Slice(0, tileBuffers.Length - 1)); + tileBuffers[tileCols - 1] = largest; + } + else + { + int start = 0, end = tileCols - 2; + TileBuffer tmp; + + // Interleave the tiles to distribute the load between threads, assuming a + // larger tile implies it is more difficult to decode. + while (start < end) + { + tmp = tileBuffers[start]; + tileBuffers[start] = tileBuffers[end]; + tileBuffers[end] = tmp; + start += 2; + end -= 2; + } + } + + int baseVal = tileCols / numWorkers; + int remain = tileCols % numWorkers; + int bufStart = 0; + + for (n = 0; n < numWorkers; ++n) + { + int count = baseVal + (remain + n) / numWorkers; + ref TileWorkerData tileData = ref cm.TileWorkerData[n + totalTiles]; + + tileData.BufStart = bufStart; + tileData.BufEnd = bufStart + count - 1; + tileData.DataEnd = data.Slice(data.Length); + bufStart += count; + } + + Ptr<Vp9Common> cmPtr = new Ptr<Vp9Common>(ref cm); + + Parallel.For(0, numWorkers, (n) => + { + ref TileWorkerData tileData = ref cmPtr.Value.TileWorkerData[n + totalTiles]; + + if (!DecodeTileCol(ref tileData, ref cmPtr.Value, ref tileBuffers)) + { + cmPtr.Value.Mb.Corrupted = true; + } + }); + + for (; n > 0; --n) + { + if (bitReaderEnd.IsNull) + { + ref TileWorkerData tileData = ref cm.TileWorkerData[n - 1 + totalTiles]; + bitReaderEnd = tileData.DataEnd; + } + } + + for (n = 0; n < numWorkers; ++n) + { + ref TileWorkerData tileData = ref cm.TileWorkerData[n + totalTiles]; + AccumulateFrameCounts(ref cm.Counts.Value, ref tileData.Counts); + } + + Debug.Assert(!bitReaderEnd.IsNull || cm.Mb.Corrupted); + return bitReaderEnd; + } + + private static int CompareTileBuffers(TileBuffer bufA, TileBuffer bufB) + { + return (bufA.Size < bufB.Size ? 1 : 0) - (bufA.Size > bufB.Size ? 1 : 0); + } + + private static void AccumulateFrameCounts(ref Vp9BackwardUpdates accum, ref Vp9BackwardUpdates counts) + { + Span<uint> a = MemoryMarshal.Cast<Vp9BackwardUpdates, uint>(MemoryMarshal.CreateSpan(ref accum, 1)); + Span<uint> c = MemoryMarshal.Cast<Vp9BackwardUpdates, uint>(MemoryMarshal.CreateSpan(ref counts, 1)); + + for (int i = 0; i < a.Length; i++) + { + a[i] += c[i]; + } + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/DecodeMv.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/DecodeMv.cs new file mode 100644 index 00000000..3281905c --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/DecodeMv.cs @@ -0,0 +1,1160 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Dsp; +using Ryujinx.Graphics.Nvdec.Vp9.Types; +using Ryujinx.Graphics.Video; +using System; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using Mv = Ryujinx.Graphics.Nvdec.Vp9.Types.Mv; +using MvRef = Ryujinx.Graphics.Nvdec.Vp9.Types.MvRef; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal static class DecodeMv + { + private const int MvrefNeighbours = 8; + + private static PredictionMode ReadIntraMode(ref Reader r, ReadOnlySpan<byte> p) + { + return (PredictionMode)r.ReadTree(Luts.Vp9IntraModeTree, p); + } + + private static PredictionMode ReadIntraModeY(ref Vp9Common cm, ref MacroBlockD xd, ref Reader r, int sizeGroup) + { + PredictionMode yMode = ReadIntraMode(ref r, cm.Fc.Value.YModeProb[sizeGroup].AsSpan()); + if (!xd.Counts.IsNull) + { + ++xd.Counts.Value.YMode[sizeGroup][(int)yMode]; + } + + return yMode; + } + + private static PredictionMode ReadIntraModeUv(ref Vp9Common cm, ref MacroBlockD xd, ref Reader r, byte yMode) + { + PredictionMode uvMode = ReadIntraMode(ref r, cm.Fc.Value.UvModeProb[yMode].AsSpan()); + if (!xd.Counts.IsNull) + { + ++xd.Counts.Value.UvMode[yMode][(int)uvMode]; + } + + return uvMode; + } + + private static PredictionMode ReadInterMode(ref Vp9Common cm, ref MacroBlockD xd, ref Reader r, int ctx) + { + int mode = r.ReadTree(Luts.Vp9InterModeTree, cm.Fc.Value.InterModeProb[ctx].AsSpan()); + if (!xd.Counts.IsNull) + { + ++xd.Counts.Value.InterMode[ctx][mode]; + } + + return PredictionMode.NearestMv + mode; + } + + private static int ReadSegmentId(ref Reader r, ref Array7<byte> segTreeProbs) + { + return r.ReadTree(Luts.Vp9SegmentTree, segTreeProbs.AsSpan()); + } + + private static ReadOnlySpan<byte> GetTxProbs(ref Vp9EntropyProbs fc, TxSize maxTxSize, int ctx) + { + switch (maxTxSize) + { + case TxSize.Tx8x8: return fc.Tx8x8Prob[ctx].AsSpan(); + case TxSize.Tx16x16: return fc.Tx16x16Prob[ctx].AsSpan(); + case TxSize.Tx32x32: return fc.Tx32x32Prob[ctx].AsSpan(); + default: Debug.Assert(false, "Invalid maxTxSize."); return ReadOnlySpan<byte>.Empty; + } + } + + private static Span<uint> GetTxCounts(ref Vp9BackwardUpdates counts, TxSize maxTxSize, int ctx) + { + switch (maxTxSize) + { + case TxSize.Tx8x8: return counts.Tx8x8[ctx].AsSpan(); + case TxSize.Tx16x16: return counts.Tx16x16[ctx].AsSpan(); + case TxSize.Tx32x32: return counts.Tx32x32[ctx].AsSpan(); + default: Debug.Assert(false, "Invalid maxTxSize."); return Span<uint>.Empty; + } + } + + private static TxSize ReadSelectedTxSize(ref Vp9Common cm, ref MacroBlockD xd, TxSize maxTxSize, ref Reader r) + { + int ctx = xd.GetTxSizeContext(); + ReadOnlySpan<byte> txProbs = GetTxProbs(ref cm.Fc.Value, maxTxSize, ctx); + TxSize txSize = (TxSize)r.Read(txProbs[0]); + if (txSize != TxSize.Tx4x4 && maxTxSize >= TxSize.Tx16x16) + { + txSize += r.Read(txProbs[1]); + if (txSize != TxSize.Tx8x8 && maxTxSize >= TxSize.Tx32x32) + { + txSize += r.Read(txProbs[2]); + } + } + + if (!xd.Counts.IsNull) + { + ++GetTxCounts(ref xd.Counts.Value, maxTxSize, ctx)[(int)txSize]; + } + + return txSize; + } + + private static TxSize ReadTxSize(ref Vp9Common cm, ref MacroBlockD xd, bool allowSelect, ref Reader r) + { + TxMode txMode = cm.TxMode; + BlockSize bsize = xd.Mi[0].Value.SbType; + TxSize maxTxSize = Luts.MaxTxSizeLookup[(int)bsize]; + if (allowSelect && txMode == TxMode.TxModeSelect && bsize >= BlockSize.Block8x8) + { + return ReadSelectedTxSize(ref cm, ref xd, maxTxSize, ref r); + } + else + { + return (TxSize)Math.Min((int)maxTxSize, (int)Luts.TxModeToBiggestTxSize[(int)txMode]); + } + } + + private static int DecGetSegmentId(ref Vp9Common cm, ArrayPtr<byte> segmentIds, int miOffset, int xMis, int yMis) + { + int x, y, segmentId = int.MaxValue; + + for (y = 0; y < yMis; y++) + { + for (x = 0; x < xMis; x++) + { + segmentId = Math.Min(segmentId, segmentIds[miOffset + y * cm.MiCols + x]); + } + } + + Debug.Assert(segmentId >= 0 && segmentId < Constants.MaxSegments); + return segmentId; + } + + private static void SetSegmentId(ref Vp9Common cm, int miOffset, int xMis, int yMis, int segmentId) + { + int x, y; + + Debug.Assert(segmentId >= 0 && segmentId < Constants.MaxSegments); + + for (y = 0; y < yMis; y++) + { + for (x = 0; x < xMis; x++) + { + cm.CurrentFrameSegMap[miOffset + y * cm.MiCols + x] = (byte)segmentId; + } + } + } + + private static void CopySegmentId( + ref Vp9Common cm, + ArrayPtr<byte> lastSegmentIds, + ArrayPtr<byte> currentSegmentIds, + int miOffset, + int xMis, + int yMis) + { + int x, y; + + for (y = 0; y < yMis; y++) + { + for (x = 0; x < xMis; x++) + { + currentSegmentIds[miOffset + y * cm.MiCols + x] = (byte)(!lastSegmentIds.IsNull ? lastSegmentIds[miOffset + y * cm.MiCols + x] : 0); + } + } + } + + private static int ReadIntraSegmentId(ref Vp9Common cm, int miOffset, int xMis, int yMis, ref Reader r) + { + ref Segmentation seg = ref cm.Seg; + int segmentId; + + if (!seg.Enabled) + { + return 0; // Default for disabled segmentation + } + + if (!seg.UpdateMap) + { + CopySegmentId(ref cm, cm.LastFrameSegMap, cm.CurrentFrameSegMap, miOffset, xMis, yMis); + return 0; + } + + segmentId = ReadSegmentId(ref r, ref cm.Fc.Value.SegTreeProb); + SetSegmentId(ref cm, miOffset, xMis, yMis, segmentId); + return segmentId; + } + + private static int ReadInterSegmentId( + ref Vp9Common cm, + ref MacroBlockD xd, + int miRow, + int miCol, + ref Reader r, + int xMis, + int yMis) + { + ref Segmentation seg = ref cm.Seg; + ref ModeInfo mi = ref xd.Mi[0].Value; + int predictedSegmentId, segmentId; + int miOffset = miRow * cm.MiCols + miCol; + + if (!seg.Enabled) + { + return 0; // Default for disabled segmentation + } + + predictedSegmentId = !cm.LastFrameSegMap.IsNull + ? DecGetSegmentId(ref cm, cm.LastFrameSegMap, miOffset, xMis, yMis) + : 0; + + if (!seg.UpdateMap) + { + CopySegmentId(ref cm, cm.LastFrameSegMap, cm.CurrentFrameSegMap, miOffset, xMis, yMis); + return predictedSegmentId; + } + + if (seg.TemporalUpdate) + { + byte predProb = Segmentation.GetPredProbSegId(ref cm.Fc.Value.SegPredProb, ref xd); + mi.SegIdPredicted = (sbyte)r.Read(predProb); + segmentId = mi.SegIdPredicted != 0 ? predictedSegmentId : ReadSegmentId(ref r, ref cm.Fc.Value.SegTreeProb); + } + else + { + segmentId = ReadSegmentId(ref r, ref cm.Fc.Value.SegTreeProb); + } + SetSegmentId(ref cm, miOffset, xMis, yMis, segmentId); + return segmentId; + } + + private static int ReadSkip(ref Vp9Common cm, ref MacroBlockD xd, int segmentId, ref Reader r) + { + if (cm.Seg.IsSegFeatureActive(segmentId, SegLvlFeatures.SegLvlSkip) != 0) + { + return 1; + } + else + { + int ctx = xd.GetSkipContext(); + int skip = r.Read(cm.Fc.Value.SkipProb[ctx]); + if (!xd.Counts.IsNull) + { + ++xd.Counts.Value.Skip[ctx][skip]; + } + + return skip; + } + } + + private static int ReadMvComponent(ref Reader r, ref Vp9EntropyProbs fc, int mvcomp, bool usehp) + { + int mag, d, fr, hp; + bool sign = r.Read(fc.Sign[mvcomp]) != 0; + MvClassType mvClass = (MvClassType)r.ReadTree(Luts.Vp9MvClassTree, fc.Classes[mvcomp].AsSpan()); + bool class0 = mvClass == MvClassType.MvClass0; + + // Integer part + if (class0) + { + d = r.Read(fc.Class0[mvcomp][0]); + mag = 0; + } + else + { + int i; + int n = (int)mvClass + Constants.Class0Bits - 1; // Number of bits + + d = 0; + for (i = 0; i < n; ++i) + { + d |= r.Read(fc.Bits[mvcomp][i]) << i; + } + + mag = Constants.Class0Size << ((int)mvClass + 2); + } + + // Fractional part + fr = r.ReadTree(Luts.Vp9MvFPTree, class0 ? fc.Class0Fp[mvcomp][d].AsSpan() : fc.Fp[mvcomp].AsSpan()); + + // High precision part (if hp is not used, the default value of the hp is 1) + hp = usehp ? r.Read(class0 ? fc.Class0Hp[mvcomp] : fc.Hp[mvcomp]) : 1; + + // Result + mag += ((d << 3) | (fr << 1) | hp) + 1; + return sign ? -mag : mag; + } + + private static void ReadMv( + ref Reader r, + ref Mv mv, + ref Mv refr, + ref Vp9EntropyProbs fc, + Ptr<Vp9BackwardUpdates> counts, + bool allowHP) + { + MvJointType jointType = (MvJointType)r.ReadTree(Luts.Vp9MvJointTree, fc.Joints.AsSpan()); + bool useHP = allowHP && refr.UseMvHp(); + Mv diff = new Mv(); + + if (Mv.MvJointVertical(jointType)) + { + diff.Row = (short)ReadMvComponent(ref r, ref fc, 0, useHP); + } + + if (Mv.MvJointHorizontal(jointType)) + { + diff.Col = (short)ReadMvComponent(ref r, ref fc, 1, useHP); + } + + diff.IncMv(counts); + + mv.Row = (short)(refr.Row + diff.Row); + mv.Col = (short)(refr.Col + diff.Col); + } + + private static ReferenceMode ReadBlockReferenceMode(ref Vp9Common cm, ref MacroBlockD xd, ref Reader r) + { + if (cm.ReferenceMode == ReferenceMode.ReferenceModeSelect) + { + int ctx = PredCommon.GetReferenceModeContext(ref cm, ref xd); + ReferenceMode mode = (ReferenceMode)r.Read(cm.Fc.Value.CompInterProb[ctx]); + if (!xd.Counts.IsNull) + { + ++xd.Counts.Value.CompInter[ctx][(int)mode]; + } + + return mode; // SingleReference or CompoundReference + } + else + { + return cm.ReferenceMode; + } + } + + // Read the referncence frame + private static void ReadRefFrames( + ref Vp9Common cm, + ref MacroBlockD xd, + ref Reader r, + int segmentId, + ref Array2<sbyte> refFrame) + { + ref Vp9EntropyProbs fc = ref cm.Fc.Value; + + if (cm.Seg.IsSegFeatureActive(segmentId, SegLvlFeatures.SegLvlRefFrame) != 0) + { + refFrame[0] = (sbyte)cm.Seg.GetSegData(segmentId, SegLvlFeatures.SegLvlRefFrame); + refFrame[1] = Constants.None; + } + else + { + ReferenceMode mode = ReadBlockReferenceMode(ref cm, ref xd, ref r); + if (mode == ReferenceMode.CompoundReference) + { + int idx = cm.RefFrameSignBias[cm.CompFixedRef]; + int ctx = PredCommon.GetPredContextCompRefP(ref cm, ref xd); + int bit = r.Read(fc.CompRefProb[ctx]); + if (!xd.Counts.IsNull) + { + ++xd.Counts.Value.CompRef[ctx][bit]; + } + + refFrame[idx] = cm.CompFixedRef; + refFrame[idx == 0 ? 1 : 0] = cm.CompVarRef[bit]; + } + else if (mode == ReferenceMode.SingleReference) + { + int ctx0 = PredCommon.GetPredContextSingleRefP1(ref xd); + int bit0 = r.Read(fc.SingleRefProb[ctx0][0]); + if (!xd.Counts.IsNull) + { + ++xd.Counts.Value.SingleRef[ctx0][0][bit0]; + } + + if (bit0 != 0) + { + int ctx1 = PredCommon.GetPredContextSingleRefP2(ref xd); + int bit1 = r.Read(fc.SingleRefProb[ctx1][1]); + if (!xd.Counts.IsNull) + { + ++xd.Counts.Value.SingleRef[ctx1][1][bit1]; + } + + refFrame[0] = (sbyte)(bit1 != 0 ? Constants.AltRefFrame : Constants.GoldenFrame); + } + else + { + refFrame[0] = Constants.LastFrame; + } + + refFrame[1] = Constants.None; + } + else + { + Debug.Assert(false, "Invalid prediction mode."); + } + } + } + + private static byte ReadSwitchableInterpFilter(ref Vp9Common cm, ref MacroBlockD xd, ref Reader r) + { + int ctx = xd.GetPredContextSwitchableInterp(); + byte type = (byte)r.ReadTree(Luts.Vp9SwitchableInterpTree, cm.Fc.Value.SwitchableInterpProb[ctx].AsSpan()); + if (!xd.Counts.IsNull) + { + ++xd.Counts.Value.SwitchableInterp[ctx][type]; + } + + return type; + } + + private static void ReadIntraBlockModeInfo(ref Vp9Common cm, ref MacroBlockD xd, ref ModeInfo mi, ref Reader r) + { + BlockSize bsize = mi.SbType; + int i; + + switch (bsize) + { + case BlockSize.Block4x4: + for (i = 0; i < 4; ++i) + { + mi.Bmi[i].Mode = ReadIntraModeY(ref cm, ref xd, ref r, 0); + } + + mi.Mode = mi.Bmi[3].Mode; + break; + case BlockSize.Block4x8: + mi.Bmi[0].Mode = mi.Bmi[2].Mode = ReadIntraModeY(ref cm, ref xd, ref r, 0); + mi.Bmi[1].Mode = mi.Bmi[3].Mode = mi.Mode = ReadIntraModeY(ref cm, ref xd, ref r, 0); + break; + case BlockSize.Block8x4: + mi.Bmi[0].Mode = mi.Bmi[1].Mode = ReadIntraModeY(ref cm, ref xd, ref r, 0); + mi.Bmi[2].Mode = mi.Bmi[3].Mode = mi.Mode = ReadIntraModeY(ref cm, ref xd, ref r, 0); + break; + default: mi.Mode = ReadIntraModeY(ref cm, ref xd, ref r, Luts.SizeGroupLookup[(int)bsize]); break; + } + + mi.UvMode = ReadIntraModeUv(ref cm, ref xd, ref r, (byte)mi.Mode); + + // Initialize interp_filter here so we do not have to check for inter block + // modes in GetPredContextSwitchableInterp() + mi.InterpFilter = Constants.SwitchableFilters; + + mi.RefFrame[0] = Constants.IntraFrame; + mi.RefFrame[1] = Constants.None; + } + + private static bool IsMvValid(ref Mv mv) + { + return mv.Row > Constants.MvLow && + mv.Row < Constants.MvUpp && + mv.Col > Constants.MvLow && + mv.Col < Constants.MvUpp; + } + + private static void CopyMvPair(ref Array2<Mv> dst, ref Array2<Mv> src) + { + dst[0] = src[0]; + dst[1] = src[1]; + } + + private static void ZeroMvPair(ref Array2<Mv> dst) + { + dst[0] = new Mv(); + dst[1] = new Mv(); + } + + private static bool AssignMv( + ref Vp9Common cm, + ref MacroBlockD xd, + PredictionMode mode, + ref Array2<Mv> mv, + ref Array2<Mv> refMv, + ref Array2<Mv> nearNearestMv, + int isCompound, + bool allowHP, + ref Reader r) + { + int i; + bool ret = true; + + switch (mode) + { + case PredictionMode.NewMv: + { + for (i = 0; i < 1 + isCompound; ++i) + { + ReadMv(ref r, ref mv[i], ref refMv[i], ref cm.Fc.Value, xd.Counts, allowHP); + ret = ret && IsMvValid(ref mv[i]); + } + break; + } + case PredictionMode.NearMv: + case PredictionMode.NearestMv: + { + CopyMvPair(ref mv, ref nearNearestMv); + break; + } + case PredictionMode.ZeroMv: + { + ZeroMvPair(ref mv); + break; + } + default: return false; + } + return ret; + } + + private static bool ReadIsInterBlock(ref Vp9Common cm, ref MacroBlockD xd, int segmentId, ref Reader r) + { + if (cm.Seg.IsSegFeatureActive(segmentId, SegLvlFeatures.SegLvlRefFrame) != 0) + { + return cm.Seg.GetSegData(segmentId, SegLvlFeatures.SegLvlRefFrame) != Constants.IntraFrame; + } + else + { + int ctx = xd.GetIntraInterContext(); + bool isInter = r.Read(cm.Fc.Value.IntraInterProb[ctx]) != 0; + if (!xd.Counts.IsNull) + { + ++xd.Counts.Value.IntraInter[ctx][isInter ? 1 : 0]; + } + + return isInter; + } + } + + private static void DecFindBestRefMvs(bool allowHP, Span<Mv> mvlist, ref Mv bestMv, int refmvCount) + { + int i; + + // Make sure all the candidates are properly clamped etc + for (i = 0; i < refmvCount; ++i) + { + mvlist[i].LowerMvPrecision(allowHP); + bestMv = mvlist[i]; + } + } + + private static bool AddMvRefListEb(Mv mv, ref int refMvCount, Span<Mv> mvRefList, bool earlyBreak) + { + if (refMvCount != 0) + { + if (Unsafe.As<Mv, int>(ref mv) != Unsafe.As<Mv, int>(ref mvRefList[0])) + { + mvRefList[refMvCount] = mv; + refMvCount++; + return true; + } + } + else + { + mvRefList[refMvCount++] = mv; + if (earlyBreak) + { + return true; + } + } + + return false; + } + + // Performs mv sign inversion if indicated by the reference frame combination. + private static Mv ScaleMv(ref ModeInfo mi, int refr, sbyte thisRefFrame, ref Array4<sbyte> refSignBias) + { + Mv mv = mi.Mv[refr]; + if (refSignBias[mi.RefFrame[refr]] != refSignBias[thisRefFrame]) + { + mv.Row *= -1; + mv.Col *= -1; + } + return mv; + } + + private static bool IsDiffRefFrameAddMvEb( + ref ModeInfo mbmi, + sbyte refFrame, + ref Array4<sbyte> refSignBias, + ref int refmvCount, + Span<Mv> mvRefList, + bool earlyBreak) + { + if (mbmi.IsInterBlock()) + { + if (mbmi.RefFrame[0] != refFrame) + { + if (AddMvRefListEb(ScaleMv(ref mbmi, 0, refFrame, ref refSignBias), ref refmvCount, mvRefList, earlyBreak)) + { + return true; + } + } + if (mbmi.HasSecondRef() && mbmi.RefFrame[1] != refFrame && Unsafe.As<Mv, int>(ref mbmi.Mv[1]) != Unsafe.As<Mv, int>(ref mbmi.Mv[0])) + { + if (AddMvRefListEb(ScaleMv(ref mbmi, 1, refFrame, ref refSignBias), ref refmvCount, mvRefList, earlyBreak)) + { + return true; + } + } + + } + return false; + } + + // This function searches the neighborhood of a given MB/SB + // to try and find candidate reference vectors. + private static unsafe int DecFindMvRefs( + ref Vp9Common cm, + ref MacroBlockD xd, + PredictionMode mode, + sbyte refFrame, + Span<Position> mvRefSearch, + Span<Mv> mvRefList, + int miRow, + int miCol, + int block, + int isSub8X8) + { + ref Array4<sbyte> refSignBias = ref cm.RefFrameSignBias; + int i, refmvCount = 0; + bool differentRefFound = false; + Ptr<MvRef> prevFrameMvs = cm.UsePrevFrameMvs ? new Ptr<MvRef>(ref cm.PrevFrameMvs[miRow * cm.MiCols + miCol]) : Ptr<MvRef>.Null; + ref TileInfo tile = ref xd.Tile; + // If mode is nearestmv or newmv (uses nearestmv as a reference) then stop + // searching after the first mv is found. + bool earlyBreak = mode != PredictionMode.NearMv; + + // Blank the reference vector list + mvRefList.Slice(0, Constants.MaxMvRefCandidates).Fill(new Mv()); + + i = 0; + if (isSub8X8 != 0) + { + // If the size < 8x8 we get the mv from the bmi substructure for the + // nearest two blocks. + for (i = 0; i < 2; ++i) + { + ref Position mvRef = ref mvRefSearch[i]; + if (tile.IsInside(miCol, miRow, cm.MiRows, ref mvRef)) + { + ref ModeInfo candidateMi = ref xd.Mi[mvRef.Col + mvRef.Row * xd.MiStride].Value; + differentRefFound = true; + + if (candidateMi.RefFrame[0] == refFrame) + { + if (AddMvRefListEb(candidateMi.GetSubBlockMv(0, mvRef.Col, block), ref refmvCount, mvRefList, earlyBreak)) + { + goto Done; + } + } + else if (candidateMi.RefFrame[1] == refFrame) + { + if (AddMvRefListEb(candidateMi.GetSubBlockMv(1, mvRef.Col, block), ref refmvCount, mvRefList, earlyBreak)) + { + goto Done; + } + } + } + } + } + + // Check the rest of the neighbors in much the same way + // as before except we don't need to keep track of sub blocks or + // mode counts. + for (; i < MvrefNeighbours; ++i) + { + ref Position mvRef = ref mvRefSearch[i]; + if (tile.IsInside(miCol, miRow, cm.MiRows, ref mvRef)) + { + ref ModeInfo candidate = ref xd.Mi[mvRef.Col + mvRef.Row * xd.MiStride].Value; + differentRefFound = true; + + if (candidate.RefFrame[0] == refFrame) + { + if (AddMvRefListEb(candidate.Mv[0], ref refmvCount, mvRefList, earlyBreak)) + { + goto Done; + } + } + else if (candidate.RefFrame[1] == refFrame) + { + if (AddMvRefListEb(candidate.Mv[1], ref refmvCount, mvRefList, earlyBreak)) + { + goto Done; + } + } + } + } + + // Check the last frame's mode and mv info. + if (!prevFrameMvs.IsNull) + { + if (prevFrameMvs.Value.RefFrame[0] == refFrame) + { + if (AddMvRefListEb(prevFrameMvs.Value.Mv[0], ref refmvCount, mvRefList, earlyBreak)) + { + goto Done; + } + } + else if (prevFrameMvs.Value.RefFrame[1] == refFrame) + { + if (AddMvRefListEb(prevFrameMvs.Value.Mv[1], ref refmvCount, mvRefList, earlyBreak)) + { + goto Done; + } + } + } + + // Since we couldn't find 2 mvs from the same reference frame + // go back through the neighbors and find motion vectors from + // different reference frames. + if (differentRefFound) + { + for (i = 0; i < MvrefNeighbours; ++i) + { + ref Position mvRef = ref mvRefSearch[i]; + if (tile.IsInside(miCol, miRow, cm.MiRows, ref mvRef)) + { + ref ModeInfo candidate = ref xd.Mi[mvRef.Col + mvRef.Row * xd.MiStride].Value; + + // If the candidate is Intra we don't want to consider its mv. + if (IsDiffRefFrameAddMvEb(ref candidate, refFrame, ref refSignBias, ref refmvCount, mvRefList, earlyBreak)) + { + goto Done; + } + } + } + } + + // Since we still don't have a candidate we'll try the last frame. + if (!prevFrameMvs.IsNull) + { + if (prevFrameMvs.Value.RefFrame[0] != refFrame && prevFrameMvs.Value.RefFrame[0] > Constants.IntraFrame) + { + Mv mv = prevFrameMvs.Value.Mv[0]; + if (refSignBias[prevFrameMvs.Value.RefFrame[0]] != refSignBias[refFrame]) + { + mv.Row *= -1; + mv.Col *= -1; + } + if (AddMvRefListEb(mv, ref refmvCount, mvRefList, earlyBreak)) + { + goto Done; + } + } + + if (prevFrameMvs.Value.RefFrame[1] > Constants.IntraFrame && + prevFrameMvs.Value.RefFrame[1] != refFrame && + Unsafe.As<Mv, int>(ref prevFrameMvs.Value.Mv[1]) != Unsafe.As<Mv, int>(ref prevFrameMvs.Value.Mv[0])) + { + Mv mv = prevFrameMvs.Value.Mv[1]; + if (refSignBias[prevFrameMvs.Value.RefFrame[1]] != refSignBias[refFrame]) + { + mv.Row *= -1; + mv.Col *= -1; + } + if (AddMvRefListEb(mv, ref refmvCount, mvRefList, earlyBreak)) + { + goto Done; + } + } + } + + if (mode == PredictionMode.NearMv) + { + refmvCount = Constants.MaxMvRefCandidates; + } + else + { + // We only care about the nearestmv for the remaining modes + refmvCount = 1; + } + + Done: + // Clamp vectors + for (i = 0; i < refmvCount; ++i) + { + mvRefList[i].ClampMvRef(ref xd); + } + + return refmvCount; + } + + private static void AppendSub8x8MvsForIdx( + ref Vp9Common cm, + ref MacroBlockD xd, + Span<Position> mvRefSearch, + PredictionMode bMode, + int block, + int refr, + int miRow, + int miCol, + ref Mv bestSub8x8) + { + Span<Mv> mvList = stackalloc Mv[Constants.MaxMvRefCandidates]; + ref ModeInfo mi = ref xd.Mi[0].Value; + ref Array4<BModeInfo> bmi = ref mi.Bmi; + int n; + int refmvCount; + + Debug.Assert(Constants.MaxMvRefCandidates == 2); + + refmvCount = DecFindMvRefs(ref cm, ref xd, bMode, mi.RefFrame[refr], mvRefSearch, mvList, miRow, miCol, block, 1); + + switch (block) + { + case 0: bestSub8x8 = mvList[refmvCount - 1]; break; + case 1: + case 2: + if (bMode == PredictionMode.NearestMv) + { + bestSub8x8 = bmi[0].Mv[refr]; + } + else + { + bestSub8x8 = new Mv(); + for (n = 0; n < refmvCount; ++n) + { + if (Unsafe.As<Mv, int>(ref bmi[0].Mv[refr]) != Unsafe.As<Mv, int>(ref mvList[n])) + { + bestSub8x8 = mvList[n]; + break; + } + } + } + break; + case 3: + if (bMode == PredictionMode.NearestMv) + { + bestSub8x8 = bmi[2].Mv[refr]; + } + else + { + Span<Mv> candidates = stackalloc Mv[2 + Constants.MaxMvRefCandidates]; + candidates[0] = bmi[1].Mv[refr]; + candidates[1] = bmi[0].Mv[refr]; + candidates[2] = mvList[0]; + candidates[3] = mvList[1]; + bestSub8x8 = new Mv(); + for (n = 0; n < 2 + Constants.MaxMvRefCandidates; ++n) + { + if (Unsafe.As<Mv, int>(ref bmi[2].Mv[refr]) != Unsafe.As<Mv, int>(ref candidates[n])) + { + bestSub8x8 = candidates[n]; + break; + } + } + } + break; + default: Debug.Assert(false, "Invalid block index."); break; + } + } + + private static byte GetModeContext(ref Vp9Common cm, ref MacroBlockD xd, Span<Position> mvRefSearch, int miRow, int miCol) + { + int i; + int contextCounter = 0; + ref TileInfo tile = ref xd.Tile; + + // Get mode count from nearest 2 blocks + for (i = 0; i < 2; ++i) + { + ref Position mvRef = ref mvRefSearch[i]; + if (tile.IsInside(miCol, miRow, cm.MiRows, ref mvRef)) + { + ref ModeInfo candidate = ref xd.Mi[mvRef.Col + mvRef.Row * xd.MiStride].Value; + // Keep counts for entropy encoding. + contextCounter += Luts.Mode2Counter[(int)candidate.Mode]; + } + } + + return (byte)Luts.CounterToContext[contextCounter]; + } + + private static void ReadInterBlockModeInfo( + ref Vp9Common cm, + ref MacroBlockD xd, + ref ModeInfo mi, + int miRow, + int miCol, + ref Reader r) + { + BlockSize bsize = mi.SbType; + bool allowHP = cm.AllowHighPrecisionMv; + Array2<Mv> bestRefMvs = new Array2<Mv>(); + int refr, isCompound; + byte interModeCtx; + Span<Position> mvRefSearch = Luts.MvRefBlocks[(int)bsize]; + + ReadRefFrames(ref cm, ref xd, ref r, mi.SegmentId, ref mi.RefFrame); + isCompound = mi.HasSecondRef() ? 1 : 0; + interModeCtx = GetModeContext(ref cm, ref xd, mvRefSearch, miRow, miCol); + + if (cm.Seg.IsSegFeatureActive(mi.SegmentId, SegLvlFeatures.SegLvlSkip) != 0) + { + mi.Mode = PredictionMode.ZeroMv; + if (bsize < BlockSize.Block8x8) + { + xd.ErrorInfo.Value.InternalError(CodecErr.CodecUnsupBitstream, "Invalid usage of segement feature on small blocks"); + return; + } + } + else + { + if (bsize >= BlockSize.Block8x8) + { + mi.Mode = ReadInterMode(ref cm, ref xd, ref r, interModeCtx); + } + else + { + // Sub 8x8 blocks use the nearestmv as a ref_mv if the bMode is NewMv. + // Setting mode to NearestMv forces the search to stop after the nearestmv + // has been found. After bModes have been read, mode will be overwritten + // by the last bMode. + mi.Mode = PredictionMode.NearestMv; + } + + if (mi.Mode != PredictionMode.ZeroMv) + { + Span<Mv> tmpMvs = stackalloc Mv[Constants.MaxMvRefCandidates]; + + for (refr = 0; refr < 1 + isCompound; ++refr) + { + sbyte frame = mi.RefFrame[refr]; + int refmvCount; + + refmvCount = DecFindMvRefs(ref cm, ref xd, mi.Mode, frame, mvRefSearch, tmpMvs, miRow, miCol, -1, 0); + + DecFindBestRefMvs(allowHP, tmpMvs, ref bestRefMvs[refr], refmvCount); + } + } + } + + mi.InterpFilter = (cm.InterpFilter == Constants.Switchable) ? ReadSwitchableInterpFilter(ref cm, ref xd, ref r) : cm.InterpFilter; + + if (bsize < BlockSize.Block8x8) + { + int num4X4W = 1 << xd.BmodeBlocksWl; + int num4X4H = 1 << xd.BmodeBlocksHl; + int idx, idy; + PredictionMode bMode = 0; + Array2<Mv> bestSub8x8 = new Array2<Mv>(); + const uint invalidMv = 0x80008000; + // Initialize the 2nd element as even though it won't be used meaningfully + // if isCompound is false. + Unsafe.As<Mv, uint>(ref bestSub8x8[1]) = invalidMv; + for (idy = 0; idy < 2; idy += num4X4H) + { + for (idx = 0; idx < 2; idx += num4X4W) + { + int j = idy * 2 + idx; + bMode = ReadInterMode(ref cm, ref xd, ref r, interModeCtx); + + if (bMode == PredictionMode.NearestMv || bMode == PredictionMode.NearMv) + { + for (refr = 0; refr < 1 + isCompound; ++refr) + { + AppendSub8x8MvsForIdx(ref cm, ref xd, mvRefSearch, bMode, j, refr, miRow, miCol, ref bestSub8x8[refr]); + } + } + + if (!AssignMv(ref cm, ref xd, bMode, ref mi.Bmi[j].Mv, ref bestRefMvs, ref bestSub8x8, isCompound, allowHP, ref r)) + { + xd.Corrupted |= true; + break; + } + + if (num4X4H == 2) + { + mi.Bmi[j + 2] = mi.Bmi[j]; + } + + if (num4X4W == 2) + { + mi.Bmi[j + 1] = mi.Bmi[j]; + } + } + } + + mi.Mode = bMode; + + CopyMvPair(ref mi.Mv, ref mi.Bmi[3].Mv); + } + else + { + xd.Corrupted |= !AssignMv(ref cm, ref xd, mi.Mode, ref mi.Mv, ref bestRefMvs, ref bestRefMvs, isCompound, allowHP, ref r); + } + } + + private static void ReadInterFrameModeInfo( + ref Vp9Common cm, + ref MacroBlockD xd, + int miRow, + int miCol, + ref Reader r, + int xMis, + int yMis) + { + ref ModeInfo mi = ref xd.Mi[0].Value; + bool interBlock; + + mi.SegmentId = (sbyte)ReadInterSegmentId(ref cm, ref xd, miRow, miCol, ref r, xMis, yMis); + mi.Skip = (sbyte)ReadSkip(ref cm, ref xd, mi.SegmentId, ref r); + interBlock = ReadIsInterBlock(ref cm, ref xd, mi.SegmentId, ref r); + mi.TxSize = ReadTxSize(ref cm, ref xd, mi.Skip == 0 || !interBlock, ref r); + + if (interBlock) + { + ReadInterBlockModeInfo(ref cm, ref xd, ref mi, miRow, miCol, ref r); + } + else + { + ReadIntraBlockModeInfo(ref cm, ref xd, ref mi, ref r); + } + } + + private static PredictionMode LeftBlockMode(Ptr<ModeInfo> curMi, Ptr<ModeInfo> leftMi, int b) + { + if (b == 0 || b == 2) + { + if (leftMi.IsNull || leftMi.Value.IsInterBlock()) + { + return PredictionMode.DcPred; + } + + return leftMi.Value.GetYMode(b + 1); + } + else + { + Debug.Assert(b == 1 || b == 3); + return curMi.Value.Bmi[b - 1].Mode; + } + } + + private static PredictionMode AboveBlockMode(Ptr<ModeInfo> curMi, Ptr<ModeInfo> aboveMi, int b) + { + if (b == 0 || b == 1) + { + if (aboveMi.IsNull || aboveMi.Value.IsInterBlock()) + { + return PredictionMode.DcPred; + } + + return aboveMi.Value.GetYMode(b + 2); + } + else + { + Debug.Assert(b == 2 || b == 3); + return curMi.Value.Bmi[b - 2].Mode; + } + } + + private static ReadOnlySpan<byte> GetYModeProbs( + ref Vp9EntropyProbs fc, + Ptr<ModeInfo> mi, + Ptr<ModeInfo> aboveMi, + Ptr<ModeInfo> leftMi, + int block) + { + PredictionMode above = AboveBlockMode(mi, aboveMi, block); + PredictionMode left = LeftBlockMode(mi, leftMi, block); + return fc.KfYModeProb[(int)above][(int)left].AsSpan(); + } + + private static void ReadIntraFrameModeInfo( + ref Vp9Common cm, + ref MacroBlockD xd, + int miRow, + int miCol, + ref Reader r, + int xMis, + int yMis) + { + Ptr<ModeInfo> mi = xd.Mi[0]; + Ptr<ModeInfo> aboveMi = xd.AboveMi; + Ptr<ModeInfo> leftMi = xd.LeftMi; + BlockSize bsize = mi.Value.SbType; + int i; + int miOffset = miRow * cm.MiCols + miCol; + + mi.Value.SegmentId = (sbyte)ReadIntraSegmentId(ref cm, miOffset, xMis, yMis, ref r); + mi.Value.Skip = (sbyte)ReadSkip(ref cm, ref xd, mi.Value.SegmentId, ref r); + mi.Value.TxSize = ReadTxSize(ref cm, ref xd, true, ref r); + mi.Value.RefFrame[0] = Constants.IntraFrame; + mi.Value.RefFrame[1] = Constants.None; + + switch (bsize) + { + case BlockSize.Block4x4: + for (i = 0; i < 4; ++i) + { + mi.Value.Bmi[i].Mode = + ReadIntraMode(ref r, GetYModeProbs(ref cm.Fc.Value, mi, aboveMi, leftMi, i)); + } + + mi.Value.Mode = mi.Value.Bmi[3].Mode; + break; + case BlockSize.Block4x8: + mi.Value.Bmi[0].Mode = mi.Value.Bmi[2].Mode = + ReadIntraMode(ref r, GetYModeProbs(ref cm.Fc.Value, mi, aboveMi, leftMi, 0)); + mi.Value.Bmi[1].Mode = mi.Value.Bmi[3].Mode = mi.Value.Mode = + ReadIntraMode(ref r, GetYModeProbs(ref cm.Fc.Value, mi, aboveMi, leftMi, 1)); + break; + case BlockSize.Block8x4: + mi.Value.Bmi[0].Mode = mi.Value.Bmi[1].Mode = + ReadIntraMode(ref r, GetYModeProbs(ref cm.Fc.Value, mi, aboveMi, leftMi, 0)); + mi.Value.Bmi[2].Mode = mi.Value.Bmi[3].Mode = mi.Value.Mode = + ReadIntraMode(ref r, GetYModeProbs(ref cm.Fc.Value, mi, aboveMi, leftMi, 2)); + break; + default: + mi.Value.Mode = ReadIntraMode(ref r, GetYModeProbs(ref cm.Fc.Value, mi, aboveMi, leftMi, 0)); + break; + } + + mi.Value.UvMode = ReadIntraMode(ref r, cm.Fc.Value.KfUvModeProb[(int)mi.Value.Mode].AsSpan()); + } + + private static void CopyRefFramePair(ref Array2<sbyte> dst, ref Array2<sbyte> src) + { + dst[0] = src[0]; + dst[1] = src[1]; + } + + public static void ReadModeInfo( + ref TileWorkerData twd, + ref Vp9Common cm, + int miRow, + int miCol, + int xMis, + int yMis) + { + ref Reader r = ref twd.BitReader; + ref MacroBlockD xd = ref twd.Xd; + ref ModeInfo mi = ref xd.Mi[0].Value; + ArrayPtr<MvRef> frameMvs = cm.CurFrameMvs.Slice(miRow * cm.MiCols + miCol); + int w, h; + + if (cm.FrameIsIntraOnly()) + { + ReadIntraFrameModeInfo(ref cm, ref xd, miRow, miCol, ref r, xMis, yMis); + } + else + { + ReadInterFrameModeInfo(ref cm, ref xd, miRow, miCol, ref r, xMis, yMis); + + for (h = 0; h < yMis; ++h) + { + for (w = 0; w < xMis; ++w) + { + ref MvRef mv = ref frameMvs[w]; + CopyRefFramePair(ref mv.RefFrame, ref mi.RefFrame); + CopyMvPair(ref mv.Mv, ref mi.Mv); + } + frameMvs = frameMvs.Slice(cm.MiCols); + } + } + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Decoder.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Decoder.cs new file mode 100644 index 00000000..acebd8ab --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Decoder.cs @@ -0,0 +1,181 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Common; +using Ryujinx.Graphics.Nvdec.Vp9.Types; +using Ryujinx.Graphics.Video; +using System; +using Vp9MvRef = Ryujinx.Graphics.Video.Vp9MvRef; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + public sealed class Decoder : IVp9Decoder + { + public bool IsHardwareAccelerated => false; + + private readonly MemoryAllocator _allocator = new MemoryAllocator(); + + public ISurface CreateSurface(int width, int height) => new Surface(width, height); + + private static ReadOnlySpan<byte> LiteralToFilter => new byte[] + { + Constants.EightTapSmooth, + Constants.EightTap, + Constants.EightTapSharp, + Constants.Bilinear + }; + + public unsafe bool Decode( + ref Vp9PictureInfo pictureInfo, + ISurface output, + ReadOnlySpan<byte> bitstream, + ReadOnlySpan<Vp9MvRef> mvsIn, + Span<Vp9MvRef> mvsOut) + { + Vp9Common cm = new Vp9Common(); + + cm.FrameType = pictureInfo.IsKeyFrame ? FrameType.KeyFrame : FrameType.InterFrame; + cm.IntraOnly = pictureInfo.IntraOnly; + + cm.Width = output.Width; + cm.Height = output.Height; + cm.SubsamplingX = 1; + cm.SubsamplingY = 1; + + cm.UsePrevFrameMvs = pictureInfo.UsePrevInFindMvRefs; + + cm.RefFrameSignBias = pictureInfo.RefFrameSignBias; + + cm.BaseQindex = pictureInfo.BaseQIndex; + cm.YDcDeltaQ = pictureInfo.YDcDeltaQ; + cm.UvAcDeltaQ = pictureInfo.UvAcDeltaQ; + cm.UvDcDeltaQ = pictureInfo.UvDcDeltaQ; + + cm.Mb.Lossless = pictureInfo.Lossless; + cm.Mb.Bd = 8; + + cm.TxMode = (TxMode)pictureInfo.TransformMode; + + cm.AllowHighPrecisionMv = pictureInfo.AllowHighPrecisionMv; + + cm.InterpFilter = (byte)pictureInfo.InterpFilter; + + if (cm.InterpFilter != Constants.Switchable) + { + cm.InterpFilter = LiteralToFilter[cm.InterpFilter]; + } + + cm.ReferenceMode = (ReferenceMode)pictureInfo.ReferenceMode; + + cm.CompFixedRef = pictureInfo.CompFixedRef; + cm.CompVarRef = pictureInfo.CompVarRef; + + cm.Log2TileCols = pictureInfo.Log2TileCols; + cm.Log2TileRows = pictureInfo.Log2TileRows; + + cm.Seg.Enabled = pictureInfo.SegmentEnabled; + cm.Seg.UpdateMap = pictureInfo.SegmentMapUpdate; + cm.Seg.TemporalUpdate = pictureInfo.SegmentMapTemporalUpdate; + cm.Seg.AbsDelta = (byte)pictureInfo.SegmentAbsDelta; + cm.Seg.FeatureMask = pictureInfo.SegmentFeatureEnable; + cm.Seg.FeatureData = pictureInfo.SegmentFeatureData; + + cm.Lf.ModeRefDeltaEnabled = pictureInfo.ModeRefDeltaEnabled; + cm.Lf.RefDeltas = pictureInfo.RefDeltas; + cm.Lf.ModeDeltas = pictureInfo.ModeDeltas; + + cm.Fc = new Ptr<Vp9EntropyProbs>(ref pictureInfo.Entropy); + cm.Counts = new Ptr<Vp9BackwardUpdates>(ref pictureInfo.BackwardUpdateCounts); + + cm.FrameRefs[0].Buf = (Surface)pictureInfo.LastReference; + cm.FrameRefs[1].Buf = (Surface)pictureInfo.GoldenReference; + cm.FrameRefs[2].Buf = (Surface)pictureInfo.AltReference; + cm.Mb.CurBuf = (Surface)output; + + cm.Mb.SetupBlockPlanes(1, 1); + + int tileCols = 1 << pictureInfo.Log2TileCols; + int tileRows = 1 << pictureInfo.Log2TileRows; + + // Video usually have only 4 columns, so more threads won't make a difference for those. + // Try to not take all CPU cores for video decoding. + int maxThreads = Math.Min(4, Environment.ProcessorCount / 2); + + cm.AllocTileWorkerData(_allocator, tileCols, tileRows, maxThreads); + cm.AllocContextBuffers(_allocator, output.Width, output.Height); + cm.InitContextBuffers(); + cm.SetupSegmentationDequant(); + cm.SetupScaleFactors(); + + SetMvs(ref cm, mvsIn); + + fixed (byte* dataPtr = bitstream) + { + try + { + if (maxThreads > 1 && tileRows == 1 && tileCols > 1) + { + DecodeFrame.DecodeTilesMt(ref cm, new ArrayPtr<byte>(dataPtr, bitstream.Length), maxThreads); + } + else + { + DecodeFrame.DecodeTiles(ref cm, new ArrayPtr<byte>(dataPtr, bitstream.Length)); + } + } + catch (InternalErrorException) + { + return false; + } + } + + GetMvs(ref cm, mvsOut); + + cm.FreeTileWorkerData(_allocator); + cm.FreeContextBuffers(_allocator); + + return true; + } + + private static void SetMvs(ref Vp9Common cm, ReadOnlySpan<Vp9MvRef> mvs) + { + if (mvs.Length > cm.PrevFrameMvs.Length) + { + throw new ArgumentException($"Size mismatch, expected: {cm.PrevFrameMvs.Length}, but got: {mvs.Length}."); + } + + for (int i = 0; i < mvs.Length; i++) + { + ref var mv = ref cm.PrevFrameMvs[i]; + + mv.Mv[0].Row = mvs[i].Mvs[0].Row; + mv.Mv[0].Col = mvs[i].Mvs[0].Col; + mv.Mv[1].Row = mvs[i].Mvs[1].Row; + mv.Mv[1].Col = mvs[i].Mvs[1].Col; + + mv.RefFrame[0] = (sbyte)mvs[i].RefFrames[0]; + mv.RefFrame[1] = (sbyte)mvs[i].RefFrames[1]; + } + } + + private static void GetMvs(ref Vp9Common cm, Span<Vp9MvRef> mvs) + { + if (mvs.Length > cm.CurFrameMvs.Length) + { + throw new ArgumentException($"Size mismatch, expected: {cm.CurFrameMvs.Length}, but got: {mvs.Length}."); + } + + for (int i = 0; i < mvs.Length; i++) + { + ref var mv = ref cm.CurFrameMvs[i]; + + mvs[i].Mvs[0].Row = mv.Mv[0].Row; + mvs[i].Mvs[0].Col = mv.Mv[0].Col; + mvs[i].Mvs[1].Row = mv.Mv[1].Row; + mvs[i].Mvs[1].Col = mv.Mv[1].Col; + + mvs[i].RefFrames[0] = mv.RefFrame[0]; + mvs[i].RefFrames[1] = mv.RefFrame[1]; + } + } + + public void Dispose() => _allocator.Dispose(); + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Detokenize.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Detokenize.cs new file mode 100644 index 00000000..52b1b3dc --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Detokenize.cs @@ -0,0 +1,325 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Dsp; +using Ryujinx.Graphics.Nvdec.Vp9.Types; +using Ryujinx.Graphics.Video; +using System; +using System.Diagnostics; +using System.Runtime.InteropServices; +using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.InvTxfm; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal static class Detokenize + { + private const int EobContextNode = 0; + private const int ZeroContextNode = 1; + private const int OneContextNode = 2; + + private static int GetCoefContext(ReadOnlySpan<short> neighbors, ReadOnlySpan<byte> tokenCache, int c) + { + const int maxNeighbors = 2; + + return (1 + tokenCache[neighbors[maxNeighbors * c + 0]] + tokenCache[neighbors[maxNeighbors * c + 1]]) >> 1; + } + + private static int ReadCoeff( + ref Reader r, + ReadOnlySpan<byte> probs, + int n, + ref ulong value, + ref int count, + ref uint range) + { + int i, val = 0; + for (i = 0; i < n; ++i) + { + val = (val << 1) | r.ReadBool(probs[i], ref value, ref count, ref range); + } + + return val; + } + + private static int DecodeCoefs( + ref MacroBlockD xd, + PlaneType type, + Span<int> dqcoeff, + TxSize txSize, + ref Array2<short> dq, + int ctx, + ReadOnlySpan<short> scan, + ReadOnlySpan<short> nb, + ref Reader r) + { + ref Vp9BackwardUpdates counts = ref xd.Counts.Value; + int maxEob = 16 << ((int)txSize << 1); + ref Vp9EntropyProbs fc = ref xd.Fc.Value; + int refr = xd.Mi[0].Value.IsInterBlock() ? 1 : 0; + int band, c = 0; + ref Array6<Array6<Array3<byte>>> coefProbs = ref fc.CoefProbs[(int)txSize][(int)type][refr]; + Span<byte> tokenCache = stackalloc byte[32 * 32]; + ReadOnlySpan<byte> bandTranslate = Luts.get_band_translate(txSize); + int dqShift = (txSize == TxSize.Tx32x32) ? 1 : 0; + int v; + short dqv = dq[0]; + ReadOnlySpan<byte> cat6Prob = (xd.Bd == 12) + ? Luts.Vp9Cat6ProbHigh12 + : (xd.Bd == 10) ? Luts.Vp9Cat6ProbHigh12.Slice(2) : Luts.Vp9Cat6Prob; + int cat6Bits = (xd.Bd == 12) ? 18 : (xd.Bd == 10) ? 16 : 14; + // Keep value, range, and count as locals. The compiler produces better + // results with the locals than using r directly. + ulong value = r.Value; + uint range = r.Range; + int count = r.Count; + + while (c < maxEob) + { + int val = -1; + band = bandTranslate[0]; + bandTranslate = bandTranslate.Slice(1); + ref Array3<byte> prob = ref coefProbs[band][ctx]; + if (!xd.Counts.IsNull) + { + ++counts.EobBranch[(int)txSize][(int)type][refr][band][ctx]; + } + + if (r.ReadBool(prob[EobContextNode], ref value, ref count, ref range) == 0) + { + if (!xd.Counts.IsNull) + { + ++counts.Coef[(int)txSize][(int)type][refr][band][ctx][Constants.EobModelToken]; + } + + break; + } + + while (r.ReadBool(prob[ZeroContextNode], ref value, ref count, ref range) == 0) + { + if (!xd.Counts.IsNull) + { + ++counts.Coef[(int)txSize][(int)type][refr][band][ctx][Constants.ZeroToken]; + } + + dqv = dq[1]; + tokenCache[scan[c]] = 0; + ++c; + if (c >= maxEob) + { + r.Value = value; + r.Range = range; + r.Count = count; + return c; // Zero tokens at the end (no eob token) + } + ctx = GetCoefContext(nb, tokenCache, c); + band = bandTranslate[0]; + bandTranslate = bandTranslate.Slice(1); + prob = ref coefProbs[band][ctx]; + } + + if (r.ReadBool(prob[OneContextNode], ref value, ref count, ref range) != 0) + { + ReadOnlySpan<byte> p = Luts.Vp9Pareto8Full[prob[Constants.PivotNode] - 1]; + if (!xd.Counts.IsNull) + { + ++counts.Coef[(int)txSize][(int)type][refr][band][ctx][Constants.TwoToken]; + } + + if (r.ReadBool(p[0], ref value, ref count, ref range) != 0) + { + if (r.ReadBool(p[3], ref value, ref count, ref range) != 0) + { + tokenCache[scan[c]] = 5; + if (r.ReadBool(p[5], ref value, ref count, ref range) != 0) + { + if (r.ReadBool(p[7], ref value, ref count, ref range) != 0) + { + val = Constants.Cat6MinVal + ReadCoeff(ref r, cat6Prob, cat6Bits, ref value, ref count, ref range); + } + else + { + val = Constants.Cat5MinVal + ReadCoeff(ref r, Luts.Vp9Cat5Prob, 5, ref value, ref count, ref range); + } + } + else if (r.ReadBool(p[6], ref value, ref count, ref range) != 0) + { + val = Constants.Cat4MinVal + ReadCoeff(ref r, Luts.Vp9Cat4Prob, 4, ref value, ref count, ref range); + } + else + { + val = Constants.Cat3MinVal + ReadCoeff(ref r, Luts.Vp9Cat3Prob, 3, ref value, ref count, ref range); + } + } + else + { + tokenCache[scan[c]] = 4; + if (r.ReadBool(p[4], ref value, ref count, ref range) != 0) + { + val = Constants.Cat2MinVal + ReadCoeff(ref r, Luts.Vp9Cat2Prob, 2, ref value, ref count, ref range); + } + else + { + val = Constants.Cat1MinVal + ReadCoeff(ref r, Luts.Vp9Cat1Prob, 1, ref value, ref count, ref range); + } + } + // Val may use 18-bits + v = (int)(((long)val * dqv) >> dqShift); + } + else + { + if (r.ReadBool(p[1], ref value, ref count, ref range) != 0) + { + tokenCache[scan[c]] = 3; + v = ((3 + r.ReadBool(p[2], ref value, ref count, ref range)) * dqv) >> dqShift; + } + else + { + tokenCache[scan[c]] = 2; + v = (2 * dqv) >> dqShift; + } + } + } + else + { + if (!xd.Counts.IsNull) + { + ++counts.Coef[(int)txSize][(int)type][refr][band][ctx][Constants.OneToken]; + } + + tokenCache[scan[c]] = 1; + v = dqv >> dqShift; + } + dqcoeff[scan[c]] = (int)HighbdCheckRange(r.ReadBool(128, ref value, ref count, ref range) != 0 ? -v : v, xd.Bd); + ++c; + ctx = GetCoefContext(nb, tokenCache, c); + dqv = dq[1]; + } + + r.Value = value; + r.Range = range; + r.Count = count; + return c; + } + + private static void GetCtxShift(ref MacroBlockD xd, ref int ctxShiftA, ref int ctxShiftL, int x, int y, uint txSizeInBlocks) + { + if (xd.MaxBlocksWide != 0) + { + if (txSizeInBlocks + x > xd.MaxBlocksWide) + { + ctxShiftA = (int)(txSizeInBlocks - (xd.MaxBlocksWide - x)) * 8; + } + } + if (xd.MaxBlocksHigh != 0) + { + if (txSizeInBlocks + y > xd.MaxBlocksHigh) + { + ctxShiftL = (int)(txSizeInBlocks - (xd.MaxBlocksHigh - y)) * 8; + } + } + } + + private static PlaneType GetPlaneType(int plane) + { + return (PlaneType)(plane > 0 ? 1 : 0); + } + + public static int DecodeBlockTokens( + ref TileWorkerData twd, + int plane, + Luts.ScanOrder sc, + int x, + int y, + TxSize txSize, + int segId) + { + ref Reader r = ref twd.BitReader; + ref MacroBlockD xd = ref twd.Xd; + ref MacroBlockDPlane pd = ref xd.Plane[plane]; + ref Array2<short> dequant = ref pd.SegDequant[segId]; + int eob; + Span<sbyte> a = pd.AboveContext.AsSpan().Slice(x); + Span<sbyte> l = pd.LeftContext.AsSpan().Slice(y); + int ctx; + int ctxShiftA = 0; + int ctxShiftL = 0; + + switch (txSize) + { + case TxSize.Tx4x4: + ctx = a[0] != 0 ? 1 : 0; + ctx += l[0] != 0 ? 1 : 0; + eob = DecodeCoefs( + ref xd, + GetPlaneType(plane), + pd.DqCoeff.AsSpan(), + txSize, + ref dequant, + ctx, + sc.Scan, + sc.Neighbors, + ref r); + a[0] = l[0] = (sbyte)(eob > 0 ? 1 : 0); + break; + case TxSize.Tx8x8: + GetCtxShift(ref xd, ref ctxShiftA, ref ctxShiftL, x, y, 1 << (int)TxSize.Tx8x8); + ctx = MemoryMarshal.Cast<sbyte, ushort>(a)[0] != 0 ? 1 : 0; + ctx += MemoryMarshal.Cast<sbyte, ushort>(l)[0] != 0 ? 1 : 0; + eob = DecodeCoefs( + ref xd, + GetPlaneType(plane), + pd.DqCoeff.AsSpan(), + txSize, + ref dequant, + ctx, + sc.Scan, + sc.Neighbors, + ref r); + MemoryMarshal.Cast<sbyte, ushort>(a)[0] = (ushort)((eob > 0 ? 0x0101 : 0) >> ctxShiftA); + MemoryMarshal.Cast<sbyte, ushort>(l)[0] = (ushort)((eob > 0 ? 0x0101 : 0) >> ctxShiftL); + break; + case TxSize.Tx16x16: + GetCtxShift(ref xd, ref ctxShiftA, ref ctxShiftL, x, y, 1 << (int)TxSize.Tx16x16); + ctx = MemoryMarshal.Cast<sbyte, uint>(a)[0] != 0 ? 1 : 0; + ctx += MemoryMarshal.Cast<sbyte, uint>(l)[0] != 0 ? 1 : 0; + eob = DecodeCoefs( + ref xd, + GetPlaneType(plane), + pd.DqCoeff.AsSpan(), + txSize, + ref dequant, + ctx, + sc.Scan, + sc.Neighbors, + ref r); + MemoryMarshal.Cast<sbyte, uint>(a)[0] = (uint)((eob > 0 ? 0x01010101 : 0) >> ctxShiftA); + MemoryMarshal.Cast<sbyte, uint>(l)[0] = (uint)((eob > 0 ? 0x01010101 : 0) >> ctxShiftL); + break; + case TxSize.Tx32x32: + GetCtxShift(ref xd, ref ctxShiftA, ref ctxShiftL, x, y, 1 << (int)TxSize.Tx32x32); + // NOTE: Casting to ulong here is safe because the default memory + // alignment is at least 8 bytes and the Tx32x32 is aligned on 8 byte + // boundaries. + ctx = MemoryMarshal.Cast<sbyte, ulong>(a)[0] != 0 ? 1 : 0; + ctx += MemoryMarshal.Cast<sbyte, ulong>(l)[0] != 0 ? 1 : 0; + eob = DecodeCoefs( + ref xd, + GetPlaneType(plane), + pd.DqCoeff.AsSpan(), + txSize, + ref dequant, + ctx, + sc.Scan, + sc.Neighbors, + ref r); + MemoryMarshal.Cast<sbyte, ulong>(a)[0] = (eob > 0 ? 0x0101010101010101UL : 0) >> ctxShiftA; + MemoryMarshal.Cast<sbyte, ulong>(l)[0] = (eob > 0 ? 0x0101010101010101UL : 0) >> ctxShiftL; + break; + default: + Debug.Assert(false, "Invalid transform size."); + eob = 0; + break; + } + + return eob; + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Convolve.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Convolve.cs new file mode 100644 index 00000000..d49a6bf6 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Convolve.cs @@ -0,0 +1,943 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Common; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.Filter; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp +{ + internal static class Convolve + { + private const bool UseIntrinsics = true; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128<int> MultiplyAddAdjacent( + Vector128<short> vsrc0, + Vector128<short> vsrc1, + Vector128<short> vsrc2, + Vector128<short> vsrc3, + Vector128<short> vfilter, + Vector128<int> zero) + { + // < sumN, sumN, sumN, sumN > + Vector128<int> sum0 = Sse2.MultiplyAddAdjacent(vsrc0, vfilter); + Vector128<int> sum1 = Sse2.MultiplyAddAdjacent(vsrc1, vfilter); + Vector128<int> sum2 = Sse2.MultiplyAddAdjacent(vsrc2, vfilter); + Vector128<int> sum3 = Sse2.MultiplyAddAdjacent(vsrc3, vfilter); + + // < 0, 0, sumN, sumN > + sum0 = Ssse3.HorizontalAdd(sum0, zero); + sum1 = Ssse3.HorizontalAdd(sum1, zero); + sum2 = Ssse3.HorizontalAdd(sum2, zero); + sum3 = Ssse3.HorizontalAdd(sum3, zero); + + // < 0, 0, 0, sumN > + sum0 = Ssse3.HorizontalAdd(sum0, zero); + sum1 = Ssse3.HorizontalAdd(sum1, zero); + sum2 = Ssse3.HorizontalAdd(sum2, zero); + sum3 = Ssse3.HorizontalAdd(sum3, zero); + + // < 0, 0, sum1, sum0 > + Vector128<int> sum01 = Sse2.UnpackLow(sum0, sum1); + + // < 0, 0, sum3, sum2 > + Vector128<int> sum23 = Sse2.UnpackLow(sum2, sum3); + + // < sum3, sum2, sum1, sum0 > + return Sse.MoveLowToHigh(sum01.AsSingle(), sum23.AsSingle()).AsInt32(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128<int> RoundShift(Vector128<int> value, Vector128<int> const64) + { + return Sse2.ShiftRightArithmetic(Sse2.Add(value, const64), FilterBits); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128<byte> PackUnsignedSaturate(Vector128<int> value, Vector128<int> zero) + { + return Sse2.PackUnsignedSaturate(Sse41.PackUnsignedSaturate(value, zero).AsInt16(), zero.AsInt16()); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void ConvolveHorizSse41( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] xFilters, + int x0Q4, + int w, + int h) + { + Vector128<int> zero = Vector128<int>.Zero; + Vector128<int> const64 = Vector128.Create(64); + + ulong x, y; + src -= SubpelTaps / 2 - 1; + + fixed (Array8<short>* xFilter = xFilters) + { + Vector128<short> vfilter = Sse2.LoadVector128((short*)xFilter + (uint)(x0Q4 & SubpelMask) * 8); + + for (y = 0; y < (uint)h; ++y) + { + ulong srcOffset = (uint)x0Q4 >> SubpelBits; + for (x = 0; x < (uint)w; x += 4) + { + Vector128<short> vsrc0 = Sse41.ConvertToVector128Int16(&src[srcOffset + x]); + Vector128<short> vsrc1 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 1]); + Vector128<short> vsrc2 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 2]); + Vector128<short> vsrc3 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 3]); + + Vector128<int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero); + + Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle()); + } + src += srcStride; + dst += dstStride; + } + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void ConvolveHoriz( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] xFilters, + int x0Q4, + int xStepQ4, + int w, + int h) + { + if (Sse41.IsSupported && UseIntrinsics && xStepQ4 == 1 << SubpelBits) + { + ConvolveHorizSse41(src, srcStride, dst, dstStride, xFilters, x0Q4, w, h); + return; + } + + int x, y; + src -= SubpelTaps / 2 - 1; + + for (y = 0; y < h; ++y) + { + int xQ4 = x0Q4; + for (x = 0; x < w; ++x) + { + byte* srcX = &src[xQ4 >> SubpelBits]; + ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask]; + int k, sum = 0; + for (k = 0; k < SubpelTaps; ++k) + { + sum += srcX[k] * xFilter[k]; + } + + dst[x] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)); + xQ4 += xStepQ4; + } + src += srcStride; + dst += dstStride; + } + } + + private static unsafe void ConvolveAvgHoriz( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] xFilters, + int x0Q4, + int xStepQ4, + int w, + int h) + { + int x, y; + src -= SubpelTaps / 2 - 1; + + for (y = 0; y < h; ++y) + { + int xQ4 = x0Q4; + for (x = 0; x < w; ++x) + { + byte* srcX = &src[xQ4 >> SubpelBits]; + ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask]; + int k, sum = 0; + for (k = 0; k < SubpelTaps; ++k) + { + sum += srcX[k] * xFilter[k]; + } + + dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1); + xQ4 += xStepQ4; + } + src += srcStride; + dst += dstStride; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void ConvolveVertAvx2( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] yFilters, + int y0Q4, + int w, + int h) + { + Vector128<int> zero = Vector128<int>.Zero; + Vector128<int> const64 = Vector128.Create(64); + Vector256<int> indices = Vector256.Create( + 0, + srcStride, + srcStride * 2, + srcStride * 3, + srcStride * 4, + srcStride * 5, + srcStride * 6, + srcStride * 7); + + ulong x, y; + src -= srcStride * (SubpelTaps / 2 - 1); + + fixed (Array8<short>* yFilter = yFilters) + { + Vector128<short> vfilter = Sse2.LoadVector128((short*)yFilter + (uint)(y0Q4 & SubpelMask) * 8); + + ulong srcBaseY = (uint)y0Q4 >> SubpelBits; + for (y = 0; y < (uint)h; ++y) + { + ulong srcOffset = (srcBaseY + y) * (uint)srcStride; + for (x = 0; x < (uint)w; x += 4) + { + Vector256<int> vsrc = Avx2.GatherVector256((uint*)&src[srcOffset + x], indices, 1).AsInt32(); + + Vector128<int> vsrcL = vsrc.GetLower(); + Vector128<int> vsrcH = vsrc.GetUpper(); + + Vector128<byte> vsrcUnpck11 = Sse2.UnpackLow(vsrcL.AsByte(), vsrcH.AsByte()); + Vector128<byte> vsrcUnpck12 = Sse2.UnpackHigh(vsrcL.AsByte(), vsrcH.AsByte()); + + Vector128<byte> vsrcUnpck21 = Sse2.UnpackLow(vsrcUnpck11, vsrcUnpck12); + Vector128<byte> vsrcUnpck22 = Sse2.UnpackHigh(vsrcUnpck11, vsrcUnpck12); + + Vector128<byte> vsrc01 = Sse2.UnpackLow(vsrcUnpck21, vsrcUnpck22); + Vector128<byte> vsrc23 = Sse2.UnpackHigh(vsrcUnpck21, vsrcUnpck22); + + Vector128<byte> vsrc11 = Sse.MoveHighToLow(vsrc01.AsSingle(), vsrc01.AsSingle()).AsByte(); + Vector128<byte> vsrc33 = Sse.MoveHighToLow(vsrc23.AsSingle(), vsrc23.AsSingle()).AsByte(); + + Vector128<short> vsrc0 = Sse41.ConvertToVector128Int16(vsrc01); + Vector128<short> vsrc1 = Sse41.ConvertToVector128Int16(vsrc11); + Vector128<short> vsrc2 = Sse41.ConvertToVector128Int16(vsrc23); + Vector128<short> vsrc3 = Sse41.ConvertToVector128Int16(vsrc33); + + Vector128<int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero); + + Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle()); + } + dst += dstStride; + } + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void ConvolveVert( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] yFilters, + int y0Q4, + int yStepQ4, + int w, + int h) + { + if (Avx2.IsSupported && UseIntrinsics && yStepQ4 == 1 << SubpelBits) + { + ConvolveVertAvx2(src, srcStride, dst, dstStride, yFilters, y0Q4, w, h); + return; + } + + int x, y; + src -= srcStride * (SubpelTaps / 2 - 1); + + for (x = 0; x < w; ++x) + { + int yQ4 = y0Q4; + for (y = 0; y < h; ++y) + { + byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride]; + ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask]; + int k, sum = 0; + for (k = 0; k < SubpelTaps; ++k) + { + sum += srcY[k * srcStride] * yFilter[k]; + } + + dst[y * dstStride] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)); + yQ4 += yStepQ4; + } + ++src; + ++dst; + } + } + + private static unsafe void ConvolveAvgVert( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] yFilters, + int y0Q4, + int yStepQ4, + int w, + int h) + { + int x, y; + src -= srcStride * (SubpelTaps / 2 - 1); + + for (x = 0; x < w; ++x) + { + int yQ4 = y0Q4; + for (y = 0; y < h; ++y) + { + byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride]; + ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask]; + int k, sum = 0; + for (k = 0; k < SubpelTaps; ++k) + { + sum += srcY[k * srcStride] * yFilter[k]; + } + + dst[y * dstStride] = (byte)BitUtils.RoundPowerOfTwo( + dst[y * dstStride] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1); + yQ4 += yStepQ4; + } + ++src; + ++dst; + } + } + + public static unsafe void Convolve8Horiz( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h) + { + ConvolveHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h); + } + + public static unsafe void Convolve8AvgHoriz( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h) + { + ConvolveAvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h); + } + + public static unsafe void Convolve8Vert( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h) + { + ConvolveVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h); + } + + public static unsafe void Convolve8AvgVert( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h) + { + ConvolveAvgVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h); + } + + [SkipLocalsInit] + public static unsafe void Convolve8( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h) + { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> yStepQ4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SubpelTaps rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> yStepQ4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. + byte* temp = stackalloc byte[64 * 135]; + int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps; + + Debug.Assert(w <= 64); + Debug.Assert(h <= 64); + Debug.Assert(yStepQ4 <= 32 || (yStepQ4 <= 64 && h <= 32)); + Debug.Assert(xStepQ4 <= 64); + + ConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, intermediateHeight); + ConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h); + } + + public static unsafe void Convolve8Avg( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h) + { + // Fixed size intermediate buffer places limits on parameters. + byte* temp = stackalloc byte[64 * 64]; + Debug.Assert(w <= 64); + Debug.Assert(h <= 64); + + Convolve8(src, srcStride, temp, 64, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h); + ConvolveAvg(temp, 64, dst, dstStride, null, 0, 0, 0, 0, w, h); + } + + public static unsafe void ConvolveCopy( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h) + { + int r; + + for (r = h; r > 0; --r) + { + MemoryUtil.Copy(dst, src, w); + src += srcStride; + dst += dstStride; + } + } + + public static unsafe void ConvolveAvg( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h) + { + int x, y; + + for (y = 0; y < h; ++y) + { + for (x = 0; x < w; ++x) + { + dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1); + } + + src += srcStride; + dst += dstStride; + } + } + + public static unsafe void ScaledHoriz( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h) + { + Convolve8Horiz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h); + } + + public static unsafe void ScaledVert( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h) + { + Convolve8Vert(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h); + } + + public static unsafe void Scaled2D( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h) + { + Convolve8(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h); + } + + public static unsafe void ScaledAvgHoriz( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h) + { + Convolve8AvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h); + } + + public static unsafe void ScaledAvgVert( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h) + { + Convolve8AvgVert(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h); + } + + public static unsafe void ScaledAvg2D( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h) + { + Convolve8Avg(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h); + } + + private static unsafe void HighbdConvolveHoriz( + ushort* src, + int srcStride, + ushort* dst, + int dstStride, + Array8<short>[] xFilters, + int x0Q4, + int xStepQ4, + int w, + int h, + int bd) + { + int x, y; + src -= SubpelTaps / 2 - 1; + + for (y = 0; y < h; ++y) + { + int xQ4 = x0Q4; + for (x = 0; x < w; ++x) + { + ushort* srcX = &src[xQ4 >> SubpelBits]; + ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask]; + int k, sum = 0; + for (k = 0; k < SubpelTaps; ++k) + { + sum += srcX[k] * xFilter[k]; + } + + dst[x] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd); + xQ4 += xStepQ4; + } + src += srcStride; + dst += dstStride; + } + } + + private static unsafe void HighbdConvolveAvgHoriz( + ushort* src, + int srcStride, + ushort* dst, + int dstStride, + Array8<short>[] xFilters, + int x0Q4, + int xStepQ4, + int w, + int h, + int bd) + { + int x, y; + src -= SubpelTaps / 2 - 1; + + for (y = 0; y < h; ++y) + { + int xQ4 = x0Q4; + for (x = 0; x < w; ++x) + { + ushort* srcX = &src[xQ4 >> SubpelBits]; + ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask]; + int k, sum = 0; + for (k = 0; k < SubpelTaps; ++k) + { + sum += srcX[k] * xFilter[k]; + } + + dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1); + xQ4 += xStepQ4; + } + src += srcStride; + dst += dstStride; + } + } + + private static unsafe void HighbdConvolveVert( + ushort* src, + int srcStride, + ushort* dst, + int dstStride, + Array8<short>[] yFilters, + int y0Q4, + int yStepQ4, + int w, + int h, + int bd) + { + int x, y; + src -= srcStride * (SubpelTaps / 2 - 1); + + for (x = 0; x < w; ++x) + { + int yQ4 = y0Q4; + for (y = 0; y < h; ++y) + { + ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride]; + ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask]; + int k, sum = 0; + for (k = 0; k < SubpelTaps; ++k) + { + sum += srcY[k * srcStride] * yFilter[k]; + } + + dst[y * dstStride] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd); + yQ4 += yStepQ4; + } + ++src; + ++dst; + } + } + + private static unsafe void HighConvolveAvgVert( + ushort* src, + int srcStride, + ushort* dst, + int dstStride, + Array8<short>[] yFilters, + int y0Q4, + int yStepQ4, + int w, + int h, + int bd) + { + int x, y; + src -= srcStride * (SubpelTaps / 2 - 1); + + for (x = 0; x < w; ++x) + { + int yQ4 = y0Q4; + for (y = 0; y < h; ++y) + { + ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride]; + ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask]; + int k, sum = 0; + for (k = 0; k < SubpelTaps; ++k) + { + sum += srcY[k * srcStride] * yFilter[k]; + } + + dst[y * dstStride] = (ushort)BitUtils.RoundPowerOfTwo( + dst[y * dstStride] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1); + yQ4 += yStepQ4; + } + ++src; + ++dst; + } + } + + private static unsafe void HighbdConvolve( + ushort* src, + int srcStride, + ushort* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h, + int bd) + { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> yStepQ4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SubpelTaps rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + ushort* temp = stackalloc ushort[64 * 135]; + int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps; + + Debug.Assert(w <= 64); + Debug.Assert(h <= 64); + Debug.Assert(yStepQ4 <= 32); + Debug.Assert(xStepQ4 <= 32); + + HighbdConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, intermediateHeight, bd); + HighbdConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd); + } + + public static unsafe void HighbdConvolve8Horiz( + ushort* src, + int srcStride, + ushort* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h, + int bd) + { + HighbdConvolveHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h, bd); + } + + public static unsafe void HighbdConvolve8AvgHoriz( + ushort* src, + int srcStride, + ushort* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h, + int bd) + { + HighbdConvolveAvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h, bd); + } + + public static unsafe void HighbdConvolve8Vert( + ushort* src, + int srcStride, + ushort* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h, + int bd) + { + HighbdConvolveVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd); + } + + public static unsafe void HighbdConvolve8AvgVert( + ushort* src, + int srcStride, + ushort* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h, + int bd) + { + HighConvolveAvgVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd); + } + + public static unsafe void HighbdConvolve8( + ushort* src, + int srcStride, + ushort* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h, + int bd) + { + HighbdConvolve(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h, bd); + } + + public static unsafe void HighbdConvolve8Avg( + ushort* src, + int srcStride, + ushort* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h, + int bd) + { + // Fixed size intermediate buffer places limits on parameters. + ushort* temp = stackalloc ushort[64 * 64]; + Debug.Assert(w <= 64); + Debug.Assert(h <= 64); + + HighbdConvolve8(src, srcStride, temp, 64, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h, bd); + HighbdConvolveAvg(temp, 64, dst, dstStride, null, 0, 0, 0, 0, w, h, bd); + } + + public static unsafe void HighbdConvolveCopy( + ushort* src, + int srcStride, + ushort* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h, + int bd) + { + int r; + + for (r = h; r > 0; --r) + { + MemoryUtil.Copy(dst, src, w); + src += srcStride; + dst += dstStride; + } + } + + public static unsafe void HighbdConvolveAvg( + ushort* src, + int srcStride, + ushort* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h, + int bd) + { + int x, y; + + for (y = 0; y < h; ++y) + { + for (x = 0; x < w; ++x) + { + dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1); + } + + src += srcStride; + dst += dstStride; + } + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Filter.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Filter.cs new file mode 100644 index 00000000..16962897 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Filter.cs @@ -0,0 +1,12 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp +{ + internal static class Filter + { + public const int FilterBits = 7; + + public const int SubpelBits = 4; + public const int SubpelMask = (1 << SubpelBits) - 1; + public const int SubpelShifts = 1 << SubpelBits; + public const int SubpelTaps = 8; + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/IntraPred.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/IntraPred.cs new file mode 100644 index 00000000..62b3a9b1 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/IntraPred.cs @@ -0,0 +1,1379 @@ +using Ryujinx.Graphics.Nvdec.Vp9.Common; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp +{ + internal static class IntraPred + { + private static unsafe ref byte Dst(byte* dst, int stride, int x, int y) + { + return ref dst[x + y * stride]; + } + + private static unsafe ref ushort Dst(ushort* dst, int stride, int x, int y) + { + return ref dst[x + y * stride]; + } + + private static byte Avg3(byte a, byte b, byte c) + { + return (byte)((a + 2 * b + c + 2) >> 2); + } + + private static ushort Avg3(ushort a, ushort b, ushort c) + { + return (ushort)((a + 2 * b + c + 2) >> 2); + } + + private static byte Avg2(byte a, byte b) + { + return (byte)((a + b + 1) >> 1); + } + + private static ushort Avg2(ushort a, ushort b) + { + return (ushort)((a + b + 1) >> 1); + } + + public static unsafe void D207Predictor8x8(byte* dst, int stride, byte* above, byte* left) + { + D207Predictor(dst, stride, 8, above, left); + } + + public static unsafe void D207Predictor16x16(byte* dst, int stride, byte* above, byte* left) + { + D207Predictor(dst, stride, 16, above, left); + } + + public static unsafe void D207Predictor32x32(byte* dst, int stride, byte* above, byte* left) + { + D207Predictor(dst, stride, 32, above, left); + } + + private static unsafe void D207Predictor(byte* dst, int stride, int bs, byte* above, byte* left) + { + int r, c; + // First column + for (r = 0; r < bs - 1; ++r) + { + dst[r * stride] = Avg2(left[r], left[r + 1]); + } + + dst[(bs - 1) * stride] = left[bs - 1]; + dst++; + + // Second column + for (r = 0; r < bs - 2; ++r) + { + dst[r * stride] = Avg3(left[r], left[r + 1], left[r + 2]); + } + + dst[(bs - 2) * stride] = Avg3(left[bs - 2], left[bs - 1], left[bs - 1]); + dst[(bs - 1) * stride] = left[bs - 1]; + dst++; + + // Rest of last row + for (c = 0; c < bs - 2; ++c) + { + dst[(bs - 1) * stride + c] = left[bs - 1]; + } + + for (r = bs - 2; r >= 0; --r) + { + for (c = 0; c < bs - 2; ++c) + { + dst[r * stride + c] = dst[(r + 1) * stride + c - 2]; + } + } + } + + public static unsafe void D63Predictor8x8(byte* dst, int stride, byte* above, byte* left) + { + D63Predictor(dst, stride, 8, above, left); + } + + public static unsafe void D63Predictor16x16(byte* dst, int stride, byte* above, byte* left) + { + D63Predictor(dst, stride, 16, above, left); + } + + public static unsafe void D63Predictor32x32(byte* dst, int stride, byte* above, byte* left) + { + D63Predictor(dst, stride, 32, above, left); + } + + private static unsafe void D63Predictor(byte* dst, int stride, int bs, byte* above, byte* left) + { + int r, c; + int size; + for (c = 0; c < bs; ++c) + { + dst[c] = Avg2(above[c], above[c + 1]); + dst[stride + c] = Avg3(above[c], above[c + 1], above[c + 2]); + } + for (r = 2, size = bs - 2; r < bs; r += 2, --size) + { + MemoryUtil.Copy(dst + (r + 0) * stride, dst + (r >> 1), size); + MemoryUtil.Fill(dst + (r + 0) * stride + size, above[bs - 1], bs - size); + MemoryUtil.Copy(dst + (r + 1) * stride, dst + stride + (r >> 1), size); + MemoryUtil.Fill(dst + (r + 1) * stride + size, above[bs - 1], bs - size); + } + } + + public static unsafe void D45Predictor8x8(byte* dst, int stride, byte* above, byte* left) + { + D45Predictor(dst, stride, 8, above, left); + } + + public static unsafe void D45Predictor16x16(byte* dst, int stride, byte* above, byte* left) + { + D45Predictor(dst, stride, 16, above, left); + } + + public static unsafe void D45Predictor32x32(byte* dst, int stride, byte* above, byte* left) + { + D45Predictor(dst, stride, 32, above, left); + } + + private static unsafe void D45Predictor(byte* dst, int stride, int bs, byte* above, byte* left) + { + byte aboveRight = above[bs - 1]; + byte* dstRow0 = dst; + int x, size; + + for (x = 0; x < bs - 1; ++x) + { + dst[x] = Avg3(above[x], above[x + 1], above[x + 2]); + } + dst[bs - 1] = aboveRight; + dst += stride; + for (x = 1, size = bs - 2; x < bs; ++x, --size) + { + MemoryUtil.Copy(dst, dstRow0 + x, size); + MemoryUtil.Fill(dst + size, aboveRight, x + 1); + dst += stride; + } + } + + public static unsafe void D117Predictor8x8(byte* dst, int stride, byte* above, byte* left) + { + D117Predictor(dst, stride, 8, above, left); + } + + public static unsafe void D117Predictor16x16(byte* dst, int stride, byte* above, byte* left) + { + D117Predictor(dst, stride, 16, above, left); + } + + public static unsafe void D117Predictor32x32(byte* dst, int stride, byte* above, byte* left) + { + D117Predictor(dst, stride, 32, above, left); + } + + private static unsafe void D117Predictor(byte* dst, int stride, int bs, byte* above, byte* left) + { + int r, c; + + // First row + for (c = 0; c < bs; c++) + { + dst[c] = Avg2(above[c - 1], above[c]); + } + + dst += stride; + + // Second row + dst[0] = Avg3(left[0], above[-1], above[0]); + for (c = 1; c < bs; c++) + { + dst[c] = Avg3(above[c - 2], above[c - 1], above[c]); + } + + dst += stride; + + // The rest of first col + dst[0] = Avg3(above[-1], left[0], left[1]); + for (r = 3; r < bs; ++r) + { + dst[(r - 2) * stride] = Avg3(left[r - 3], left[r - 2], left[r - 1]); + } + + // The rest of the block + for (r = 2; r < bs; ++r) + { + for (c = 1; c < bs; c++) + { + dst[c] = dst[-2 * stride + c - 1]; + } + + dst += stride; + } + } + + public static unsafe void D135Predictor8x8(byte* dst, int stride, byte* above, byte* left) + { + D135Predictor(dst, stride, 8, above, left); + } + + public static unsafe void D135Predictor16x16(byte* dst, int stride, byte* above, byte* left) + { + D135Predictor(dst, stride, 16, above, left); + } + + public static unsafe void D135Predictor32x32(byte* dst, int stride, byte* above, byte* left) + { + D135Predictor(dst, stride, 32, above, left); + } + + private static unsafe void D135Predictor(byte* dst, int stride, int bs, byte* above, byte* left) + { + int i; + byte* border = stackalloc byte[32 + 32 - 1]; // outer border from bottom-left to top-right + + // Dst(dst, stride, bs, bs - 2)[0], i.e., border starting at bottom-left + for (i = 0; i < bs - 2; ++i) + { + border[i] = Avg3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]); + } + border[bs - 2] = Avg3(above[-1], left[0], left[1]); + border[bs - 1] = Avg3(left[0], above[-1], above[0]); + border[bs - 0] = Avg3(above[-1], above[0], above[1]); + // dst[0][2, size), i.e., remaining top border ascending + for (i = 0; i < bs - 2; ++i) + { + border[bs + 1 + i] = Avg3(above[i], above[i + 1], above[i + 2]); + } + + for (i = 0; i < bs; ++i) + { + MemoryUtil.Copy(dst + i * stride, border + bs - 1 - i, bs); + } + } + + public static unsafe void D153Predictor8x8(byte* dst, int stride, byte* above, byte* left) + { + D153Predictor(dst, stride, 8, above, left); + } + + public static unsafe void D153Predictor16x16(byte* dst, int stride, byte* above, byte* left) + { + D153Predictor(dst, stride, 16, above, left); + } + + public static unsafe void D153Predictor32x32(byte* dst, int stride, byte* above, byte* left) + { + D153Predictor(dst, stride, 32, above, left); + } + + private static unsafe void D153Predictor(byte* dst, int stride, int bs, byte* above, byte* left) + { + int r, c; + dst[0] = Avg2(above[-1], left[0]); + for (r = 1; r < bs; r++) + { + dst[r * stride] = Avg2(left[r - 1], left[r]); + } + + dst++; + + dst[0] = Avg3(left[0], above[-1], above[0]); + dst[stride] = Avg3(above[-1], left[0], left[1]); + for (r = 2; r < bs; r++) + { + dst[r * stride] = Avg3(left[r - 2], left[r - 1], left[r]); + } + + dst++; + + for (c = 0; c < bs - 2; c++) + { + dst[c] = Avg3(above[c - 1], above[c], above[c + 1]); + } + + dst += stride; + + for (r = 1; r < bs; ++r) + { + for (c = 0; c < bs - 2; c++) + { + dst[c] = dst[-stride + c - 2]; + } + + dst += stride; + } + } + + public static unsafe void VPredictor4x4(byte* dst, int stride, byte* above, byte* left) + { + VPredictor(dst, stride, 4, above, left); + } + + public static unsafe void VPredictor8x8(byte* dst, int stride, byte* above, byte* left) + { + VPredictor(dst, stride, 8, above, left); + } + + public static unsafe void VPredictor16x16(byte* dst, int stride, byte* above, byte* left) + { + VPredictor(dst, stride, 16, above, left); + } + + public static unsafe void VPredictor32x32(byte* dst, int stride, byte* above, byte* left) + { + VPredictor(dst, stride, 32, above, left); + } + + private static unsafe void VPredictor(byte* dst, int stride, int bs, byte* above, byte* left) + { + int r; + + for (r = 0; r < bs; r++) + { + MemoryUtil.Copy(dst, above, bs); + dst += stride; + } + } + + public static unsafe void HPredictor4x4(byte* dst, int stride, byte* above, byte* left) + { + HPredictor(dst, stride, 4, above, left); + } + + public static unsafe void HPredictor8x8(byte* dst, int stride, byte* above, byte* left) + { + HPredictor(dst, stride, 8, above, left); + } + + public static unsafe void HPredictor16x16(byte* dst, int stride, byte* above, byte* left) + { + HPredictor(dst, stride, 16, above, left); + } + + public static unsafe void HPredictor32x32(byte* dst, int stride, byte* above, byte* left) + { + HPredictor(dst, stride, 32, above, left); + } + + private static unsafe void HPredictor(byte* dst, int stride, int bs, byte* above, byte* left) + { + int r; + + for (r = 0; r < bs; r++) + { + MemoryUtil.Fill(dst, left[r], bs); + dst += stride; + } + } + + public static unsafe void TMPredictor4x4(byte* dst, int stride, byte* above, byte* left) + { + TMPredictor(dst, stride, 4, above, left); + } + + public static unsafe void TMPredictor8x8(byte* dst, int stride, byte* above, byte* left) + { + TMPredictor(dst, stride, 8, above, left); + } + + public static unsafe void TMPredictor16x16(byte* dst, int stride, byte* above, byte* left) + { + TMPredictor(dst, stride, 16, above, left); + } + + public static unsafe void TMPredictor32x32(byte* dst, int stride, byte* above, byte* left) + { + TMPredictor(dst, stride, 32, above, left); + } + + private static unsafe void TMPredictor(byte* dst, int stride, int bs, byte* above, byte* left) + { + int r, c; + int yTopLeft = above[-1]; + + for (r = 0; r < bs; r++) + { + for (c = 0; c < bs; c++) + { + dst[c] = BitUtils.ClipPixel(left[r] + above[c] - yTopLeft); + } + + dst += stride; + } + } + + public static unsafe void Dc128Predictor4x4(byte* dst, int stride, byte* above, byte* left) + { + Dc128Predictor(dst, stride, 4, above, left); + } + + public static unsafe void Dc128Predictor8x8(byte* dst, int stride, byte* above, byte* left) + { + Dc128Predictor(dst, stride, 8, above, left); + } + + public static unsafe void Dc128Predictor16x16(byte* dst, int stride, byte* above, byte* left) + { + Dc128Predictor(dst, stride, 16, above, left); + } + + public static unsafe void Dc128Predictor32x32(byte* dst, int stride, byte* above, byte* left) + { + Dc128Predictor(dst, stride, 32, above, left); + } + + private static unsafe void Dc128Predictor(byte* dst, int stride, int bs, byte* above, byte* left) + { + int r; + + for (r = 0; r < bs; r++) + { + MemoryUtil.Fill(dst, (byte)128, bs); + dst += stride; + } + } + + public static unsafe void DcLeftPredictor4x4(byte* dst, int stride, byte* above, byte* left) + { + DcLeftPredictor(dst, stride, 4, above, left); + } + + public static unsafe void DcLeftPredictor8x8(byte* dst, int stride, byte* above, byte* left) + { + DcLeftPredictor(dst, stride, 8, above, left); + } + + public static unsafe void DcLeftPredictor16x16(byte* dst, int stride, byte* above, byte* left) + { + DcLeftPredictor(dst, stride, 16, above, left); + } + + public static unsafe void DcLeftPredictor32x32(byte* dst, int stride, byte* above, byte* left) + { + DcLeftPredictor(dst, stride, 32, above, left); + } + + private static unsafe void DcLeftPredictor(byte* dst, int stride, int bs, byte* above, byte* left) + { + int i, r, expectedDc, sum = 0; + + for (i = 0; i < bs; i++) + { + sum += left[i]; + } + + expectedDc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) + { + MemoryUtil.Fill(dst, (byte)expectedDc, bs); + dst += stride; + } + } + + public static unsafe void DcTopPredictor4x4(byte* dst, int stride, byte* above, byte* left) + { + DcTopPredictor(dst, stride, 4, above, left); + } + + public static unsafe void DcTopPredictor8x8(byte* dst, int stride, byte* above, byte* left) + { + DcTopPredictor(dst, stride, 8, above, left); + } + + public static unsafe void DcTopPredictor16x16(byte* dst, int stride, byte* above, byte* left) + { + DcTopPredictor(dst, stride, 16, above, left); + } + + public static unsafe void DcTopPredictor32x32(byte* dst, int stride, byte* above, byte* left) + { + DcTopPredictor(dst, stride, 32, above, left); + } + + private static unsafe void DcTopPredictor(byte* dst, int stride, int bs, byte* above, byte* left) + { + int i, r, expectedDc, sum = 0; + + for (i = 0; i < bs; i++) + { + sum += above[i]; + } + + expectedDc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) + { + MemoryUtil.Fill(dst, (byte)expectedDc, bs); + dst += stride; + } + } + + public static unsafe void DcPredictor4x4(byte* dst, int stride, byte* above, byte* left) + { + DcPredictor(dst, stride, 4, above, left); + } + + public static unsafe void DcPredictor8x8(byte* dst, int stride, byte* above, byte* left) + { + DcPredictor(dst, stride, 8, above, left); + } + + public static unsafe void DcPredictor16x16(byte* dst, int stride, byte* above, byte* left) + { + DcPredictor(dst, stride, 16, above, left); + } + + public static unsafe void DcPredictor32x32(byte* dst, int stride, byte* above, byte* left) + { + DcPredictor(dst, stride, 32, above, left); + } + + private static unsafe void DcPredictor(byte* dst, int stride, int bs, byte* above, byte* left) + { + int i, r, expectedDc, sum = 0; + int count = 2 * bs; + + for (i = 0; i < bs; i++) + { + sum += above[i]; + sum += left[i]; + } + + expectedDc = (sum + (count >> 1)) / count; + + for (r = 0; r < bs; r++) + { + MemoryUtil.Fill(dst, (byte)expectedDc, bs); + dst += stride; + } + } + + public static unsafe void HePredictor4x4(byte* dst, int stride, byte* above, byte* left) + { + byte h = above[-1]; + byte I = left[0]; + byte j = left[1]; + byte k = left[2]; + byte l = left[3]; + + MemoryUtil.Fill(dst + stride * 0, Avg3(h, I, j), 4); + MemoryUtil.Fill(dst + stride * 1, Avg3(I, j, k), 4); + MemoryUtil.Fill(dst + stride * 2, Avg3(j, k, l), 4); + MemoryUtil.Fill(dst + stride * 3, Avg3(k, l, l), 4); + } + + public static unsafe void VePredictor4x4(byte* dst, int stride, byte* above, byte* left) + { + byte h = above[-1]; + byte I = above[0]; + byte j = above[1]; + byte k = above[2]; + byte l = above[3]; + byte m = above[4]; + + dst[0] = Avg3(h, I, j); + dst[1] = Avg3(I, j, k); + dst[2] = Avg3(j, k, l); + dst[3] = Avg3(k, l, m); + MemoryUtil.Copy(dst + stride * 1, dst, 4); + MemoryUtil.Copy(dst + stride * 2, dst, 4); + MemoryUtil.Copy(dst + stride * 3, dst, 4); + } + + public static unsafe void D207Predictor4x4(byte* dst, int stride, byte* above, byte* left) + { + byte I = left[0]; + byte j = left[1]; + byte k = left[2]; + byte l = left[3]; + Dst(dst, stride, 0, 0) = Avg2(I, j); + Dst(dst, stride, 2, 0) = Dst(dst, stride, 0, 1) = Avg2(j, k); + Dst(dst, stride, 2, 1) = Dst(dst, stride, 0, 2) = Avg2(k, l); + Dst(dst, stride, 1, 0) = Avg3(I, j, k); + Dst(dst, stride, 3, 0) = Dst(dst, stride, 1, 1) = Avg3(j, k, l); + Dst(dst, stride, 3, 1) = Dst(dst, stride, 1, 2) = Avg3(k, l, l); + Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 0, 3) = Dst(dst, stride, 1, 3) = Dst(dst, stride, 2, 3) = Dst(dst, stride, 3, 3) = l; + } + + public static unsafe void D63Predictor4x4(byte* dst, int stride, byte* above, byte* left) + { + byte a = above[0]; + byte b = above[1]; + byte c = above[2]; + byte d = above[3]; + byte e = above[4]; + byte f = above[5]; + byte g = above[6]; + Dst(dst, stride, 0, 0) = Avg2(a, b); + Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 2) = Avg2(b, c); + Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 2) = Avg2(c, d); + Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 2) = Avg2(d, e); + Dst(dst, stride, 3, 2) = Avg2(e, f); // Differs from vp8 + + Dst(dst, stride, 0, 1) = Avg3(a, b, c); + Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 3) = Avg3(b, c, d); + Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 3) = Avg3(c, d, e); + Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 3) = Avg3(d, e, f); + Dst(dst, stride, 3, 3) = Avg3(e, f, g); // Differs from vp8 + } + + public static unsafe void D63ePredictor4x4(byte* dst, int stride, byte* above, byte* left) + { + byte a = above[0]; + byte b = above[1]; + byte c = above[2]; + byte d = above[3]; + byte e = above[4]; + byte f = above[5]; + byte g = above[6]; + byte h = above[7]; + Dst(dst, stride, 0, 0) = Avg2(a, b); + Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 2) = Avg2(b, c); + Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 2) = Avg2(c, d); + Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 2) = Avg2(d, e); + Dst(dst, stride, 3, 2) = Avg3(e, f, g); + + Dst(dst, stride, 0, 1) = Avg3(a, b, c); + Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 3) = Avg3(b, c, d); + Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 3) = Avg3(c, d, e); + Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 3) = Avg3(d, e, f); + Dst(dst, stride, 3, 3) = Avg3(f, g, h); + } + + public static unsafe void D45Predictor4x4(byte* dst, int stride, byte* above, byte* left) + { + byte a = above[0]; + byte b = above[1]; + byte c = above[2]; + byte d = above[3]; + byte e = above[4]; + byte f = above[5]; + byte g = above[6]; + byte h = above[7]; + Dst(dst, stride, 0, 0) = Avg3(a, b, c); + Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 1) = Avg3(b, c, d); + Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 2) = Avg3(c, d, e); + Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f); + Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 3) = Avg3(e, f, g); + Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 3) = Avg3(f, g, h); + Dst(dst, stride, 3, 3) = h; // differs from vp8 + } + + public static unsafe void D45ePredictor4x4(byte* dst, int stride, byte* above, byte* left) + { + byte a = above[0]; + byte b = above[1]; + byte c = above[2]; + byte d = above[3]; + byte e = above[4]; + byte f = above[5]; + byte g = above[6]; + byte h = above[7]; + Dst(dst, stride, 0, 0) = Avg3(a, b, c); + Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 1) = Avg3(b, c, d); + Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 2) = Avg3(c, d, e); + Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f); + Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 3) = Avg3(e, f, g); + Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 3) = Avg3(f, g, h); + Dst(dst, stride, 3, 3) = Avg3(g, h, h); + } + + public static unsafe void D117Predictor4x4(byte* dst, int stride, byte* above, byte* left) + { + byte I = left[0]; + byte j = left[1]; + byte k = left[2]; + byte x = above[-1]; + byte a = above[0]; + byte b = above[1]; + byte c = above[2]; + byte d = above[3]; + Dst(dst, stride, 0, 0) = Dst(dst, stride, 1, 2) = Avg2(x, a); + Dst(dst, stride, 1, 0) = Dst(dst, stride, 2, 2) = Avg2(a, b); + Dst(dst, stride, 2, 0) = Dst(dst, stride, 3, 2) = Avg2(b, c); + Dst(dst, stride, 3, 0) = Avg2(c, d); + + Dst(dst, stride, 0, 3) = Avg3(k, j, I); + Dst(dst, stride, 0, 2) = Avg3(j, I, x); + Dst(dst, stride, 0, 1) = Dst(dst, stride, 1, 3) = Avg3(I, x, a); + Dst(dst, stride, 1, 1) = Dst(dst, stride, 2, 3) = Avg3(x, a, b); + Dst(dst, stride, 2, 1) = Dst(dst, stride, 3, 3) = Avg3(a, b, c); + Dst(dst, stride, 3, 1) = Avg3(b, c, d); + } + + public static unsafe void D135Predictor4x4(byte* dst, int stride, byte* above, byte* left) + { + byte I = left[0]; + byte j = left[1]; + byte k = left[2]; + byte l = left[3]; + byte x = above[-1]; + byte a = above[0]; + byte b = above[1]; + byte c = above[2]; + byte d = above[3]; + Dst(dst, stride, 0, 3) = Avg3(j, k, l); + Dst(dst, stride, 1, 3) = Dst(dst, stride, 0, 2) = Avg3(I, j, k); + Dst(dst, stride, 2, 3) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 1) = Avg3(x, I, j); + Dst(dst, stride, 3, 3) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 0) = Avg3(a, x, I); + Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 0) = Avg3(b, a, x); + Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 0) = Avg3(c, b, a); + Dst(dst, stride, 3, 0) = Avg3(d, c, b); + } + + public static unsafe void D153Predictor4x4(byte* dst, int stride, byte* above, byte* left) + { + byte I = left[0]; + byte j = left[1]; + byte k = left[2]; + byte l = left[3]; + byte x = above[-1]; + byte a = above[0]; + byte b = above[1]; + byte c = above[2]; + Dst(dst, stride, 0, 0) = Dst(dst, stride, 2, 1) = Avg2(I, x); + Dst(dst, stride, 0, 1) = Dst(dst, stride, 2, 2) = Avg2(j, I); + Dst(dst, stride, 0, 2) = Dst(dst, stride, 2, 3) = Avg2(k, j); + Dst(dst, stride, 0, 3) = Avg2(l, k); + + Dst(dst, stride, 3, 0) = Avg3(a, b, c); + Dst(dst, stride, 2, 0) = Avg3(x, a, b); + Dst(dst, stride, 1, 0) = Dst(dst, stride, 3, 1) = Avg3(I, x, a); + Dst(dst, stride, 1, 1) = Dst(dst, stride, 3, 2) = Avg3(j, I, x); + Dst(dst, stride, 1, 2) = Dst(dst, stride, 3, 3) = Avg3(k, j, I); + Dst(dst, stride, 1, 3) = Avg3(l, k, j); + } + + public static unsafe void HighbdD207Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdD207Predictor(dst, stride, 8, above, left, bd); + } + + public static unsafe void HighbdD207Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdD207Predictor(dst, stride, 16, above, left, bd); + } + + public static unsafe void HighbdD207Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdD207Predictor(dst, stride, 32, above, left, bd); + } + + private static unsafe void HighbdD207Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + { + int r, c; + + // First column. + for (r = 0; r < bs - 1; ++r) + { + dst[r * stride] = Avg2(left[r], left[r + 1]); + } + dst[(bs - 1) * stride] = left[bs - 1]; + dst++; + + // Second column. + for (r = 0; r < bs - 2; ++r) + { + dst[r * stride] = Avg3(left[r], left[r + 1], left[r + 2]); + } + dst[(bs - 2) * stride] = Avg3(left[bs - 2], left[bs - 1], left[bs - 1]); + dst[(bs - 1) * stride] = left[bs - 1]; + dst++; + + // Rest of last row. + for (c = 0; c < bs - 2; ++c) + { + dst[(bs - 1) * stride + c] = left[bs - 1]; + } + + for (r = bs - 2; r >= 0; --r) + { + for (c = 0; c < bs - 2; ++c) + { + dst[r * stride + c] = dst[(r + 1) * stride + c - 2]; + } + } + } + + public static unsafe void HighbdD63Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdD63Predictor(dst, stride, 8, above, left, bd); + } + + public static unsafe void HighbdD63Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdD63Predictor(dst, stride, 16, above, left, bd); + } + + public static unsafe void HighbdD63Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdD63Predictor(dst, stride, 32, above, left, bd); + } + + private static unsafe void HighbdD63Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + { + int r, c; + int size; + for (c = 0; c < bs; ++c) + { + dst[c] = Avg2(above[c], above[c + 1]); + dst[stride + c] = Avg3(above[c], above[c + 1], above[c + 2]); + } + for (r = 2, size = bs - 2; r < bs; r += 2, --size) + { + MemoryUtil.Copy(dst + (r + 0) * stride, dst + (r >> 1), size); + MemoryUtil.Fill(dst + (r + 0) * stride + size, above[bs - 1], bs - size); + MemoryUtil.Copy(dst + (r + 1) * stride, dst + stride + (r >> 1), size); + MemoryUtil.Fill(dst + (r + 1) * stride + size, above[bs - 1], bs - size); + } + } + + public static unsafe void HighbdD45Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdD45Predictor(dst, stride, 8, above, left, bd); + } + + public static unsafe void HighbdD45Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdD45Predictor(dst, stride, 16, above, left, bd); + } + + public static unsafe void HighbdD45Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdD45Predictor(dst, stride, 32, above, left, bd); + } + + private static unsafe void HighbdD45Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + { + ushort aboveRight = above[bs - 1]; + ushort* dstRow0 = dst; + int x, size; + + for (x = 0; x < bs - 1; ++x) + { + dst[x] = Avg3(above[x], above[x + 1], above[x + 2]); + } + dst[bs - 1] = aboveRight; + dst += stride; + for (x = 1, size = bs - 2; x < bs; ++x, --size) + { + MemoryUtil.Copy(dst, dstRow0 + x, size); + MemoryUtil.Fill(dst + size, aboveRight, x + 1); + dst += stride; + } + } + + public static unsafe void HighbdD117Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdD117Predictor(dst, stride, 8, above, left, bd); + } + + public static unsafe void HighbdD117Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdD117Predictor(dst, stride, 16, above, left, bd); + } + + public static unsafe void HighbdD117Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdD117Predictor(dst, stride, 32, above, left, bd); + } + + private static unsafe void HighbdD117Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + { + int r, c; + + // First row + for (c = 0; c < bs; c++) + { + dst[c] = Avg2(above[c - 1], above[c]); + } + + dst += stride; + + // Second row + dst[0] = Avg3(left[0], above[-1], above[0]); + for (c = 1; c < bs; c++) + { + dst[c] = Avg3(above[c - 2], above[c - 1], above[c]); + } + + dst += stride; + + // The rest of first col + dst[0] = Avg3(above[-1], left[0], left[1]); + for (r = 3; r < bs; ++r) + { + dst[(r - 2) * stride] = Avg3(left[r - 3], left[r - 2], left[r - 1]); + } + + // The rest of the block + for (r = 2; r < bs; ++r) + { + for (c = 1; c < bs; c++) + { + dst[c] = dst[-2 * stride + c - 1]; + } + + dst += stride; + } + } + + public static unsafe void HighbdD135Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdD135Predictor(dst, stride, 8, above, left, bd); + } + + public static unsafe void HighbdD135Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdD135Predictor(dst, stride, 16, above, left, bd); + } + + public static unsafe void HighbdD135Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdD135Predictor(dst, stride, 32, above, left, bd); + } + + private static unsafe void HighbdD135Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + { + int i; + ushort* border = stackalloc ushort[32 + 32 - 1]; // Outer border from bottom-left to top-right + + // Dst(dst, stride, bs, bs - 2)[0], i.e., border starting at bottom-left + for (i = 0; i < bs - 2; ++i) + { + border[i] = Avg3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]); + } + border[bs - 2] = Avg3(above[-1], left[0], left[1]); + border[bs - 1] = Avg3(left[0], above[-1], above[0]); + border[bs - 0] = Avg3(above[-1], above[0], above[1]); + // dst[0][2, size), i.e., remaining top border ascending + for (i = 0; i < bs - 2; ++i) + { + border[bs + 1 + i] = Avg3(above[i], above[i + 1], above[i + 2]); + } + + for (i = 0; i < bs; ++i) + { + MemoryUtil.Copy(dst + i * stride, border + bs - 1 - i, bs); + } + } + + public static unsafe void HighbdD153Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdD153Predictor(dst, stride, 8, above, left, bd); + } + + public static unsafe void HighbdD153Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdD153Predictor(dst, stride, 16, above, left, bd); + } + + public static unsafe void HighbdD153Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdD153Predictor(dst, stride, 32, above, left, bd); + } + + private static unsafe void HighbdD153Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + { + int r, c; + dst[0] = Avg2(above[-1], left[0]); + for (r = 1; r < bs; r++) + { + dst[r * stride] = Avg2(left[r - 1], left[r]); + } + + dst++; + + dst[0] = Avg3(left[0], above[-1], above[0]); + dst[stride] = Avg3(above[-1], left[0], left[1]); + for (r = 2; r < bs; r++) + { + dst[r * stride] = Avg3(left[r - 2], left[r - 1], left[r]); + } + + dst++; + + for (c = 0; c < bs - 2; c++) + { + dst[c] = Avg3(above[c - 1], above[c], above[c + 1]); + } + + dst += stride; + + for (r = 1; r < bs; ++r) + { + for (c = 0; c < bs - 2; c++) + { + dst[c] = dst[-stride + c - 2]; + } + + dst += stride; + } + } + + public static unsafe void HighbdVPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdVPredictor(dst, stride, 4, above, left, bd); + } + + public static unsafe void HighbdVPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdVPredictor(dst, stride, 8, above, left, bd); + } + + public static unsafe void HighbdVPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdVPredictor(dst, stride, 16, above, left, bd); + } + + public static unsafe void HighbdVPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdVPredictor(dst, stride, 32, above, left, bd); + } + + private static unsafe void HighbdVPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + { + int r; + for (r = 0; r < bs; r++) + { + MemoryUtil.Copy(dst, above, bs); + dst += stride; + } + } + + public static unsafe void HighbdHPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdHPredictor(dst, stride, 4, above, left, bd); + } + + public static unsafe void HighbdHPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdHPredictor(dst, stride, 8, above, left, bd); + } + + public static unsafe void HighbdHPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdHPredictor(dst, stride, 16, above, left, bd); + } + + public static unsafe void HighbdHPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdHPredictor(dst, stride, 32, above, left, bd); + } + + private static unsafe void HighbdHPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + { + int r; + for (r = 0; r < bs; r++) + { + MemoryUtil.Fill(dst, left[r], bs); + dst += stride; + } + } + + public static unsafe void HighbdTMPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdTMPredictor(dst, stride, 4, above, left, bd); + } + + public static unsafe void HighbdTMPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdTMPredictor(dst, stride, 8, above, left, bd); + } + + public static unsafe void HighbdTMPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdTMPredictor(dst, stride, 16, above, left, bd); + } + + public static unsafe void HighbdTMPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdTMPredictor(dst, stride, 32, above, left, bd); + } + + private static unsafe void HighbdTMPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + { + int r, c; + int yTopLeft = above[-1]; + + for (r = 0; r < bs; r++) + { + for (c = 0; c < bs; c++) + { + dst[c] = BitUtils.ClipPixelHighbd(left[r] + above[c] - yTopLeft, bd); + } + + dst += stride; + } + } + + public static unsafe void HighbdDc128Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdDc128Predictor(dst, stride, 4, above, left, bd); + } + + public static unsafe void HighbdDc128Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdDc128Predictor(dst, stride, 8, above, left, bd); + } + + public static unsafe void HighbdDc128Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdDc128Predictor(dst, stride, 16, above, left, bd); + } + + public static unsafe void HighbdDc128Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdDc128Predictor(dst, stride, 32, above, left, bd); + } + + private static unsafe void HighbdDc128Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + { + int r; + + for (r = 0; r < bs; r++) + { + MemoryUtil.Fill(dst, (ushort)(128 << (bd - 8)), bs); + dst += stride; + } + } + + public static unsafe void HighbdDcLeftPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdDcLeftPredictor(dst, stride, 4, above, left, bd); + } + + public static unsafe void HighbdDcLeftPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdDcLeftPredictor(dst, stride, 8, above, left, bd); + } + + public static unsafe void HighbdDcLeftPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdDcLeftPredictor(dst, stride, 16, above, left, bd); + } + + public static unsafe void HighbdDcLeftPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdDcLeftPredictor(dst, stride, 32, above, left, bd); + } + + private static unsafe void HighbdDcLeftPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + { + int i, r, expectedDc, sum = 0; + + for (i = 0; i < bs; i++) + { + sum += left[i]; + } + + expectedDc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) + { + MemoryUtil.Fill(dst, (ushort)expectedDc, bs); + dst += stride; + } + } + + public static unsafe void HighbdDcTopPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdDcTopPredictor(dst, stride, 4, above, left, bd); + } + + public static unsafe void HighbdDcTopPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdDcTopPredictor(dst, stride, 8, above, left, bd); + } + + public static unsafe void HighbdDcTopPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdDcTopPredictor(dst, stride, 16, above, left, bd); + } + + public static unsafe void HighbdDcTopPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdDcTopPredictor(dst, stride, 32, above, left, bd); + } + + private static unsafe void HighbdDcTopPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + { + int i, r, expectedDc, sum = 0; + + for (i = 0; i < bs; i++) + { + sum += above[i]; + } + + expectedDc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) + { + MemoryUtil.Fill(dst, (ushort)expectedDc, bs); + dst += stride; + } + } + + public static unsafe void HighbdDcPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdDcPredictor(dst, stride, 4, above, left, bd); + } + + public static unsafe void HighbdDcPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdDcPredictor(dst, stride, 8, above, left, bd); + } + + public static unsafe void HighbdDcPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdDcPredictor(dst, stride, 16, above, left, bd); + } + + public static unsafe void HighbdDcPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + HighbdDcPredictor(dst, stride, 32, above, left, bd); + } + + private static unsafe void HighbdDcPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + { + int i, r, expectedDc, sum = 0; + int count = 2 * bs; + + for (i = 0; i < bs; i++) + { + sum += above[i]; + sum += left[i]; + } + + expectedDc = (sum + (count >> 1)) / count; + + for (r = 0; r < bs; r++) + { + MemoryUtil.Fill(dst, (ushort)expectedDc, bs); + dst += stride; + } + } + + public static unsafe void HighbdD207Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + ushort I = left[0]; + ushort j = left[1]; + ushort k = left[2]; + ushort l = left[3]; + Dst(dst, stride, 0, 0) = Avg2(I, j); + Dst(dst, stride, 2, 0) = Dst(dst, stride, 0, 1) = Avg2(j, k); + Dst(dst, stride, 2, 1) = Dst(dst, stride, 0, 2) = Avg2(k, l); + Dst(dst, stride, 1, 0) = Avg3(I, j, k); + Dst(dst, stride, 3, 0) = Dst(dst, stride, 1, 1) = Avg3(j, k, l); + Dst(dst, stride, 3, 1) = Dst(dst, stride, 1, 2) = Avg3(k, l, l); + Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 0, 3) = Dst(dst, stride, 1, 3) = Dst(dst, stride, 2, 3) = Dst(dst, stride, 3, 3) = l; + } + + public static unsafe void HighbdD63Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + ushort a = above[0]; + ushort b = above[1]; + ushort c = above[2]; + ushort d = above[3]; + ushort e = above[4]; + ushort f = above[5]; + ushort g = above[6]; + Dst(dst, stride, 0, 0) = Avg2(a, b); + Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 2) = Avg2(b, c); + Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 2) = Avg2(c, d); + Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 2) = Avg2(d, e); + Dst(dst, stride, 3, 2) = Avg2(e, f); // Differs from vp8 + + Dst(dst, stride, 0, 1) = Avg3(a, b, c); + Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 3) = Avg3(b, c, d); + Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 3) = Avg3(c, d, e); + Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 3) = Avg3(d, e, f); + Dst(dst, stride, 3, 3) = Avg3(e, f, g); // Differs from vp8 + } + + public static unsafe void HighbdD45Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + ushort a = above[0]; + ushort b = above[1]; + ushort c = above[2]; + ushort d = above[3]; + ushort e = above[4]; + ushort f = above[5]; + ushort g = above[6]; + ushort h = above[7]; + Dst(dst, stride, 0, 0) = Avg3(a, b, c); + Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 1) = Avg3(b, c, d); + Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 2) = Avg3(c, d, e); + Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f); + Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 3) = Avg3(e, f, g); + Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 3) = Avg3(f, g, h); + Dst(dst, stride, 3, 3) = h; // Differs from vp8 + } + + public static unsafe void HighbdD117Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + ushort I = left[0]; + ushort j = left[1]; + ushort k = left[2]; + ushort x = above[-1]; + ushort a = above[0]; + ushort b = above[1]; + ushort c = above[2]; + ushort d = above[3]; + Dst(dst, stride, 0, 0) = Dst(dst, stride, 1, 2) = Avg2(x, a); + Dst(dst, stride, 1, 0) = Dst(dst, stride, 2, 2) = Avg2(a, b); + Dst(dst, stride, 2, 0) = Dst(dst, stride, 3, 2) = Avg2(b, c); + Dst(dst, stride, 3, 0) = Avg2(c, d); + + Dst(dst, stride, 0, 3) = Avg3(k, j, I); + Dst(dst, stride, 0, 2) = Avg3(j, I, x); + Dst(dst, stride, 0, 1) = Dst(dst, stride, 1, 3) = Avg3(I, x, a); + Dst(dst, stride, 1, 1) = Dst(dst, stride, 2, 3) = Avg3(x, a, b); + Dst(dst, stride, 2, 1) = Dst(dst, stride, 3, 3) = Avg3(a, b, c); + Dst(dst, stride, 3, 1) = Avg3(b, c, d); + } + + public static unsafe void HighbdD135Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + ushort I = left[0]; + ushort j = left[1]; + ushort k = left[2]; + ushort l = left[3]; + ushort x = above[-1]; + ushort a = above[0]; + ushort b = above[1]; + ushort c = above[2]; + ushort d = above[3]; + Dst(dst, stride, 0, 3) = Avg3(j, k, l); + Dst(dst, stride, 1, 3) = Dst(dst, stride, 0, 2) = Avg3(I, j, k); + Dst(dst, stride, 2, 3) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 1) = Avg3(x, I, j); + Dst(dst, stride, 3, 3) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 0) = Avg3(a, x, I); + Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 0) = Avg3(b, a, x); + Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 0) = Avg3(c, b, a); + Dst(dst, stride, 3, 0) = Avg3(d, c, b); + } + + public static unsafe void HighbdD153Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd) + { + ushort I = left[0]; + ushort j = left[1]; + ushort k = left[2]; + ushort l = left[3]; + ushort x = above[-1]; + ushort a = above[0]; + ushort b = above[1]; + ushort c = above[2]; + + Dst(dst, stride, 0, 0) = Dst(dst, stride, 2, 1) = Avg2(I, x); + Dst(dst, stride, 0, 1) = Dst(dst, stride, 2, 2) = Avg2(j, I); + Dst(dst, stride, 0, 2) = Dst(dst, stride, 2, 3) = Avg2(k, j); + Dst(dst, stride, 0, 3) = Avg2(l, k); + + Dst(dst, stride, 3, 0) = Avg3(a, b, c); + Dst(dst, stride, 2, 0) = Avg3(x, a, b); + Dst(dst, stride, 1, 0) = Dst(dst, stride, 3, 1) = Avg3(I, x, a); + Dst(dst, stride, 1, 1) = Dst(dst, stride, 3, 2) = Avg3(j, I, x); + Dst(dst, stride, 1, 2) = Dst(dst, stride, 3, 3) = Avg3(k, j, I); + Dst(dst, stride, 1, 3) = Avg3(l, k, j); + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs new file mode 100644 index 00000000..3fc3c72a --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs @@ -0,0 +1,2917 @@ +using Ryujinx.Graphics.Nvdec.Vp9.Common; +using System; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.TxfmCommon; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp +{ + internal static class InvTxfm + { + // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse + // transform amplify bits + 1 bit for contingency in rounding and quantizing + private const int HighbdValidTxfmMagnitudeRange = (1 << 25); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int DetectInvalidHighbdInput(ReadOnlySpan<int> input, int size) + { + int i; + for (i = 0; i < size; ++i) + { + if (Math.Abs(input[i]) >= HighbdValidTxfmMagnitudeRange) + { + return 1; + } + } + + return 0; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static long CheckRange(long input) + { + // For valid VP9 input streams, intermediate stage coefficients should always + // stay within the range of a signed 16 bit integer. Coefficients can go out + // of this range for invalid/corrupt VP9 streams. + Debug.Assert(short.MinValue <= input); + Debug.Assert(input <= short.MaxValue); + return input; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static long HighbdCheckRange(long input, int bd) + { + // For valid highbitdepth VP9 streams, intermediate stage coefficients will + // stay within the ranges: + // - 8 bit: signed 16 bit integer + // - 10 bit: signed 18 bit integer + // - 12 bit: signed 20 bit integer + int intMax = (1 << (7 + bd)) - 1; + int intMin = -intMax - 1; + Debug.Assert(intMin <= input); + Debug.Assert(input <= intMax); + + return input; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int WrapLow(long x) + { + return (short)CheckRange(x); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int HighbdWrapLow(long x, int bd) + { + return ((int)HighbdCheckRange(x, bd) << (24 - bd)) >> (24 - bd); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static byte ClipPixelAdd(byte dest, long trans) + { + trans = WrapLow(trans); + return BitUtils.ClipPixel(dest + (int)trans); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static ushort HighbdClipPixelAdd(ushort dest, long trans, int bd) + { + trans = HighbdWrapLow(trans, bd); + return BitUtils.ClipPixelHighbd(dest + (int)trans, bd); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static long DctConstRoundShift(long input) + { + long rv = BitUtils.RoundPowerOfTwo(input, DctConstBits); + return rv; + } + + [SkipLocalsInit] + public static void Iwht4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) + { + /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ + int i; + Span<int> output = stackalloc int[16]; + long a1, b1, c1, d1, e1; + ReadOnlySpan<int> ip = input; + Span<int> op = output; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] >> UnitQuantShift; + c1 = ip[1] >> UnitQuantShift; + d1 = ip[2] >> UnitQuantShift; + b1 = ip[3] >> UnitQuantShift; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + op[0] = WrapLow(a1); + op[1] = WrapLow(b1); + op[2] = WrapLow(c1); + op[3] = WrapLow(d1); + ip = ip.Slice(4); + op = op.Slice(4); + } + + Span<int> ip2 = output; + for (i = 0; i < 4; i++) + { + a1 = ip2[4 * 0]; + c1 = ip2[4 * 1]; + d1 = ip2[4 * 2]; + b1 = ip2[4 * 3]; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + dest[stride * 0] = ClipPixelAdd(dest[stride * 0], WrapLow(a1)); + dest[stride * 1] = ClipPixelAdd(dest[stride * 1], WrapLow(b1)); + dest[stride * 2] = ClipPixelAdd(dest[stride * 2], WrapLow(c1)); + dest[stride * 3] = ClipPixelAdd(dest[stride * 3], WrapLow(d1)); + + ip2 = ip2.Slice(1); + dest = dest.Slice(1); + } + } + + [SkipLocalsInit] + public static void Iwht4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) + { + int i; + long a1, e1; + Span<int> tmp = stackalloc int[4]; + ReadOnlySpan<int> ip = input; + Span<int> op = tmp; + + a1 = ip[0] >> UnitQuantShift; + e1 = a1 >> 1; + a1 -= e1; + op[0] = WrapLow(a1); + op[1] = op[2] = op[3] = WrapLow(e1); + + Span<int> ip2 = tmp; + for (i = 0; i < 4; i++) + { + e1 = ip2[0] >> 1; + a1 = ip2[0] - e1; + dest[stride * 0] = ClipPixelAdd(dest[stride * 0], a1); + dest[stride * 1] = ClipPixelAdd(dest[stride * 1], e1); + dest[stride * 2] = ClipPixelAdd(dest[stride * 2], e1); + dest[stride * 3] = ClipPixelAdd(dest[stride * 3], e1); + ip2 = ip2.Slice(1); + dest = dest.Slice(1); + } + } + + public static void Iadst4(ReadOnlySpan<int> input, Span<int> output) + { + long s0, s1, s2, s3, s4, s5, s6, s7; + int x0 = input[0]; + int x1 = input[1]; + int x2 = input[2]; + int x3 = input[3]; + + if ((x0 | x1 | x2 | x3) == 0) + { + output.Slice(0, 4).Fill(0); + return; + } + + // 32-bit result is enough for the following multiplications. + s0 = SinPi1_9 * x0; + s1 = SinPi2_9 * x0; + s2 = SinPi3_9 * x1; + s3 = SinPi4_9 * x2; + s4 = SinPi1_9 * x2; + s5 = SinPi2_9 * x3; + s6 = SinPi4_9 * x3; + s7 = WrapLow(x0 - x2 + x3); + + s0 = s0 + s3 + s5; + s1 = s1 - s4 - s6; + s3 = s2; + s2 = SinPi3_9 * s7; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = WrapLow(DctConstRoundShift(s0 + s3)); + output[1] = WrapLow(DctConstRoundShift(s1 + s3)); + output[2] = WrapLow(DctConstRoundShift(s2)); + output[3] = WrapLow(DctConstRoundShift(s0 + s1 - s3)); + } + + [SkipLocalsInit] + public static void Idct4(ReadOnlySpan<int> input, Span<int> output) + { + Span<short> step = stackalloc short[4]; + long temp1, temp2; + + // stage 1 + temp1 = ((short)input[0] + (short)input[2]) * CosPi16_64; + temp2 = ((short)input[0] - (short)input[2]) * CosPi16_64; + step[0] = (short)WrapLow(DctConstRoundShift(temp1)); + step[1] = (short)WrapLow(DctConstRoundShift(temp2)); + temp1 = (short)input[1] * CosPi24_64 - (short)input[3] * CosPi8_64; + temp2 = (short)input[1] * CosPi8_64 + (short)input[3] * CosPi24_64; + step[2] = (short)WrapLow(DctConstRoundShift(temp1)); + step[3] = (short)WrapLow(DctConstRoundShift(temp2)); + + // stage 2 + output[0] = WrapLow(step[0] + step[3]); + output[1] = WrapLow(step[1] + step[2]); + output[2] = WrapLow(step[1] - step[2]); + output[3] = WrapLow(step[0] - step[3]); + } + + [SkipLocalsInit] + public static void Idct4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) + { + int i, j; + Span<int> output = stackalloc int[4 * 4]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[4]; + Span<int> tempOut = stackalloc int[4]; + + // Rows + for (i = 0; i < 4; ++i) + { + Idct4(input, outptr); + input = input.Slice(4); + outptr = outptr.Slice(4); + } + + // Columns + for (i = 0; i < 4; ++i) + { + for (j = 0; j < 4; ++j) + { + tempIn[j] = output[j * 4 + i]; + } + + Idct4(tempIn, tempOut); + for (j = 0; j < 4; ++j) + { + dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4)); + } + } + } + + public static void Idct4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) + { + int i; + long a1; + int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64)); + + output = WrapLow(DctConstRoundShift(output * CosPi16_64)); + a1 = BitUtils.RoundPowerOfTwo(output, 4); + + for (i = 0; i < 4; i++) + { + dest[0] = ClipPixelAdd(dest[0], a1); + dest[1] = ClipPixelAdd(dest[1], a1); + dest[2] = ClipPixelAdd(dest[2], a1); + dest[3] = ClipPixelAdd(dest[3], a1); + dest = dest.Slice(stride); + } + } + + public static void Iadst8(ReadOnlySpan<int> input, Span<int> output) + { + int s0, s1, s2, s3, s4, s5, s6, s7; + long x0 = input[7]; + long x1 = input[0]; + long x2 = input[5]; + long x3 = input[2]; + long x4 = input[3]; + long x5 = input[4]; + long x6 = input[1]; + long x7 = input[6]; + + if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7) == 0) + { + output.Slice(0, 8).Fill(0); + return; + } + + // stage 1 + s0 = (int)(CosPi2_64 * x0 + CosPi30_64 * x1); + s1 = (int)(CosPi30_64 * x0 - CosPi2_64 * x1); + s2 = (int)(CosPi10_64 * x2 + CosPi22_64 * x3); + s3 = (int)(CosPi22_64 * x2 - CosPi10_64 * x3); + s4 = (int)(CosPi18_64 * x4 + CosPi14_64 * x5); + s5 = (int)(CosPi14_64 * x4 - CosPi18_64 * x5); + s6 = (int)(CosPi26_64 * x6 + CosPi6_64 * x7); + s7 = (int)(CosPi6_64 * x6 - CosPi26_64 * x7); + + x0 = WrapLow(DctConstRoundShift(s0 + s4)); + x1 = WrapLow(DctConstRoundShift(s1 + s5)); + x2 = WrapLow(DctConstRoundShift(s2 + s6)); + x3 = WrapLow(DctConstRoundShift(s3 + s7)); + x4 = WrapLow(DctConstRoundShift(s0 - s4)); + x5 = WrapLow(DctConstRoundShift(s1 - s5)); + x6 = WrapLow(DctConstRoundShift(s2 - s6)); + x7 = WrapLow(DctConstRoundShift(s3 - s7)); + + // stage 2 + s0 = (int)x0; + s1 = (int)x1; + s2 = (int)x2; + s3 = (int)x3; + s4 = (int)(CosPi8_64 * x4 + CosPi24_64 * x5); + s5 = (int)(CosPi24_64 * x4 - CosPi8_64 * x5); + s6 = (int)(-CosPi24_64 * x6 + CosPi8_64 * x7); + s7 = (int)(CosPi8_64 * x6 + CosPi24_64 * x7); + + x0 = WrapLow(s0 + s2); + x1 = WrapLow(s1 + s3); + x2 = WrapLow(s0 - s2); + x3 = WrapLow(s1 - s3); + x4 = WrapLow(DctConstRoundShift(s4 + s6)); + x5 = WrapLow(DctConstRoundShift(s5 + s7)); + x6 = WrapLow(DctConstRoundShift(s4 - s6)); + x7 = WrapLow(DctConstRoundShift(s5 - s7)); + + // stage 3 + s2 = (int)(CosPi16_64 * (x2 + x3)); + s3 = (int)(CosPi16_64 * (x2 - x3)); + s6 = (int)(CosPi16_64 * (x6 + x7)); + s7 = (int)(CosPi16_64 * (x6 - x7)); + + x2 = WrapLow(DctConstRoundShift(s2)); + x3 = WrapLow(DctConstRoundShift(s3)); + x6 = WrapLow(DctConstRoundShift(s6)); + x7 = WrapLow(DctConstRoundShift(s7)); + + output[0] = WrapLow(x0); + output[1] = WrapLow(-x4); + output[2] = WrapLow(x6); + output[3] = WrapLow(-x2); + output[4] = WrapLow(x3); + output[5] = WrapLow(-x7); + output[6] = WrapLow(x5); + output[7] = WrapLow(-x1); + } + + [SkipLocalsInit] + public static void Idct8(ReadOnlySpan<int> input, Span<int> output) + { + Span<short> step1 = stackalloc short[8]; + Span<short> step2 = stackalloc short[8]; + long temp1, temp2; + + // stage 1 + step1[0] = (short)input[0]; + step1[2] = (short)input[4]; + step1[1] = (short)input[2]; + step1[3] = (short)input[6]; + temp1 = (short)input[1] * CosPi28_64 - (short)input[7] * CosPi4_64; + temp2 = (short)input[1] * CosPi4_64 + (short)input[7] * CosPi28_64; + step1[4] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[7] = (short)WrapLow(DctConstRoundShift(temp2)); + temp1 = (short)input[5] * CosPi12_64 - (short)input[3] * CosPi20_64; + temp2 = (short)input[5] * CosPi20_64 + (short)input[3] * CosPi12_64; + step1[5] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[6] = (short)WrapLow(DctConstRoundShift(temp2)); + + // stage 2 + temp1 = (step1[0] + step1[2]) * CosPi16_64; + temp2 = (step1[0] - step1[2]) * CosPi16_64; + step2[0] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[1] = (short)WrapLow(DctConstRoundShift(temp2)); + temp1 = step1[1] * CosPi24_64 - step1[3] * CosPi8_64; + temp2 = step1[1] * CosPi8_64 + step1[3] * CosPi24_64; + step2[2] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[3] = (short)WrapLow(DctConstRoundShift(temp2)); + step2[4] = (short)WrapLow(step1[4] + step1[5]); + step2[5] = (short)WrapLow(step1[4] - step1[5]); + step2[6] = (short)WrapLow(-step1[6] + step1[7]); + step2[7] = (short)WrapLow(step1[6] + step1[7]); + + // stage 3 + step1[0] = (short)WrapLow(step2[0] + step2[3]); + step1[1] = (short)WrapLow(step2[1] + step2[2]); + step1[2] = (short)WrapLow(step2[1] - step2[2]); + step1[3] = (short)WrapLow(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * CosPi16_64; + temp2 = (step2[5] + step2[6]) * CosPi16_64; + step1[5] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[6] = (short)WrapLow(DctConstRoundShift(temp2)); + step1[7] = step2[7]; + + // stage 4 + output[0] = WrapLow(step1[0] + step1[7]); + output[1] = WrapLow(step1[1] + step1[6]); + output[2] = WrapLow(step1[2] + step1[5]); + output[3] = WrapLow(step1[3] + step1[4]); + output[4] = WrapLow(step1[3] - step1[4]); + output[5] = WrapLow(step1[2] - step1[5]); + output[6] = WrapLow(step1[1] - step1[6]); + output[7] = WrapLow(step1[0] - step1[7]); + } + + [SkipLocalsInit] + public static void Idct8x864Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) + { + int i, j; + Span<int> output = stackalloc int[8 * 8]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[8]; + Span<int> tempOut = stackalloc int[8]; + + // First transform rows + for (i = 0; i < 8; ++i) + { + Idct8(input, outptr); + input = input.Slice(8); + outptr = outptr.Slice(8); + } + + // Then transform columns + for (i = 0; i < 8; ++i) + { + for (j = 0; j < 8; ++j) + { + tempIn[j] = output[j * 8 + i]; + } + + Idct8(tempIn, tempOut); + for (j = 0; j < 8; ++j) + { + dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], + BitUtils.RoundPowerOfTwo(tempOut[j], 5)); + } + } + } + + [SkipLocalsInit] + public static void Idct8x812Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) + { + int i, j; + Span<int> output = stackalloc int[8 * 8]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[8]; + Span<int> tempOut = stackalloc int[8]; + + output.Fill(0); + + // First transform rows + // Only first 4 row has non-zero coefs + for (i = 0; i < 4; ++i) + { + Idct8(input, outptr); + input = input.Slice(8); + outptr = outptr.Slice(8); + } + + // Then transform columns + for (i = 0; i < 8; ++i) + { + for (j = 0; j < 8; ++j) + { + tempIn[j] = output[j * 8 + i]; + } + + Idct8(tempIn, tempOut); + for (j = 0; j < 8; ++j) + { + dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5)); + } + } + } + + public static void Idct8x81Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) + { + int i, j; + long a1; + int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64)); + + output = WrapLow(DctConstRoundShift(output * CosPi16_64)); + a1 = BitUtils.RoundPowerOfTwo(output, 5); + for (j = 0; j < 8; ++j) + { + for (i = 0; i < 8; ++i) + { + dest[i] = ClipPixelAdd(dest[i], a1); + } + + dest = dest.Slice(stride); + } + } + + public static void Iadst16(ReadOnlySpan<int> input, Span<int> output) + { + long s0, s1, s2, s3, s4, s5, s6, s7, s8; + long s9, s10, s11, s12, s13, s14, s15; + long x0 = input[15]; + long x1 = input[0]; + long x2 = input[13]; + long x3 = input[2]; + long x4 = input[11]; + long x5 = input[4]; + long x6 = input[9]; + long x7 = input[6]; + long x8 = input[7]; + long x9 = input[8]; + long x10 = input[5]; + long x11 = input[10]; + long x12 = input[3]; + long x13 = input[12]; + long x14 = input[1]; + long x15 = input[14]; + + if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15) == 0) + { + output.Slice(0, 16).Fill(0); + return; + } + + // stage 1 + s0 = x0 * CosPi1_64 + x1 * CosPi31_64; + s1 = x0 * CosPi31_64 - x1 * CosPi1_64; + s2 = x2 * CosPi5_64 + x3 * CosPi27_64; + s3 = x2 * CosPi27_64 - x3 * CosPi5_64; + s4 = x4 * CosPi9_64 + x5 * CosPi23_64; + s5 = x4 * CosPi23_64 - x5 * CosPi9_64; + s6 = x6 * CosPi13_64 + x7 * CosPi19_64; + s7 = x6 * CosPi19_64 - x7 * CosPi13_64; + s8 = x8 * CosPi17_64 + x9 * CosPi15_64; + s9 = x8 * CosPi15_64 - x9 * CosPi17_64; + s10 = x10 * CosPi21_64 + x11 * CosPi11_64; + s11 = x10 * CosPi11_64 - x11 * CosPi21_64; + s12 = x12 * CosPi25_64 + x13 * CosPi7_64; + s13 = x12 * CosPi7_64 - x13 * CosPi25_64; + s14 = x14 * CosPi29_64 + x15 * CosPi3_64; + s15 = x14 * CosPi3_64 - x15 * CosPi29_64; + + x0 = WrapLow(DctConstRoundShift(s0 + s8)); + x1 = WrapLow(DctConstRoundShift(s1 + s9)); + x2 = WrapLow(DctConstRoundShift(s2 + s10)); + x3 = WrapLow(DctConstRoundShift(s3 + s11)); + x4 = WrapLow(DctConstRoundShift(s4 + s12)); + x5 = WrapLow(DctConstRoundShift(s5 + s13)); + x6 = WrapLow(DctConstRoundShift(s6 + s14)); + x7 = WrapLow(DctConstRoundShift(s7 + s15)); + x8 = WrapLow(DctConstRoundShift(s0 - s8)); + x9 = WrapLow(DctConstRoundShift(s1 - s9)); + x10 = WrapLow(DctConstRoundShift(s2 - s10)); + x11 = WrapLow(DctConstRoundShift(s3 - s11)); + x12 = WrapLow(DctConstRoundShift(s4 - s12)); + x13 = WrapLow(DctConstRoundShift(s5 - s13)); + x14 = WrapLow(DctConstRoundShift(s6 - s14)); + x15 = WrapLow(DctConstRoundShift(s7 - s15)); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * CosPi4_64 + x9 * CosPi28_64; + s9 = x8 * CosPi28_64 - x9 * CosPi4_64; + s10 = x10 * CosPi20_64 + x11 * CosPi12_64; + s11 = x10 * CosPi12_64 - x11 * CosPi20_64; + s12 = -x12 * CosPi28_64 + x13 * CosPi4_64; + s13 = x12 * CosPi4_64 + x13 * CosPi28_64; + s14 = -x14 * CosPi12_64 + x15 * CosPi20_64; + s15 = x14 * CosPi20_64 + x15 * CosPi12_64; + + x0 = WrapLow(s0 + s4); + x1 = WrapLow(s1 + s5); + x2 = WrapLow(s2 + s6); + x3 = WrapLow(s3 + s7); + x4 = WrapLow(s0 - s4); + x5 = WrapLow(s1 - s5); + x6 = WrapLow(s2 - s6); + x7 = WrapLow(s3 - s7); + x8 = WrapLow(DctConstRoundShift(s8 + s12)); + x9 = WrapLow(DctConstRoundShift(s9 + s13)); + x10 = WrapLow(DctConstRoundShift(s10 + s14)); + x11 = WrapLow(DctConstRoundShift(s11 + s15)); + x12 = WrapLow(DctConstRoundShift(s8 - s12)); + x13 = WrapLow(DctConstRoundShift(s9 - s13)); + x14 = WrapLow(DctConstRoundShift(s10 - s14)); + x15 = WrapLow(DctConstRoundShift(s11 - s15)); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * CosPi8_64 + x5 * CosPi24_64; + s5 = x4 * CosPi24_64 - x5 * CosPi8_64; + s6 = -x6 * CosPi24_64 + x7 * CosPi8_64; + s7 = x6 * CosPi8_64 + x7 * CosPi24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * CosPi8_64 + x13 * CosPi24_64; + s13 = x12 * CosPi24_64 - x13 * CosPi8_64; + s14 = -x14 * CosPi24_64 + x15 * CosPi8_64; + s15 = x14 * CosPi8_64 + x15 * CosPi24_64; + + x0 = WrapLow(s0 + s2); + x1 = WrapLow(s1 + s3); + x2 = WrapLow(s0 - s2); + x3 = WrapLow(s1 - s3); + x4 = WrapLow(DctConstRoundShift(s4 + s6)); + x5 = WrapLow(DctConstRoundShift(s5 + s7)); + x6 = WrapLow(DctConstRoundShift(s4 - s6)); + x7 = WrapLow(DctConstRoundShift(s5 - s7)); + x8 = WrapLow(s8 + s10); + x9 = WrapLow(s9 + s11); + x10 = WrapLow(s8 - s10); + x11 = WrapLow(s9 - s11); + x12 = WrapLow(DctConstRoundShift(s12 + s14)); + x13 = WrapLow(DctConstRoundShift(s13 + s15)); + x14 = WrapLow(DctConstRoundShift(s12 - s14)); + x15 = WrapLow(DctConstRoundShift(s13 - s15)); + + // stage 4 + s2 = (-CosPi16_64) * (x2 + x3); + s3 = CosPi16_64 * (x2 - x3); + s6 = CosPi16_64 * (x6 + x7); + s7 = CosPi16_64 * (-x6 + x7); + s10 = CosPi16_64 * (x10 + x11); + s11 = CosPi16_64 * (-x10 + x11); + s14 = (-CosPi16_64) * (x14 + x15); + s15 = CosPi16_64 * (x14 - x15); + + x2 = WrapLow(DctConstRoundShift(s2)); + x3 = WrapLow(DctConstRoundShift(s3)); + x6 = WrapLow(DctConstRoundShift(s6)); + x7 = WrapLow(DctConstRoundShift(s7)); + x10 = WrapLow(DctConstRoundShift(s10)); + x11 = WrapLow(DctConstRoundShift(s11)); + x14 = WrapLow(DctConstRoundShift(s14)); + x15 = WrapLow(DctConstRoundShift(s15)); + + output[0] = WrapLow(x0); + output[1] = WrapLow(-x8); + output[2] = WrapLow(x12); + output[3] = WrapLow(-x4); + output[4] = WrapLow(x6); + output[5] = WrapLow(x14); + output[6] = WrapLow(x10); + output[7] = WrapLow(x2); + output[8] = WrapLow(x3); + output[9] = WrapLow(x11); + output[10] = WrapLow(x15); + output[11] = WrapLow(x7); + output[12] = WrapLow(x5); + output[13] = WrapLow(-x13); + output[14] = WrapLow(x9); + output[15] = WrapLow(-x1); + } + + [SkipLocalsInit] + public static void Idct16(ReadOnlySpan<int> input, Span<int> output) + { + Span<short> step1 = stackalloc short[16]; + Span<short> step2 = stackalloc short[16]; + long temp1, temp2; + + // stage 1 + step1[0] = (short)input[0 / 2]; + step1[1] = (short)input[16 / 2]; + step1[2] = (short)input[8 / 2]; + step1[3] = (short)input[24 / 2]; + step1[4] = (short)input[4 / 2]; + step1[5] = (short)input[20 / 2]; + step1[6] = (short)input[12 / 2]; + step1[7] = (short)input[28 / 2]; + step1[8] = (short)input[2 / 2]; + step1[9] = (short)input[18 / 2]; + step1[10] = (short)input[10 / 2]; + step1[11] = (short)input[26 / 2]; + step1[12] = (short)input[6 / 2]; + step1[13] = (short)input[22 / 2]; + step1[14] = (short)input[14 / 2]; + step1[15] = (short)input[30 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * CosPi30_64 - step1[15] * CosPi2_64; + temp2 = step1[8] * CosPi2_64 + step1[15] * CosPi30_64; + step2[8] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[15] = (short)WrapLow(DctConstRoundShift(temp2)); + + temp1 = step1[9] * CosPi14_64 - step1[14] * CosPi18_64; + temp2 = step1[9] * CosPi18_64 + step1[14] * CosPi14_64; + step2[9] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[14] = (short)WrapLow(DctConstRoundShift(temp2)); + + temp1 = step1[10] * CosPi22_64 - step1[13] * CosPi10_64; + temp2 = step1[10] * CosPi10_64 + step1[13] * CosPi22_64; + step2[10] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[13] = (short)WrapLow(DctConstRoundShift(temp2)); + + temp1 = step1[11] * CosPi6_64 - step1[12] * CosPi26_64; + temp2 = step1[11] * CosPi26_64 + step1[12] * CosPi6_64; + step2[11] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[12] = (short)WrapLow(DctConstRoundShift(temp2)); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * CosPi28_64 - step2[7] * CosPi4_64; + temp2 = step2[4] * CosPi4_64 + step2[7] * CosPi28_64; + step1[4] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[7] = (short)WrapLow(DctConstRoundShift(temp2)); + temp1 = step2[5] * CosPi12_64 - step2[6] * CosPi20_64; + temp2 = step2[5] * CosPi20_64 + step2[6] * CosPi12_64; + step1[5] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[6] = (short)WrapLow(DctConstRoundShift(temp2)); + + step1[8] = (short)WrapLow(step2[8] + step2[9]); + step1[9] = (short)WrapLow(step2[8] - step2[9]); + step1[10] = (short)WrapLow(-step2[10] + step2[11]); + step1[11] = (short)WrapLow(step2[10] + step2[11]); + step1[12] = (short)WrapLow(step2[12] + step2[13]); + step1[13] = (short)WrapLow(step2[12] - step2[13]); + step1[14] = (short)WrapLow(-step2[14] + step2[15]); + step1[15] = (short)WrapLow(step2[14] + step2[15]); + + // stage 4 + temp1 = (step1[0] + step1[1]) * CosPi16_64; + temp2 = (step1[0] - step1[1]) * CosPi16_64; + step2[0] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[1] = (short)WrapLow(DctConstRoundShift(temp2)); + temp1 = step1[2] * CosPi24_64 - step1[3] * CosPi8_64; + temp2 = step1[2] * CosPi8_64 + step1[3] * CosPi24_64; + step2[2] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[3] = (short)WrapLow(DctConstRoundShift(temp2)); + step2[4] = (short)WrapLow(step1[4] + step1[5]); + step2[5] = (short)WrapLow(step1[4] - step1[5]); + step2[6] = (short)WrapLow(-step1[6] + step1[7]); + step2[7] = (short)WrapLow(step1[6] + step1[7]); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * CosPi8_64 + step1[14] * CosPi24_64; + temp2 = step1[9] * CosPi24_64 + step1[14] * CosPi8_64; + step2[9] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[14] = (short)WrapLow(DctConstRoundShift(temp2)); + temp1 = -step1[10] * CosPi24_64 - step1[13] * CosPi8_64; + temp2 = -step1[10] * CosPi8_64 + step1[13] * CosPi24_64; + step2[10] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[13] = (short)WrapLow(DctConstRoundShift(temp2)); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[0] = (short)WrapLow(step2[0] + step2[3]); + step1[1] = (short)WrapLow(step2[1] + step2[2]); + step1[2] = (short)WrapLow(step2[1] - step2[2]); + step1[3] = (short)WrapLow(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * CosPi16_64; + temp2 = (step2[5] + step2[6]) * CosPi16_64; + step1[5] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[6] = (short)WrapLow(DctConstRoundShift(temp2)); + step1[7] = step2[7]; + + step1[8] = (short)WrapLow(step2[8] + step2[11]); + step1[9] = (short)WrapLow(step2[9] + step2[10]); + step1[10] = (short)WrapLow(step2[9] - step2[10]); + step1[11] = (short)WrapLow(step2[8] - step2[11]); + step1[12] = (short)WrapLow(-step2[12] + step2[15]); + step1[13] = (short)WrapLow(-step2[13] + step2[14]); + step1[14] = (short)WrapLow(step2[13] + step2[14]); + step1[15] = (short)WrapLow(step2[12] + step2[15]); + + // stage 6 + step2[0] = (short)WrapLow(step1[0] + step1[7]); + step2[1] = (short)WrapLow(step1[1] + step1[6]); + step2[2] = (short)WrapLow(step1[2] + step1[5]); + step2[3] = (short)WrapLow(step1[3] + step1[4]); + step2[4] = (short)WrapLow(step1[3] - step1[4]); + step2[5] = (short)WrapLow(step1[2] - step1[5]); + step2[6] = (short)WrapLow(step1[1] - step1[6]); + step2[7] = (short)WrapLow(step1[0] - step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * CosPi16_64; + temp2 = (step1[10] + step1[13]) * CosPi16_64; + step2[10] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[13] = (short)WrapLow(DctConstRoundShift(temp2)); + temp1 = (-step1[11] + step1[12]) * CosPi16_64; + temp2 = (step1[11] + step1[12]) * CosPi16_64; + step2[11] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[12] = (short)WrapLow(DctConstRoundShift(temp2)); + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + output[0] = WrapLow(step2[0] + step2[15]); + output[1] = WrapLow(step2[1] + step2[14]); + output[2] = WrapLow(step2[2] + step2[13]); + output[3] = WrapLow(step2[3] + step2[12]); + output[4] = WrapLow(step2[4] + step2[11]); + output[5] = WrapLow(step2[5] + step2[10]); + output[6] = WrapLow(step2[6] + step2[9]); + output[7] = WrapLow(step2[7] + step2[8]); + output[8] = WrapLow(step2[7] - step2[8]); + output[9] = WrapLow(step2[6] - step2[9]); + output[10] = WrapLow(step2[5] - step2[10]); + output[11] = WrapLow(step2[4] - step2[11]); + output[12] = WrapLow(step2[3] - step2[12]); + output[13] = WrapLow(step2[2] - step2[13]); + output[14] = WrapLow(step2[1] - step2[14]); + output[15] = WrapLow(step2[0] - step2[15]); + } + + [SkipLocalsInit] + public static void Idct16x16256Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) + { + int i, j; + Span<int> output = stackalloc int[16 * 16]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[16]; + Span<int> tempOut = stackalloc int[16]; + + // First transform rows + for (i = 0; i < 16; ++i) + { + Idct16(input, outptr); + input = input.Slice(16); + outptr = outptr.Slice(16); + } + + // Then transform columns + for (i = 0; i < 16; ++i) + { + for (j = 0; j < 16; ++j) + { + tempIn[j] = output[j * 16 + i]; + } + + Idct16(tempIn, tempOut); + for (j = 0; j < 16; ++j) + { + dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); + } + } + } + + [SkipLocalsInit] + public static void Idct16x1638Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) + { + int i, j; + Span<int> output = stackalloc int[16 * 16]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[16]; + Span<int> tempOut = stackalloc int[16]; + + output.Fill(0); + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 8x8 area, we only need to calculate first 8 rows here. + for (i = 0; i < 8; ++i) + { + Idct16(input, outptr); + input = input.Slice(16); + outptr = outptr.Slice(16); + } + + // Then transform columns + for (i = 0; i < 16; ++i) + { + for (j = 0; j < 16; ++j) + { + tempIn[j] = output[j * 16 + i]; + } + + Idct16(tempIn, tempOut); + for (j = 0; j < 16; ++j) + { + dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); + } + } + } + + [SkipLocalsInit] + public static void Idct16x1610Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) + { + int i, j; + Span<int> output = stackalloc int[16 * 16]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[16]; + Span<int> tempOut = stackalloc int[16]; + + output.Fill(0); + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. + for (i = 0; i < 4; ++i) + { + Idct16(input, outptr); + input = input.Slice(16); + outptr = outptr.Slice(16); + } + + // Then transform columns + for (i = 0; i < 16; ++i) + { + for (j = 0; j < 16; ++j) + { + tempIn[j] = output[j * 16 + i]; + } + + Idct16(tempIn, tempOut); + for (j = 0; j < 16; ++j) + { + dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); + } + } + } + + public static void Idct16x161Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) + { + int i, j; + long a1; + int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64)); + + output = WrapLow(DctConstRoundShift(output * CosPi16_64)); + a1 = BitUtils.RoundPowerOfTwo(output, 6); + for (j = 0; j < 16; ++j) + { + for (i = 0; i < 16; ++i) + { + dest[i] = ClipPixelAdd(dest[i], a1); + } + + dest = dest.Slice(stride); + } + } + + [SkipLocalsInit] + public static void Idct32(ReadOnlySpan<int> input, Span<int> output) + { + Span<short> step1 = stackalloc short[32]; + Span<short> step2 = stackalloc short[32]; + long temp1, temp2; + + // stage 1 + step1[0] = (short)input[0]; + step1[1] = (short)input[16]; + step1[2] = (short)input[8]; + step1[3] = (short)input[24]; + step1[4] = (short)input[4]; + step1[5] = (short)input[20]; + step1[6] = (short)input[12]; + step1[7] = (short)input[28]; + step1[8] = (short)input[2]; + step1[9] = (short)input[18]; + step1[10] = (short)input[10]; + step1[11] = (short)input[26]; + step1[12] = (short)input[6]; + step1[13] = (short)input[22]; + step1[14] = (short)input[14]; + step1[15] = (short)input[30]; + + temp1 = (short)input[1] * CosPi31_64 - (short)input[31] * CosPi1_64; + temp2 = (short)input[1] * CosPi1_64 + (short)input[31] * CosPi31_64; + step1[16] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[31] = (short)WrapLow(DctConstRoundShift(temp2)); + + temp1 = (short)input[17] * CosPi15_64 - (short)input[15] * CosPi17_64; + temp2 = (short)input[17] * CosPi17_64 + (short)input[15] * CosPi15_64; + step1[17] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[30] = (short)WrapLow(DctConstRoundShift(temp2)); + + temp1 = (short)input[9] * CosPi23_64 - (short)input[23] * CosPi9_64; + temp2 = (short)input[9] * CosPi9_64 + (short)input[23] * CosPi23_64; + step1[18] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[29] = (short)WrapLow(DctConstRoundShift(temp2)); + + temp1 = (short)input[25] * CosPi7_64 - (short)input[7] * CosPi25_64; + temp2 = (short)input[25] * CosPi25_64 + (short)input[7] * CosPi7_64; + step1[19] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[28] = (short)WrapLow(DctConstRoundShift(temp2)); + + temp1 = (short)input[5] * CosPi27_64 - (short)input[27] * CosPi5_64; + temp2 = (short)input[5] * CosPi5_64 + (short)input[27] * CosPi27_64; + step1[20] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[27] = (short)WrapLow(DctConstRoundShift(temp2)); + + temp1 = (short)input[21] * CosPi11_64 - (short)input[11] * CosPi21_64; + temp2 = (short)input[21] * CosPi21_64 + (short)input[11] * CosPi11_64; + step1[21] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[26] = (short)WrapLow(DctConstRoundShift(temp2)); + + temp1 = (short)input[13] * CosPi19_64 - (short)input[19] * CosPi13_64; + temp2 = (short)input[13] * CosPi13_64 + (short)input[19] * CosPi19_64; + step1[22] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[25] = (short)WrapLow(DctConstRoundShift(temp2)); + + temp1 = (short)input[29] * CosPi3_64 - (short)input[3] * CosPi29_64; + temp2 = (short)input[29] * CosPi29_64 + (short)input[3] * CosPi3_64; + step1[23] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[24] = (short)WrapLow(DctConstRoundShift(temp2)); + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * CosPi30_64 - step1[15] * CosPi2_64; + temp2 = step1[8] * CosPi2_64 + step1[15] * CosPi30_64; + step2[8] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[15] = (short)WrapLow(DctConstRoundShift(temp2)); + + temp1 = step1[9] * CosPi14_64 - step1[14] * CosPi18_64; + temp2 = step1[9] * CosPi18_64 + step1[14] * CosPi14_64; + step2[9] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[14] = (short)WrapLow(DctConstRoundShift(temp2)); + + temp1 = step1[10] * CosPi22_64 - step1[13] * CosPi10_64; + temp2 = step1[10] * CosPi10_64 + step1[13] * CosPi22_64; + step2[10] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[13] = (short)WrapLow(DctConstRoundShift(temp2)); + + temp1 = step1[11] * CosPi6_64 - step1[12] * CosPi26_64; + temp2 = step1[11] * CosPi26_64 + step1[12] * CosPi6_64; + step2[11] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[12] = (short)WrapLow(DctConstRoundShift(temp2)); + + step2[16] = (short)WrapLow(step1[16] + step1[17]); + step2[17] = (short)WrapLow(step1[16] - step1[17]); + step2[18] = (short)WrapLow(-step1[18] + step1[19]); + step2[19] = (short)WrapLow(step1[18] + step1[19]); + step2[20] = (short)WrapLow(step1[20] + step1[21]); + step2[21] = (short)WrapLow(step1[20] - step1[21]); + step2[22] = (short)WrapLow(-step1[22] + step1[23]); + step2[23] = (short)WrapLow(step1[22] + step1[23]); + step2[24] = (short)WrapLow(step1[24] + step1[25]); + step2[25] = (short)WrapLow(step1[24] - step1[25]); + step2[26] = (short)WrapLow(-step1[26] + step1[27]); + step2[27] = (short)WrapLow(step1[26] + step1[27]); + step2[28] = (short)WrapLow(step1[28] + step1[29]); + step2[29] = (short)WrapLow(step1[28] - step1[29]); + step2[30] = (short)WrapLow(-step1[30] + step1[31]); + step2[31] = (short)WrapLow(step1[30] + step1[31]); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * CosPi28_64 - step2[7] * CosPi4_64; + temp2 = step2[4] * CosPi4_64 + step2[7] * CosPi28_64; + step1[4] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[7] = (short)WrapLow(DctConstRoundShift(temp2)); + temp1 = step2[5] * CosPi12_64 - step2[6] * CosPi20_64; + temp2 = step2[5] * CosPi20_64 + step2[6] * CosPi12_64; + step1[5] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[6] = (short)WrapLow(DctConstRoundShift(temp2)); + + step1[8] = (short)WrapLow(step2[8] + step2[9]); + step1[9] = (short)WrapLow(step2[8] - step2[9]); + step1[10] = (short)WrapLow(-step2[10] + step2[11]); + step1[11] = (short)WrapLow(step2[10] + step2[11]); + step1[12] = (short)WrapLow(step2[12] + step2[13]); + step1[13] = (short)WrapLow(step2[12] - step2[13]); + step1[14] = (short)WrapLow(-step2[14] + step2[15]); + step1[15] = (short)WrapLow(step2[14] + step2[15]); + + step1[16] = step2[16]; + step1[31] = step2[31]; + temp1 = -step2[17] * CosPi4_64 + step2[30] * CosPi28_64; + temp2 = step2[17] * CosPi28_64 + step2[30] * CosPi4_64; + step1[17] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[30] = (short)WrapLow(DctConstRoundShift(temp2)); + temp1 = -step2[18] * CosPi28_64 - step2[29] * CosPi4_64; + temp2 = -step2[18] * CosPi4_64 + step2[29] * CosPi28_64; + step1[18] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[29] = (short)WrapLow(DctConstRoundShift(temp2)); + step1[19] = step2[19]; + step1[20] = step2[20]; + temp1 = -step2[21] * CosPi20_64 + step2[26] * CosPi12_64; + temp2 = step2[21] * CosPi12_64 + step2[26] * CosPi20_64; + step1[21] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[26] = (short)WrapLow(DctConstRoundShift(temp2)); + temp1 = -step2[22] * CosPi12_64 - step2[25] * CosPi20_64; + temp2 = -step2[22] * CosPi20_64 + step2[25] * CosPi12_64; + step1[22] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[25] = (short)WrapLow(DctConstRoundShift(temp2)); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + // stage 4 + temp1 = (step1[0] + step1[1]) * CosPi16_64; + temp2 = (step1[0] - step1[1]) * CosPi16_64; + step2[0] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[1] = (short)WrapLow(DctConstRoundShift(temp2)); + temp1 = step1[2] * CosPi24_64 - step1[3] * CosPi8_64; + temp2 = step1[2] * CosPi8_64 + step1[3] * CosPi24_64; + step2[2] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[3] = (short)WrapLow(DctConstRoundShift(temp2)); + step2[4] = (short)WrapLow(step1[4] + step1[5]); + step2[5] = (short)WrapLow(step1[4] - step1[5]); + step2[6] = (short)WrapLow(-step1[6] + step1[7]); + step2[7] = (short)WrapLow(step1[6] + step1[7]); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * CosPi8_64 + step1[14] * CosPi24_64; + temp2 = step1[9] * CosPi24_64 + step1[14] * CosPi8_64; + step2[9] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[14] = (short)WrapLow(DctConstRoundShift(temp2)); + temp1 = -step1[10] * CosPi24_64 - step1[13] * CosPi8_64; + temp2 = -step1[10] * CosPi8_64 + step1[13] * CosPi24_64; + step2[10] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[13] = (short)WrapLow(DctConstRoundShift(temp2)); + step2[11] = step1[11]; + step2[12] = step1[12]; + + step2[16] = (short)WrapLow(step1[16] + step1[19]); + step2[17] = (short)WrapLow(step1[17] + step1[18]); + step2[18] = (short)WrapLow(step1[17] - step1[18]); + step2[19] = (short)WrapLow(step1[16] - step1[19]); + step2[20] = (short)WrapLow(-step1[20] + step1[23]); + step2[21] = (short)WrapLow(-step1[21] + step1[22]); + step2[22] = (short)WrapLow(step1[21] + step1[22]); + step2[23] = (short)WrapLow(step1[20] + step1[23]); + + step2[24] = (short)WrapLow(step1[24] + step1[27]); + step2[25] = (short)WrapLow(step1[25] + step1[26]); + step2[26] = (short)WrapLow(step1[25] - step1[26]); + step2[27] = (short)WrapLow(step1[24] - step1[27]); + step2[28] = (short)WrapLow(-step1[28] + step1[31]); + step2[29] = (short)WrapLow(-step1[29] + step1[30]); + step2[30] = (short)WrapLow(step1[29] + step1[30]); + step2[31] = (short)WrapLow(step1[28] + step1[31]); + + // stage 5 + step1[0] = (short)WrapLow(step2[0] + step2[3]); + step1[1] = (short)WrapLow(step2[1] + step2[2]); + step1[2] = (short)WrapLow(step2[1] - step2[2]); + step1[3] = (short)WrapLow(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * CosPi16_64; + temp2 = (step2[5] + step2[6]) * CosPi16_64; + step1[5] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[6] = (short)WrapLow(DctConstRoundShift(temp2)); + step1[7] = step2[7]; + + step1[8] = (short)WrapLow(step2[8] + step2[11]); + step1[9] = (short)WrapLow(step2[9] + step2[10]); + step1[10] = (short)WrapLow(step2[9] - step2[10]); + step1[11] = (short)WrapLow(step2[8] - step2[11]); + step1[12] = (short)WrapLow(-step2[12] + step2[15]); + step1[13] = (short)WrapLow(-step2[13] + step2[14]); + step1[14] = (short)WrapLow(step2[13] + step2[14]); + step1[15] = (short)WrapLow(step2[12] + step2[15]); + + step1[16] = step2[16]; + step1[17] = step2[17]; + temp1 = -step2[18] * CosPi8_64 + step2[29] * CosPi24_64; + temp2 = step2[18] * CosPi24_64 + step2[29] * CosPi8_64; + step1[18] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[29] = (short)WrapLow(DctConstRoundShift(temp2)); + temp1 = -step2[19] * CosPi8_64 + step2[28] * CosPi24_64; + temp2 = step2[19] * CosPi24_64 + step2[28] * CosPi8_64; + step1[19] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[28] = (short)WrapLow(DctConstRoundShift(temp2)); + temp1 = -step2[20] * CosPi24_64 - step2[27] * CosPi8_64; + temp2 = -step2[20] * CosPi8_64 + step2[27] * CosPi24_64; + step1[20] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[27] = (short)WrapLow(DctConstRoundShift(temp2)); + temp1 = -step2[21] * CosPi24_64 - step2[26] * CosPi8_64; + temp2 = -step2[21] * CosPi8_64 + step2[26] * CosPi24_64; + step1[21] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[26] = (short)WrapLow(DctConstRoundShift(temp2)); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[0] = (short)WrapLow(step1[0] + step1[7]); + step2[1] = (short)WrapLow(step1[1] + step1[6]); + step2[2] = (short)WrapLow(step1[2] + step1[5]); + step2[3] = (short)WrapLow(step1[3] + step1[4]); + step2[4] = (short)WrapLow(step1[3] - step1[4]); + step2[5] = (short)WrapLow(step1[2] - step1[5]); + step2[6] = (short)WrapLow(step1[1] - step1[6]); + step2[7] = (short)WrapLow(step1[0] - step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * CosPi16_64; + temp2 = (step1[10] + step1[13]) * CosPi16_64; + step2[10] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[13] = (short)WrapLow(DctConstRoundShift(temp2)); + temp1 = (-step1[11] + step1[12]) * CosPi16_64; + temp2 = (step1[11] + step1[12]) * CosPi16_64; + step2[11] = (short)WrapLow(DctConstRoundShift(temp1)); + step2[12] = (short)WrapLow(DctConstRoundShift(temp2)); + step2[14] = step1[14]; + step2[15] = step1[15]; + + step2[16] = (short)WrapLow(step1[16] + step1[23]); + step2[17] = (short)WrapLow(step1[17] + step1[22]); + step2[18] = (short)WrapLow(step1[18] + step1[21]); + step2[19] = (short)WrapLow(step1[19] + step1[20]); + step2[20] = (short)WrapLow(step1[19] - step1[20]); + step2[21] = (short)WrapLow(step1[18] - step1[21]); + step2[22] = (short)WrapLow(step1[17] - step1[22]); + step2[23] = (short)WrapLow(step1[16] - step1[23]); + + step2[24] = (short)WrapLow(-step1[24] + step1[31]); + step2[25] = (short)WrapLow(-step1[25] + step1[30]); + step2[26] = (short)WrapLow(-step1[26] + step1[29]); + step2[27] = (short)WrapLow(-step1[27] + step1[28]); + step2[28] = (short)WrapLow(step1[27] + step1[28]); + step2[29] = (short)WrapLow(step1[26] + step1[29]); + step2[30] = (short)WrapLow(step1[25] + step1[30]); + step2[31] = (short)WrapLow(step1[24] + step1[31]); + + // stage 7 + step1[0] = (short)WrapLow(step2[0] + step2[15]); + step1[1] = (short)WrapLow(step2[1] + step2[14]); + step1[2] = (short)WrapLow(step2[2] + step2[13]); + step1[3] = (short)WrapLow(step2[3] + step2[12]); + step1[4] = (short)WrapLow(step2[4] + step2[11]); + step1[5] = (short)WrapLow(step2[5] + step2[10]); + step1[6] = (short)WrapLow(step2[6] + step2[9]); + step1[7] = (short)WrapLow(step2[7] + step2[8]); + step1[8] = (short)WrapLow(step2[7] - step2[8]); + step1[9] = (short)WrapLow(step2[6] - step2[9]); + step1[10] = (short)WrapLow(step2[5] - step2[10]); + step1[11] = (short)WrapLow(step2[4] - step2[11]); + step1[12] = (short)WrapLow(step2[3] - step2[12]); + step1[13] = (short)WrapLow(step2[2] - step2[13]); + step1[14] = (short)WrapLow(step2[1] - step2[14]); + step1[15] = (short)WrapLow(step2[0] - step2[15]); + + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[18] = step2[18]; + step1[19] = step2[19]; + temp1 = (-step2[20] + step2[27]) * CosPi16_64; + temp2 = (step2[20] + step2[27]) * CosPi16_64; + step1[20] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[27] = (short)WrapLow(DctConstRoundShift(temp2)); + temp1 = (-step2[21] + step2[26]) * CosPi16_64; + temp2 = (step2[21] + step2[26]) * CosPi16_64; + step1[21] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[26] = (short)WrapLow(DctConstRoundShift(temp2)); + temp1 = (-step2[22] + step2[25]) * CosPi16_64; + temp2 = (step2[22] + step2[25]) * CosPi16_64; + step1[22] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[25] = (short)WrapLow(DctConstRoundShift(temp2)); + temp1 = (-step2[23] + step2[24]) * CosPi16_64; + temp2 = (step2[23] + step2[24]) * CosPi16_64; + step1[23] = (short)WrapLow(DctConstRoundShift(temp1)); + step1[24] = (short)WrapLow(DctConstRoundShift(temp2)); + step1[28] = step2[28]; + step1[29] = step2[29]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // final stage + output[0] = WrapLow(step1[0] + step1[31]); + output[1] = WrapLow(step1[1] + step1[30]); + output[2] = WrapLow(step1[2] + step1[29]); + output[3] = WrapLow(step1[3] + step1[28]); + output[4] = WrapLow(step1[4] + step1[27]); + output[5] = WrapLow(step1[5] + step1[26]); + output[6] = WrapLow(step1[6] + step1[25]); + output[7] = WrapLow(step1[7] + step1[24]); + output[8] = WrapLow(step1[8] + step1[23]); + output[9] = WrapLow(step1[9] + step1[22]); + output[10] = WrapLow(step1[10] + step1[21]); + output[11] = WrapLow(step1[11] + step1[20]); + output[12] = WrapLow(step1[12] + step1[19]); + output[13] = WrapLow(step1[13] + step1[18]); + output[14] = WrapLow(step1[14] + step1[17]); + output[15] = WrapLow(step1[15] + step1[16]); + output[16] = WrapLow(step1[15] - step1[16]); + output[17] = WrapLow(step1[14] - step1[17]); + output[18] = WrapLow(step1[13] - step1[18]); + output[19] = WrapLow(step1[12] - step1[19]); + output[20] = WrapLow(step1[11] - step1[20]); + output[21] = WrapLow(step1[10] - step1[21]); + output[22] = WrapLow(step1[9] - step1[22]); + output[23] = WrapLow(step1[8] - step1[23]); + output[24] = WrapLow(step1[7] - step1[24]); + output[25] = WrapLow(step1[6] - step1[25]); + output[26] = WrapLow(step1[5] - step1[26]); + output[27] = WrapLow(step1[4] - step1[27]); + output[28] = WrapLow(step1[3] - step1[28]); + output[29] = WrapLow(step1[2] - step1[29]); + output[30] = WrapLow(step1[1] - step1[30]); + output[31] = WrapLow(step1[0] - step1[31]); + } + + [SkipLocalsInit] + public static void Idct32x321024Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) + { + int i, j; + Span<int> output = stackalloc int[32 * 32]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[32]; + Span<int> tempOut = stackalloc int[32]; + + // Rows + for (i = 0; i < 32; ++i) + { + short zeroCoeff = 0; + for (j = 0; j < 32; ++j) + { + zeroCoeff |= (short)input[j]; + } + + if (zeroCoeff != 0) + { + Idct32(input, outptr); + } + else + { + outptr.Slice(0, 32).Fill(0); + } + + input = input.Slice(32); + outptr = outptr.Slice(32); + } + + // Columns + for (i = 0; i < 32; ++i) + { + for (j = 0; j < 32; ++j) + { + tempIn[j] = output[j * 32 + i]; + } + + Idct32(tempIn, tempOut); + for (j = 0; j < 32; ++j) + { + dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); + } + } + } + + [SkipLocalsInit] + public static void Idct32x32135Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) + { + int i, j; + Span<int> output = stackalloc int[32 * 32]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[32]; + Span<int> tempOut = stackalloc int[32]; + + output.Fill(0); + + // Rows + // Only upper-left 16x16 has non-zero coeff + for (i = 0; i < 16; ++i) + { + Idct32(input, outptr); + input = input.Slice(32); + outptr = outptr.Slice(32); + } + + // Columns + for (i = 0; i < 32; ++i) + { + for (j = 0; j < 32; ++j) + { + tempIn[j] = output[j * 32 + i]; + } + + Idct32(tempIn, tempOut); + for (j = 0; j < 32; ++j) + { + dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); + } + } + } + + [SkipLocalsInit] + public static void Idct32x3234Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) + { + int i, j; + Span<int> output = stackalloc int[32 * 32]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[32]; + Span<int> tempOut = stackalloc int[32]; + + output.Fill(0); + + // Rows + // Only upper-left 8x8 has non-zero coeff + for (i = 0; i < 8; ++i) + { + Idct32(input, outptr); + input = input.Slice(32); + outptr = outptr.Slice(32); + } + + // Columns + for (i = 0; i < 32; ++i) + { + for (j = 0; j < 32; ++j) + { + tempIn[j] = output[j * 32 + i]; + } + + Idct32(tempIn, tempOut); + for (j = 0; j < 32; ++j) + { + dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); + } + } + } + + public static void Idct32x321Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) + { + int i, j; + long a1; + int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64)); + + output = WrapLow(DctConstRoundShift(output * CosPi16_64)); + a1 = BitUtils.RoundPowerOfTwo(output, 6); + + for (j = 0; j < 32; ++j) + { + for (i = 0; i < 32; ++i) + { + dest[i] = ClipPixelAdd(dest[i], a1); + } + + dest = dest.Slice(stride); + } + } + + [SkipLocalsInit] + public static void HighbdIwht4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) + { + /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ + int i; + Span<int> output = stackalloc int[16]; + long a1, b1, c1, d1, e1; + ReadOnlySpan<int> ip = input; + Span<int> op = output; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] >> UnitQuantShift; + c1 = ip[1] >> UnitQuantShift; + d1 = ip[2] >> UnitQuantShift; + b1 = ip[3] >> UnitQuantShift; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + op[0] = HighbdWrapLow(a1, bd); + op[1] = HighbdWrapLow(b1, bd); + op[2] = HighbdWrapLow(c1, bd); + op[3] = HighbdWrapLow(d1, bd); + ip = ip.Slice(4); + op = op.Slice(4); + } + + ReadOnlySpan<int> ip2 = output; + for (i = 0; i < 4; i++) + { + a1 = ip2[4 * 0]; + c1 = ip2[4 * 1]; + d1 = ip2[4 * 2]; + b1 = ip2[4 * 3]; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + dest[stride * 0] = HighbdClipPixelAdd(dest[stride * 0], HighbdWrapLow(a1, bd), bd); + dest[stride * 1] = HighbdClipPixelAdd(dest[stride * 1], HighbdWrapLow(b1, bd), bd); + dest[stride * 2] = HighbdClipPixelAdd(dest[stride * 2], HighbdWrapLow(c1, bd), bd); + dest[stride * 3] = HighbdClipPixelAdd(dest[stride * 3], HighbdWrapLow(d1, bd), bd); + + ip2 = ip2.Slice(1); + dest = dest.Slice(1); + } + } + + [SkipLocalsInit] + public static void HighbdIwht4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) + { + int i; + long a1, e1; + Span<int> tmp = stackalloc int[4]; + ReadOnlySpan<int> ip = input; + Span<int> op = tmp; + + a1 = ip[0] >> UnitQuantShift; + e1 = a1 >> 1; + a1 -= e1; + op[0] = HighbdWrapLow(a1, bd); + op[1] = op[2] = op[3] = HighbdWrapLow(e1, bd); + + ReadOnlySpan<int> ip2 = tmp; + for (i = 0; i < 4; i++) + { + e1 = ip2[0] >> 1; + a1 = ip2[0] - e1; + dest[stride * 0] = HighbdClipPixelAdd(dest[stride * 0], a1, bd); + dest[stride * 1] = HighbdClipPixelAdd(dest[stride * 1], e1, bd); + dest[stride * 2] = HighbdClipPixelAdd(dest[stride * 2], e1, bd); + dest[stride * 3] = HighbdClipPixelAdd(dest[stride * 3], e1, bd); + ip2 = ip2.Slice(1); + dest = dest.Slice(1); + } + } + + public static void HighbdIadst4(ReadOnlySpan<int> input, Span<int> output, int bd) + { + long s0, s1, s2, s3, s4, s5, s6, s7; + int x0 = input[0]; + int x1 = input[1]; + int x2 = input[2]; + int x3 = input[3]; + + if (DetectInvalidHighbdInput(input, 4) != 0) + { + Debug.Assert(false, "invalid highbd txfm input"); + output.Slice(0, 4).Fill(0); + return; + } + + if ((x0 | x1 | x2 | x3) == 0) + { + output.Slice(0, 4).Fill(0); + return; + } + + s0 = (long)SinPi1_9 * x0; + s1 = (long)SinPi2_9 * x0; + s2 = (long)SinPi3_9 * x1; + s3 = (long)SinPi4_9 * x2; + s4 = (long)SinPi1_9 * x2; + s5 = (long)SinPi2_9 * x3; + s6 = (long)SinPi4_9 * x3; + s7 = HighbdWrapLow(x0 - x2 + x3, bd); + + s0 = s0 + s3 + s5; + s1 = s1 - s4 - s6; + s3 = s2; + s2 = SinPi3_9 * s7; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = HighbdWrapLow(DctConstRoundShift(s0 + s3), bd); + output[1] = HighbdWrapLow(DctConstRoundShift(s1 + s3), bd); + output[2] = HighbdWrapLow(DctConstRoundShift(s2), bd); + output[3] = HighbdWrapLow(DctConstRoundShift(s0 + s1 - s3), bd); + } + + [SkipLocalsInit] + public static void HighbdIdct4(ReadOnlySpan<int> input, Span<int> output, int bd) + { + Span<int> step = stackalloc int[4]; + long temp1, temp2; + + if (DetectInvalidHighbdInput(input, 4) != 0) + { + Debug.Assert(false, "invalid highbd txfm input"); + output.Slice(0, 4).Fill(0); + return; + } + + // stage 1 + temp1 = (input[0] + input[2]) * (long)CosPi16_64; + temp2 = (input[0] - input[2]) * (long)CosPi16_64; + step[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + temp1 = input[1] * (long)CosPi24_64 - input[3] * (long)CosPi8_64; + temp2 = input[1] * (long)CosPi8_64 + input[3] * (long)CosPi24_64; + step[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + // stage 2 + output[0] = HighbdWrapLow(step[0] + step[3], bd); + output[1] = HighbdWrapLow(step[1] + step[2], bd); + output[2] = HighbdWrapLow(step[1] - step[2], bd); + output[3] = HighbdWrapLow(step[0] - step[3], bd); + } + + [SkipLocalsInit] + public static void HighbdIdct4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) + { + int i, j; + Span<int> output = stackalloc int[4 * 4]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[4]; + Span<int> tempOut = stackalloc int[4]; + + // Rows + for (i = 0; i < 4; ++i) + { + HighbdIdct4(input, outptr, bd); + input = input.Slice(4); + outptr = outptr.Slice(4); + } + + // Columns + for (i = 0; i < 4; ++i) + { + for (j = 0; j < 4; ++j) + { + tempIn[j] = output[j * 4 + i]; + } + + HighbdIdct4(tempIn, tempOut, bd); + for (j = 0; j < 4; ++j) + { + dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4), bd); + } + } + } + + public static void HighbdIdct4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) + { + int i; + long a1; + int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd); + + output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd); + a1 = BitUtils.RoundPowerOfTwo(output, 4); + + for (i = 0; i < 4; i++) + { + dest[0] = HighbdClipPixelAdd(dest[0], a1, bd); + dest[1] = HighbdClipPixelAdd(dest[1], a1, bd); + dest[2] = HighbdClipPixelAdd(dest[2], a1, bd); + dest[3] = HighbdClipPixelAdd(dest[3], a1, bd); + dest = dest.Slice(stride); + } + } + + public static void HighbdIadst8(ReadOnlySpan<int> input, Span<int> output, int bd) + { + long s0, s1, s2, s3, s4, s5, s6, s7; + int x0 = input[7]; + int x1 = input[0]; + int x2 = input[5]; + int x3 = input[2]; + int x4 = input[3]; + int x5 = input[4]; + int x6 = input[1]; + int x7 = input[6]; + + if (DetectInvalidHighbdInput(input, 8) != 0) + { + Debug.Assert(false, "invalid highbd txfm input"); + output.Slice(0, 8).Fill(0); + return; + } + + if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7) == 0) + { + output.Slice(0, 8).Fill(0); + return; + } + + // stage 1 + s0 = (long)CosPi2_64 * x0 + (long)CosPi30_64 * x1; + s1 = (long)CosPi30_64 * x0 - (long)CosPi2_64 * x1; + s2 = (long)CosPi10_64 * x2 + (long)CosPi22_64 * x3; + s3 = (long)CosPi22_64 * x2 - (long)CosPi10_64 * x3; + s4 = (long)CosPi18_64 * x4 + (long)CosPi14_64 * x5; + s5 = (long)CosPi14_64 * x4 - (long)CosPi18_64 * x5; + s6 = (long)CosPi26_64 * x6 + (long)CosPi6_64 * x7; + s7 = (long)CosPi6_64 * x6 - (long)CosPi26_64 * x7; + + x0 = HighbdWrapLow(DctConstRoundShift(s0 + s4), bd); + x1 = HighbdWrapLow(DctConstRoundShift(s1 + s5), bd); + x2 = HighbdWrapLow(DctConstRoundShift(s2 + s6), bd); + x3 = HighbdWrapLow(DctConstRoundShift(s3 + s7), bd); + x4 = HighbdWrapLow(DctConstRoundShift(s0 - s4), bd); + x5 = HighbdWrapLow(DctConstRoundShift(s1 - s5), bd); + x6 = HighbdWrapLow(DctConstRoundShift(s2 - s6), bd); + x7 = HighbdWrapLow(DctConstRoundShift(s3 - s7), bd); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = (long)CosPi8_64 * x4 + (long)CosPi24_64 * x5; + s5 = (long)CosPi24_64 * x4 - (long)CosPi8_64 * x5; + s6 = (long)(-CosPi24_64) * x6 + (long)CosPi8_64 * x7; + s7 = (long)CosPi8_64 * x6 + (long)CosPi24_64 * x7; + + x0 = HighbdWrapLow(s0 + s2, bd); + x1 = HighbdWrapLow(s1 + s3, bd); + x2 = HighbdWrapLow(s0 - s2, bd); + x3 = HighbdWrapLow(s1 - s3, bd); + x4 = HighbdWrapLow(DctConstRoundShift(s4 + s6), bd); + x5 = HighbdWrapLow(DctConstRoundShift(s5 + s7), bd); + x6 = HighbdWrapLow(DctConstRoundShift(s4 - s6), bd); + x7 = HighbdWrapLow(DctConstRoundShift(s5 - s7), bd); + + // stage 3 + s2 = (long)CosPi16_64 * (x2 + x3); + s3 = (long)CosPi16_64 * (x2 - x3); + s6 = (long)CosPi16_64 * (x6 + x7); + s7 = (long)CosPi16_64 * (x6 - x7); + + x2 = HighbdWrapLow(DctConstRoundShift(s2), bd); + x3 = HighbdWrapLow(DctConstRoundShift(s3), bd); + x6 = HighbdWrapLow(DctConstRoundShift(s6), bd); + x7 = HighbdWrapLow(DctConstRoundShift(s7), bd); + + output[0] = HighbdWrapLow(x0, bd); + output[1] = HighbdWrapLow(-x4, bd); + output[2] = HighbdWrapLow(x6, bd); + output[3] = HighbdWrapLow(-x2, bd); + output[4] = HighbdWrapLow(x3, bd); + output[5] = HighbdWrapLow(-x7, bd); + output[6] = HighbdWrapLow(x5, bd); + output[7] = HighbdWrapLow(-x1, bd); + } + + [SkipLocalsInit] + public static void HighbdIdct8(ReadOnlySpan<int> input, Span<int> output, int bd) + { + Span<int> step1 = stackalloc int[8]; + Span<int> step2 = stackalloc int[8]; + long temp1, temp2; + + if (DetectInvalidHighbdInput(input, 8) != 0) + { + Debug.Assert(false, "invalid highbd txfm input"); + output.Slice(0, 8).Fill(0); + return; + } + + // stage 1 + step1[0] = input[0]; + step1[2] = input[4]; + step1[1] = input[2]; + step1[3] = input[6]; + temp1 = input[1] * (long)CosPi28_64 - input[7] * (long)CosPi4_64; + temp2 = input[1] * (long)CosPi4_64 + input[7] * (long)CosPi28_64; + step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + temp1 = input[5] * (long)CosPi12_64 - input[3] * (long)CosPi20_64; + temp2 = input[5] * (long)CosPi20_64 + input[3] * (long)CosPi12_64; + step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + // stage 2 & stage 3 - even half + HighbdIdct4(step1, step1, bd); + + // stage 2 - odd half + step2[4] = HighbdWrapLow(step1[4] + step1[5], bd); + step2[5] = HighbdWrapLow(step1[4] - step1[5], bd); + step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd); + step2[7] = HighbdWrapLow(step1[6] + step1[7], bd); + + // stage 3 - odd half + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * (long)CosPi16_64; + temp2 = (step2[5] + step2[6]) * (long)CosPi16_64; + step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + step1[7] = step2[7]; + + // stage 4 + output[0] = HighbdWrapLow(step1[0] + step1[7], bd); + output[1] = HighbdWrapLow(step1[1] + step1[6], bd); + output[2] = HighbdWrapLow(step1[2] + step1[5], bd); + output[3] = HighbdWrapLow(step1[3] + step1[4], bd); + output[4] = HighbdWrapLow(step1[3] - step1[4], bd); + output[5] = HighbdWrapLow(step1[2] - step1[5], bd); + output[6] = HighbdWrapLow(step1[1] - step1[6], bd); + output[7] = HighbdWrapLow(step1[0] - step1[7], bd); + } + + [SkipLocalsInit] + public static void HighbdIdct8x864Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) + { + int i, j; + Span<int> output = stackalloc int[8 * 8]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[8]; + Span<int> tempOut = stackalloc int[8]; + + // First transform rows + for (i = 0; i < 8; ++i) + { + HighbdIdct8(input, outptr, bd); + input = input.Slice(8); + outptr = outptr.Slice(8); + } + + // Then transform columns + for (i = 0; i < 8; ++i) + { + for (j = 0; j < 8; ++j) + { + tempIn[j] = output[j * 8 + i]; + } + + HighbdIdct8(tempIn, tempOut, bd); + for (j = 0; j < 8; ++j) + { + dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd); + } + } + } + + [SkipLocalsInit] + public static void HighbdIdct8x812Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) + { + int i, j; + Span<int> output = stackalloc int[8 * 8]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[8]; + Span<int> tempOut = stackalloc int[8]; + + output.Fill(0); + + // First transform rows + // Only first 4 row has non-zero coefs + for (i = 0; i < 4; ++i) + { + HighbdIdct8(input, outptr, bd); + input = input.Slice(8); + outptr = outptr.Slice(8); + } + + // Then transform columns + for (i = 0; i < 8; ++i) + { + for (j = 0; j < 8; ++j) + { + tempIn[j] = output[j * 8 + i]; + } + + HighbdIdct8(tempIn, tempOut, bd); + for (j = 0; j < 8; ++j) + { + dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd); + } + } + } + + public static void vpx_Highbdidct8x8_1_add_c(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) + { + int i, j; + long a1; + int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd); + + output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd); + a1 = BitUtils.RoundPowerOfTwo(output, 5); + for (j = 0; j < 8; ++j) + { + for (i = 0; i < 8; ++i) + { + dest[i] = HighbdClipPixelAdd(dest[i], a1, bd); + } + + dest = dest.Slice(stride); + } + } + + public static void HighbdIadst16(ReadOnlySpan<int> input, Span<int> output, int bd) + { + long s0, s1, s2, s3, s4, s5, s6, s7, s8; + long s9, s10, s11, s12, s13, s14, s15; + int x0 = input[15]; + int x1 = input[0]; + int x2 = input[13]; + int x3 = input[2]; + int x4 = input[11]; + int x5 = input[4]; + int x6 = input[9]; + int x7 = input[6]; + int x8 = input[7]; + int x9 = input[8]; + int x10 = input[5]; + int x11 = input[10]; + int x12 = input[3]; + int x13 = input[12]; + int x14 = input[1]; + int x15 = input[14]; + if (DetectInvalidHighbdInput(input, 16) != 0) + { + Debug.Assert(false, "invalid highbd txfm input"); + output.Slice(0, 16).Fill(0); + return; + } + + if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15) == 0) + { + output.Slice(0, 16).Fill(0); + return; + } + + // stage 1 + s0 = x0 * (long)CosPi1_64 + x1 * (long)CosPi31_64; + s1 = x0 * (long)CosPi31_64 - x1 * (long)CosPi1_64; + s2 = x2 * (long)CosPi5_64 + x3 * (long)CosPi27_64; + s3 = x2 * (long)CosPi27_64 - x3 * (long)CosPi5_64; + s4 = x4 * (long)CosPi9_64 + x5 * (long)CosPi23_64; + s5 = x4 * (long)CosPi23_64 - x5 * (long)CosPi9_64; + s6 = x6 * (long)CosPi13_64 + x7 * (long)CosPi19_64; + s7 = x6 * (long)CosPi19_64 - x7 * (long)CosPi13_64; + s8 = x8 * (long)CosPi17_64 + x9 * (long)CosPi15_64; + s9 = x8 * (long)CosPi15_64 - x9 * (long)CosPi17_64; + s10 = x10 * (long)CosPi21_64 + x11 * (long)CosPi11_64; + s11 = x10 * (long)CosPi11_64 - x11 * (long)CosPi21_64; + s12 = x12 * (long)CosPi25_64 + x13 * (long)CosPi7_64; + s13 = x12 * (long)CosPi7_64 - x13 * (long)CosPi25_64; + s14 = x14 * (long)CosPi29_64 + x15 * (long)CosPi3_64; + s15 = x14 * (long)CosPi3_64 - x15 * (long)CosPi29_64; + + x0 = HighbdWrapLow(DctConstRoundShift(s0 + s8), bd); + x1 = HighbdWrapLow(DctConstRoundShift(s1 + s9), bd); + x2 = HighbdWrapLow(DctConstRoundShift(s2 + s10), bd); + x3 = HighbdWrapLow(DctConstRoundShift(s3 + s11), bd); + x4 = HighbdWrapLow(DctConstRoundShift(s4 + s12), bd); + x5 = HighbdWrapLow(DctConstRoundShift(s5 + s13), bd); + x6 = HighbdWrapLow(DctConstRoundShift(s6 + s14), bd); + x7 = HighbdWrapLow(DctConstRoundShift(s7 + s15), bd); + x8 = HighbdWrapLow(DctConstRoundShift(s0 - s8), bd); + x9 = HighbdWrapLow(DctConstRoundShift(s1 - s9), bd); + x10 = HighbdWrapLow(DctConstRoundShift(s2 - s10), bd); + x11 = HighbdWrapLow(DctConstRoundShift(s3 - s11), bd); + x12 = HighbdWrapLow(DctConstRoundShift(s4 - s12), bd); + x13 = HighbdWrapLow(DctConstRoundShift(s5 - s13), bd); + x14 = HighbdWrapLow(DctConstRoundShift(s6 - s14), bd); + x15 = HighbdWrapLow(DctConstRoundShift(s7 - s15), bd); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * (long)CosPi4_64 + x9 * (long)CosPi28_64; + s9 = x8 * (long)CosPi28_64 - x9 * (long)CosPi4_64; + s10 = x10 * (long)CosPi20_64 + x11 * (long)CosPi12_64; + s11 = x10 * (long)CosPi12_64 - x11 * (long)CosPi20_64; + s12 = -x12 * (long)CosPi28_64 + x13 * (long)CosPi4_64; + s13 = x12 * (long)CosPi4_64 + x13 * (long)CosPi28_64; + s14 = -x14 * (long)CosPi12_64 + x15 * (long)CosPi20_64; + s15 = x14 * (long)CosPi20_64 + x15 * (long)CosPi12_64; + + x0 = HighbdWrapLow(s0 + s4, bd); + x1 = HighbdWrapLow(s1 + s5, bd); + x2 = HighbdWrapLow(s2 + s6, bd); + x3 = HighbdWrapLow(s3 + s7, bd); + x4 = HighbdWrapLow(s0 - s4, bd); + x5 = HighbdWrapLow(s1 - s5, bd); + x6 = HighbdWrapLow(s2 - s6, bd); + x7 = HighbdWrapLow(s3 - s7, bd); + x8 = HighbdWrapLow(DctConstRoundShift(s8 + s12), bd); + x9 = HighbdWrapLow(DctConstRoundShift(s9 + s13), bd); + x10 = HighbdWrapLow(DctConstRoundShift(s10 + s14), bd); + x11 = HighbdWrapLow(DctConstRoundShift(s11 + s15), bd); + x12 = HighbdWrapLow(DctConstRoundShift(s8 - s12), bd); + x13 = HighbdWrapLow(DctConstRoundShift(s9 - s13), bd); + x14 = HighbdWrapLow(DctConstRoundShift(s10 - s14), bd); + x15 = HighbdWrapLow(DctConstRoundShift(s11 - s15), bd); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * (long)CosPi8_64 + x5 * (long)CosPi24_64; + s5 = x4 * (long)CosPi24_64 - x5 * (long)CosPi8_64; + s6 = -x6 * (long)CosPi24_64 + x7 * (long)CosPi8_64; + s7 = x6 * (long)CosPi8_64 + x7 * (long)CosPi24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * (long)CosPi8_64 + x13 * (long)CosPi24_64; + s13 = x12 * (long)CosPi24_64 - x13 * (long)CosPi8_64; + s14 = -x14 * (long)CosPi24_64 + x15 * (long)CosPi8_64; + s15 = x14 * (long)CosPi8_64 + x15 * (long)CosPi24_64; + + x0 = HighbdWrapLow(s0 + s2, bd); + x1 = HighbdWrapLow(s1 + s3, bd); + x2 = HighbdWrapLow(s0 - s2, bd); + x3 = HighbdWrapLow(s1 - s3, bd); + x4 = HighbdWrapLow(DctConstRoundShift(s4 + s6), bd); + x5 = HighbdWrapLow(DctConstRoundShift(s5 + s7), bd); + x6 = HighbdWrapLow(DctConstRoundShift(s4 - s6), bd); + x7 = HighbdWrapLow(DctConstRoundShift(s5 - s7), bd); + x8 = HighbdWrapLow(s8 + s10, bd); + x9 = HighbdWrapLow(s9 + s11, bd); + x10 = HighbdWrapLow(s8 - s10, bd); + x11 = HighbdWrapLow(s9 - s11, bd); + x12 = HighbdWrapLow(DctConstRoundShift(s12 + s14), bd); + x13 = HighbdWrapLow(DctConstRoundShift(s13 + s15), bd); + x14 = HighbdWrapLow(DctConstRoundShift(s12 - s14), bd); + x15 = HighbdWrapLow(DctConstRoundShift(s13 - s15), bd); + + // stage 4 + s2 = (long)(-CosPi16_64) * (x2 + x3); + s3 = (long)CosPi16_64 * (x2 - x3); + s6 = (long)CosPi16_64 * (x6 + x7); + s7 = (long)CosPi16_64 * (-x6 + x7); + s10 = (long)CosPi16_64 * (x10 + x11); + s11 = (long)CosPi16_64 * (-x10 + x11); + s14 = (long)(-CosPi16_64) * (x14 + x15); + s15 = (long)CosPi16_64 * (x14 - x15); + + x2 = HighbdWrapLow(DctConstRoundShift(s2), bd); + x3 = HighbdWrapLow(DctConstRoundShift(s3), bd); + x6 = HighbdWrapLow(DctConstRoundShift(s6), bd); + x7 = HighbdWrapLow(DctConstRoundShift(s7), bd); + x10 = HighbdWrapLow(DctConstRoundShift(s10), bd); + x11 = HighbdWrapLow(DctConstRoundShift(s11), bd); + x14 = HighbdWrapLow(DctConstRoundShift(s14), bd); + x15 = HighbdWrapLow(DctConstRoundShift(s15), bd); + + output[0] = HighbdWrapLow(x0, bd); + output[1] = HighbdWrapLow(-x8, bd); + output[2] = HighbdWrapLow(x12, bd); + output[3] = HighbdWrapLow(-x4, bd); + output[4] = HighbdWrapLow(x6, bd); + output[5] = HighbdWrapLow(x14, bd); + output[6] = HighbdWrapLow(x10, bd); + output[7] = HighbdWrapLow(x2, bd); + output[8] = HighbdWrapLow(x3, bd); + output[9] = HighbdWrapLow(x11, bd); + output[10] = HighbdWrapLow(x15, bd); + output[11] = HighbdWrapLow(x7, bd); + output[12] = HighbdWrapLow(x5, bd); + output[13] = HighbdWrapLow(-x13, bd); + output[14] = HighbdWrapLow(x9, bd); + output[15] = HighbdWrapLow(-x1, bd); + } + + [SkipLocalsInit] + public static void HighbdIdct16(ReadOnlySpan<int> input, Span<int> output, int bd) + { + Span<int> step1 = stackalloc int[16]; + Span<int> step2 = stackalloc int[16]; + long temp1, temp2; + + if (DetectInvalidHighbdInput(input, 16) != 0) + { + Debug.Assert(false, "invalid highbd txfm input"); + output.Slice(0, 16).Fill(0); + return; + } + + // stage 1 + step1[0] = input[0 / 2]; + step1[1] = input[16 / 2]; + step1[2] = input[8 / 2]; + step1[3] = input[24 / 2]; + step1[4] = input[4 / 2]; + step1[5] = input[20 / 2]; + step1[6] = input[12 / 2]; + step1[7] = input[28 / 2]; + step1[8] = input[2 / 2]; + step1[9] = input[18 / 2]; + step1[10] = input[10 / 2]; + step1[11] = input[26 / 2]; + step1[12] = input[6 / 2]; + step1[13] = input[22 / 2]; + step1[14] = input[14 / 2]; + step1[15] = input[30 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * (long)CosPi30_64 - step1[15] * (long)CosPi2_64; + temp2 = step1[8] * (long)CosPi2_64 + step1[15] * (long)CosPi30_64; + step2[8] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[15] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + temp1 = step1[9] * (long)CosPi14_64 - step1[14] * (long)CosPi18_64; + temp2 = step1[9] * (long)CosPi18_64 + step1[14] * (long)CosPi14_64; + step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + temp1 = step1[10] * (long)CosPi22_64 - step1[13] * (long)CosPi10_64; + temp2 = step1[10] * (long)CosPi10_64 + step1[13] * (long)CosPi22_64; + step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + temp1 = step1[11] * (long)CosPi6_64 - step1[12] * (long)CosPi26_64; + temp2 = step1[11] * (long)CosPi26_64 + step1[12] * (long)CosPi6_64; + step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * (long)CosPi28_64 - step2[7] * (long)CosPi4_64; + temp2 = step2[4] * (long)CosPi4_64 + step2[7] * (long)CosPi28_64; + step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + temp1 = step2[5] * (long)CosPi12_64 - step2[6] * (long)CosPi20_64; + temp2 = step2[5] * (long)CosPi20_64 + step2[6] * (long)CosPi12_64; + step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + step1[8] = HighbdWrapLow(step2[8] + step2[9], bd); + step1[9] = HighbdWrapLow(step2[8] - step2[9], bd); + step1[10] = HighbdWrapLow(-step2[10] + step2[11], bd); + step1[11] = HighbdWrapLow(step2[10] + step2[11], bd); + step1[12] = HighbdWrapLow(step2[12] + step2[13], bd); + step1[13] = HighbdWrapLow(step2[12] - step2[13], bd); + step1[14] = HighbdWrapLow(-step2[14] + step2[15], bd); + step1[15] = HighbdWrapLow(step2[14] + step2[15], bd); + + // stage 4 + temp1 = (step1[0] + step1[1]) * (long)CosPi16_64; + temp2 = (step1[0] - step1[1]) * (long)CosPi16_64; + step2[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + temp1 = step1[2] * (long)CosPi24_64 - step1[3] * (long)CosPi8_64; + temp2 = step1[2] * (long)CosPi8_64 + step1[3] * (long)CosPi24_64; + step2[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + step2[4] = HighbdWrapLow(step1[4] + step1[5], bd); + step2[5] = HighbdWrapLow(step1[4] - step1[5], bd); + step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd); + step2[7] = HighbdWrapLow(step1[6] + step1[7], bd); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64; + temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64; + step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64; + temp2 = -step1[10] * (long)CosPi8_64 + step1[13] * (long)CosPi24_64; + step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[0] = HighbdWrapLow(step2[0] + step2[3], bd); + step1[1] = HighbdWrapLow(step2[1] + step2[2], bd); + step1[2] = HighbdWrapLow(step2[1] - step2[2], bd); + step1[3] = HighbdWrapLow(step2[0] - step2[3], bd); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * (long)CosPi16_64; + temp2 = (step2[5] + step2[6]) * (long)CosPi16_64; + step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + step1[7] = step2[7]; + + step1[8] = HighbdWrapLow(step2[8] + step2[11], bd); + step1[9] = HighbdWrapLow(step2[9] + step2[10], bd); + step1[10] = HighbdWrapLow(step2[9] - step2[10], bd); + step1[11] = HighbdWrapLow(step2[8] - step2[11], bd); + step1[12] = HighbdWrapLow(-step2[12] + step2[15], bd); + step1[13] = HighbdWrapLow(-step2[13] + step2[14], bd); + step1[14] = HighbdWrapLow(step2[13] + step2[14], bd); + step1[15] = HighbdWrapLow(step2[12] + step2[15], bd); + + // stage 6 + step2[0] = HighbdWrapLow(step1[0] + step1[7], bd); + step2[1] = HighbdWrapLow(step1[1] + step1[6], bd); + step2[2] = HighbdWrapLow(step1[2] + step1[5], bd); + step2[3] = HighbdWrapLow(step1[3] + step1[4], bd); + step2[4] = HighbdWrapLow(step1[3] - step1[4], bd); + step2[5] = HighbdWrapLow(step1[2] - step1[5], bd); + step2[6] = HighbdWrapLow(step1[1] - step1[6], bd); + step2[7] = HighbdWrapLow(step1[0] - step1[7], bd); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * (long)CosPi16_64; + temp2 = (step1[10] + step1[13]) * (long)CosPi16_64; + step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + temp1 = (-step1[11] + step1[12]) * (long)CosPi16_64; + temp2 = (step1[11] + step1[12]) * (long)CosPi16_64; + step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + output[0] = HighbdWrapLow(step2[0] + step2[15], bd); + output[1] = HighbdWrapLow(step2[1] + step2[14], bd); + output[2] = HighbdWrapLow(step2[2] + step2[13], bd); + output[3] = HighbdWrapLow(step2[3] + step2[12], bd); + output[4] = HighbdWrapLow(step2[4] + step2[11], bd); + output[5] = HighbdWrapLow(step2[5] + step2[10], bd); + output[6] = HighbdWrapLow(step2[6] + step2[9], bd); + output[7] = HighbdWrapLow(step2[7] + step2[8], bd); + output[8] = HighbdWrapLow(step2[7] - step2[8], bd); + output[9] = HighbdWrapLow(step2[6] - step2[9], bd); + output[10] = HighbdWrapLow(step2[5] - step2[10], bd); + output[11] = HighbdWrapLow(step2[4] - step2[11], bd); + output[12] = HighbdWrapLow(step2[3] - step2[12], bd); + output[13] = HighbdWrapLow(step2[2] - step2[13], bd); + output[14] = HighbdWrapLow(step2[1] - step2[14], bd); + output[15] = HighbdWrapLow(step2[0] - step2[15], bd); + } + + [SkipLocalsInit] + public static void HighbdIdct16x16256Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) + { + int i, j; + Span<int> output = stackalloc int[16 * 16]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[16]; + Span<int> tempOut = stackalloc int[16]; + + // First transform rows + for (i = 0; i < 16; ++i) + { + HighbdIdct16(input, outptr, bd); + input = input.Slice(16); + outptr = outptr.Slice(16); + } + + // Then transform columns + for (i = 0; i < 16; ++i) + { + for (j = 0; j < 16; ++j) + { + tempIn[j] = output[j * 16 + i]; + } + + HighbdIdct16(tempIn, tempOut, bd); + for (j = 0; j < 16; ++j) + { + dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); + } + } + } + + [SkipLocalsInit] + public static void HighbdIdct16x1638Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) + { + int i, j; + Span<int> output = stackalloc int[16 * 16]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[16]; + Span<int> tempOut = stackalloc int[16]; + + output.Fill(0); + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 8x8 area, we only need to calculate first 8 rows here. + for (i = 0; i < 8; ++i) + { + HighbdIdct16(input, outptr, bd); + input = input.Slice(16); + outptr = outptr.Slice(16); + } + + // Then transform columns + for (i = 0; i < 16; ++i) + { + Span<ushort> destT = dest; + for (j = 0; j < 16; ++j) + { + tempIn[j] = output[j * 16 + i]; + } + + HighbdIdct16(tempIn, tempOut, bd); + for (j = 0; j < 16; ++j) + { + destT[i] = HighbdClipPixelAdd(destT[i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); + destT = destT.Slice(stride); + } + } + } + + [SkipLocalsInit] + public static void HighbdIdct16x1610Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) + { + int i, j; + Span<int> output = stackalloc int[16 * 16]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[16]; + Span<int> tempOut = stackalloc int[16]; + + output.Fill(0); + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. + for (i = 0; i < 4; ++i) + { + HighbdIdct16(input, outptr, bd); + input = input.Slice(16); + outptr = outptr.Slice(16); + } + + // Then transform columns + for (i = 0; i < 16; ++i) + { + for (j = 0; j < 16; ++j) + { + tempIn[j] = output[j * 16 + i]; + } + + HighbdIdct16(tempIn, tempOut, bd); + for (j = 0; j < 16; ++j) + { + dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); + } + } + } + + public static void HighbdIdct16x161Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) + { + int i, j; + long a1; + int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd); + + output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd); + a1 = BitUtils.RoundPowerOfTwo(output, 6); + for (j = 0; j < 16; ++j) + { + for (i = 0; i < 16; ++i) + { + dest[i] = HighbdClipPixelAdd(dest[i], a1, bd); + } + + dest = dest.Slice(stride); + } + } + + [SkipLocalsInit] + public static void HighbdIdct32(ReadOnlySpan<int> input, Span<int> output, int bd) + { + Span<int> step1 = stackalloc int[32]; + Span<int> step2 = stackalloc int[32]; + long temp1, temp2; + + if (DetectInvalidHighbdInput(input, 32) != 0) + { + Debug.Assert(false, "invalid highbd txfm input"); + output.Slice(0, 32).Fill(0); + return; + } + + // stage 1 + step1[0] = input[0]; + step1[1] = input[16]; + step1[2] = input[8]; + step1[3] = input[24]; + step1[4] = input[4]; + step1[5] = input[20]; + step1[6] = input[12]; + step1[7] = input[28]; + step1[8] = input[2]; + step1[9] = input[18]; + step1[10] = input[10]; + step1[11] = input[26]; + step1[12] = input[6]; + step1[13] = input[22]; + step1[14] = input[14]; + step1[15] = input[30]; + + temp1 = input[1] * (long)CosPi31_64 - input[31] * (long)CosPi1_64; + temp2 = input[1] * (long)CosPi1_64 + input[31] * (long)CosPi31_64; + step1[16] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[31] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + temp1 = input[17] * (long)CosPi15_64 - input[15] * (long)CosPi17_64; + temp2 = input[17] * (long)CosPi17_64 + input[15] * (long)CosPi15_64; + step1[17] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[30] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + temp1 = input[9] * (long)CosPi23_64 - input[23] * (long)CosPi9_64; + temp2 = input[9] * (long)CosPi9_64 + input[23] * (long)CosPi23_64; + step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + temp1 = input[25] * (long)CosPi7_64 - input[7] * (long)CosPi25_64; + temp2 = input[25] * (long)CosPi25_64 + input[7] * (long)CosPi7_64; + step1[19] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[28] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + temp1 = input[5] * (long)CosPi27_64 - input[27] * (long)CosPi5_64; + temp2 = input[5] * (long)CosPi5_64 + input[27] * (long)CosPi27_64; + step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + temp1 = input[21] * (long)CosPi11_64 - input[11] * (long)CosPi21_64; + temp2 = input[21] * (long)CosPi21_64 + input[11] * (long)CosPi11_64; + step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + temp1 = input[13] * (long)CosPi19_64 - input[19] * (long)CosPi13_64; + temp2 = input[13] * (long)CosPi13_64 + input[19] * (long)CosPi19_64; + step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + temp1 = input[29] * (long)CosPi3_64 - input[3] * (long)CosPi29_64; + temp2 = input[29] * (long)CosPi29_64 + input[3] * (long)CosPi3_64; + step1[23] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[24] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * (long)CosPi30_64 - step1[15] * (long)CosPi2_64; + temp2 = step1[8] * (long)CosPi2_64 + step1[15] * (long)CosPi30_64; + step2[8] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[15] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + temp1 = step1[9] * (long)CosPi14_64 - step1[14] * (long)CosPi18_64; + temp2 = step1[9] * (long)CosPi18_64 + step1[14] * (long)CosPi14_64; + step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + temp1 = step1[10] * (long)CosPi22_64 - step1[13] * (long)CosPi10_64; + temp2 = step1[10] * (long)CosPi10_64 + step1[13] * (long)CosPi22_64; + step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + temp1 = step1[11] * (long)CosPi6_64 - step1[12] * (long)CosPi26_64; + temp2 = step1[11] * (long)CosPi26_64 + step1[12] * (long)CosPi6_64; + step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + step2[16] = HighbdWrapLow(step1[16] + step1[17], bd); + step2[17] = HighbdWrapLow(step1[16] - step1[17], bd); + step2[18] = HighbdWrapLow(-step1[18] + step1[19], bd); + step2[19] = HighbdWrapLow(step1[18] + step1[19], bd); + step2[20] = HighbdWrapLow(step1[20] + step1[21], bd); + step2[21] = HighbdWrapLow(step1[20] - step1[21], bd); + step2[22] = HighbdWrapLow(-step1[22] + step1[23], bd); + step2[23] = HighbdWrapLow(step1[22] + step1[23], bd); + step2[24] = HighbdWrapLow(step1[24] + step1[25], bd); + step2[25] = HighbdWrapLow(step1[24] - step1[25], bd); + step2[26] = HighbdWrapLow(-step1[26] + step1[27], bd); + step2[27] = HighbdWrapLow(step1[26] + step1[27], bd); + step2[28] = HighbdWrapLow(step1[28] + step1[29], bd); + step2[29] = HighbdWrapLow(step1[28] - step1[29], bd); + step2[30] = HighbdWrapLow(-step1[30] + step1[31], bd); + step2[31] = HighbdWrapLow(step1[30] + step1[31], bd); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * (long)CosPi28_64 - step2[7] * (long)CosPi4_64; + temp2 = step2[4] * (long)CosPi4_64 + step2[7] * (long)CosPi28_64; + step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + temp1 = step2[5] * (long)CosPi12_64 - step2[6] * (long)CosPi20_64; + temp2 = step2[5] * (long)CosPi20_64 + step2[6] * (long)CosPi12_64; + step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + + step1[8] = HighbdWrapLow(step2[8] + step2[9], bd); + step1[9] = HighbdWrapLow(step2[8] - step2[9], bd); + step1[10] = HighbdWrapLow(-step2[10] + step2[11], bd); + step1[11] = HighbdWrapLow(step2[10] + step2[11], bd); + step1[12] = HighbdWrapLow(step2[12] + step2[13], bd); + step1[13] = HighbdWrapLow(step2[12] - step2[13], bd); + step1[14] = HighbdWrapLow(-step2[14] + step2[15], bd); + step1[15] = HighbdWrapLow(step2[14] + step2[15], bd); + + step1[16] = step2[16]; + step1[31] = step2[31]; + temp1 = -step2[17] * (long)CosPi4_64 + step2[30] * (long)CosPi28_64; + temp2 = step2[17] * (long)CosPi28_64 + step2[30] * (long)CosPi4_64; + step1[17] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[30] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + temp1 = -step2[18] * (long)CosPi28_64 - step2[29] * (long)CosPi4_64; + temp2 = -step2[18] * (long)CosPi4_64 + step2[29] * (long)CosPi28_64; + step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + step1[19] = step2[19]; + step1[20] = step2[20]; + temp1 = -step2[21] * (long)CosPi20_64 + step2[26] * (long)CosPi12_64; + temp2 = step2[21] * (long)CosPi12_64 + step2[26] * (long)CosPi20_64; + step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + temp1 = -step2[22] * (long)CosPi12_64 - step2[25] * (long)CosPi20_64; + temp2 = -step2[22] * (long)CosPi20_64 + step2[25] * (long)CosPi12_64; + step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + // stage 4 + temp1 = (step1[0] + step1[1]) * (long)CosPi16_64; + temp2 = (step1[0] - step1[1]) * (long)CosPi16_64; + step2[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + temp1 = step1[2] * (long)CosPi24_64 - step1[3] * (long)CosPi8_64; + temp2 = step1[2] * (long)CosPi8_64 + step1[3] * (long)CosPi24_64; + step2[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + step2[4] = HighbdWrapLow(step1[4] + step1[5], bd); + step2[5] = HighbdWrapLow(step1[4] - step1[5], bd); + step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd); + step2[7] = HighbdWrapLow(step1[6] + step1[7], bd); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64; + temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64; + step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64; + temp2 = -step1[10] * (long)CosPi8_64 + step1[13] * (long)CosPi24_64; + step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + step2[11] = step1[11]; + step2[12] = step1[12]; + + step2[16] = HighbdWrapLow(step1[16] + step1[19], bd); + step2[17] = HighbdWrapLow(step1[17] + step1[18], bd); + step2[18] = HighbdWrapLow(step1[17] - step1[18], bd); + step2[19] = HighbdWrapLow(step1[16] - step1[19], bd); + step2[20] = HighbdWrapLow(-step1[20] + step1[23], bd); + step2[21] = HighbdWrapLow(-step1[21] + step1[22], bd); + step2[22] = HighbdWrapLow(step1[21] + step1[22], bd); + step2[23] = HighbdWrapLow(step1[20] + step1[23], bd); + + step2[24] = HighbdWrapLow(step1[24] + step1[27], bd); + step2[25] = HighbdWrapLow(step1[25] + step1[26], bd); + step2[26] = HighbdWrapLow(step1[25] - step1[26], bd); + step2[27] = HighbdWrapLow(step1[24] - step1[27], bd); + step2[28] = HighbdWrapLow(-step1[28] + step1[31], bd); + step2[29] = HighbdWrapLow(-step1[29] + step1[30], bd); + step2[30] = HighbdWrapLow(step1[29] + step1[30], bd); + step2[31] = HighbdWrapLow(step1[28] + step1[31], bd); + + // stage 5 + step1[0] = HighbdWrapLow(step2[0] + step2[3], bd); + step1[1] = HighbdWrapLow(step2[1] + step2[2], bd); + step1[2] = HighbdWrapLow(step2[1] - step2[2], bd); + step1[3] = HighbdWrapLow(step2[0] - step2[3], bd); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * (long)CosPi16_64; + temp2 = (step2[5] + step2[6]) * (long)CosPi16_64; + step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + step1[7] = step2[7]; + + step1[8] = HighbdWrapLow(step2[8] + step2[11], bd); + step1[9] = HighbdWrapLow(step2[9] + step2[10], bd); + step1[10] = HighbdWrapLow(step2[9] - step2[10], bd); + step1[11] = HighbdWrapLow(step2[8] - step2[11], bd); + step1[12] = HighbdWrapLow(-step2[12] + step2[15], bd); + step1[13] = HighbdWrapLow(-step2[13] + step2[14], bd); + step1[14] = HighbdWrapLow(step2[13] + step2[14], bd); + step1[15] = HighbdWrapLow(step2[12] + step2[15], bd); + + step1[16] = step2[16]; + step1[17] = step2[17]; + temp1 = -step2[18] * (long)CosPi8_64 + step2[29] * (long)CosPi24_64; + temp2 = step2[18] * (long)CosPi24_64 + step2[29] * (long)CosPi8_64; + step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + temp1 = -step2[19] * (long)CosPi8_64 + step2[28] * (long)CosPi24_64; + temp2 = step2[19] * (long)CosPi24_64 + step2[28] * (long)CosPi8_64; + step1[19] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[28] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + temp1 = -step2[20] * (long)CosPi24_64 - step2[27] * (long)CosPi8_64; + temp2 = -step2[20] * (long)CosPi8_64 + step2[27] * (long)CosPi24_64; + step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + temp1 = -step2[21] * (long)CosPi24_64 - step2[26] * (long)CosPi8_64; + temp2 = -step2[21] * (long)CosPi8_64 + step2[26] * (long)CosPi24_64; + step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[0] = HighbdWrapLow(step1[0] + step1[7], bd); + step2[1] = HighbdWrapLow(step1[1] + step1[6], bd); + step2[2] = HighbdWrapLow(step1[2] + step1[5], bd); + step2[3] = HighbdWrapLow(step1[3] + step1[4], bd); + step2[4] = HighbdWrapLow(step1[3] - step1[4], bd); + step2[5] = HighbdWrapLow(step1[2] - step1[5], bd); + step2[6] = HighbdWrapLow(step1[1] - step1[6], bd); + step2[7] = HighbdWrapLow(step1[0] - step1[7], bd); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * (long)CosPi16_64; + temp2 = (step1[10] + step1[13]) * (long)CosPi16_64; + step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + temp1 = (-step1[11] + step1[12]) * (long)CosPi16_64; + temp2 = (step1[11] + step1[12]) * (long)CosPi16_64; + step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + step2[14] = step1[14]; + step2[15] = step1[15]; + + step2[16] = HighbdWrapLow(step1[16] + step1[23], bd); + step2[17] = HighbdWrapLow(step1[17] + step1[22], bd); + step2[18] = HighbdWrapLow(step1[18] + step1[21], bd); + step2[19] = HighbdWrapLow(step1[19] + step1[20], bd); + step2[20] = HighbdWrapLow(step1[19] - step1[20], bd); + step2[21] = HighbdWrapLow(step1[18] - step1[21], bd); + step2[22] = HighbdWrapLow(step1[17] - step1[22], bd); + step2[23] = HighbdWrapLow(step1[16] - step1[23], bd); + + step2[24] = HighbdWrapLow(-step1[24] + step1[31], bd); + step2[25] = HighbdWrapLow(-step1[25] + step1[30], bd); + step2[26] = HighbdWrapLow(-step1[26] + step1[29], bd); + step2[27] = HighbdWrapLow(-step1[27] + step1[28], bd); + step2[28] = HighbdWrapLow(step1[27] + step1[28], bd); + step2[29] = HighbdWrapLow(step1[26] + step1[29], bd); + step2[30] = HighbdWrapLow(step1[25] + step1[30], bd); + step2[31] = HighbdWrapLow(step1[24] + step1[31], bd); + + // stage 7 + step1[0] = HighbdWrapLow(step2[0] + step2[15], bd); + step1[1] = HighbdWrapLow(step2[1] + step2[14], bd); + step1[2] = HighbdWrapLow(step2[2] + step2[13], bd); + step1[3] = HighbdWrapLow(step2[3] + step2[12], bd); + step1[4] = HighbdWrapLow(step2[4] + step2[11], bd); + step1[5] = HighbdWrapLow(step2[5] + step2[10], bd); + step1[6] = HighbdWrapLow(step2[6] + step2[9], bd); + step1[7] = HighbdWrapLow(step2[7] + step2[8], bd); + step1[8] = HighbdWrapLow(step2[7] - step2[8], bd); + step1[9] = HighbdWrapLow(step2[6] - step2[9], bd); + step1[10] = HighbdWrapLow(step2[5] - step2[10], bd); + step1[11] = HighbdWrapLow(step2[4] - step2[11], bd); + step1[12] = HighbdWrapLow(step2[3] - step2[12], bd); + step1[13] = HighbdWrapLow(step2[2] - step2[13], bd); + step1[14] = HighbdWrapLow(step2[1] - step2[14], bd); + step1[15] = HighbdWrapLow(step2[0] - step2[15], bd); + + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[18] = step2[18]; + step1[19] = step2[19]; + temp1 = (-step2[20] + step2[27]) * (long)CosPi16_64; + temp2 = (step2[20] + step2[27]) * (long)CosPi16_64; + step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + temp1 = (-step2[21] + step2[26]) * (long)CosPi16_64; + temp2 = (step2[21] + step2[26]) * (long)CosPi16_64; + step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + temp1 = (-step2[22] + step2[25]) * (long)CosPi16_64; + temp2 = (step2[22] + step2[25]) * (long)CosPi16_64; + step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + temp1 = (-step2[23] + step2[24]) * (long)CosPi16_64; + temp2 = (step2[23] + step2[24]) * (long)CosPi16_64; + step1[23] = HighbdWrapLow(DctConstRoundShift(temp1), bd); + step1[24] = HighbdWrapLow(DctConstRoundShift(temp2), bd); + step1[28] = step2[28]; + step1[29] = step2[29]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // final stage + output[0] = HighbdWrapLow(step1[0] + step1[31], bd); + output[1] = HighbdWrapLow(step1[1] + step1[30], bd); + output[2] = HighbdWrapLow(step1[2] + step1[29], bd); + output[3] = HighbdWrapLow(step1[3] + step1[28], bd); + output[4] = HighbdWrapLow(step1[4] + step1[27], bd); + output[5] = HighbdWrapLow(step1[5] + step1[26], bd); + output[6] = HighbdWrapLow(step1[6] + step1[25], bd); + output[7] = HighbdWrapLow(step1[7] + step1[24], bd); + output[8] = HighbdWrapLow(step1[8] + step1[23], bd); + output[9] = HighbdWrapLow(step1[9] + step1[22], bd); + output[10] = HighbdWrapLow(step1[10] + step1[21], bd); + output[11] = HighbdWrapLow(step1[11] + step1[20], bd); + output[12] = HighbdWrapLow(step1[12] + step1[19], bd); + output[13] = HighbdWrapLow(step1[13] + step1[18], bd); + output[14] = HighbdWrapLow(step1[14] + step1[17], bd); + output[15] = HighbdWrapLow(step1[15] + step1[16], bd); + output[16] = HighbdWrapLow(step1[15] - step1[16], bd); + output[17] = HighbdWrapLow(step1[14] - step1[17], bd); + output[18] = HighbdWrapLow(step1[13] - step1[18], bd); + output[19] = HighbdWrapLow(step1[12] - step1[19], bd); + output[20] = HighbdWrapLow(step1[11] - step1[20], bd); + output[21] = HighbdWrapLow(step1[10] - step1[21], bd); + output[22] = HighbdWrapLow(step1[9] - step1[22], bd); + output[23] = HighbdWrapLow(step1[8] - step1[23], bd); + output[24] = HighbdWrapLow(step1[7] - step1[24], bd); + output[25] = HighbdWrapLow(step1[6] - step1[25], bd); + output[26] = HighbdWrapLow(step1[5] - step1[26], bd); + output[27] = HighbdWrapLow(step1[4] - step1[27], bd); + output[28] = HighbdWrapLow(step1[3] - step1[28], bd); + output[29] = HighbdWrapLow(step1[2] - step1[29], bd); + output[30] = HighbdWrapLow(step1[1] - step1[30], bd); + output[31] = HighbdWrapLow(step1[0] - step1[31], bd); + } + + [SkipLocalsInit] + public static void HighbdIdct32x321024Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) + { + int i, j; + Span<int> output = stackalloc int[32 * 32]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[32]; + Span<int> tempOut = stackalloc int[32]; + + // Rows + for (i = 0; i < 32; ++i) + { + int zeroCoeff = 0; + for (j = 0; j < 32; ++j) + { + zeroCoeff |= input[j]; + } + + if (zeroCoeff != 0) + { + HighbdIdct32(input, outptr, bd); + } + else + { + outptr.Slice(0, 32).Fill(0); + } + + input = input.Slice(32); + outptr = outptr.Slice(32); + } + + // Columns + for (i = 0; i < 32; ++i) + { + for (j = 0; j < 32; ++j) + { + tempIn[j] = output[j * 32 + i]; + } + + HighbdIdct32(tempIn, tempOut, bd); + for (j = 0; j < 32; ++j) + { + dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); + } + } + } + + [SkipLocalsInit] + public static void HighbdIdct32x32135Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) + { + int i, j; + Span<int> output = stackalloc int[32 * 32]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[32]; + Span<int> tempOut = stackalloc int[32]; + + output.Fill(0); + + // Rows + // Only upper-left 16x16 has non-zero coeff + for (i = 0; i < 16; ++i) + { + HighbdIdct32(input, outptr, bd); + input = input.Slice(32); + outptr = outptr.Slice(32); + } + + // Columns + for (i = 0; i < 32; ++i) + { + Span<ushort> destT = dest; + for (j = 0; j < 32; ++j) + { + tempIn[j] = output[j * 32 + i]; + } + + HighbdIdct32(tempIn, tempOut, bd); + for (j = 0; j < 32; ++j) + { + destT[i] = HighbdClipPixelAdd(destT[i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); + destT = destT.Slice(stride); + } + } + } + + [SkipLocalsInit] + public static void HighbdIdct32x3234Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) + { + int i, j; + Span<int> output = stackalloc int[32 * 32]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[32]; + Span<int> tempOut = stackalloc int[32]; + + output.Fill(0); + + // Rows + // Only upper-left 8x8 has non-zero coeff + for (i = 0; i < 8; ++i) + { + HighbdIdct32(input, outptr, bd); + input = input.Slice(32); + outptr = outptr.Slice(32); + } + + // Columns + for (i = 0; i < 32; ++i) + { + for (j = 0; j < 32; ++j) + { + tempIn[j] = output[j * 32 + i]; + } + + HighbdIdct32(tempIn, tempOut, bd); + for (j = 0; j < 32; ++j) + { + dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); + } + } + } + + public static void HighbdIdct32x321Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) + { + int i, j; + int a1; + int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd); + + output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd); + a1 = BitUtils.RoundPowerOfTwo(output, 6); + + for (j = 0; j < 32; ++j) + { + for (i = 0; i < 32; ++i) + { + dest[i] = HighbdClipPixelAdd(dest[i], a1, bd); + } + + dest = dest.Slice(stride); + } + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Prob.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Prob.cs new file mode 100644 index 00000000..0d5e8b6e --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Prob.cs @@ -0,0 +1,73 @@ +using Ryujinx.Graphics.Nvdec.Vp9.Common; +using System; +using System.Diagnostics; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp +{ + internal static class Prob + { + public const int MaxProb = 255; + + private static byte GetProb(uint num, uint den) + { + Debug.Assert(den != 0); + { + int p = (int)(((ulong)num * 256 + (den >> 1)) / den); + // (p > 255) ? 255 : (p < 1) ? 1 : p; + int clippedProb = p | ((255 - p) >> 23) | (p == 0 ? 1 : 0); + return (byte)clippedProb; + } + } + + /* This function assumes prob1 and prob2 are already within [1,255] range. */ + public static byte WeightedProb(int prob1, int prob2, int factor) + { + return (byte)BitUtils.RoundPowerOfTwo(prob1 * (256 - factor) + prob2 * factor, 8); + } + + // MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT; + private static readonly uint[] CountToUpdateFactor = new uint[] + { + 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64, + 70, 76, 83, 89, 96, 102, 108, 115, 121, 128 + }; + + private const int ModeMvCountSat = 20; + + public static byte ModeMvMergeProbs(byte preProb, uint ct0, uint ct1) + { + uint den = ct0 + ct1; + if (den == 0) + { + return preProb; + } + else + { + uint count = Math.Min(den, ModeMvCountSat); + uint factor = CountToUpdateFactor[(int)count]; + byte prob = GetProb(ct0, den); + return WeightedProb(preProb, prob, (int)factor); + } + } + + private static uint TreeMergeProbsImpl( + uint i, + sbyte[] tree, + ReadOnlySpan<byte> preProbs, + ReadOnlySpan<uint> counts, + Span<byte> probs) + { + int l = tree[i]; + uint leftCount = (l <= 0) ? counts[-l] : TreeMergeProbsImpl((uint)l, tree, preProbs, counts, probs); + int r = tree[i + 1]; + uint rightCount = (r <= 0) ? counts[-r] : TreeMergeProbsImpl((uint)r, tree, preProbs, counts, probs); + probs[(int)(i >> 1)] = ModeMvMergeProbs(preProbs[(int)(i >> 1)], leftCount, rightCount); + return leftCount + rightCount; + } + + public static void TreeMergeProbs(sbyte[] tree, ReadOnlySpan<byte> preProbs, ReadOnlySpan<uint> counts, Span<byte> probs) + { + TreeMergeProbsImpl(0, tree, preProbs, counts, probs); + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs new file mode 100644 index 00000000..05095121 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs @@ -0,0 +1,237 @@ +using Ryujinx.Common.Memory; +using System; +using System.Buffers.Binary; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp +{ + internal struct Reader + { + private static readonly byte[] Norm = new byte[] + { + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + private const int BdValueSize = sizeof(ulong) * 8; + + // This is meant to be a large, positive constant that can still be efficiently + // loaded as an immediate (on platforms like ARM, for example). + // Even relatively modest values like 100 would work fine. + private const int LotsOfBits = 0x40000000; + + public ulong Value; + public uint Range; + public int Count; + private ArrayPtr<byte> _buffer; + + public bool Init(ArrayPtr<byte> buffer, int size) + { + if (size != 0 && buffer.IsNull) + { + return true; + } + else + { + _buffer = new ArrayPtr<byte>(ref buffer[0], size); + Value = 0; + Count = -8; + Range = 255; + Fill(); + return ReadBit() != 0; // Marker bit + } + } + + private void Fill() + { + ReadOnlySpan<byte> buffer = _buffer.AsSpan(); + ReadOnlySpan<byte> bufferStart = buffer; + ulong value = Value; + int count = Count; + ulong bytesLeft = (ulong)buffer.Length; + ulong bitsLeft = bytesLeft * 8; + int shift = BdValueSize - 8 - (count + 8); + + if (bitsLeft > BdValueSize) + { + int bits = (shift & unchecked((int)0xfffffff8)) + 8; + ulong nv; + ulong bigEndianValues = BinaryPrimitives.ReadUInt64BigEndian(buffer); + nv = bigEndianValues >> (BdValueSize - bits); + count += bits; + buffer = buffer.Slice(bits >> 3); + value = Value | (nv << (shift & 0x7)); + } + else + { + int bitsOver = shift + 8 - (int)bitsLeft; + int loopEnd = 0; + if (bitsOver >= 0) + { + count += LotsOfBits; + loopEnd = bitsOver; + } + + if (bitsOver < 0 || bitsLeft != 0) + { + while (shift >= loopEnd) + { + count += 8; + value |= (ulong)buffer[0] << shift; + buffer = buffer.Slice(1); + shift -= 8; + } + } + } + + // NOTE: Variable 'buffer' may not relate to '_buffer' after decryption, + // so we increase '_buffer' by the amount that 'buffer' moved, rather than + // assign 'buffer' to '_buffer'. + _buffer = _buffer.Slice(bufferStart.Length - buffer.Length); + Value = value; + Count = count; + } + + public bool HasError() + { + // Check if we have reached the end of the buffer. + // + // Variable 'count' stores the number of bits in the 'value' buffer, minus + // 8. The top byte is part of the algorithm, and the remainder is buffered + // to be shifted into it. So if count == 8, the top 16 bits of 'value' are + // occupied, 8 for the algorithm and 8 in the buffer. + // + // When reading a byte from the user's buffer, count is filled with 8 and + // one byte is filled into the value buffer. When we reach the end of the + // data, count is additionally filled with LotsOfBits. So when + // count == LotsOfBits - 1, the user's data has been exhausted. + // + // 1 if we have tried to decode bits after the end of stream was encountered. + // 0 No error. + return Count > BdValueSize && Count < LotsOfBits; + } + + public int Read(int prob) + { + uint bit = 0; + ulong value; + ulong bigsplit; + int count; + uint range; + uint split = (Range * (uint)prob + (256 - (uint)prob)) >> 8; + + if (Count < 0) + { + Fill(); + } + + value = Value; + count = Count; + + bigsplit = (ulong)split << (BdValueSize - 8); + + range = split; + + if (value >= bigsplit) + { + range = Range - split; + value -= bigsplit; + bit = 1; + } + + { + int shift = Norm[range]; + range <<= shift; + value <<= shift; + count -= shift; + } + Value = value; + Count = count; + Range = range; + + return (int)bit; + } + + public int ReadBit() + { + return Read(128); // vpx_prob_half + } + + public int ReadLiteral(int bits) + { + int literal = 0, bit; + + for (bit = bits - 1; bit >= 0; bit--) + { + literal |= ReadBit() << bit; + } + + return literal; + } + + public int ReadTree(ReadOnlySpan<sbyte> tree, ReadOnlySpan<byte> probs) + { + sbyte i = 0; + + while ((i = tree[i + Read(probs[i >> 1])]) > 0) + { + continue; + } + + return -i; + } + + public int ReadBool(int prob, ref ulong value, ref int count, ref uint range) + { + uint split = (range * (uint)prob + (256 - (uint)prob)) >> 8; + ulong bigsplit = (ulong)split << (BdValueSize - 8); + + if (count < 0) + { + Value = value; + Count = count; + Fill(); + value = Value; + count = Count; + } + + if (value >= bigsplit) + { + range = range - split; + value = value - bigsplit; + { + int shift = Norm[range]; + range <<= shift; + value <<= shift; + count -= shift; + } + return 1; + } + range = split; + { + int shift = Norm[range]; + range <<= shift; + value <<= shift; + count -= shift; + } + return 0; + } + + public ArrayPtr<byte> FindEnd() + { + // Find the end of the coded buffer + while (Count > 8 && Count < BdValueSize) + { + Count -= 8; + _buffer = _buffer.Slice(-1); + } + return _buffer; + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/TxfmCommon.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/TxfmCommon.cs new file mode 100644 index 00000000..e041f2e0 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/TxfmCommon.cs @@ -0,0 +1,54 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp +{ + internal static class TxfmCommon + { + // Constants used by all idct/dct functions + public const int DctConstBits = 14; + public const int DctConstRounding = 1 << (DctConstBits - 1); + + public const int UnitQuantShift = 2; + public const int UnitQuantFactor = 1 << UnitQuantShift; + + // Constants: + // for (int i = 1; i < 32; ++i) + // Console.WriteLine("public const short CosPi{0}_64 = {1};", i, MathF.Round(16384 * MathF.Cos(i * MathF.PI / 64))); + // Note: sin(k * Pi / 64) = cos((32 - k) * Pi / 64) + public const short CosPi1_64 = 16364; + public const short CosPi2_64 = 16305; + public const short CosPi3_64 = 16207; + public const short CosPi4_64 = 16069; + public const short CosPi5_64 = 15893; + public const short CosPi6_64 = 15679; + public const short CosPi7_64 = 15426; + public const short CosPi8_64 = 15137; + public const short CosPi9_64 = 14811; + public const short CosPi10_64 = 14449; + public const short CosPi11_64 = 14053; + public const short CosPi12_64 = 13623; + public const short CosPi13_64 = 13160; + public const short CosPi14_64 = 12665; + public const short CosPi15_64 = 12140; + public const short CosPi16_64 = 11585; + public const short CosPi17_64 = 11003; + public const short CosPi18_64 = 10394; + public const short CosPi19_64 = 9760; + public const short CosPi20_64 = 9102; + public const short CosPi21_64 = 8423; + public const short CosPi22_64 = 7723; + public const short CosPi23_64 = 7005; + public const short CosPi24_64 = 6270; + public const short CosPi25_64 = 5520; + public const short CosPi26_64 = 4756; + public const short CosPi27_64 = 3981; + public const short CosPi28_64 = 3196; + public const short CosPi29_64 = 2404; + public const short CosPi30_64 = 1606; + public const short CosPi31_64 = 804; + + // 16384 * sqrt(2) * sin(kPi / 9) * 2 / 3 + public const short SinPi1_9 = 5283; + public const short SinPi2_9 = 9929; + public const short SinPi3_9 = 13377; + public const short SinPi4_9 = 15212; + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Idct.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Idct.cs new file mode 100644 index 00000000..9fa5842a --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Idct.cs @@ -0,0 +1,536 @@ +using Ryujinx.Graphics.Nvdec.Vp9.Common; +using Ryujinx.Graphics.Nvdec.Vp9.Types; +using System; +using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.InvTxfm; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal static class Idct + { + private delegate void Transform1D(ReadOnlySpan<int> input, Span<int> output); + private delegate void HighbdTransform1D(ReadOnlySpan<int> input, Span<int> output, int bd); + + private struct Transform2D + { + public Transform1D Cols, Rows; // Vertical and horizontal + + public Transform2D(Transform1D cols, Transform1D rows) + { + Cols = cols; + Rows = rows; + } + } + + private struct HighbdTransform2D + { + public HighbdTransform1D Cols, Rows; // Vertical and horizontal + + public HighbdTransform2D(HighbdTransform1D cols, HighbdTransform1D rows) + { + Cols = cols; + Rows = rows; + } + } + + private static readonly Transform2D[] Iht4 = new Transform2D[] + { + new Transform2D(Idct4, Idct4), // DCT_DCT = 0 + new Transform2D(Iadst4, Idct4), // ADST_DCT = 1 + new Transform2D(Idct4, Iadst4), // DCT_ADST = 2 + new Transform2D(Iadst4, Iadst4) // ADST_ADST = 3 + }; + + public static void Iht4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int txType) + { + int i, j; + Span<int> output = stackalloc int[4 * 4]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[4]; + Span<int> tempOut = stackalloc int[4]; + + // Inverse transform row vectors + for (i = 0; i < 4; ++i) + { + Iht4[txType].Rows(input, outptr); + input = input.Slice(4); + outptr = outptr.Slice(4); + } + + // Inverse transform column vectors + for (i = 0; i < 4; ++i) + { + for (j = 0; j < 4; ++j) + { + tempIn[j] = output[j * 4 + i]; + } + + Iht4[txType].Cols(tempIn, tempOut); + for (j = 0; j < 4; ++j) + { + dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4)); + } + } + } + + private static readonly Transform2D[] Iht8 = new Transform2D[] + { + new Transform2D(Idct8, Idct8), // DCT_DCT = 0 + new Transform2D(Iadst8, Idct8), // ADST_DCT = 1 + new Transform2D(Idct8, Iadst8), // DCT_ADST = 2 + new Transform2D(Iadst8, Iadst8) // ADST_ADST = 3 + }; + + public static void Iht8x864Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int txType) + { + int i, j; + Span<int> output = stackalloc int[8 * 8]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[8]; + Span<int> tempOut = stackalloc int[8]; + Transform2D ht = Iht8[txType]; + + // Inverse transform row vectors + for (i = 0; i < 8; ++i) + { + ht.Rows(input, outptr); + input = input.Slice(8); + outptr = outptr.Slice(8); + } + + // Inverse transform column vectors + for (i = 0; i < 8; ++i) + { + for (j = 0; j < 8; ++j) + { + tempIn[j] = output[j * 8 + i]; + } + + ht.Cols(tempIn, tempOut); + for (j = 0; j < 8; ++j) + { + dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5)); + } + } + } + + private static readonly Transform2D[] Iht16 = new Transform2D[] + { + new Transform2D(Idct16, Idct16), // DCT_DCT = 0 + new Transform2D(Iadst16, Idct16), // ADST_DCT = 1 + new Transform2D(Idct16, Iadst16), // DCT_ADST = 2 + new Transform2D(Iadst16, Iadst16) // ADST_ADST = 3 + }; + + public static void Iht16x16256Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int txType) + { + int i, j; + Span<int> output = stackalloc int[16 * 16]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[16]; + Span<int> tempOut = stackalloc int[16]; + Transform2D ht = Iht16[txType]; + + // Rows + for (i = 0; i < 16; ++i) + { + ht.Rows(input, outptr); + input = input.Slice(16); + outptr = outptr.Slice(16); + } + + // Columns + for (i = 0; i < 16; ++i) + { + for (j = 0; j < 16; ++j) + { + tempIn[j] = output[j * 16 + i]; + } + + ht.Cols(tempIn, tempOut); + for (j = 0; j < 16; ++j) + { + dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); + } + } + } + + // Idct + public static void Idct4x4Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int eob) + { + if (eob > 1) + { + Idct4x416Add(input, dest, stride); + } + else + { + Idct4x41Add(input, dest, stride); + } + } + + public static void Iwht4x4Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int eob) + { + if (eob > 1) + { + Iwht4x416Add(input, dest, stride); + } + else + { + Iwht4x41Add(input, dest, stride); + } + } + + public static void Idct8x8Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int eob) + { + // If dc is 1, then input[0] is the reconstructed value, do not need + // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. + + // The calculation can be simplified if there are not many non-zero dct + // coefficients. Use eobs to decide what to do. + if (eob == 1) + { + // DC only DCT coefficient + Idct8x81Add(input, dest, stride); + } + else if (eob <= 12) + { + Idct8x812Add(input, dest, stride); + } + else + { + Idct8x864Add(input, dest, stride); + } + } + + public static void Idct16x16Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int eob) + { + /* The calculation can be simplified if there are not many non-zero dct + * coefficients. Use eobs to separate different cases. */ + if (eob == 1) /* DC only DCT coefficient. */ + { + Idct16x161Add(input, dest, stride); + } + else if (eob <= 10) + { + Idct16x1610Add(input, dest, stride); + } + else if (eob <= 38) + { + Idct16x1638Add(input, dest, stride); + } + else + { + Idct16x16256Add(input, dest, stride); + } + } + + public static void Idct32x32Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int eob) + { + if (eob == 1) + { + Idct32x321Add(input, dest, stride); + } + else if (eob <= 34) + { + // Non-zero coeff only in upper-left 8x8 + Idct32x3234Add(input, dest, stride); + } + else if (eob <= 135) + { + // Non-zero coeff only in upper-left 16x16 + Idct32x32135Add(input, dest, stride); + } + else + { + Idct32x321024Add(input, dest, stride); + } + } + + // Iht + public static void Iht4x4Add(TxType txType, ReadOnlySpan<int> input, Span<byte> dest, int stride, int eob) + { + if (txType == TxType.DctDct) + { + Idct4x4Add(input, dest, stride, eob); + } + else + { + Iht4x416Add(input, dest, stride, (int)txType); + } + } + + public static void Iht8x8Add(TxType txType, ReadOnlySpan<int> input, Span<byte> dest, int stride, int eob) + { + if (txType == TxType.DctDct) + { + Idct8x8Add(input, dest, stride, eob); + } + else + { + Iht8x864Add(input, dest, stride, (int)txType); + } + } + + public static void Iht16x16Add(TxType txType, ReadOnlySpan<int> input, Span<byte> dest, + int stride, int eob) + { + if (txType == TxType.DctDct) + { + Idct16x16Add(input, dest, stride, eob); + } + else + { + Iht16x16256Add(input, dest, stride, (int)txType); + } + } + + private static readonly HighbdTransform2D[] HighbdIht4 = new HighbdTransform2D[] + { + new HighbdTransform2D(HighbdIdct4, HighbdIdct4), // DCT_DCT = 0 + new HighbdTransform2D(HighbdIadst4, HighbdIdct4), // ADST_DCT = 1 + new HighbdTransform2D(HighbdIdct4, HighbdIadst4), // DCT_ADST = 2 + new HighbdTransform2D(HighbdIadst4, HighbdIadst4) // ADST_ADST = 3 + }; + + public static void HighbdIht4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int txType, int bd) + { + int i, j; + Span<int> output = stackalloc int[4 * 4]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[4]; + Span<int> tempOut = stackalloc int[4]; + + // Inverse transform row vectors. + for (i = 0; i < 4; ++i) + { + HighbdIht4[txType].Rows(input, outptr, bd); + input = input.Slice(4); + outptr = outptr.Slice(4); + } + + // Inverse transform column vectors. + for (i = 0; i < 4; ++i) + { + for (j = 0; j < 4; ++j) + { + tempIn[j] = output[j * 4 + i]; + } + + HighbdIht4[txType].Cols(tempIn, tempOut, bd); + for (j = 0; j < 4; ++j) + { + dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4), bd); + } + } + } + + private static readonly HighbdTransform2D[] HighIht8 = new HighbdTransform2D[] + { + new HighbdTransform2D(HighbdIdct8, HighbdIdct8), // DCT_DCT = 0 + new HighbdTransform2D(HighbdIadst8, HighbdIdct8), // ADST_DCT = 1 + new HighbdTransform2D(HighbdIdct8, HighbdIadst8), // DCT_ADST = 2 + new HighbdTransform2D(HighbdIadst8, HighbdIadst8) // ADST_ADST = 3 + }; + + public static void HighbdIht8x864Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int txType, int bd) + { + int i, j; + Span<int> output = stackalloc int[8 * 8]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[8]; + Span<int> tempOut = stackalloc int[8]; + HighbdTransform2D ht = HighIht8[txType]; + + // Inverse transform row vectors. + for (i = 0; i < 8; ++i) + { + ht.Rows(input, outptr, bd); + input = input.Slice(8); + outptr = output.Slice(8); + } + + // Inverse transform column vectors. + for (i = 0; i < 8; ++i) + { + for (j = 0; j < 8; ++j) + { + tempIn[j] = output[j * 8 + i]; + } + + ht.Cols(tempIn, tempOut, bd); + for (j = 0; j < 8; ++j) + { + dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd); + } + } + } + + private static readonly HighbdTransform2D[] HighIht16 = new HighbdTransform2D[] + { + new HighbdTransform2D(HighbdIdct16, HighbdIdct16), // DCT_DCT = 0 + new HighbdTransform2D(HighbdIadst16, HighbdIdct16), // ADST_DCT = 1 + new HighbdTransform2D(HighbdIdct16, HighbdIadst16), // DCT_ADST = 2 + new HighbdTransform2D(HighbdIadst16, HighbdIadst16) // ADST_ADST = 3 + }; + + public static void HighbdIht16x16256Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int txType, int bd) + { + int i, j; + Span<int> output = stackalloc int[16 * 16]; + Span<int> outptr = output; + Span<int> tempIn = stackalloc int[16]; + Span<int> tempOut = stackalloc int[16]; + HighbdTransform2D ht = HighIht16[txType]; + + // Rows + for (i = 0; i < 16; ++i) + { + ht.Rows(input, outptr, bd); + input = input.Slice(16); + outptr = output.Slice(16); + } + + // Columns + for (i = 0; i < 16; ++i) + { + for (j = 0; j < 16; ++j) + { + tempIn[j] = output[j * 16 + i]; + } + + ht.Cols(tempIn, tempOut, bd); + for (j = 0; j < 16; ++j) + { + dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); + } + } + } + + // Idct + public static void HighbdIdct4x4Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd) + { + if (eob > 1) + { + HighbdIdct4x416Add(input, dest, stride, bd); + } + else + { + HighbdIdct4x41Add(input, dest, stride, bd); + } + } + + public static void HighbdIwht4x4Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd) + { + if (eob > 1) + { + HighbdIwht4x416Add(input, dest, stride, bd); + } + else + { + HighbdIwht4x41Add(input, dest, stride, bd); + } + } + + public static void HighbdIdct8x8Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd) + { + // If dc is 1, then input[0] is the reconstructed value, do not need + // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. + + // The calculation can be simplified if there are not many non-zero dct + // coefficients. Use eobs to decide what to do. + // DC only DCT coefficient + if (eob == 1) + { + vpx_Highbdidct8x8_1_add_c(input, dest, stride, bd); + } + else if (eob <= 12) + { + HighbdIdct8x812Add(input, dest, stride, bd); + } + else + { + HighbdIdct8x864Add(input, dest, stride, bd); + } + } + + public static void HighbdIdct16x16Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd) + { + // The calculation can be simplified if there are not many non-zero dct + // coefficients. Use eobs to separate different cases. + // DC only DCT coefficient. + if (eob == 1) + { + HighbdIdct16x161Add(input, dest, stride, bd); + } + else if (eob <= 10) + { + HighbdIdct16x1610Add(input, dest, stride, bd); + } + else if (eob <= 38) + { + HighbdIdct16x1638Add(input, dest, stride, bd); + } + else + { + HighbdIdct16x16256Add(input, dest, stride, bd); + } + } + + public static void HighbdIdct32x32Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd) + { + // Non-zero coeff only in upper-left 8x8 + if (eob == 1) + { + HighbdIdct32x321Add(input, dest, stride, bd); + } + else if (eob <= 34) + { + HighbdIdct32x3234Add(input, dest, stride, bd); + } + else if (eob <= 135) + { + HighbdIdct32x32135Add(input, dest, stride, bd); + } + else + { + HighbdIdct32x321024Add(input, dest, stride, bd); + } + } + + // Iht + public static void HighbdIht4x4Add(TxType txType, ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd) + { + if (txType == TxType.DctDct) + { + HighbdIdct4x4Add(input, dest, stride, eob, bd); + } + else + { + HighbdIht4x416Add(input, dest, stride, (int)txType, bd); + } + } + + public static void HighbdIht8x8Add(TxType txType, ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd) + { + if (txType == TxType.DctDct) + { + HighbdIdct8x8Add(input, dest, stride, eob, bd); + } + else + { + HighbdIht8x864Add(input, dest, stride, (int)txType, bd); + } + } + + public static void HighbdIht16x16Add(TxType txType, ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd) + { + if (txType == TxType.DctDct) + { + HighbdIdct16x16Add(input, dest, stride, eob, bd); + } + else + { + HighbdIht16x16256Add(input, dest, stride, (int)txType, bd); + } + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/InternalErrorException.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/InternalErrorException.cs new file mode 100644 index 00000000..baa0ab99 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/InternalErrorException.cs @@ -0,0 +1,15 @@ +using System; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + class InternalErrorException : Exception + { + public InternalErrorException(string message) : base(message) + { + } + + public InternalErrorException(string message, Exception innerException) : base(message, innerException) + { + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/InternalErrorInfo.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/InternalErrorInfo.cs new file mode 100644 index 00000000..68e9cb4b --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/InternalErrorInfo.cs @@ -0,0 +1,14 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal struct InternalErrorInfo + { + public CodecErr ErrorCode; + + public void InternalError(CodecErr error, string message) + { + ErrorCode = error; + + throw new InternalErrorException(message); + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/LoopFilter.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/LoopFilter.cs new file mode 100644 index 00000000..9ecccc64 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/LoopFilter.cs @@ -0,0 +1,418 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Common; +using Ryujinx.Graphics.Nvdec.Vp9.Types; +using System; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal static class LoopFilter + { + public const int MaxLoopFilter = 63; + + public const int MaxRefLfDeltas = 4; + public const int MaxModeLfDeltas = 2; + + // 64 bit masks for left transform size. Each 1 represents a position where + // we should apply a loop filter across the left border of an 8x8 block + // boundary. + // + // In the case of TX_16X16 -> ( in low order byte first we end up with + // a mask that looks like this + // + // 10101010 + // 10101010 + // 10101010 + // 10101010 + // 10101010 + // 10101010 + // 10101010 + // 10101010 + // + // A loopfilter should be applied to every other 8x8 horizontally. + private static readonly ulong[] Left64X64TxformMask = new ulong[] + { + 0xffffffffffffffffUL, // TX_4X4 + 0xffffffffffffffffUL, // TX_8x8 + 0x5555555555555555UL, // TX_16x16 + 0x1111111111111111UL, // TX_32x32 + }; + + // 64 bit masks for above transform size. Each 1 represents a position where + // we should apply a loop filter across the top border of an 8x8 block + // boundary. + // + // In the case of TX_32x32 -> ( in low order byte first we end up with + // a mask that looks like this + // + // 11111111 + // 00000000 + // 00000000 + // 00000000 + // 11111111 + // 00000000 + // 00000000 + // 00000000 + // + // A loopfilter should be applied to every other 4 the row vertically. + private static readonly ulong[] Above64X64TxformMask = new ulong[] + { + 0xffffffffffffffffUL, // TX_4X4 + 0xffffffffffffffffUL, // TX_8x8 + 0x00ff00ff00ff00ffUL, // TX_16x16 + 0x000000ff000000ffUL, // TX_32x32 + }; + + // 64 bit masks for prediction sizes (left). Each 1 represents a position + // where left border of an 8x8 block. These are aligned to the right most + // appropriate bit, and then shifted into place. + // + // In the case of TX_16x32 -> ( low order byte first ) we end up with + // a mask that looks like this : + // + // 10000000 + // 10000000 + // 10000000 + // 10000000 + // 00000000 + // 00000000 + // 00000000 + // 00000000 + private static readonly ulong[] LeftPredictionMask = new ulong[] + { + 0x0000000000000001UL, // BLOCK_4X4, + 0x0000000000000001UL, // BLOCK_4X8, + 0x0000000000000001UL, // BLOCK_8X4, + 0x0000000000000001UL, // BLOCK_8X8, + 0x0000000000000101UL, // BLOCK_8X16, + 0x0000000000000001UL, // BLOCK_16X8, + 0x0000000000000101UL, // BLOCK_16X16, + 0x0000000001010101UL, // BLOCK_16X32, + 0x0000000000000101UL, // BLOCK_32X16, + 0x0000000001010101UL, // BLOCK_32X32, + 0x0101010101010101UL, // BLOCK_32X64, + 0x0000000001010101UL, // BLOCK_64X32, + 0x0101010101010101UL, // BLOCK_64X64 + }; + + // 64 bit mask to shift and set for each prediction size. + private static readonly ulong[] AbovePredictionMask = new ulong[] + { + 0x0000000000000001UL, // BLOCK_4X4 + 0x0000000000000001UL, // BLOCK_4X8 + 0x0000000000000001UL, // BLOCK_8X4 + 0x0000000000000001UL, // BLOCK_8X8 + 0x0000000000000001UL, // BLOCK_8X16, + 0x0000000000000003UL, // BLOCK_16X8 + 0x0000000000000003UL, // BLOCK_16X16 + 0x0000000000000003UL, // BLOCK_16X32, + 0x000000000000000fUL, // BLOCK_32X16, + 0x000000000000000fUL, // BLOCK_32X32, + 0x000000000000000fUL, // BLOCK_32X64, + 0x00000000000000ffUL, // BLOCK_64X32, + 0x00000000000000ffUL, // BLOCK_64X64 + }; + + // 64 bit mask to shift and set for each prediction size. A bit is set for + // each 8x8 block that would be in the left most block of the given block + // size in the 64x64 block. + private static readonly ulong[] SizeMask = new ulong[] + { + 0x0000000000000001UL, // BLOCK_4X4 + 0x0000000000000001UL, // BLOCK_4X8 + 0x0000000000000001UL, // BLOCK_8X4 + 0x0000000000000001UL, // BLOCK_8X8 + 0x0000000000000101UL, // BLOCK_8X16, + 0x0000000000000003UL, // BLOCK_16X8 + 0x0000000000000303UL, // BLOCK_16X16 + 0x0000000003030303UL, // BLOCK_16X32, + 0x0000000000000f0fUL, // BLOCK_32X16, + 0x000000000f0f0f0fUL, // BLOCK_32X32, + 0x0f0f0f0f0f0f0f0fUL, // BLOCK_32X64, + 0x00000000ffffffffUL, // BLOCK_64X32, + 0xffffffffffffffffUL, // BLOCK_64X64 + }; + + // These are used for masking the left and above borders. + private const ulong LeftBorder = 0x1111111111111111UL; + private const ulong AboveBorder = 0x000000ff000000ffUL; + + // 16 bit masks for uv transform sizes. + private static readonly ushort[] Left64X64TxformMaskUv = new ushort[] + { + 0xffff, // TX_4X4 + 0xffff, // TX_8x8 + 0x5555, // TX_16x16 + 0x1111, // TX_32x32 + }; + + private static readonly ushort[] Above64X64TxformMaskUv = new ushort[] + { + 0xffff, // TX_4X4 + 0xffff, // TX_8x8 + 0x0f0f, // TX_16x16 + 0x000f, // TX_32x32 + }; + + // 16 bit left mask to shift and set for each uv prediction size. + private static readonly ushort[] LeftPredictionMaskUv = new ushort[] + { + 0x0001, // BLOCK_4X4, + 0x0001, // BLOCK_4X8, + 0x0001, // BLOCK_8X4, + 0x0001, // BLOCK_8X8, + 0x0001, // BLOCK_8X16, + 0x0001, // BLOCK_16X8, + 0x0001, // BLOCK_16X16, + 0x0011, // BLOCK_16X32, + 0x0001, // BLOCK_32X16, + 0x0011, // BLOCK_32X32, + 0x1111, // BLOCK_32X64 + 0x0011, // BLOCK_64X32, + 0x1111, // BLOCK_64X64 + }; + + // 16 bit above mask to shift and set for uv each prediction size. + private static readonly ushort[] AbovePredictionMaskUv = new ushort[] + { + 0x0001, // BLOCK_4X4 + 0x0001, // BLOCK_4X8 + 0x0001, // BLOCK_8X4 + 0x0001, // BLOCK_8X8 + 0x0001, // BLOCK_8X16, + 0x0001, // BLOCK_16X8 + 0x0001, // BLOCK_16X16 + 0x0001, // BLOCK_16X32, + 0x0003, // BLOCK_32X16, + 0x0003, // BLOCK_32X32, + 0x0003, // BLOCK_32X64, + 0x000f, // BLOCK_64X32, + 0x000f, // BLOCK_64X64 + }; + + // 64 bit mask to shift and set for each uv prediction size + private static readonly ushort[] SizeMaskUv = new ushort[] + { + 0x0001, // BLOCK_4X4 + 0x0001, // BLOCK_4X8 + 0x0001, // BLOCK_8X4 + 0x0001, // BLOCK_8X8 + 0x0001, // BLOCK_8X16, + 0x0001, // BLOCK_16X8 + 0x0001, // BLOCK_16X16 + 0x0011, // BLOCK_16X32, + 0x0003, // BLOCK_32X16, + 0x0033, // BLOCK_32X32, + 0x3333, // BLOCK_32X64, + 0x00ff, // BLOCK_64X32, + 0xffff, // BLOCK_64X64 + }; + + private const ushort LeftBorderUv = 0x1111; + private const ushort AboveBorderUv = 0x000f; + + private static readonly int[] ModeLfLut = new int[] + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES + 1, 1, 0, 1 // INTER_MODES (ZEROMV == 0) + }; + + private static byte GetFilterLevel(ref LoopFilterInfoN lfiN, ref ModeInfo mi) + { + return lfiN.Lvl[mi.SegmentId][mi.RefFrame[0]][ModeLfLut[(int)mi.Mode]]; + } + + private static ref LoopFilterMask GetLfm(ref Types.LoopFilter lf, int miRow, int miCol) + { + return ref lf.Lfm[(miCol >> 3) + ((miRow >> 3) * lf.LfmStride)]; + } + + // 8x8 blocks in a superblock. A "1" represents the first block in a 16x16 + // or greater area. + private static readonly byte[][] FirstBlockIn16x16 = new byte[][] + { + new byte[] { 1, 0, 1, 0, 1, 0, 1, 0 }, new byte[] { 0, 0, 0, 0, 0, 0, 0, 0 }, + new byte[] { 1, 0, 1, 0, 1, 0, 1, 0 }, new byte[] { 0, 0, 0, 0, 0, 0, 0, 0 }, + new byte[] { 1, 0, 1, 0, 1, 0, 1, 0 }, new byte[] { 0, 0, 0, 0, 0, 0, 0, 0 }, + new byte[] { 1, 0, 1, 0, 1, 0, 1, 0 }, new byte[] { 0, 0, 0, 0, 0, 0, 0, 0 } + }; + + // This function sets up the bit masks for a block represented + // by miRow, miCol in a 64x64 region. + public static void BuildMask(ref Vp9Common cm, ref ModeInfo mi, int miRow, int miCol, int bw, int bh) + { + BlockSize blockSize = mi.SbType; + TxSize txSizeY = mi.TxSize; + ref LoopFilterInfoN lfiN = ref cm.LfInfo; + int filterLevel = GetFilterLevel(ref lfiN, ref mi); + TxSize txSizeUv = Luts.UvTxsizeLookup[(int)blockSize][(int)txSizeY][1][1]; + ref LoopFilterMask lfm = ref GetLfm(ref cm.Lf, miRow, miCol); + ref ulong leftY = ref lfm.LeftY[(int)txSizeY]; + ref ulong aboveY = ref lfm.AboveY[(int)txSizeY]; + ref ulong int4X4Y = ref lfm.Int4x4Y; + ref ushort leftUv = ref lfm.LeftUv[(int)txSizeUv]; + ref ushort aboveUv = ref lfm.AboveUv[(int)txSizeUv]; + ref ushort int4X4Uv = ref lfm.Int4x4Uv; + int rowInSb = (miRow & 7); + int colInSb = (miCol & 7); + int shiftY = colInSb + (rowInSb << 3); + int shiftUv = (colInSb >> 1) + ((rowInSb >> 1) << 2); + int buildUv = FirstBlockIn16x16[rowInSb][colInSb]; + + if (filterLevel == 0) + { + return; + } + else + { + int index = shiftY; + int i; + for (i = 0; i < bh; i++) + { + MemoryMarshal.CreateSpan(ref lfm.LflY[index], 64 - index).Slice(0, bw).Fill((byte)filterLevel); + index += 8; + } + } + + // These set 1 in the current block size for the block size edges. + // For instance if the block size is 32x16, we'll set: + // above = 1111 + // 0000 + // and + // left = 1000 + // = 1000 + // NOTE : In this example the low bit is left most ( 1000 ) is stored as + // 1, not 8... + // + // U and V set things on a 16 bit scale. + // + aboveY |= AbovePredictionMask[(int)blockSize] << shiftY; + leftY |= LeftPredictionMask[(int)blockSize] << shiftY; + + if (buildUv != 0) + { + aboveUv |= (ushort)(AbovePredictionMaskUv[(int)blockSize] << shiftUv); + leftUv |= (ushort)(LeftPredictionMaskUv[(int)blockSize] << shiftUv); + } + + // If the block has no coefficients and is not intra we skip applying + // the loop filter on block edges. + if (mi.Skip != 0 && mi.IsInterBlock()) + { + return; + } + + // Add a mask for the transform size. The transform size mask is set to + // be correct for a 64x64 prediction block size. Mask to match the size of + // the block we are working on and then shift it into place. + aboveY |= (SizeMask[(int)blockSize] & Above64X64TxformMask[(int)txSizeY]) << shiftY; + leftY |= (SizeMask[(int)blockSize] & Left64X64TxformMask[(int)txSizeY]) << shiftY; + + if (buildUv != 0) + { + aboveUv |= (ushort)((SizeMaskUv[(int)blockSize] & Above64X64TxformMaskUv[(int)txSizeUv]) << shiftUv); + leftUv |= (ushort)((SizeMaskUv[(int)blockSize] & Left64X64TxformMaskUv[(int)txSizeUv]) << shiftUv); + } + + // Try to determine what to do with the internal 4x4 block boundaries. These + // differ from the 4x4 boundaries on the outside edge of an 8x8 in that the + // internal ones can be skipped and don't depend on the prediction block size. + if (txSizeY == TxSize.Tx4x4) + { + int4X4Y |= SizeMask[(int)blockSize] << shiftY; + } + + if (buildUv != 0 && txSizeUv == TxSize.Tx4x4) + { + int4X4Uv |= (ushort)((SizeMaskUv[(int)blockSize] & 0xffff) << shiftUv); + } + } + + public static unsafe void ResetLfm(ref Vp9Common cm) + { + if (cm.Lf.FilterLevel != 0) + { + MemoryUtil.Fill(cm.Lf.Lfm.ToPointer(), new LoopFilterMask(), ((cm.MiRows + (Constants.MiBlockSize - 1)) >> 3) * cm.Lf.LfmStride); + } + } + + private static void UpdateSharpness(ref LoopFilterInfoN lfi, int sharpnessLvl) + { + int lvl; + + // For each possible value for the loop filter fill out limits + for (lvl = 0; lvl <= MaxLoopFilter; lvl++) + { + // Set loop filter parameters that control sharpness. + int blockInsideLimit = lvl >> ((sharpnessLvl > 0 ? 1 : 0) + (sharpnessLvl > 4 ? 1 : 0)); + + if (sharpnessLvl > 0) + { + if (blockInsideLimit > (9 - sharpnessLvl)) + { + blockInsideLimit = (9 - sharpnessLvl); + } + } + + if (blockInsideLimit < 1) + { + blockInsideLimit = 1; + } + + lfi.Lfthr[lvl].Lim.AsSpan().Fill((byte)blockInsideLimit); + lfi.Lfthr[lvl].Mblim.AsSpan().Fill((byte)(2 * (lvl + 2) + blockInsideLimit)); + } + } + + public static void LoopFilterFrameInit(ref Vp9Common cm, int defaultFiltLvl) + { + int segId; + // nShift is the multiplier for lfDeltas + // the multiplier is 1 for when filterLvl is between 0 and 31; + // 2 when filterLvl is between 32 and 63 + int scale = 1 << (defaultFiltLvl >> 5); + ref LoopFilterInfoN lfi = ref cm.LfInfo; + ref Types.LoopFilter lf = ref cm.Lf; + ref Segmentation seg = ref cm.Seg; + + // Update limits if sharpness has changed + if (lf.LastSharpnessLevel != lf.SharpnessLevel) + { + UpdateSharpness(ref lfi, lf.SharpnessLevel); + lf.LastSharpnessLevel = lf.SharpnessLevel; + } + + for (segId = 0; segId < Constants.MaxSegments; segId++) + { + int lvlSeg = defaultFiltLvl; + if (seg.IsSegFeatureActive(segId, SegLvlFeatures.SegLvlAltLf) != 0) + { + int data = seg.GetSegData(segId, SegLvlFeatures.SegLvlAltLf); + lvlSeg = Math.Clamp(seg.AbsDelta == Constants.SegmentAbsData ? data : defaultFiltLvl + data, 0, MaxLoopFilter); + } + + if (!lf.ModeRefDeltaEnabled) + { + // We could get rid of this if we assume that deltas are set to + // zero when not in use; encoder always uses deltas + MemoryMarshal.Cast<Array2<byte>, byte>(lfi.Lvl[segId].AsSpan()).Fill((byte)lvlSeg); + } + else + { + int refr, mode; + int intraLvl = lvlSeg + lf.RefDeltas[Constants.IntraFrame] * scale; + lfi.Lvl[segId][Constants.IntraFrame][0] = (byte)Math.Clamp(intraLvl, 0, MaxLoopFilter); + + for (refr = Constants.LastFrame; refr < Constants.MaxRefFrames; ++refr) + { + for (mode = 0; mode < MaxModeLfDeltas; ++mode) + { + int interLvl = lvlSeg + lf.RefDeltas[refr] * scale + lf.ModeDeltas[mode] * scale; + lfi.Lvl[segId][refr][mode] = (byte)Math.Clamp(interLvl, 0, MaxLoopFilter); + } + } + } + } + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Luts.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Luts.cs new file mode 100644 index 00000000..140181ef --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Luts.cs @@ -0,0 +1,1610 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Types; +using System; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal static class Luts + { + public static ReadOnlySpan<byte> SizeGroupLookup => new byte[] { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; + + public static readonly BlockSize[][] SubsizeLookup = new BlockSize[][] + { + new BlockSize[] + { // PARTITION_NONE + BlockSize.Block4x4, BlockSize.Block4x8, BlockSize.Block8x4, BlockSize.Block8x8, BlockSize.Block8x16, BlockSize.Block16x8, + BlockSize.Block16x16, BlockSize.Block16x32, BlockSize.Block32x16, BlockSize.Block32x32, BlockSize.Block32x64, + BlockSize.Block64x32, BlockSize.Block64x64 + }, + new BlockSize[] + { // PARTITION_HORZ + BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block8x4, BlockSize.BlockInvalid, + BlockSize.BlockInvalid, BlockSize.Block16x8, BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block32x16, + BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block64x32 + }, + new BlockSize[] + { // PARTITION_VERT + BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block4x8, BlockSize.BlockInvalid, + BlockSize.BlockInvalid, BlockSize.Block8x16, BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block16x32, + BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block32x64 + }, + new BlockSize[] + { // PARTITION_SPLIT + BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block4x4, BlockSize.BlockInvalid, + BlockSize.BlockInvalid, BlockSize.Block8x8, BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block16x16, + BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block32x32 + } + }; + + public static readonly TxSize[] MaxTxSizeLookup = new TxSize[] + { + TxSize.Tx4x4, TxSize.Tx4x4, TxSize.Tx4x4, TxSize.Tx8x8, TxSize.Tx8x8, TxSize.Tx8x8, TxSize.Tx16x16, + TxSize.Tx16x16, TxSize.Tx16x16, TxSize.Tx32x32, TxSize.Tx32x32, TxSize.Tx32x32, TxSize.Tx32x32 + }; + + public static readonly TxSize[] TxModeToBiggestTxSize = new TxSize[] + { + TxSize.Tx4x4, // ONLY_4X4 + TxSize.Tx8x8, // ALLOW_8X8 + TxSize.Tx16x16, // ALLOW_16X16 + TxSize.Tx32x32, // ALLOW_32X32 + TxSize.Tx32x32, // TX_MODE_SELECT + }; + + public static readonly BlockSize[][][] SsSizeLookup = new BlockSize[][][] + { + // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1 + // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1 + new BlockSize[][] { new BlockSize[] { BlockSize.Block4x4, BlockSize.BlockInvalid }, new BlockSize[] { BlockSize.BlockInvalid, BlockSize.BlockInvalid } }, + new BlockSize[][] { new BlockSize[] { BlockSize.Block4x8, BlockSize.Block4x4 }, new BlockSize[] { BlockSize.BlockInvalid, BlockSize.BlockInvalid } }, + new BlockSize[][] { new BlockSize[] { BlockSize.Block8x4, BlockSize.BlockInvalid }, new BlockSize[] { BlockSize.Block4x4, BlockSize.BlockInvalid } }, + new BlockSize[][] { new BlockSize[] { BlockSize.Block8x8, BlockSize.Block8x4 }, new BlockSize[] { BlockSize.Block4x8, BlockSize.Block4x4 } }, + new BlockSize[][] { new BlockSize[] { BlockSize.Block8x16, BlockSize.Block8x8 }, new BlockSize[] { BlockSize.BlockInvalid, BlockSize.Block4x8 } }, + new BlockSize[][] { new BlockSize[] { BlockSize.Block16x8, BlockSize.BlockInvalid }, new BlockSize[] { BlockSize.Block8x8, BlockSize.Block8x4 } }, + new BlockSize[][] { new BlockSize[] { BlockSize.Block16x16, BlockSize.Block16x8 }, new BlockSize[] { BlockSize.Block8x16, BlockSize.Block8x8 } }, + new BlockSize[][] { new BlockSize[] { BlockSize.Block16x32, BlockSize.Block16x16 }, new BlockSize[] { BlockSize.BlockInvalid, BlockSize.Block8x16 } }, + new BlockSize[][] { new BlockSize[] { BlockSize.Block32x16, BlockSize.BlockInvalid }, new BlockSize[] { BlockSize.Block16x16, BlockSize.Block16x8 } }, + new BlockSize[][] { new BlockSize[] { BlockSize.Block32x32, BlockSize.Block32x16 }, new BlockSize[] { BlockSize.Block16x32, BlockSize.Block16x16 } }, + new BlockSize[][] { new BlockSize[] { BlockSize.Block32x64, BlockSize.Block32x32 }, new BlockSize[] { BlockSize.BlockInvalid, BlockSize.Block16x32 } }, + new BlockSize[][] { new BlockSize[] { BlockSize.Block64x32, BlockSize.BlockInvalid }, new BlockSize[] { BlockSize.Block32x32, BlockSize.Block32x16 } }, + new BlockSize[][] { new BlockSize[] { BlockSize.Block64x64, BlockSize.Block64x32 }, new BlockSize[] { BlockSize.Block32x64, BlockSize.Block32x32 } }, + }; + + public static readonly TxSize[][][][] UvTxsizeLookup = new TxSize[][][][] + { + // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1 + // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1 + new TxSize[][][] + { + // BLOCK_4X4 + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + }, + new TxSize[][][] + { + // BLOCK_4X8 + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + }, + new TxSize[][][] + { + // BLOCK_8X4 + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + }, + new TxSize[][][] + { + // BLOCK_8X8 + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + }, + new TxSize[][][] + { + // BLOCK_8X16 + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + }, + new TxSize[][][] + { + // BLOCK_16X8 + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, + }, + new TxSize[][][] + { + // BLOCK_16X16 + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, + }, + new TxSize[][][] + { + // BLOCK_16X32 + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, + }, + new TxSize[][][] + { + // BLOCK_32X16 + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 } }, + }, + new TxSize[][][] + { + // BLOCK_32X32 + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx32x32, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } }, + }, + new TxSize[][][] + { + // BLOCK_32X64 + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx32x32, TxSize.Tx32x32 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } }, + }, + new TxSize[][][] + { + // BLOCK_64X32 + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx32x32, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx32x32, TxSize.Tx16x16 } }, + }, + new TxSize[][][] + { + // BLOCK_64X64 + new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } }, + new TxSize[][] { new TxSize[] { TxSize.Tx32x32, TxSize.Tx32x32 }, new TxSize[] { TxSize.Tx32x32, TxSize.Tx32x32 } }, + }, + }; + + public struct PartitionContextPair + { + public sbyte Above; + public sbyte Left; + + public PartitionContextPair(sbyte above, sbyte left) + { + Above = above; + Left = left; + } + } + + // Generates 4 bit field in which each bit set to 1 represents + // a blocksize partition 1111 means we split 64x64, 32x32, 16x16 + // and 8x8. 1000 means we just split the 64x64 to 32x32 + public static readonly PartitionContextPair[] PartitionContextLookup = new PartitionContextPair[] + { + new PartitionContextPair(15, 15), // 4X4 - {0b1111, 0b1111} + new PartitionContextPair(15, 14), // 4X8 - {0b1111, 0b1110} + new PartitionContextPair(14, 15), // 8X4 - {0b1110, 0b1111} + new PartitionContextPair(14, 14), // 8X8 - {0b1110, 0b1110} + new PartitionContextPair(14, 12), // 8X16 - {0b1110, 0b1100} + new PartitionContextPair(12, 14), // 16X8 - {0b1100, 0b1110} + new PartitionContextPair(12, 12), // 16X16 - {0b1100, 0b1100} + new PartitionContextPair(12, 8), // 16X32 - {0b1100, 0b1000} + new PartitionContextPair(8, 12), // 32X16 - {0b1000, 0b1100} + new PartitionContextPair(8, 8), // 32X32 - {0b1000, 0b1000} + new PartitionContextPair(8, 0), // 32X64 - {0b1000, 0b0000} + new PartitionContextPair(0, 8), // 64X32 - {0b0000, 0b1000} + new PartitionContextPair(0, 0), // 64X64 - {0b0000, 0b0000} + }; + + // Filter + + private static readonly Array8<short>[] BilinearFilters = new Array8<short>[] + { + NewArray8Short(0, 0, 0, 128, 0, 0, 0, 0), NewArray8Short(0, 0, 0, 120, 8, 0, 0, 0), + NewArray8Short(0, 0, 0, 112, 16, 0, 0, 0), NewArray8Short(0, 0, 0, 104, 24, 0, 0, 0), + NewArray8Short(0, 0, 0, 96, 32, 0, 0, 0), NewArray8Short(0, 0, 0, 88, 40, 0, 0, 0), + NewArray8Short(0, 0, 0, 80, 48, 0, 0, 0), NewArray8Short(0, 0, 0, 72, 56, 0, 0, 0), + NewArray8Short(0, 0, 0, 64, 64, 0, 0, 0), NewArray8Short(0, 0, 0, 56, 72, 0, 0, 0), + NewArray8Short(0, 0, 0, 48, 80, 0, 0, 0), NewArray8Short(0, 0, 0, 40, 88, 0, 0, 0), + NewArray8Short(0, 0, 0, 32, 96, 0, 0, 0), NewArray8Short(0, 0, 0, 24, 104, 0, 0, 0), + NewArray8Short(0, 0, 0, 16, 112, 0, 0, 0), NewArray8Short(0, 0, 0, 8, 120, 0, 0, 0) + }; + + // Lagrangian interpolation filter + private static readonly Array8<short>[] SubPelFilters8 = new Array8<short>[] + { + NewArray8Short(0, 0, 0, 128, 0, 0, 0, 0), NewArray8Short(0, 1, -5, 126, 8, -3, 1, 0), + NewArray8Short(-1, 3, -10, 122, 18, -6, 2, 0), NewArray8Short(-1, 4, -13, 118, 27, -9, 3, -1), + NewArray8Short(-1, 4, -16, 112, 37, -11, 4, -1), NewArray8Short(-1, 5, -18, 105, 48, -14, 4, -1), + NewArray8Short(-1, 5, -19, 97, 58, -16, 5, -1), NewArray8Short(-1, 6, -19, 88, 68, -18, 5, -1), + NewArray8Short(-1, 6, -19, 78, 78, -19, 6, -1), NewArray8Short(-1, 5, -18, 68, 88, -19, 6, -1), + NewArray8Short(-1, 5, -16, 58, 97, -19, 5, -1), NewArray8Short(-1, 4, -14, 48, 105, -18, 5, -1), + NewArray8Short(-1, 4, -11, 37, 112, -16, 4, -1), NewArray8Short(-1, 3, -9, 27, 118, -13, 4, -1), + NewArray8Short(0, 2, -6, 18, 122, -10, 3, -1), NewArray8Short(0, 1, -3, 8, 126, -5, 1, 0) + }; + + // DCT based filter + private static readonly Array8<short>[] SubPelFilters8S = new Array8<short>[] + { + NewArray8Short(0, 0, 0, 128, 0, 0, 0, 0), NewArray8Short(-1, 3, -7, 127, 8, -3, 1, 0), + NewArray8Short(-2, 5, -13, 125, 17, -6, 3, -1), NewArray8Short(-3, 7, -17, 121, 27, -10, 5, -2), + NewArray8Short(-4, 9, -20, 115, 37, -13, 6, -2), NewArray8Short(-4, 10, -23, 108, 48, -16, 8, -3), + NewArray8Short(-4, 10, -24, 100, 59, -19, 9, -3), NewArray8Short(-4, 11, -24, 90, 70, -21, 10, -4), + NewArray8Short(-4, 11, -23, 80, 80, -23, 11, -4), NewArray8Short(-4, 10, -21, 70, 90, -24, 11, -4), + NewArray8Short(-3, 9, -19, 59, 100, -24, 10, -4), NewArray8Short(-3, 8, -16, 48, 108, -23, 10, -4), + NewArray8Short(-2, 6, -13, 37, 115, -20, 9, -4), NewArray8Short(-2, 5, -10, 27, 121, -17, 7, -3), + NewArray8Short(-1, 3, -6, 17, 125, -13, 5, -2), NewArray8Short(0, 1, -3, 8, 127, -7, 3, -1) + }; + + // freqmultiplier = 0.5 + private static readonly Array8<short>[] SubPelFilters8Lp = new Array8<short>[] + { + NewArray8Short(0, 0, 0, 128, 0, 0, 0, 0), NewArray8Short(-3, -1, 32, 64, 38, 1, -3, 0), + NewArray8Short(-2, -2, 29, 63, 41, 2, -3, 0), NewArray8Short(-2, -2, 26, 63, 43, 4, -4, 0), + NewArray8Short(-2, -3, 24, 62, 46, 5, -4, 0), NewArray8Short(-2, -3, 21, 60, 49, 7, -4, 0), + NewArray8Short(-1, -4, 18, 59, 51, 9, -4, 0), NewArray8Short(-1, -4, 16, 57, 53, 12, -4, -1), + NewArray8Short(-1, -4, 14, 55, 55, 14, -4, -1), NewArray8Short(-1, -4, 12, 53, 57, 16, -4, -1), + NewArray8Short(0, -4, 9, 51, 59, 18, -4, -1), NewArray8Short(0, -4, 7, 49, 60, 21, -3, -2), + NewArray8Short(0, -4, 5, 46, 62, 24, -3, -2), NewArray8Short(0, -4, 4, 43, 63, 26, -2, -2), + NewArray8Short(0, -3, 2, 41, 63, 29, -2, -2), NewArray8Short(0, -3, 1, 38, 64, 32, -1, -3) + }; + + private static Array8<short> NewArray8Short(short e0, short e1, short e2, short e3, short e4, short e5, short e6, short e7) + { + Array8<short> output = new Array8<short>(); + + output[0] = e0; + output[1] = e1; + output[2] = e2; + output[3] = e3; + output[4] = e4; + output[5] = e5; + output[6] = e6; + output[7] = e7; + + return output; + } + + public static readonly Array8<short>[][] Vp9FilterKernels = new Array8<short>[][] + { + SubPelFilters8, SubPelFilters8Lp, SubPelFilters8S, BilinearFilters + }; + + // Scan + + private static readonly short[] DefaultScan4X4 = new short[] + { + 0, 4, 1, 5, 8, 2, 12, 9, 3, 6, 13, 10, 7, 14, 11, 15, + }; + + private static readonly short[] ColScan4X4 = new short[] + { + 0, 4, 8, 1, 12, 5, 9, 2, 13, 6, 10, 3, 7, 14, 11, 15, + }; + + private static readonly short[] RowScan4X4 = new short[] + { + 0, 1, 4, 2, 5, 3, 6, 8, 9, 7, 12, 10, 13, 11, 14, 15, + }; + + private static readonly short[] DefaultScan8X8 = new short[] + { + 0, 8, 1, 16, 9, 2, 17, 24, 10, 3, 18, 25, 32, 11, 4, 26, + 33, 19, 40, 12, 34, 27, 5, 41, 20, 48, 13, 35, 42, 28, 21, 6, + 49, 56, 36, 43, 29, 7, 14, 50, 57, 44, 22, 37, 15, 51, 58, 30, + 45, 23, 52, 59, 38, 31, 60, 53, 46, 39, 61, 54, 47, 62, 55, 63, + }; + + private static readonly short[] ColScan8X8 = new short[] + { + 0, 8, 16, 1, 24, 9, 32, 17, 2, 40, 25, 10, 33, 18, 48, 3, + 26, 41, 11, 56, 19, 34, 4, 49, 27, 42, 12, 35, 20, 57, 50, 28, + 5, 43, 13, 36, 58, 51, 21, 44, 6, 29, 59, 37, 14, 52, 22, 7, + 45, 60, 30, 15, 38, 53, 23, 46, 31, 61, 39, 54, 47, 62, 55, 63, + }; + + private static readonly short[] RowScan8X8 = new short[] + { + 0, 1, 2, 8, 9, 3, 16, 10, 4, 17, 11, 24, 5, 18, 25, 12, + 19, 26, 32, 6, 13, 20, 33, 27, 7, 34, 40, 21, 28, 41, 14, 35, + 48, 42, 29, 36, 49, 22, 43, 15, 56, 37, 50, 44, 30, 57, 23, 51, + 58, 45, 38, 52, 31, 59, 53, 46, 60, 39, 61, 47, 54, 55, 62, 63, + }; + + private static readonly short[] DefaultScan16X16 = new short[] + { + 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, + 80, 50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, + 21, 52, 98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, + 129, 38, 69, 100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, + 101, 131, 160, 146, 55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, + 176, 162, 87, 56, 25, 133, 118, 177, 148, 72, 103, 41, 163, 10, 192, + 178, 88, 57, 134, 149, 119, 26, 164, 73, 104, 193, 42, 179, 208, 11, + 135, 89, 165, 120, 150, 58, 194, 180, 27, 74, 209, 105, 151, 136, 43, + 90, 224, 166, 195, 181, 121, 210, 59, 12, 152, 106, 167, 196, 75, 137, + 225, 211, 240, 182, 122, 91, 28, 197, 13, 226, 168, 183, 153, 44, 212, + 138, 107, 241, 60, 29, 123, 198, 184, 227, 169, 242, 76, 213, 154, 45, + 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108, 77, 155, 30, 15, + 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140, 230, 62, 216, + 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141, 63, 232, + 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142, 219, + 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159, 251, + 190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239, + 255, + }; + + private static readonly short[] ColScan16X16 = new short[] + { + 0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, + 81, 34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, + 129, 4, 67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, + 68, 115, 21, 146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, + 116, 193, 147, 85, 22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, + 7, 148, 194, 86, 179, 225, 23, 133, 39, 164, 8, 102, 210, 241, 55, + 195, 118, 149, 71, 180, 24, 87, 226, 134, 165, 211, 40, 103, 56, 72, + 150, 196, 242, 119, 9, 181, 227, 88, 166, 25, 135, 41, 104, 212, 57, + 151, 197, 120, 73, 243, 182, 136, 167, 213, 89, 10, 228, 105, 152, 198, + 26, 42, 121, 183, 244, 168, 58, 137, 229, 74, 214, 90, 153, 199, 184, + 11, 106, 245, 27, 122, 230, 169, 43, 215, 59, 200, 138, 185, 246, 75, + 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170, 60, 247, 232, 76, + 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202, 233, 171, 61, + 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125, 62, 172, + 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79, 126, + 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205, 236, + 159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239, + 255, + }; + + private static readonly short[] RowScan16X16 = new short[] + { + 0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, + 20, 49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, + 66, 52, 23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, + 83, 97, 69, 25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, + 41, 56, 114, 100, 13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, + 116, 14, 87, 130, 102, 144, 73, 131, 117, 28, 58, 15, 88, 43, 145, + 103, 132, 146, 118, 74, 160, 89, 133, 104, 29, 59, 147, 119, 44, 161, + 148, 90, 105, 134, 162, 120, 176, 75, 135, 149, 30, 60, 163, 177, 45, + 121, 91, 106, 164, 178, 150, 192, 136, 165, 179, 31, 151, 193, 76, 122, + 61, 137, 194, 107, 152, 180, 208, 46, 166, 167, 195, 92, 181, 138, 209, + 123, 153, 224, 196, 77, 168, 210, 182, 240, 108, 197, 62, 154, 225, 183, + 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170, 124, 155, 199, 78, + 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186, 156, 229, 243, + 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110, 157, 245, + 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111, 158, 188, + 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220, 175, + 190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, + 255, + }; + + private static readonly short[] DefaultScan32X32 = new short[] + { + 0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, + 160, 129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, + 162, 193, 68, 131, 37, 100, 225, 194, 256, 163, 69, 132, 6, + 226, 257, 288, 195, 101, 164, 38, 258, 7, 227, 289, 133, 320, + 70, 196, 165, 290, 259, 228, 39, 321, 102, 352, 8, 197, 71, + 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292, + 135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, + 293, 41, 417, 199, 136, 262, 387, 448, 325, 356, 10, 73, 418, + 231, 168, 449, 294, 388, 105, 419, 263, 42, 200, 357, 450, 137, + 480, 74, 326, 232, 11, 389, 169, 295, 420, 106, 451, 481, 358, + 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421, 75, + 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391, + 453, 139, 44, 234, 484, 297, 360, 171, 76, 515, 545, 266, 329, + 454, 13, 423, 203, 108, 546, 485, 576, 298, 235, 140, 361, 330, + 172, 547, 45, 455, 267, 577, 486, 77, 204, 362, 608, 14, 299, + 578, 109, 236, 487, 609, 331, 141, 579, 46, 15, 173, 610, 363, + 78, 205, 16, 110, 237, 611, 142, 47, 174, 79, 206, 17, 111, + 238, 48, 143, 80, 175, 112, 207, 49, 18, 239, 81, 113, 19, + 50, 82, 114, 51, 83, 115, 640, 516, 392, 268, 144, 20, 672, + 641, 548, 517, 424, 393, 300, 269, 176, 145, 52, 21, 704, 673, + 642, 580, 549, 518, 456, 425, 394, 332, 301, 270, 208, 177, 146, + 84, 53, 22, 736, 705, 674, 643, 612, 581, 550, 519, 488, 457, + 426, 395, 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, + 23, 737, 706, 675, 613, 582, 551, 489, 458, 427, 365, 334, 303, + 241, 210, 179, 117, 86, 55, 738, 707, 614, 583, 490, 459, 366, + 335, 242, 211, 118, 87, 739, 615, 491, 367, 243, 119, 768, 644, + 520, 396, 272, 148, 24, 800, 769, 676, 645, 552, 521, 428, 397, + 304, 273, 180, 149, 56, 25, 832, 801, 770, 708, 677, 646, 584, + 553, 522, 460, 429, 398, 336, 305, 274, 212, 181, 150, 88, 57, + 26, 864, 833, 802, 771, 740, 709, 678, 647, 616, 585, 554, 523, + 492, 461, 430, 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, + 89, 58, 27, 865, 834, 803, 741, 710, 679, 617, 586, 555, 493, + 462, 431, 369, 338, 307, 245, 214, 183, 121, 90, 59, 866, 835, + 742, 711, 618, 587, 494, 463, 370, 339, 246, 215, 122, 91, 867, + 743, 619, 495, 371, 247, 123, 896, 772, 648, 524, 400, 276, 152, + 28, 928, 897, 804, 773, 680, 649, 556, 525, 432, 401, 308, 277, + 184, 153, 60, 29, 960, 929, 898, 836, 805, 774, 712, 681, 650, + 588, 557, 526, 464, 433, 402, 340, 309, 278, 216, 185, 154, 92, + 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682, + 651, 620, 589, 558, 527, 496, 465, 434, 403, 372, 341, 310, 279, + 248, 217, 186, 155, 124, 93, 62, 31, 993, 962, 931, 869, 838, + 807, 745, 714, 683, 621, 590, 559, 497, 466, 435, 373, 342, 311, + 249, 218, 187, 125, 94, 63, 994, 963, 870, 839, 746, 715, 622, + 591, 498, 467, 374, 343, 250, 219, 126, 95, 995, 871, 747, 623, + 499, 375, 251, 127, 900, 776, 652, 528, 404, 280, 156, 932, 901, + 808, 777, 684, 653, 560, 529, 436, 405, 312, 281, 188, 157, 964, + 933, 902, 840, 809, 778, 716, 685, 654, 592, 561, 530, 468, 437, + 406, 344, 313, 282, 220, 189, 158, 996, 965, 934, 903, 872, 841, + 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469, 438, + 407, 376, 345, 314, 283, 252, 221, 190, 159, 997, 966, 935, 873, + 842, 811, 749, 718, 687, 625, 594, 563, 501, 470, 439, 377, 346, + 315, 253, 222, 191, 998, 967, 874, 843, 750, 719, 626, 595, 502, + 471, 378, 347, 254, 223, 999, 875, 751, 627, 503, 379, 255, 904, + 780, 656, 532, 408, 284, 936, 905, 812, 781, 688, 657, 564, 533, + 440, 409, 316, 285, 968, 937, 906, 844, 813, 782, 720, 689, 658, + 596, 565, 534, 472, 441, 410, 348, 317, 286, 1000, 969, 938, 907, + 876, 845, 814, 783, 752, 721, 690, 659, 628, 597, 566, 535, 504, + 473, 442, 411, 380, 349, 318, 287, 1001, 970, 939, 877, 846, 815, + 753, 722, 691, 629, 598, 567, 505, 474, 443, 381, 350, 319, 1002, + 971, 878, 847, 754, 723, 630, 599, 506, 475, 382, 351, 1003, 879, + 755, 631, 507, 383, 908, 784, 660, 536, 412, 940, 909, 816, 785, + 692, 661, 568, 537, 444, 413, 972, 941, 910, 848, 817, 786, 724, + 693, 662, 600, 569, 538, 476, 445, 414, 1004, 973, 942, 911, 880, + 849, 818, 787, 756, 725, 694, 663, 632, 601, 570, 539, 508, 477, + 446, 415, 1005, 974, 943, 881, 850, 819, 757, 726, 695, 633, 602, + 571, 509, 478, 447, 1006, 975, 882, 851, 758, 727, 634, 603, 510, + 479, 1007, 883, 759, 635, 511, 912, 788, 664, 540, 944, 913, 820, + 789, 696, 665, 572, 541, 976, 945, 914, 852, 821, 790, 728, 697, + 666, 604, 573, 542, 1008, 977, 946, 915, 884, 853, 822, 791, 760, + 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 885, 854, 823, + 761, 730, 699, 637, 606, 575, 1010, 979, 886, 855, 762, 731, 638, + 607, 1011, 887, 763, 639, 916, 792, 668, 948, 917, 824, 793, 700, + 669, 980, 949, 918, 856, 825, 794, 732, 701, 670, 1012, 981, 950, + 919, 888, 857, 826, 795, 764, 733, 702, 671, 1013, 982, 951, 889, + 858, 827, 765, 734, 703, 1014, 983, 890, 859, 766, 735, 1015, 891, + 767, 920, 796, 952, 921, 828, 797, 984, 953, 922, 860, 829, 798, + 1016, 985, 954, 923, 892, 861, 830, 799, 1017, 986, 955, 893, 862, + 831, 1018, 987, 894, 863, 1019, 895, 924, 956, 925, 988, 957, 926, + 1020, 989, 958, 927, 1021, 990, 959, 1022, 991, 1023, + }; + + // Neighborhood 2-tuples for various scans and blocksizes, + // in {top, left} order for each position in corresponding scan order. + private static readonly short[] DefaultScan4X4Neighbors = new short[] + { + 0, 0, 0, 0, 0, 0, 1, 4, 4, 4, 1, 1, 8, 8, 5, 8, 2, + 2, 2, 5, 9, 12, 6, 9, 3, 6, 10, 13, 7, 10, 11, 14, 0, 0, + }; + + private static readonly short[] ColScan4X4Neighbors = new short[] + { + 0, 0, 0, 0, 4, 4, 0, 0, 8, 8, 1, 1, 5, 5, 1, 1, 9, + 9, 2, 2, 6, 6, 2, 2, 3, 3, 10, 10, 7, 7, 11, 11, 0, 0, + }; + + private static readonly short[] RowScan4X4Neighbors = new short[] + { + 0, 0, 0, 0, 0, 0, 1, 1, 4, 4, 2, 2, 5, 5, 4, 4, 8, + 8, 6, 6, 8, 8, 9, 9, 12, 12, 10, 10, 13, 13, 14, 14, 0, 0, + }; + + private static readonly short[] ColScan8X8Neighbors = new short[] + { + 0, 0, 0, 0, 8, 8, 0, 0, 16, 16, 1, 1, 24, 24, 9, 9, 1, 1, 32, + 32, 17, 17, 2, 2, 25, 25, 10, 10, 40, 40, 2, 2, 18, 18, 33, 33, 3, 3, + 48, 48, 11, 11, 26, 26, 3, 3, 41, 41, 19, 19, 34, 34, 4, 4, 27, 27, 12, + 12, 49, 49, 42, 42, 20, 20, 4, 4, 35, 35, 5, 5, 28, 28, 50, 50, 43, 43, + 13, 13, 36, 36, 5, 5, 21, 21, 51, 51, 29, 29, 6, 6, 44, 44, 14, 14, 6, + 6, 37, 37, 52, 52, 22, 22, 7, 7, 30, 30, 45, 45, 15, 15, 38, 38, 23, 23, + 53, 53, 31, 31, 46, 46, 39, 39, 54, 54, 47, 47, 55, 55, 0, 0, + }; + + private static readonly short[] RowScan8X8Neighbors = new short[] + { + 0, 0, 0, 0, 1, 1, 0, 0, 8, 8, 2, 2, 8, 8, 9, 9, 3, 3, 16, + 16, 10, 10, 16, 16, 4, 4, 17, 17, 24, 24, 11, 11, 18, 18, 25, 25, 24, 24, + 5, 5, 12, 12, 19, 19, 32, 32, 26, 26, 6, 6, 33, 33, 32, 32, 20, 20, 27, + 27, 40, 40, 13, 13, 34, 34, 40, 40, 41, 41, 28, 28, 35, 35, 48, 48, 21, 21, + 42, 42, 14, 14, 48, 48, 36, 36, 49, 49, 43, 43, 29, 29, 56, 56, 22, 22, 50, + 50, 57, 57, 44, 44, 37, 37, 51, 51, 30, 30, 58, 58, 52, 52, 45, 45, 59, 59, + 38, 38, 60, 60, 46, 46, 53, 53, 54, 54, 61, 61, 62, 62, 0, 0, + }; + + private static readonly short[] DefaultScan8X8Neighbors = new short[] + { + 0, 0, 0, 0, 0, 0, 8, 8, 1, 8, 1, 1, 9, 16, 16, 16, 2, 9, 2, + 2, 10, 17, 17, 24, 24, 24, 3, 10, 3, 3, 18, 25, 25, 32, 11, 18, 32, 32, + 4, 11, 26, 33, 19, 26, 4, 4, 33, 40, 12, 19, 40, 40, 5, 12, 27, 34, 34, + 41, 20, 27, 13, 20, 5, 5, 41, 48, 48, 48, 28, 35, 35, 42, 21, 28, 6, 6, + 6, 13, 42, 49, 49, 56, 36, 43, 14, 21, 29, 36, 7, 14, 43, 50, 50, 57, 22, + 29, 37, 44, 15, 22, 44, 51, 51, 58, 30, 37, 23, 30, 52, 59, 45, 52, 38, 45, + 31, 38, 53, 60, 46, 53, 39, 46, 54, 61, 47, 54, 55, 62, 0, 0, + }; + + private static readonly short[] ColScan16X16Neighbors = new short[] + { + 0, 0, 0, 0, 16, 16, 32, 32, 0, 0, 48, 48, 1, 1, 64, + 64, 17, 17, 80, 80, 33, 33, 1, 1, 49, 49, 96, 96, 2, 2, + 65, 65, 18, 18, 112, 112, 34, 34, 81, 81, 2, 2, 50, 50, 128, + 128, 3, 3, 97, 97, 19, 19, 66, 66, 144, 144, 82, 82, 35, 35, + 113, 113, 3, 3, 51, 51, 160, 160, 4, 4, 98, 98, 129, 129, 67, + 67, 20, 20, 83, 83, 114, 114, 36, 36, 176, 176, 4, 4, 145, 145, + 52, 52, 99, 99, 5, 5, 130, 130, 68, 68, 192, 192, 161, 161, 21, + 21, 115, 115, 84, 84, 37, 37, 146, 146, 208, 208, 53, 53, 5, 5, + 100, 100, 177, 177, 131, 131, 69, 69, 6, 6, 224, 224, 116, 116, 22, + 22, 162, 162, 85, 85, 147, 147, 38, 38, 193, 193, 101, 101, 54, 54, + 6, 6, 132, 132, 178, 178, 70, 70, 163, 163, 209, 209, 7, 7, 117, + 117, 23, 23, 148, 148, 7, 7, 86, 86, 194, 194, 225, 225, 39, 39, + 179, 179, 102, 102, 133, 133, 55, 55, 164, 164, 8, 8, 71, 71, 210, + 210, 118, 118, 149, 149, 195, 195, 24, 24, 87, 87, 40, 40, 56, 56, + 134, 134, 180, 180, 226, 226, 103, 103, 8, 8, 165, 165, 211, 211, 72, + 72, 150, 150, 9, 9, 119, 119, 25, 25, 88, 88, 196, 196, 41, 41, + 135, 135, 181, 181, 104, 104, 57, 57, 227, 227, 166, 166, 120, 120, 151, + 151, 197, 197, 73, 73, 9, 9, 212, 212, 89, 89, 136, 136, 182, 182, + 10, 10, 26, 26, 105, 105, 167, 167, 228, 228, 152, 152, 42, 42, 121, + 121, 213, 213, 58, 58, 198, 198, 74, 74, 137, 137, 183, 183, 168, 168, + 10, 10, 90, 90, 229, 229, 11, 11, 106, 106, 214, 214, 153, 153, 27, + 27, 199, 199, 43, 43, 184, 184, 122, 122, 169, 169, 230, 230, 59, 59, + 11, 11, 75, 75, 138, 138, 200, 200, 215, 215, 91, 91, 12, 12, 28, + 28, 185, 185, 107, 107, 154, 154, 44, 44, 231, 231, 216, 216, 60, 60, + 123, 123, 12, 12, 76, 76, 201, 201, 170, 170, 232, 232, 139, 139, 92, + 92, 13, 13, 108, 108, 29, 29, 186, 186, 217, 217, 155, 155, 45, 45, + 13, 13, 61, 61, 124, 124, 14, 14, 233, 233, 77, 77, 14, 14, 171, + 171, 140, 140, 202, 202, 30, 30, 93, 93, 109, 109, 46, 46, 156, 156, + 62, 62, 187, 187, 15, 15, 125, 125, 218, 218, 78, 78, 31, 31, 172, + 172, 47, 47, 141, 141, 94, 94, 234, 234, 203, 203, 63, 63, 110, 110, + 188, 188, 157, 157, 126, 126, 79, 79, 173, 173, 95, 95, 219, 219, 142, + 142, 204, 204, 235, 235, 111, 111, 158, 158, 127, 127, 189, 189, 220, 220, + 143, 143, 174, 174, 205, 205, 236, 236, 159, 159, 190, 190, 221, 221, 175, + 175, 237, 237, 206, 206, 222, 222, 191, 191, 238, 238, 207, 207, 223, 223, + 239, 239, 0, 0, + }; + + private static readonly short[] RowScan16X16Neighbors = new short[] + { + 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 16, 16, 3, 3, 17, + 17, 16, 16, 4, 4, 32, 32, 18, 18, 5, 5, 33, 33, 32, 32, + 19, 19, 48, 48, 6, 6, 34, 34, 20, 20, 49, 49, 48, 48, 7, + 7, 35, 35, 64, 64, 21, 21, 50, 50, 36, 36, 64, 64, 8, 8, + 65, 65, 51, 51, 22, 22, 37, 37, 80, 80, 66, 66, 9, 9, 52, + 52, 23, 23, 81, 81, 67, 67, 80, 80, 38, 38, 10, 10, 53, 53, + 82, 82, 96, 96, 68, 68, 24, 24, 97, 97, 83, 83, 39, 39, 96, + 96, 54, 54, 11, 11, 69, 69, 98, 98, 112, 112, 84, 84, 25, 25, + 40, 40, 55, 55, 113, 113, 99, 99, 12, 12, 70, 70, 112, 112, 85, + 85, 26, 26, 114, 114, 100, 100, 128, 128, 41, 41, 56, 56, 71, 71, + 115, 115, 13, 13, 86, 86, 129, 129, 101, 101, 128, 128, 72, 72, 130, + 130, 116, 116, 27, 27, 57, 57, 14, 14, 87, 87, 42, 42, 144, 144, + 102, 102, 131, 131, 145, 145, 117, 117, 73, 73, 144, 144, 88, 88, 132, + 132, 103, 103, 28, 28, 58, 58, 146, 146, 118, 118, 43, 43, 160, 160, + 147, 147, 89, 89, 104, 104, 133, 133, 161, 161, 119, 119, 160, 160, 74, + 74, 134, 134, 148, 148, 29, 29, 59, 59, 162, 162, 176, 176, 44, 44, + 120, 120, 90, 90, 105, 105, 163, 163, 177, 177, 149, 149, 176, 176, 135, + 135, 164, 164, 178, 178, 30, 30, 150, 150, 192, 192, 75, 75, 121, 121, + 60, 60, 136, 136, 193, 193, 106, 106, 151, 151, 179, 179, 192, 192, 45, + 45, 165, 165, 166, 166, 194, 194, 91, 91, 180, 180, 137, 137, 208, 208, + 122, 122, 152, 152, 208, 208, 195, 195, 76, 76, 167, 167, 209, 209, 181, + 181, 224, 224, 107, 107, 196, 196, 61, 61, 153, 153, 224, 224, 182, 182, + 168, 168, 210, 210, 46, 46, 138, 138, 92, 92, 183, 183, 225, 225, 211, + 211, 240, 240, 197, 197, 169, 169, 123, 123, 154, 154, 198, 198, 77, 77, + 212, 212, 184, 184, 108, 108, 226, 226, 199, 199, 62, 62, 227, 227, 241, + 241, 139, 139, 213, 213, 170, 170, 185, 185, 155, 155, 228, 228, 242, 242, + 124, 124, 93, 93, 200, 200, 243, 243, 214, 214, 215, 215, 229, 229, 140, + 140, 186, 186, 201, 201, 78, 78, 171, 171, 109, 109, 156, 156, 244, 244, + 216, 216, 230, 230, 94, 94, 245, 245, 231, 231, 125, 125, 202, 202, 246, + 246, 232, 232, 172, 172, 217, 217, 141, 141, 110, 110, 157, 157, 187, 187, + 247, 247, 126, 126, 233, 233, 218, 218, 248, 248, 188, 188, 203, 203, 142, + 142, 173, 173, 158, 158, 249, 249, 234, 234, 204, 204, 219, 219, 174, 174, + 189, 189, 250, 250, 220, 220, 190, 190, 205, 205, 235, 235, 206, 206, 236, + 236, 251, 251, 221, 221, 252, 252, 222, 222, 237, 237, 238, 238, 253, 253, + 254, 254, 0, 0, + }; + + private static readonly short[] DefaultScan16X16Neighbors = new short[] + { + 0, 0, 0, 0, 0, 0, 16, 16, 1, 16, 1, 1, 32, 32, 17, + 32, 2, 17, 2, 2, 48, 48, 18, 33, 33, 48, 3, 18, 49, 64, + 64, 64, 34, 49, 3, 3, 19, 34, 50, 65, 4, 19, 65, 80, 80, + 80, 35, 50, 4, 4, 20, 35, 66, 81, 81, 96, 51, 66, 96, 96, + 5, 20, 36, 51, 82, 97, 21, 36, 67, 82, 97, 112, 5, 5, 52, + 67, 112, 112, 37, 52, 6, 21, 83, 98, 98, 113, 68, 83, 6, 6, + 113, 128, 22, 37, 53, 68, 84, 99, 99, 114, 128, 128, 114, 129, 69, + 84, 38, 53, 7, 22, 7, 7, 129, 144, 23, 38, 54, 69, 100, 115, + 85, 100, 115, 130, 144, 144, 130, 145, 39, 54, 70, 85, 8, 23, 55, + 70, 116, 131, 101, 116, 145, 160, 24, 39, 8, 8, 86, 101, 131, 146, + 160, 160, 146, 161, 71, 86, 40, 55, 9, 24, 117, 132, 102, 117, 161, + 176, 132, 147, 56, 71, 87, 102, 25, 40, 147, 162, 9, 9, 176, 176, + 162, 177, 72, 87, 41, 56, 118, 133, 133, 148, 103, 118, 10, 25, 148, + 163, 57, 72, 88, 103, 177, 192, 26, 41, 163, 178, 192, 192, 10, 10, + 119, 134, 73, 88, 149, 164, 104, 119, 134, 149, 42, 57, 178, 193, 164, + 179, 11, 26, 58, 73, 193, 208, 89, 104, 135, 150, 120, 135, 27, 42, + 74, 89, 208, 208, 150, 165, 179, 194, 165, 180, 105, 120, 194, 209, 43, + 58, 11, 11, 136, 151, 90, 105, 151, 166, 180, 195, 59, 74, 121, 136, + 209, 224, 195, 210, 224, 224, 166, 181, 106, 121, 75, 90, 12, 27, 181, + 196, 12, 12, 210, 225, 152, 167, 167, 182, 137, 152, 28, 43, 196, 211, + 122, 137, 91, 106, 225, 240, 44, 59, 13, 28, 107, 122, 182, 197, 168, + 183, 211, 226, 153, 168, 226, 241, 60, 75, 197, 212, 138, 153, 29, 44, + 76, 91, 13, 13, 183, 198, 123, 138, 45, 60, 212, 227, 198, 213, 154, + 169, 169, 184, 227, 242, 92, 107, 61, 76, 139, 154, 14, 29, 14, 14, + 184, 199, 213, 228, 108, 123, 199, 214, 228, 243, 77, 92, 30, 45, 170, + 185, 155, 170, 185, 200, 93, 108, 124, 139, 214, 229, 46, 61, 200, 215, + 229, 244, 15, 30, 109, 124, 62, 77, 140, 155, 215, 230, 31, 46, 171, + 186, 186, 201, 201, 216, 78, 93, 230, 245, 125, 140, 47, 62, 216, 231, + 156, 171, 94, 109, 231, 246, 141, 156, 63, 78, 202, 217, 187, 202, 110, + 125, 217, 232, 172, 187, 232, 247, 79, 94, 157, 172, 126, 141, 203, 218, + 95, 110, 233, 248, 218, 233, 142, 157, 111, 126, 173, 188, 188, 203, 234, + 249, 219, 234, 127, 142, 158, 173, 204, 219, 189, 204, 143, 158, 235, 250, + 174, 189, 205, 220, 159, 174, 220, 235, 221, 236, 175, 190, 190, 205, 236, + 251, 206, 221, 237, 252, 191, 206, 222, 237, 207, 222, 238, 253, 223, 238, + 239, 254, 0, 0, + }; + + private static readonly short[] DefaultScan32X32Neighbors = new short[] + { + 0, 0, 0, 0, 0, 0, 32, 32, 1, 32, 1, 1, 64, 64, + 33, 64, 2, 33, 96, 96, 2, 2, 65, 96, 34, 65, 128, 128, + 97, 128, 3, 34, 66, 97, 3, 3, 35, 66, 98, 129, 129, 160, + 160, 160, 4, 35, 67, 98, 192, 192, 4, 4, 130, 161, 161, 192, + 36, 67, 99, 130, 5, 36, 68, 99, 193, 224, 162, 193, 224, 224, + 131, 162, 37, 68, 100, 131, 5, 5, 194, 225, 225, 256, 256, 256, + 163, 194, 69, 100, 132, 163, 6, 37, 226, 257, 6, 6, 195, 226, + 257, 288, 101, 132, 288, 288, 38, 69, 164, 195, 133, 164, 258, 289, + 227, 258, 196, 227, 7, 38, 289, 320, 70, 101, 320, 320, 7, 7, + 165, 196, 39, 70, 102, 133, 290, 321, 259, 290, 228, 259, 321, 352, + 352, 352, 197, 228, 134, 165, 71, 102, 8, 39, 322, 353, 291, 322, + 260, 291, 103, 134, 353, 384, 166, 197, 229, 260, 40, 71, 8, 8, + 384, 384, 135, 166, 354, 385, 323, 354, 198, 229, 292, 323, 72, 103, + 261, 292, 9, 40, 385, 416, 167, 198, 104, 135, 230, 261, 355, 386, + 416, 416, 293, 324, 324, 355, 9, 9, 41, 72, 386, 417, 199, 230, + 136, 167, 417, 448, 262, 293, 356, 387, 73, 104, 387, 418, 231, 262, + 10, 41, 168, 199, 325, 356, 418, 449, 105, 136, 448, 448, 42, 73, + 294, 325, 200, 231, 10, 10, 357, 388, 137, 168, 263, 294, 388, 419, + 74, 105, 419, 450, 449, 480, 326, 357, 232, 263, 295, 326, 169, 200, + 11, 42, 106, 137, 480, 480, 450, 481, 358, 389, 264, 295, 201, 232, + 138, 169, 389, 420, 43, 74, 420, 451, 327, 358, 11, 11, 481, 512, + 233, 264, 451, 482, 296, 327, 75, 106, 170, 201, 482, 513, 512, 512, + 390, 421, 359, 390, 421, 452, 107, 138, 12, 43, 202, 233, 452, 483, + 265, 296, 328, 359, 139, 170, 44, 75, 483, 514, 513, 544, 234, 265, + 297, 328, 422, 453, 12, 12, 391, 422, 171, 202, 76, 107, 514, 545, + 453, 484, 544, 544, 266, 297, 203, 234, 108, 139, 329, 360, 298, 329, + 140, 171, 515, 546, 13, 44, 423, 454, 235, 266, 545, 576, 454, 485, + 45, 76, 172, 203, 330, 361, 576, 576, 13, 13, 267, 298, 546, 577, + 77, 108, 204, 235, 455, 486, 577, 608, 299, 330, 109, 140, 547, 578, + 14, 45, 14, 14, 141, 172, 578, 609, 331, 362, 46, 77, 173, 204, + 15, 15, 78, 109, 205, 236, 579, 610, 110, 141, 15, 46, 142, 173, + 47, 78, 174, 205, 16, 16, 79, 110, 206, 237, 16, 47, 111, 142, + 48, 79, 143, 174, 80, 111, 175, 206, 17, 48, 17, 17, 207, 238, + 49, 80, 81, 112, 18, 18, 18, 49, 50, 81, 82, 113, 19, 50, + 51, 82, 83, 114, 608, 608, 484, 515, 360, 391, 236, 267, 112, 143, + 19, 19, 640, 640, 609, 640, 516, 547, 485, 516, 392, 423, 361, 392, + 268, 299, 237, 268, 144, 175, 113, 144, 20, 51, 20, 20, 672, 672, + 641, 672, 610, 641, 548, 579, 517, 548, 486, 517, 424, 455, 393, 424, + 362, 393, 300, 331, 269, 300, 238, 269, 176, 207, 145, 176, 114, 145, + 52, 83, 21, 52, 21, 21, 704, 704, 673, 704, 642, 673, 611, 642, + 580, 611, 549, 580, 518, 549, 487, 518, 456, 487, 425, 456, 394, 425, + 363, 394, 332, 363, 301, 332, 270, 301, 239, 270, 208, 239, 177, 208, + 146, 177, 115, 146, 84, 115, 53, 84, 22, 53, 22, 22, 705, 736, + 674, 705, 643, 674, 581, 612, 550, 581, 519, 550, 457, 488, 426, 457, + 395, 426, 333, 364, 302, 333, 271, 302, 209, 240, 178, 209, 147, 178, + 85, 116, 54, 85, 23, 54, 706, 737, 675, 706, 582, 613, 551, 582, + 458, 489, 427, 458, 334, 365, 303, 334, 210, 241, 179, 210, 86, 117, + 55, 86, 707, 738, 583, 614, 459, 490, 335, 366, 211, 242, 87, 118, + 736, 736, 612, 643, 488, 519, 364, 395, 240, 271, 116, 147, 23, 23, + 768, 768, 737, 768, 644, 675, 613, 644, 520, 551, 489, 520, 396, 427, + 365, 396, 272, 303, 241, 272, 148, 179, 117, 148, 24, 55, 24, 24, + 800, 800, 769, 800, 738, 769, 676, 707, 645, 676, 614, 645, 552, 583, + 521, 552, 490, 521, 428, 459, 397, 428, 366, 397, 304, 335, 273, 304, + 242, 273, 180, 211, 149, 180, 118, 149, 56, 87, 25, 56, 25, 25, + 832, 832, 801, 832, 770, 801, 739, 770, 708, 739, 677, 708, 646, 677, + 615, 646, 584, 615, 553, 584, 522, 553, 491, 522, 460, 491, 429, 460, + 398, 429, 367, 398, 336, 367, 305, 336, 274, 305, 243, 274, 212, 243, + 181, 212, 150, 181, 119, 150, 88, 119, 57, 88, 26, 57, 26, 26, + 833, 864, 802, 833, 771, 802, 709, 740, 678, 709, 647, 678, 585, 616, + 554, 585, 523, 554, 461, 492, 430, 461, 399, 430, 337, 368, 306, 337, + 275, 306, 213, 244, 182, 213, 151, 182, 89, 120, 58, 89, 27, 58, + 834, 865, 803, 834, 710, 741, 679, 710, 586, 617, 555, 586, 462, 493, + 431, 462, 338, 369, 307, 338, 214, 245, 183, 214, 90, 121, 59, 90, + 835, 866, 711, 742, 587, 618, 463, 494, 339, 370, 215, 246, 91, 122, + 864, 864, 740, 771, 616, 647, 492, 523, 368, 399, 244, 275, 120, 151, + 27, 27, 896, 896, 865, 896, 772, 803, 741, 772, 648, 679, 617, 648, + 524, 555, 493, 524, 400, 431, 369, 400, 276, 307, 245, 276, 152, 183, + 121, 152, 28, 59, 28, 28, 928, 928, 897, 928, 866, 897, 804, 835, + 773, 804, 742, 773, 680, 711, 649, 680, 618, 649, 556, 587, 525, 556, + 494, 525, 432, 463, 401, 432, 370, 401, 308, 339, 277, 308, 246, 277, + 184, 215, 153, 184, 122, 153, 60, 91, 29, 60, 29, 29, 960, 960, + 929, 960, 898, 929, 867, 898, 836, 867, 805, 836, 774, 805, 743, 774, + 712, 743, 681, 712, 650, 681, 619, 650, 588, 619, 557, 588, 526, 557, + 495, 526, 464, 495, 433, 464, 402, 433, 371, 402, 340, 371, 309, 340, + 278, 309, 247, 278, 216, 247, 185, 216, 154, 185, 123, 154, 92, 123, + 61, 92, 30, 61, 30, 30, 961, 992, 930, 961, 899, 930, 837, 868, + 806, 837, 775, 806, 713, 744, 682, 713, 651, 682, 589, 620, 558, 589, + 527, 558, 465, 496, 434, 465, 403, 434, 341, 372, 310, 341, 279, 310, + 217, 248, 186, 217, 155, 186, 93, 124, 62, 93, 31, 62, 962, 993, + 931, 962, 838, 869, 807, 838, 714, 745, 683, 714, 590, 621, 559, 590, + 466, 497, 435, 466, 342, 373, 311, 342, 218, 249, 187, 218, 94, 125, + 63, 94, 963, 994, 839, 870, 715, 746, 591, 622, 467, 498, 343, 374, + 219, 250, 95, 126, 868, 899, 744, 775, 620, 651, 496, 527, 372, 403, + 248, 279, 124, 155, 900, 931, 869, 900, 776, 807, 745, 776, 652, 683, + 621, 652, 528, 559, 497, 528, 404, 435, 373, 404, 280, 311, 249, 280, + 156, 187, 125, 156, 932, 963, 901, 932, 870, 901, 808, 839, 777, 808, + 746, 777, 684, 715, 653, 684, 622, 653, 560, 591, 529, 560, 498, 529, + 436, 467, 405, 436, 374, 405, 312, 343, 281, 312, 250, 281, 188, 219, + 157, 188, 126, 157, 964, 995, 933, 964, 902, 933, 871, 902, 840, 871, + 809, 840, 778, 809, 747, 778, 716, 747, 685, 716, 654, 685, 623, 654, + 592, 623, 561, 592, 530, 561, 499, 530, 468, 499, 437, 468, 406, 437, + 375, 406, 344, 375, 313, 344, 282, 313, 251, 282, 220, 251, 189, 220, + 158, 189, 127, 158, 965, 996, 934, 965, 903, 934, 841, 872, 810, 841, + 779, 810, 717, 748, 686, 717, 655, 686, 593, 624, 562, 593, 531, 562, + 469, 500, 438, 469, 407, 438, 345, 376, 314, 345, 283, 314, 221, 252, + 190, 221, 159, 190, 966, 997, 935, 966, 842, 873, 811, 842, 718, 749, + 687, 718, 594, 625, 563, 594, 470, 501, 439, 470, 346, 377, 315, 346, + 222, 253, 191, 222, 967, 998, 843, 874, 719, 750, 595, 626, 471, 502, + 347, 378, 223, 254, 872, 903, 748, 779, 624, 655, 500, 531, 376, 407, + 252, 283, 904, 935, 873, 904, 780, 811, 749, 780, 656, 687, 625, 656, + 532, 563, 501, 532, 408, 439, 377, 408, 284, 315, 253, 284, 936, 967, + 905, 936, 874, 905, 812, 843, 781, 812, 750, 781, 688, 719, 657, 688, + 626, 657, 564, 595, 533, 564, 502, 533, 440, 471, 409, 440, 378, 409, + 316, 347, 285, 316, 254, 285, 968, 999, 937, 968, 906, 937, 875, 906, + 844, 875, 813, 844, 782, 813, 751, 782, 720, 751, 689, 720, 658, 689, + 627, 658, 596, 627, 565, 596, 534, 565, 503, 534, 472, 503, 441, 472, + 410, 441, 379, 410, 348, 379, 317, 348, 286, 317, 255, 286, 969, 1000, + 938, 969, 907, 938, 845, 876, 814, 845, 783, 814, 721, 752, 690, 721, + 659, 690, 597, 628, 566, 597, 535, 566, 473, 504, 442, 473, 411, 442, + 349, 380, 318, 349, 287, 318, 970, 1001, 939, 970, 846, 877, 815, 846, + 722, 753, 691, 722, 598, 629, 567, 598, 474, 505, 443, 474, 350, 381, + 319, 350, 971, 1002, 847, 878, 723, 754, 599, 630, 475, 506, 351, 382, + 876, 907, 752, 783, 628, 659, 504, 535, 380, 411, 908, 939, 877, 908, + 784, 815, 753, 784, 660, 691, 629, 660, 536, 567, 505, 536, 412, 443, + 381, 412, 940, 971, 909, 940, 878, 909, 816, 847, 785, 816, 754, 785, + 692, 723, 661, 692, 630, 661, 568, 599, 537, 568, 506, 537, 444, 475, + 413, 444, 382, 413, 972, 1003, 941, 972, 910, 941, 879, 910, 848, 879, + 817, 848, 786, 817, 755, 786, 724, 755, 693, 724, 662, 693, 631, 662, + 600, 631, 569, 600, 538, 569, 507, 538, 476, 507, 445, 476, 414, 445, + 383, 414, 973, 1004, 942, 973, 911, 942, 849, 880, 818, 849, 787, 818, + 725, 756, 694, 725, 663, 694, 601, 632, 570, 601, 539, 570, 477, 508, + 446, 477, 415, 446, 974, 1005, 943, 974, 850, 881, 819, 850, 726, 757, + 695, 726, 602, 633, 571, 602, 478, 509, 447, 478, 975, 1006, 851, 882, + 727, 758, 603, 634, 479, 510, 880, 911, 756, 787, 632, 663, 508, 539, + 912, 943, 881, 912, 788, 819, 757, 788, 664, 695, 633, 664, 540, 571, + 509, 540, 944, 975, 913, 944, 882, 913, 820, 851, 789, 820, 758, 789, + 696, 727, 665, 696, 634, 665, 572, 603, 541, 572, 510, 541, 976, 1007, + 945, 976, 914, 945, 883, 914, 852, 883, 821, 852, 790, 821, 759, 790, + 728, 759, 697, 728, 666, 697, 635, 666, 604, 635, 573, 604, 542, 573, + 511, 542, 977, 1008, 946, 977, 915, 946, 853, 884, 822, 853, 791, 822, + 729, 760, 698, 729, 667, 698, 605, 636, 574, 605, 543, 574, 978, 1009, + 947, 978, 854, 885, 823, 854, 730, 761, 699, 730, 606, 637, 575, 606, + 979, 1010, 855, 886, 731, 762, 607, 638, 884, 915, 760, 791, 636, 667, + 916, 947, 885, 916, 792, 823, 761, 792, 668, 699, 637, 668, 948, 979, + 917, 948, 886, 917, 824, 855, 793, 824, 762, 793, 700, 731, 669, 700, + 638, 669, 980, 1011, 949, 980, 918, 949, 887, 918, 856, 887, 825, 856, + 794, 825, 763, 794, 732, 763, 701, 732, 670, 701, 639, 670, 981, 1012, + 950, 981, 919, 950, 857, 888, 826, 857, 795, 826, 733, 764, 702, 733, + 671, 702, 982, 1013, 951, 982, 858, 889, 827, 858, 734, 765, 703, 734, + 983, 1014, 859, 890, 735, 766, 888, 919, 764, 795, 920, 951, 889, 920, + 796, 827, 765, 796, 952, 983, 921, 952, 890, 921, 828, 859, 797, 828, + 766, 797, 984, 1015, 953, 984, 922, 953, 891, 922, 860, 891, 829, 860, + 798, 829, 767, 798, 985, 1016, 954, 985, 923, 954, 861, 892, 830, 861, + 799, 830, 986, 1017, 955, 986, 862, 893, 831, 862, 987, 1018, 863, 894, + 892, 923, 924, 955, 893, 924, 956, 987, 925, 956, 894, 925, 988, 1019, + 957, 988, 926, 957, 895, 926, 989, 1020, 958, 989, 927, 958, 990, 1021, + 959, 990, 991, 1022, 0, 0, + }; + + private static readonly short[] Vp9DefaultIscan4X4 = new short[] + { + 0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15, + }; + + private static readonly short[] Vp9ColIscan4X4 = new short[] + { + 0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15, + }; + + private static readonly short[] Vp9RowIscan4X4 = new short[] + { + 0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15, + }; + + private static readonly short[] Vp9ColIscan8X8 = new short[] + { + 0, 3, 8, 15, 22, 32, 40, 47, 1, 5, 11, 18, 26, 34, 44, 51, + 2, 7, 13, 20, 28, 38, 46, 54, 4, 10, 16, 24, 31, 41, 50, 56, + 6, 12, 21, 27, 35, 43, 52, 58, 9, 17, 25, 33, 39, 48, 55, 60, + 14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63, + }; + + private static readonly short[] Vp9RowIscan8X8 = new short[] + { + 0, 1, 2, 5, 8, 12, 19, 24, 3, 4, 7, 10, 15, 20, 30, 39, + 6, 9, 13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52, + 18, 22, 25, 31, 35, 41, 50, 57, 26, 29, 33, 38, 43, 49, 55, 59, + 32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63, + }; + + private static readonly short[] Vp9DefaultIscan8X8 = new short[] + { + 0, 2, 5, 9, 14, 22, 31, 37, 1, 4, 8, 13, 19, 26, 38, 44, + 3, 6, 10, 17, 24, 30, 42, 49, 7, 11, 15, 21, 29, 36, 47, 53, + 12, 16, 20, 27, 34, 43, 52, 57, 18, 23, 28, 35, 41, 48, 56, 60, + 25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63, + }; + + private static readonly short[] Vp9ColIscan16X16 = new short[] + { + 0, 4, 11, 20, 31, 43, 59, 75, 85, 109, 130, 150, 165, 181, 195, 198, + 1, 6, 14, 23, 34, 47, 64, 81, 95, 114, 135, 153, 171, 188, 201, 212, + 2, 8, 16, 25, 38, 52, 67, 83, 101, 116, 136, 157, 172, 190, 205, 216, + 3, 10, 18, 29, 41, 55, 71, 89, 103, 119, 141, 159, 176, 194, 208, 218, + 5, 12, 21, 32, 45, 58, 74, 93, 104, 123, 144, 164, 179, 196, 210, 223, + 7, 15, 26, 37, 49, 63, 78, 96, 112, 129, 146, 166, 182, 200, 215, 228, + 9, 19, 28, 39, 54, 69, 86, 102, 117, 132, 151, 170, 187, 206, 220, 230, + 13, 24, 35, 46, 60, 73, 91, 108, 122, 137, 154, 174, 189, 207, 224, 235, + 17, 30, 40, 53, 66, 82, 98, 115, 126, 142, 161, 180, 197, 213, 227, 237, + 22, 36, 48, 62, 76, 92, 105, 120, 133, 147, 167, 186, 203, 219, 232, 240, + 27, 44, 56, 70, 84, 99, 113, 127, 140, 156, 175, 193, 209, 226, 236, 244, + 33, 51, 68, 79, 94, 110, 125, 138, 149, 162, 184, 202, 217, 229, 241, 247, + 42, 61, 77, 90, 106, 121, 134, 148, 160, 173, 191, 211, 225, 238, 245, 251, + 50, 72, 87, 100, 118, 128, 145, 158, 168, 183, 204, 222, 233, 242, 249, 253, + 57, 80, 97, 111, 131, 143, 155, 169, 178, 192, 214, 231, 239, 246, 250, 254, + 65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255, + }; + + private static readonly short[] Vp9RowIscan16X16 = new short[] + { + 0, 1, 2, 4, 6, 9, 12, 17, 22, 29, 36, 43, 54, 64, 76, + 86, 3, 5, 7, 11, 15, 19, 25, 32, 38, 48, 59, 68, 84, 99, + 115, 130, 8, 10, 13, 18, 23, 27, 33, 42, 51, 60, 72, 88, 103, + 119, 142, 167, 14, 16, 20, 26, 31, 37, 44, 53, 61, 73, 85, 100, + 116, 135, 161, 185, 21, 24, 30, 35, 40, 47, 55, 65, 74, 81, 94, + 112, 133, 154, 179, 205, 28, 34, 39, 45, 50, 58, 67, 77, 87, 96, + 106, 121, 146, 169, 196, 212, 41, 46, 49, 56, 63, 70, 79, 90, 98, + 107, 122, 138, 159, 182, 207, 222, 52, 57, 62, 69, 75, 83, 93, 102, + 110, 120, 134, 150, 176, 195, 215, 226, 66, 71, 78, 82, 91, 97, 108, + 113, 127, 136, 148, 168, 188, 202, 221, 232, 80, 89, 92, 101, 105, 114, + 125, 131, 139, 151, 162, 177, 192, 208, 223, 234, 95, 104, 109, 117, 123, + 128, 143, 144, 155, 165, 175, 190, 206, 219, 233, 239, 111, 118, 124, 129, + 140, 147, 157, 164, 170, 181, 191, 203, 224, 230, 240, 243, 126, 132, 137, + 145, 153, 160, 174, 178, 184, 197, 204, 216, 231, 237, 244, 246, 141, 149, + 156, 166, 172, 180, 189, 199, 200, 210, 220, 228, 238, 242, 249, 251, 152, + 163, 171, 183, 186, 193, 201, 211, 214, 218, 227, 236, 245, 247, 252, 253, + 158, 173, 187, 194, 198, 209, 213, 217, 225, 229, 235, 241, 248, 250, 254, + 255, + }; + + private static readonly short[] Vp9DefaultIscan16X16 = new short[] + { + 0, 2, 5, 9, 17, 24, 36, 44, 55, 72, 88, 104, 128, 143, 166, + 179, 1, 4, 8, 13, 20, 30, 40, 54, 66, 79, 96, 113, 141, 154, + 178, 196, 3, 7, 11, 18, 25, 33, 46, 57, 71, 86, 101, 119, 148, + 164, 186, 201, 6, 12, 16, 23, 31, 39, 53, 64, 78, 92, 110, 127, + 153, 169, 193, 208, 10, 14, 19, 28, 37, 47, 58, 67, 84, 98, 114, + 133, 161, 176, 198, 214, 15, 21, 26, 34, 43, 52, 65, 77, 91, 106, + 120, 140, 165, 185, 205, 221, 22, 27, 32, 41, 48, 60, 73, 85, 99, + 116, 130, 151, 175, 190, 211, 225, 29, 35, 42, 49, 59, 69, 81, 95, + 108, 125, 139, 155, 182, 197, 217, 229, 38, 45, 51, 61, 68, 80, 93, + 105, 118, 134, 150, 168, 191, 207, 223, 234, 50, 56, 63, 74, 83, 94, + 109, 117, 129, 147, 163, 177, 199, 213, 228, 238, 62, 70, 76, 87, 97, + 107, 122, 131, 145, 159, 172, 188, 210, 222, 235, 242, 75, 82, 90, 102, + 112, 124, 138, 146, 157, 173, 187, 202, 219, 230, 240, 245, 89, 100, 111, + 123, 132, 142, 156, 167, 180, 189, 203, 216, 231, 237, 246, 250, 103, 115, + 126, 136, 149, 162, 171, 183, 194, 204, 215, 224, 236, 241, 248, 252, 121, + 135, 144, 158, 170, 181, 192, 200, 209, 218, 227, 233, 243, 244, 251, 254, + 137, 152, 160, 174, 184, 195, 206, 212, 220, 226, 232, 239, 247, 249, 253, + 255, + }; + + private static readonly short[] Vp9DefaultIscan32X32 = new short[] + { + 0, 2, 5, 10, 17, 25, 38, 47, 62, 83, 101, 121, 145, + 170, 193, 204, 210, 219, 229, 233, 245, 257, 275, 299, 342, 356, + 377, 405, 455, 471, 495, 527, 1, 4, 8, 15, 22, 30, 45, + 58, 74, 92, 112, 133, 158, 184, 203, 215, 222, 228, 234, 237, + 256, 274, 298, 317, 355, 376, 404, 426, 470, 494, 526, 551, 3, + 7, 12, 18, 28, 36, 52, 64, 82, 102, 118, 142, 164, 189, + 208, 217, 224, 231, 235, 238, 273, 297, 316, 329, 375, 403, 425, + 440, 493, 525, 550, 567, 6, 11, 16, 23, 31, 43, 60, 73, + 90, 109, 126, 150, 173, 196, 211, 220, 226, 232, 236, 239, 296, + 315, 328, 335, 402, 424, 439, 447, 524, 549, 566, 575, 9, 14, + 19, 29, 37, 50, 65, 78, 95, 116, 134, 157, 179, 201, 214, + 223, 244, 255, 272, 295, 341, 354, 374, 401, 454, 469, 492, 523, + 582, 596, 617, 645, 13, 20, 26, 35, 44, 54, 72, 85, 105, + 123, 140, 163, 182, 205, 216, 225, 254, 271, 294, 314, 353, 373, + 400, 423, 468, 491, 522, 548, 595, 616, 644, 666, 21, 27, 33, + 42, 53, 63, 80, 94, 113, 132, 151, 172, 190, 209, 218, 227, + 270, 293, 313, 327, 372, 399, 422, 438, 490, 521, 547, 565, 615, + 643, 665, 680, 24, 32, 39, 48, 57, 71, 88, 104, 120, 139, + 159, 178, 197, 212, 221, 230, 292, 312, 326, 334, 398, 421, 437, + 446, 520, 546, 564, 574, 642, 664, 679, 687, 34, 40, 46, 56, + 68, 81, 96, 111, 130, 147, 167, 186, 243, 253, 269, 291, 340, + 352, 371, 397, 453, 467, 489, 519, 581, 594, 614, 641, 693, 705, + 723, 747, 41, 49, 55, 67, 77, 91, 107, 124, 138, 161, 177, + 194, 252, 268, 290, 311, 351, 370, 396, 420, 466, 488, 518, 545, + 593, 613, 640, 663, 704, 722, 746, 765, 51, 59, 66, 76, 89, + 99, 119, 131, 149, 168, 181, 200, 267, 289, 310, 325, 369, 395, + 419, 436, 487, 517, 544, 563, 612, 639, 662, 678, 721, 745, 764, + 777, 61, 69, 75, 87, 100, 114, 129, 144, 162, 180, 191, 207, + 288, 309, 324, 333, 394, 418, 435, 445, 516, 543, 562, 573, 638, + 661, 677, 686, 744, 763, 776, 783, 70, 79, 86, 97, 108, 122, + 137, 155, 242, 251, 266, 287, 339, 350, 368, 393, 452, 465, 486, + 515, 580, 592, 611, 637, 692, 703, 720, 743, 788, 798, 813, 833, + 84, 93, 103, 110, 125, 141, 154, 171, 250, 265, 286, 308, 349, + 367, 392, 417, 464, 485, 514, 542, 591, 610, 636, 660, 702, 719, + 742, 762, 797, 812, 832, 848, 98, 106, 115, 127, 143, 156, 169, + 185, 264, 285, 307, 323, 366, 391, 416, 434, 484, 513, 541, 561, + 609, 635, 659, 676, 718, 741, 761, 775, 811, 831, 847, 858, 117, + 128, 136, 148, 160, 175, 188, 198, 284, 306, 322, 332, 390, 415, + 433, 444, 512, 540, 560, 572, 634, 658, 675, 685, 740, 760, 774, + 782, 830, 846, 857, 863, 135, 146, 152, 165, 241, 249, 263, 283, + 338, 348, 365, 389, 451, 463, 483, 511, 579, 590, 608, 633, 691, + 701, 717, 739, 787, 796, 810, 829, 867, 875, 887, 903, 153, 166, + 174, 183, 248, 262, 282, 305, 347, 364, 388, 414, 462, 482, 510, + 539, 589, 607, 632, 657, 700, 716, 738, 759, 795, 809, 828, 845, + 874, 886, 902, 915, 176, 187, 195, 202, 261, 281, 304, 321, 363, + 387, 413, 432, 481, 509, 538, 559, 606, 631, 656, 674, 715, 737, + 758, 773, 808, 827, 844, 856, 885, 901, 914, 923, 192, 199, 206, + 213, 280, 303, 320, 331, 386, 412, 431, 443, 508, 537, 558, 571, + 630, 655, 673, 684, 736, 757, 772, 781, 826, 843, 855, 862, 900, + 913, 922, 927, 240, 247, 260, 279, 337, 346, 362, 385, 450, 461, + 480, 507, 578, 588, 605, 629, 690, 699, 714, 735, 786, 794, 807, + 825, 866, 873, 884, 899, 930, 936, 945, 957, 246, 259, 278, 302, + 345, 361, 384, 411, 460, 479, 506, 536, 587, 604, 628, 654, 698, + 713, 734, 756, 793, 806, 824, 842, 872, 883, 898, 912, 935, 944, + 956, 966, 258, 277, 301, 319, 360, 383, 410, 430, 478, 505, 535, + 557, 603, 627, 653, 672, 712, 733, 755, 771, 805, 823, 841, 854, + 882, 897, 911, 921, 943, 955, 965, 972, 276, 300, 318, 330, 382, + 409, 429, 442, 504, 534, 556, 570, 626, 652, 671, 683, 732, 754, + 770, 780, 822, 840, 853, 861, 896, 910, 920, 926, 954, 964, 971, + 975, 336, 344, 359, 381, 449, 459, 477, 503, 577, 586, 602, 625, + 689, 697, 711, 731, 785, 792, 804, 821, 865, 871, 881, 895, 929, + 934, 942, 953, 977, 981, 987, 995, 343, 358, 380, 408, 458, 476, + 502, 533, 585, 601, 624, 651, 696, 710, 730, 753, 791, 803, 820, + 839, 870, 880, 894, 909, 933, 941, 952, 963, 980, 986, 994, 1001, + 357, 379, 407, 428, 475, 501, 532, 555, 600, 623, 650, 670, 709, + 729, 752, 769, 802, 819, 838, 852, 879, 893, 908, 919, 940, 951, + 962, 970, 985, 993, 1000, 1005, 378, 406, 427, 441, 500, 531, 554, + 569, 622, 649, 669, 682, 728, 751, 768, 779, 818, 837, 851, 860, + 892, 907, 918, 925, 950, 961, 969, 974, 992, 999, 1004, 1007, 448, + 457, 474, 499, 576, 584, 599, 621, 688, 695, 708, 727, 784, 790, + 801, 817, 864, 869, 878, 891, 928, 932, 939, 949, 976, 979, 984, + 991, 1008, 1010, 1013, 1017, 456, 473, 498, 530, 583, 598, 620, 648, + 694, 707, 726, 750, 789, 800, 816, 836, 868, 877, 890, 906, 931, + 938, 948, 960, 978, 983, 990, 998, 1009, 1012, 1016, 1020, 472, 497, + 529, 553, 597, 619, 647, 668, 706, 725, 749, 767, 799, 815, 835, + 850, 876, 889, 905, 917, 937, 947, 959, 968, 982, 989, 997, 1003, + 1011, 1015, 1019, 1022, 496, 528, 552, 568, 618, 646, 667, 681, 724, + 748, 766, 778, 814, 834, 849, 859, 888, 904, 916, 924, 946, 958, + 967, 973, 988, 996, 1002, 1006, 1014, 1018, 1021, 1023, + }; + + public class ScanOrder + { + public short[] Scan { get; } + public short[] IScan { get; } + public short[] Neighbors { get; } + + public ScanOrder(short[] scan, short[] iScan, short[] neighbors) + { + Scan = scan; + IScan = iScan; + Neighbors = neighbors; + } + } + + public static readonly ScanOrder[] Vp9DefaultScanOrders = new ScanOrder[] + { + new ScanOrder(DefaultScan4X4, Vp9DefaultIscan4X4, DefaultScan4X4Neighbors), + new ScanOrder(DefaultScan8X8, Vp9DefaultIscan8X8, DefaultScan8X8Neighbors), + new ScanOrder(DefaultScan16X16, Vp9DefaultIscan16X16, DefaultScan16X16Neighbors), + new ScanOrder(DefaultScan32X32, Vp9DefaultIscan32X32, DefaultScan32X32Neighbors) + }; + + public static readonly ScanOrder[][] Vp9ScanOrders = new ScanOrder[][] + { + new ScanOrder[] + { // TX_4X4 + new ScanOrder(DefaultScan4X4, Vp9DefaultIscan4X4, DefaultScan4X4Neighbors), + new ScanOrder(RowScan4X4, Vp9RowIscan4X4, RowScan4X4Neighbors), + new ScanOrder(ColScan4X4, Vp9ColIscan4X4, ColScan4X4Neighbors), + new ScanOrder(DefaultScan4X4, Vp9DefaultIscan4X4, DefaultScan4X4Neighbors) + }, + new ScanOrder[] + { // TX_8X8 + new ScanOrder(DefaultScan8X8, Vp9DefaultIscan8X8, DefaultScan8X8Neighbors), + new ScanOrder(RowScan8X8, Vp9RowIscan8X8, RowScan8X8Neighbors), + new ScanOrder(ColScan8X8, Vp9ColIscan8X8, ColScan8X8Neighbors), + new ScanOrder(DefaultScan8X8, Vp9DefaultIscan8X8, DefaultScan8X8Neighbors) + }, + new ScanOrder[] + { // TX_16X16 + new ScanOrder(DefaultScan16X16, Vp9DefaultIscan16X16, DefaultScan16X16Neighbors), + new ScanOrder(RowScan16X16, Vp9RowIscan16X16, RowScan16X16Neighbors), + new ScanOrder(ColScan16X16, Vp9ColIscan16X16, ColScan16X16Neighbors), + new ScanOrder(DefaultScan16X16, Vp9DefaultIscan16X16, DefaultScan16X16Neighbors) + }, + new ScanOrder[] + { // TX_32X32 + new ScanOrder(DefaultScan32X32, Vp9DefaultIscan32X32, DefaultScan32X32Neighbors), + new ScanOrder(DefaultScan32X32, Vp9DefaultIscan32X32, DefaultScan32X32Neighbors), + new ScanOrder(DefaultScan32X32, Vp9DefaultIscan32X32, DefaultScan32X32Neighbors), + new ScanOrder(DefaultScan32X32, Vp9DefaultIscan32X32, DefaultScan32X32Neighbors) + } + }; + + // Entropy MV + + public static readonly sbyte[] Vp9MvJointTree = new sbyte[] + { + -(sbyte)MvJointType.MvJointZero, 2, -(sbyte)MvJointType.MvJointHnzvz, 4, -(sbyte)MvJointType.MvJointHzvnz, -(sbyte)MvJointType.MvJointHnzvnz + }; + + public static readonly sbyte[] Vp9MvClassTree = new sbyte[] + { + -(sbyte)MvClassType.MvClass0, + 2, + -(sbyte)MvClassType.MvClass1, + 4, + 6, + 8, + -(sbyte)MvClassType.MvClass2, + -(sbyte)MvClassType.MvClass3, + 10, + 12, + -(sbyte)MvClassType.MvClass4, + -(sbyte)MvClassType.MvClass5, + -(sbyte)MvClassType.MvClass6, + 14, + 16, + 18, + -(sbyte)MvClassType.MvClass7, + -(sbyte)MvClassType.MvClass8, + -(sbyte)MvClassType.MvClass9, + -(sbyte)MvClassType.MvClass10, + }; + + public static ReadOnlySpan<sbyte> Vp9MvFPTree => new sbyte[] { -0, 2, -1, 4, -2, -3 }; + + // Entropy + + public static ReadOnlySpan<byte> Vp9Cat1Prob => new byte[] { 159 }; + public static ReadOnlySpan<byte> Vp9Cat2Prob => new byte[] { 165, 145 }; + public static ReadOnlySpan<byte> Vp9Cat3Prob => new byte[] { 173, 148, 140 }; + public static ReadOnlySpan<byte> Vp9Cat4Prob => new byte[] { 176, 155, 140, 135 }; + public static ReadOnlySpan<byte> Vp9Cat5Prob => new byte[] { 180, 157, 141, 134, 130 }; + public static ReadOnlySpan<byte> Vp9Cat6Prob => new byte[] { 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 }; + + public static ReadOnlySpan<byte> Vp9Cat6ProbHigh12 => new byte[] + { + 255, 255, 255, 255, 254, 254, 54, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 + }; + + private static readonly byte[] Vp9CoefbandTrans8X8Plus = new byte[] + { + 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, + // Beyond MAXBAND_INDEX+1 all values are filled as 5 + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + }; + + private static ReadOnlySpan<byte> Vp9CoefbandTrans4X4 => new byte[] + { + 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, + }; + + public static ReadOnlySpan<byte> get_band_translate(TxSize txSize) + { + return txSize == TxSize.Tx4x4 ? Vp9CoefbandTrans4X4 : Vp9CoefbandTrans8X8Plus; + } + + public static readonly byte[][] Vp9Pareto8Full = new byte[][] + { + new byte[] { 3, 86, 128, 6, 86, 23, 88, 29 }, + new byte[] { 6, 86, 128, 11, 87, 42, 91, 52 }, + new byte[] { 9, 86, 129, 17, 88, 61, 94, 76 }, + new byte[] { 12, 86, 129, 22, 88, 77, 97, 93 }, + new byte[] { 15, 87, 129, 28, 89, 93, 100, 110 }, + new byte[] { 17, 87, 129, 33, 90, 105, 103, 123 }, + new byte[] { 20, 88, 130, 38, 91, 118, 106, 136 }, + new byte[] { 23, 88, 130, 43, 91, 128, 108, 146 }, + new byte[] { 26, 89, 131, 48, 92, 139, 111, 156 }, + new byte[] { 28, 89, 131, 53, 93, 147, 114, 163 }, + new byte[] { 31, 90, 131, 58, 94, 156, 117, 171 }, + new byte[] { 34, 90, 131, 62, 94, 163, 119, 177 }, + new byte[] { 37, 90, 132, 66, 95, 171, 122, 184 }, + new byte[] { 39, 90, 132, 70, 96, 177, 124, 189 }, + new byte[] { 42, 91, 132, 75, 97, 183, 127, 194 }, + new byte[] { 44, 91, 132, 79, 97, 188, 129, 198 }, + new byte[] { 47, 92, 133, 83, 98, 193, 132, 202 }, + new byte[] { 49, 92, 133, 86, 99, 197, 134, 205 }, + new byte[] { 52, 93, 133, 90, 100, 201, 137, 208 }, + new byte[] { 54, 93, 133, 94, 100, 204, 139, 211 }, + new byte[] { 57, 94, 134, 98, 101, 208, 142, 214 }, + new byte[] { 59, 94, 134, 101, 102, 211, 144, 216 }, + new byte[] { 62, 94, 135, 105, 103, 214, 146, 218 }, + new byte[] { 64, 94, 135, 108, 103, 216, 148, 220 }, + new byte[] { 66, 95, 135, 111, 104, 219, 151, 222 }, + new byte[] { 68, 95, 135, 114, 105, 221, 153, 223 }, + new byte[] { 71, 96, 136, 117, 106, 224, 155, 225 }, + new byte[] { 73, 96, 136, 120, 106, 225, 157, 226 }, + new byte[] { 76, 97, 136, 123, 107, 227, 159, 228 }, + new byte[] { 78, 97, 136, 126, 108, 229, 160, 229 }, + new byte[] { 80, 98, 137, 129, 109, 231, 162, 231 }, + new byte[] { 82, 98, 137, 131, 109, 232, 164, 232 }, + new byte[] { 84, 98, 138, 134, 110, 234, 166, 233 }, + new byte[] { 86, 98, 138, 137, 111, 235, 168, 234 }, + new byte[] { 89, 99, 138, 140, 112, 236, 170, 235 }, + new byte[] { 91, 99, 138, 142, 112, 237, 171, 235 }, + new byte[] { 93, 100, 139, 145, 113, 238, 173, 236 }, + new byte[] { 95, 100, 139, 147, 114, 239, 174, 237 }, + new byte[] { 97, 101, 140, 149, 115, 240, 176, 238 }, + new byte[] { 99, 101, 140, 151, 115, 241, 177, 238 }, + new byte[] { 101, 102, 140, 154, 116, 242, 179, 239 }, + new byte[] { 103, 102, 140, 156, 117, 242, 180, 239 }, + new byte[] { 105, 103, 141, 158, 118, 243, 182, 240 }, + new byte[] { 107, 103, 141, 160, 118, 243, 183, 240 }, + new byte[] { 109, 104, 141, 162, 119, 244, 185, 241 }, + new byte[] { 111, 104, 141, 164, 119, 244, 186, 241 }, + new byte[] { 113, 104, 142, 166, 120, 245, 187, 242 }, + new byte[] { 114, 104, 142, 168, 121, 245, 188, 242 }, + new byte[] { 116, 105, 143, 170, 122, 246, 190, 243 }, + new byte[] { 118, 105, 143, 171, 122, 246, 191, 243 }, + new byte[] { 120, 106, 143, 173, 123, 247, 192, 244 }, + new byte[] { 121, 106, 143, 175, 124, 247, 193, 244 }, + new byte[] { 123, 107, 144, 177, 125, 248, 195, 244 }, + new byte[] { 125, 107, 144, 178, 125, 248, 196, 244 }, + new byte[] { 127, 108, 145, 180, 126, 249, 197, 245 }, + new byte[] { 128, 108, 145, 181, 127, 249, 198, 245 }, + new byte[] { 130, 109, 145, 183, 128, 249, 199, 245 }, + new byte[] { 132, 109, 145, 184, 128, 249, 200, 245 }, + new byte[] { 134, 110, 146, 186, 129, 250, 201, 246 }, + new byte[] { 135, 110, 146, 187, 130, 250, 202, 246 }, + new byte[] { 137, 111, 147, 189, 131, 251, 203, 246 }, + new byte[] { 138, 111, 147, 190, 131, 251, 204, 246 }, + new byte[] { 140, 112, 147, 192, 132, 251, 205, 247 }, + new byte[] { 141, 112, 147, 193, 132, 251, 206, 247 }, + new byte[] { 143, 113, 148, 194, 133, 251, 207, 247 }, + new byte[] { 144, 113, 148, 195, 134, 251, 207, 247 }, + new byte[] { 146, 114, 149, 197, 135, 252, 208, 248 }, + new byte[] { 147, 114, 149, 198, 135, 252, 209, 248 }, + new byte[] { 149, 115, 149, 199, 136, 252, 210, 248 }, + new byte[] { 150, 115, 149, 200, 137, 252, 210, 248 }, + new byte[] { 152, 115, 150, 201, 138, 252, 211, 248 }, + new byte[] { 153, 115, 150, 202, 138, 252, 212, 248 }, + new byte[] { 155, 116, 151, 204, 139, 253, 213, 249 }, + new byte[] { 156, 116, 151, 205, 139, 253, 213, 249 }, + new byte[] { 158, 117, 151, 206, 140, 253, 214, 249 }, + new byte[] { 159, 117, 151, 207, 141, 253, 215, 249 }, + new byte[] { 161, 118, 152, 208, 142, 253, 216, 249 }, + new byte[] { 162, 118, 152, 209, 142, 253, 216, 249 }, + new byte[] { 163, 119, 153, 210, 143, 253, 217, 249 }, + new byte[] { 164, 119, 153, 211, 143, 253, 217, 249 }, + new byte[] { 166, 120, 153, 212, 144, 254, 218, 250 }, + new byte[] { 167, 120, 153, 212, 145, 254, 219, 250 }, + new byte[] { 168, 121, 154, 213, 146, 254, 220, 250 }, + new byte[] { 169, 121, 154, 214, 146, 254, 220, 250 }, + new byte[] { 171, 122, 155, 215, 147, 254, 221, 250 }, + new byte[] { 172, 122, 155, 216, 147, 254, 221, 250 }, + new byte[] { 173, 123, 155, 217, 148, 254, 222, 250 }, + new byte[] { 174, 123, 155, 217, 149, 254, 222, 250 }, + new byte[] { 176, 124, 156, 218, 150, 254, 223, 250 }, + new byte[] { 177, 124, 156, 219, 150, 254, 223, 250 }, + new byte[] { 178, 125, 157, 220, 151, 254, 224, 251 }, + new byte[] { 179, 125, 157, 220, 151, 254, 224, 251 }, + new byte[] { 180, 126, 157, 221, 152, 254, 225, 251 }, + new byte[] { 181, 126, 157, 221, 152, 254, 225, 251 }, + new byte[] { 183, 127, 158, 222, 153, 254, 226, 251 }, + new byte[] { 184, 127, 158, 223, 154, 254, 226, 251 }, + new byte[] { 185, 128, 159, 224, 155, 255, 227, 251 }, + new byte[] { 186, 128, 159, 224, 155, 255, 227, 251 }, + new byte[] { 187, 129, 160, 225, 156, 255, 228, 251 }, + new byte[] { 188, 130, 160, 225, 156, 255, 228, 251 }, + new byte[] { 189, 131, 160, 226, 157, 255, 228, 251 }, + new byte[] { 190, 131, 160, 226, 158, 255, 228, 251 }, + new byte[] { 191, 132, 161, 227, 159, 255, 229, 251 }, + new byte[] { 192, 132, 161, 227, 159, 255, 229, 251 }, + new byte[] { 193, 133, 162, 228, 160, 255, 230, 252 }, + new byte[] { 194, 133, 162, 229, 160, 255, 230, 252 }, + new byte[] { 195, 134, 163, 230, 161, 255, 231, 252 }, + new byte[] { 196, 134, 163, 230, 161, 255, 231, 252 }, + new byte[] { 197, 135, 163, 231, 162, 255, 231, 252 }, + new byte[] { 198, 135, 163, 231, 162, 255, 231, 252 }, + new byte[] { 199, 136, 164, 232, 163, 255, 232, 252 }, + new byte[] { 200, 136, 164, 232, 164, 255, 232, 252 }, + new byte[] { 201, 137, 165, 233, 165, 255, 233, 252 }, + new byte[] { 201, 137, 165, 233, 165, 255, 233, 252 }, + new byte[] { 202, 138, 166, 233, 166, 255, 233, 252 }, + new byte[] { 203, 138, 166, 233, 166, 255, 233, 252 }, + new byte[] { 204, 139, 166, 234, 167, 255, 234, 252 }, + new byte[] { 205, 139, 166, 234, 167, 255, 234, 252 }, + new byte[] { 206, 140, 167, 235, 168, 255, 235, 252 }, + new byte[] { 206, 140, 167, 235, 168, 255, 235, 252 }, + new byte[] { 207, 141, 168, 236, 169, 255, 235, 252 }, + new byte[] { 208, 141, 168, 236, 170, 255, 235, 252 }, + new byte[] { 209, 142, 169, 237, 171, 255, 236, 252 }, + new byte[] { 209, 143, 169, 237, 171, 255, 236, 252 }, + new byte[] { 210, 144, 169, 237, 172, 255, 236, 252 }, + new byte[] { 211, 144, 169, 237, 172, 255, 236, 252 }, + new byte[] { 212, 145, 170, 238, 173, 255, 237, 252 }, + new byte[] { 213, 145, 170, 238, 173, 255, 237, 252 }, + new byte[] { 214, 146, 171, 239, 174, 255, 237, 253 }, + new byte[] { 214, 146, 171, 239, 174, 255, 237, 253 }, + new byte[] { 215, 147, 172, 240, 175, 255, 238, 253 }, + new byte[] { 215, 147, 172, 240, 175, 255, 238, 253 }, + new byte[] { 216, 148, 173, 240, 176, 255, 238, 253 }, + new byte[] { 217, 148, 173, 240, 176, 255, 238, 253 }, + new byte[] { 218, 149, 173, 241, 177, 255, 239, 253 }, + new byte[] { 218, 149, 173, 241, 178, 255, 239, 253 }, + new byte[] { 219, 150, 174, 241, 179, 255, 239, 253 }, + new byte[] { 219, 151, 174, 241, 179, 255, 239, 253 }, + new byte[] { 220, 152, 175, 242, 180, 255, 240, 253 }, + new byte[] { 221, 152, 175, 242, 180, 255, 240, 253 }, + new byte[] { 222, 153, 176, 242, 181, 255, 240, 253 }, + new byte[] { 222, 153, 176, 242, 181, 255, 240, 253 }, + new byte[] { 223, 154, 177, 243, 182, 255, 240, 253 }, + new byte[] { 223, 154, 177, 243, 182, 255, 240, 253 }, + new byte[] { 224, 155, 178, 244, 183, 255, 241, 253 }, + new byte[] { 224, 155, 178, 244, 183, 255, 241, 253 }, + new byte[] { 225, 156, 178, 244, 184, 255, 241, 253 }, + new byte[] { 225, 157, 178, 244, 184, 255, 241, 253 }, + new byte[] { 226, 158, 179, 244, 185, 255, 242, 253 }, + new byte[] { 227, 158, 179, 244, 185, 255, 242, 253 }, + new byte[] { 228, 159, 180, 245, 186, 255, 242, 253 }, + new byte[] { 228, 159, 180, 245, 186, 255, 242, 253 }, + new byte[] { 229, 160, 181, 245, 187, 255, 242, 253 }, + new byte[] { 229, 160, 181, 245, 187, 255, 242, 253 }, + new byte[] { 230, 161, 182, 246, 188, 255, 243, 253 }, + new byte[] { 230, 162, 182, 246, 188, 255, 243, 253 }, + new byte[] { 231, 163, 183, 246, 189, 255, 243, 253 }, + new byte[] { 231, 163, 183, 246, 189, 255, 243, 253 }, + new byte[] { 232, 164, 184, 247, 190, 255, 243, 253 }, + new byte[] { 232, 164, 184, 247, 190, 255, 243, 253 }, + new byte[] { 233, 165, 185, 247, 191, 255, 244, 253 }, + new byte[] { 233, 165, 185, 247, 191, 255, 244, 253 }, + new byte[] { 234, 166, 185, 247, 192, 255, 244, 253 }, + new byte[] { 234, 167, 185, 247, 192, 255, 244, 253 }, + new byte[] { 235, 168, 186, 248, 193, 255, 244, 253 }, + new byte[] { 235, 168, 186, 248, 193, 255, 244, 253 }, + new byte[] { 236, 169, 187, 248, 194, 255, 244, 253 }, + new byte[] { 236, 169, 187, 248, 194, 255, 244, 253 }, + new byte[] { 236, 170, 188, 248, 195, 255, 245, 253 }, + new byte[] { 236, 170, 188, 248, 195, 255, 245, 253 }, + new byte[] { 237, 171, 189, 249, 196, 255, 245, 254 }, + new byte[] { 237, 172, 189, 249, 196, 255, 245, 254 }, + new byte[] { 238, 173, 190, 249, 197, 255, 245, 254 }, + new byte[] { 238, 173, 190, 249, 197, 255, 245, 254 }, + new byte[] { 239, 174, 191, 249, 198, 255, 245, 254 }, + new byte[] { 239, 174, 191, 249, 198, 255, 245, 254 }, + new byte[] { 240, 175, 192, 249, 199, 255, 246, 254 }, + new byte[] { 240, 176, 192, 249, 199, 255, 246, 254 }, + new byte[] { 240, 177, 193, 250, 200, 255, 246, 254 }, + new byte[] { 240, 177, 193, 250, 200, 255, 246, 254 }, + new byte[] { 241, 178, 194, 250, 201, 255, 246, 254 }, + new byte[] { 241, 178, 194, 250, 201, 255, 246, 254 }, + new byte[] { 242, 179, 195, 250, 202, 255, 246, 254 }, + new byte[] { 242, 180, 195, 250, 202, 255, 246, 254 }, + new byte[] { 242, 181, 196, 250, 203, 255, 247, 254 }, + new byte[] { 242, 181, 196, 250, 203, 255, 247, 254 }, + new byte[] { 243, 182, 197, 251, 204, 255, 247, 254 }, + new byte[] { 243, 183, 197, 251, 204, 255, 247, 254 }, + new byte[] { 244, 184, 198, 251, 205, 255, 247, 254 }, + new byte[] { 244, 184, 198, 251, 205, 255, 247, 254 }, + new byte[] { 244, 185, 199, 251, 206, 255, 247, 254 }, + new byte[] { 244, 185, 199, 251, 206, 255, 247, 254 }, + new byte[] { 245, 186, 200, 251, 207, 255, 247, 254 }, + new byte[] { 245, 187, 200, 251, 207, 255, 247, 254 }, + new byte[] { 246, 188, 201, 252, 207, 255, 248, 254 }, + new byte[] { 246, 188, 201, 252, 207, 255, 248, 254 }, + new byte[] { 246, 189, 202, 252, 208, 255, 248, 254 }, + new byte[] { 246, 190, 202, 252, 208, 255, 248, 254 }, + new byte[] { 247, 191, 203, 252, 209, 255, 248, 254 }, + new byte[] { 247, 191, 203, 252, 209, 255, 248, 254 }, + new byte[] { 247, 192, 204, 252, 210, 255, 248, 254 }, + new byte[] { 247, 193, 204, 252, 210, 255, 248, 254 }, + new byte[] { 248, 194, 205, 252, 211, 255, 248, 254 }, + new byte[] { 248, 194, 205, 252, 211, 255, 248, 254 }, + new byte[] { 248, 195, 206, 252, 212, 255, 249, 254 }, + new byte[] { 248, 196, 206, 252, 212, 255, 249, 254 }, + new byte[] { 249, 197, 207, 253, 213, 255, 249, 254 }, + new byte[] { 249, 197, 207, 253, 213, 255, 249, 254 }, + new byte[] { 249, 198, 208, 253, 214, 255, 249, 254 }, + new byte[] { 249, 199, 209, 253, 214, 255, 249, 254 }, + new byte[] { 250, 200, 210, 253, 215, 255, 249, 254 }, + new byte[] { 250, 200, 210, 253, 215, 255, 249, 254 }, + new byte[] { 250, 201, 211, 253, 215, 255, 249, 254 }, + new byte[] { 250, 202, 211, 253, 215, 255, 249, 254 }, + new byte[] { 250, 203, 212, 253, 216, 255, 249, 254 }, + new byte[] { 250, 203, 212, 253, 216, 255, 249, 254 }, + new byte[] { 251, 204, 213, 253, 217, 255, 250, 254 }, + new byte[] { 251, 205, 213, 253, 217, 255, 250, 254 }, + new byte[] { 251, 206, 214, 254, 218, 255, 250, 254 }, + new byte[] { 251, 206, 215, 254, 218, 255, 250, 254 }, + new byte[] { 252, 207, 216, 254, 219, 255, 250, 254 }, + new byte[] { 252, 208, 216, 254, 219, 255, 250, 254 }, + new byte[] { 252, 209, 217, 254, 220, 255, 250, 254 }, + new byte[] { 252, 210, 217, 254, 220, 255, 250, 254 }, + new byte[] { 252, 211, 218, 254, 221, 255, 250, 254 }, + new byte[] { 252, 212, 218, 254, 221, 255, 250, 254 }, + new byte[] { 253, 213, 219, 254, 222, 255, 250, 254 }, + new byte[] { 253, 213, 220, 254, 222, 255, 250, 254 }, + new byte[] { 253, 214, 221, 254, 223, 255, 250, 254 }, + new byte[] { 253, 215, 221, 254, 223, 255, 250, 254 }, + new byte[] { 253, 216, 222, 254, 224, 255, 251, 254 }, + new byte[] { 253, 217, 223, 254, 224, 255, 251, 254 }, + new byte[] { 253, 218, 224, 254, 225, 255, 251, 254 }, + new byte[] { 253, 219, 224, 254, 225, 255, 251, 254 }, + new byte[] { 254, 220, 225, 254, 225, 255, 251, 254 }, + new byte[] { 254, 221, 226, 254, 225, 255, 251, 254 }, + new byte[] { 254, 222, 227, 255, 226, 255, 251, 254 }, + new byte[] { 254, 223, 227, 255, 226, 255, 251, 254 }, + new byte[] { 254, 224, 228, 255, 227, 255, 251, 254 }, + new byte[] { 254, 225, 229, 255, 227, 255, 251, 254 }, + new byte[] { 254, 226, 230, 255, 228, 255, 251, 254 }, + new byte[] { 254, 227, 230, 255, 229, 255, 251, 254 }, + new byte[] { 255, 228, 231, 255, 230, 255, 251, 254 }, + new byte[] { 255, 229, 232, 255, 230, 255, 251, 254 }, + new byte[] { 255, 230, 233, 255, 231, 255, 252, 254 }, + new byte[] { 255, 231, 234, 255, 231, 255, 252, 254 }, + new byte[] { 255, 232, 235, 255, 232, 255, 252, 254 }, + new byte[] { 255, 233, 236, 255, 232, 255, 252, 254 }, + new byte[] { 255, 235, 237, 255, 233, 255, 252, 254 }, + new byte[] { 255, 236, 238, 255, 234, 255, 252, 254 }, + new byte[] { 255, 238, 240, 255, 235, 255, 252, 255 }, + new byte[] { 255, 239, 241, 255, 235, 255, 252, 254 }, + new byte[] { 255, 241, 243, 255, 236, 255, 252, 254 }, + new byte[] { 255, 243, 245, 255, 237, 255, 252, 254 }, + new byte[] { 255, 246, 247, 255, 239, 255, 253, 255 }, + }; + + /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */ + public static readonly sbyte[] Vp9IntraModeTree = new sbyte[] + { + -(sbyte)PredictionMode.DcPred, 2, /* 0 = DC_NODE */ + -(sbyte)PredictionMode.TmPred, 4, /* 1 = TM_NODE */ + -(sbyte)PredictionMode.VPred, 6, /* 2 = V_NODE */ + 8, 12, /* 3 = COM_NODE */ + -(sbyte)PredictionMode.HPred, 10, /* 4 = H_NODE */ + -(sbyte)PredictionMode.D135Pred, -(sbyte)PredictionMode.D117Pred, /* 5 = D135_NODE */ + -(sbyte)PredictionMode.D45Pred, 14, /* 6 = D45_NODE */ + -(sbyte)PredictionMode.D63Pred, 16, /* 7 = D63_NODE */ + -(sbyte)PredictionMode.D153Pred, -(sbyte)PredictionMode.D207Pred /* 8 = D153_NODE */ + }; + + public static readonly sbyte[] Vp9InterModeTree = new sbyte[] + { + -((sbyte)PredictionMode.ZeroMv - (sbyte)PredictionMode. NearestMv), 2, + -((sbyte)PredictionMode.NearestMv - (sbyte)PredictionMode.NearestMv), 4, + -((sbyte)PredictionMode.NearMv - (sbyte)PredictionMode.NearestMv), + -((sbyte)PredictionMode.NewMv - (sbyte)PredictionMode.NearestMv) + }; + + public static readonly sbyte[] Vp9PartitionTree = new sbyte[] + { + -(sbyte)PartitionType.PartitionNone, 2, -(sbyte)PartitionType.PartitionHorz, 4, -(sbyte)PartitionType.PartitionVert, -(sbyte)PartitionType.PartitionSplit + }; + + public static readonly sbyte[] Vp9SwitchableInterpTree = new sbyte[] + { + -Constants.EightTap, 2, -Constants.EightTapSmooth, -Constants.EightTapSharp + }; + + public static readonly sbyte[] Vp9SegmentTree = new sbyte[] + { + 2, 4, 6, 8, 10, 12, 0, -1, -2, -3, -4, -5, -6, -7 + }; + + // MV Ref + + // This is used to figure out a context for the ref blocks. The code flattens + // an array that would have 3 possible counts (0, 1 & 2) for 3 choices by + // adding 9 for each intra block, 3 for each zero mv and 1 for each new + // motion vector. This single number is then converted into a context + // with a single lookup ( CounterToContext ). + public static readonly int[] Mode2Counter = new int[] + { + 9, // DC_PRED + 9, // V_PRED + 9, // H_PRED + 9, // D45_PRED + 9, // D135_PRED + 9, // D117_PRED + 9, // D153_PRED + 9, // D207_PRED + 9, // D63_PRED + 9, // TM_PRED + 0, // NEARESTMV + 0, // NEARMV + 3, // ZEROMV + 1, // NEWMV + }; + + // There are 3^3 different combinations of 3 counts that can be either 0,1 or + // 2. However the actual count can never be greater than 2 so the highest + // counter we need is 18. 9 is an invalid counter that's never used. + public static readonly MotionVectorContext[] CounterToContext = new MotionVectorContext[] + { + MotionVectorContext.BothPredicted, // 0 + MotionVectorContext.NewPlusNonIntra, // 1 + MotionVectorContext.BothNew, // 2 + MotionVectorContext.ZeroPlusPredicted, // 3 + MotionVectorContext.NewPlusNonIntra, // 4 + MotionVectorContext.InvalidCase, // 5 + MotionVectorContext.BothZero, // 6 + MotionVectorContext.InvalidCase, // 7 + MotionVectorContext.InvalidCase, // 8 + MotionVectorContext.IntraPlusNonIntra, // 9 + MotionVectorContext.IntraPlusNonIntra, // 10 + MotionVectorContext.InvalidCase, // 11 + MotionVectorContext.IntraPlusNonIntra, // 12 + MotionVectorContext.InvalidCase, // 13 + MotionVectorContext.InvalidCase, // 14 + MotionVectorContext.InvalidCase, // 15 + MotionVectorContext.InvalidCase, // 16 + MotionVectorContext.InvalidCase, // 17 + MotionVectorContext.BothIntra // 18 + }; + + public static readonly Position[][] MvRefBlocks = new Position[][] + { + // 4X4 + new Position[] { new Position( -1, 0 ), + new Position( 0, -1 ), + new Position( -1, -1 ), + new Position( -2, 0 ), + new Position( 0, -2 ), + new Position( -2, -1 ), + new Position( -1, -2 ), + new Position( -2, -2 ) }, + // 4X8 + new Position[] { new Position( -1, 0 ), + new Position( 0, -1 ), + new Position( -1, -1 ), + new Position( -2, 0 ), + new Position( 0, -2 ), + new Position( -2, -1 ), + new Position( -1, -2 ), + new Position( -2, -2 ) }, + // 8X4 + new Position[] { new Position( -1, 0 ), + new Position( 0, -1 ), + new Position( -1, -1 ), + new Position( -2, 0 ), + new Position( 0, -2 ), + new Position( -2, -1 ), + new Position( -1, -2 ), + new Position( -2, -2 ) }, + // 8X8 + new Position[] { new Position( -1, 0 ), + new Position( 0, -1 ), + new Position( -1, -1 ), + new Position( -2, 0 ), + new Position( 0, -2 ), + new Position( -2, -1 ), + new Position( -1, -2 ), + new Position( -2, -2 ) }, + // 8X16 + new Position[] { new Position( 0, -1 ), + new Position( -1, 0 ), + new Position( 1, -1 ), + new Position( -1, -1 ), + new Position( 0, -2 ), + new Position( -2, 0 ), + new Position( -2, -1 ), + new Position( -1, -2 ) }, + // 16X8 + new Position[] { new Position( -1, 0 ), + new Position( 0, -1 ), + new Position( -1, 1 ), + new Position( -1, -1 ), + new Position( -2, 0 ), + new Position( 0, -2 ), + new Position( -1, -2 ), + new Position( -2, -1 ) }, + // 16X16 + new Position[] { new Position( -1, 0 ), + new Position( 0, -1 ), + new Position( -1, 1 ), + new Position( 1, -1 ), + new Position( -1, -1 ), + new Position( -3, 0 ), + new Position( 0, -3 ), + new Position( -3, -3 ) }, + // 16X32 + new Position[] { new Position( 0, -1 ), + new Position( -1, 0 ), + new Position( 2, -1 ), + new Position( -1, -1 ), + new Position( -1, 1 ), + new Position( 0, -3 ), + new Position( -3, 0 ), + new Position( -3, -3 ) }, + // 32X16 + new Position[] { new Position( -1, 0 ), + new Position( 0, -1 ), + new Position( -1, 2 ), + new Position( -1, -1 ), + new Position( 1, -1 ), + new Position( -3, 0 ), + new Position( 0, -3 ), + new Position( -3, -3 ) }, + // 32X32 + new Position[] { new Position( -1, 1 ), + new Position( 1, -1 ), + new Position( -1, 2 ), + new Position( 2, -1 ), + new Position( -1, -1 ), + new Position( -3, 0 ), + new Position( 0, -3 ), + new Position( -3, -3 ) }, + // 32X64 + new Position[] { new Position( 0, -1 ), + new Position( -1, 0 ), + new Position( 4, -1 ), + new Position( -1, 2 ), + new Position( -1, -1 ), + new Position( 0, -3 ), + new Position( -3, 0 ), + new Position( 2, -1 ) }, + // 64X32 + new Position[] { new Position( -1, 0 ), + new Position( 0, -1 ), + new Position( -1, 4 ), + new Position( 2, -1 ), + new Position( -1, -1 ), + new Position( -3, 0 ), + new Position( 0, -3 ), + new Position( -1, 2 ) }, + // 64X64 + new Position[] { new Position( -1, 3 ), + new Position( 3, -1 ), + new Position( -1, 4 ), + new Position( 4, -1 ), + new Position( -1, -1 ), + new Position( -1, 0 ), + new Position( 0, -1 ), + new Position( -1, 6 ) } + }; + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/PredCommon.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/PredCommon.cs new file mode 100644 index 00000000..a9da1042 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/PredCommon.cs @@ -0,0 +1,389 @@ +using Ryujinx.Graphics.Nvdec.Vp9.Types; +using System.Diagnostics; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal static class PredCommon + { + public static int GetReferenceModeContext(ref Vp9Common cm, ref MacroBlockD xd) + { + int ctx; + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + if (!xd.AboveMi.IsNull && !xd.LeftMi.IsNull) + { // both edges available + if (!xd.AboveMi.Value.HasSecondRef() && !xd.LeftMi.Value.HasSecondRef()) + { + // Neither edge uses comp pred (0/1) + ctx = (xd.AboveMi.Value.RefFrame[0] == cm.CompFixedRef ? 1 : 0) ^ + (xd.LeftMi.Value.RefFrame[0] == cm.CompFixedRef ? 1 : 0); + } + else if (!xd.AboveMi.Value.HasSecondRef()) + { + // One of two edges uses comp pred (2/3) + ctx = 2 + (xd.AboveMi.Value.RefFrame[0] == cm.CompFixedRef || !xd.AboveMi.Value.IsInterBlock() ? 1 : 0); + } + else if (!xd.LeftMi.Value.HasSecondRef()) + { + // One of two edges uses comp pred (2/3) + ctx = 2 + (xd.LeftMi.Value.RefFrame[0] == cm.CompFixedRef || !xd.LeftMi.Value.IsInterBlock() ? 1 : 0); + } + else // Both edges use comp pred (4) + { + ctx = 4; + } + } + else if (!xd.AboveMi.IsNull || !xd.LeftMi.IsNull) + { // One edge available + ref ModeInfo edgeMi = ref !xd.AboveMi.IsNull ? ref xd.AboveMi.Value : ref xd.LeftMi.Value; + + if (!edgeMi.HasSecondRef()) + { + // Edge does not use comp pred (0/1) + ctx = edgeMi.RefFrame[0] == cm.CompFixedRef ? 1 : 0; + } + else + { + // Edge uses comp pred (3) + ctx = 3; + } + } + else + { // No edges available (1) + ctx = 1; + } + Debug.Assert(ctx >= 0 && ctx < Constants.CompInterContexts); + return ctx; + } + + // Returns a context number for the given MB prediction signal + public static int GetPredContextCompRefP(ref Vp9Common cm, ref MacroBlockD xd) + { + int predContext; + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + int fixRefIdx = cm.RefFrameSignBias[cm.CompFixedRef]; + int varRefIdx = fixRefIdx == 0 ? 1 : 0; + + if (!xd.AboveMi.IsNull && !xd.LeftMi.IsNull) + { // Both edges available + bool aboveIntra = !xd.AboveMi.Value.IsInterBlock(); + bool leftIntra = !xd.LeftMi.Value.IsInterBlock(); + + if (aboveIntra && leftIntra) + { // Intra/Intra (2) + predContext = 2; + } + else if (aboveIntra || leftIntra) + { // Intra/Inter + ref ModeInfo edgeMi = ref aboveIntra ? ref xd.LeftMi.Value : ref xd.AboveMi.Value; + + if (!edgeMi.HasSecondRef()) // single pred (1/3) + { + predContext = 1 + 2 * (edgeMi.RefFrame[0] != cm.CompVarRef[1] ? 1 : 0); + } + else // Comp pred (1/3) + { + predContext = 1 + 2 * (edgeMi.RefFrame[varRefIdx] != cm.CompVarRef[1] ? 1 : 0); + } + } + else + { // Inter/Inter + bool lSg = !xd.LeftMi.Value.HasSecondRef(); + bool aSg = !xd.AboveMi.Value.HasSecondRef(); + sbyte vrfa = aSg ? xd.AboveMi.Value.RefFrame[0] : xd.AboveMi.Value.RefFrame[varRefIdx]; + sbyte vrfl = lSg ? xd.LeftMi.Value.RefFrame[0] : xd.LeftMi.Value.RefFrame[varRefIdx]; + + if (vrfa == vrfl && cm.CompVarRef[1] == vrfa) + { + predContext = 0; + } + else if (lSg && aSg) + { // Single/Single + if ((vrfa == cm.CompFixedRef && vrfl == cm.CompVarRef[0]) || + (vrfl == cm.CompFixedRef && vrfa == cm.CompVarRef[0])) + { + predContext = 4; + } + else if (vrfa == vrfl) + { + predContext = 3; + } + else + { + predContext = 1; + } + } + else if (lSg || aSg) + { // Single/Comp + sbyte vrfc = lSg ? vrfa : vrfl; + sbyte rfs = aSg ? vrfa : vrfl; + if (vrfc == cm.CompVarRef[1] && rfs != cm.CompVarRef[1]) + { + predContext = 1; + } + else if (rfs == cm.CompVarRef[1] && vrfc != cm.CompVarRef[1]) + { + predContext = 2; + } + else + { + predContext = 4; + } + } + else if (vrfa == vrfl) + { // Comp/Comp + predContext = 4; + } + else + { + predContext = 2; + } + } + } + else if (!xd.AboveMi.IsNull || !xd.LeftMi.IsNull) + { // One edge available + ref ModeInfo edgeMi = ref !xd.AboveMi.IsNull ? ref xd.AboveMi.Value : ref xd.LeftMi.Value; + + if (!edgeMi.IsInterBlock()) + { + predContext = 2; + } + else + { + if (edgeMi.HasSecondRef()) + { + predContext = 4 * (edgeMi.RefFrame[varRefIdx] != cm.CompVarRef[1] ? 1 : 0); + } + else + { + predContext = 3 * (edgeMi.RefFrame[0] != cm.CompVarRef[1] ? 1 : 0); + } + } + } + else + { // No edges available (2) + predContext = 2; + } + Debug.Assert(predContext >= 0 && predContext < Constants.RefContexts); + return predContext; + } + + public static int GetPredContextSingleRefP1(ref MacroBlockD xd) + { + int predContext; + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + if (!xd.AboveMi.IsNull && !xd.LeftMi.IsNull) + { // Both edges available + bool aboveIntra = !xd.AboveMi.Value.IsInterBlock(); + bool leftIntra = !xd.LeftMi.Value.IsInterBlock(); + + if (aboveIntra && leftIntra) + { // Intra/Intra + predContext = 2; + } + else if (aboveIntra || leftIntra) + { // Intra/Inter or Inter/Intra + ref ModeInfo edgeMi = ref aboveIntra ? ref xd.LeftMi.Value : ref xd.AboveMi.Value; + if (!edgeMi.HasSecondRef()) + { + predContext = 4 * (edgeMi.RefFrame[0] == Constants.LastFrame ? 1 : 0); + } + else + { + predContext = 1 + (edgeMi.RefFrame[0] == Constants.LastFrame || + edgeMi.RefFrame[1] == Constants.LastFrame ? 1 : 0); + } + } + else + { // Inter/Inter + bool aboveHasSecond = xd.AboveMi.Value.HasSecondRef(); + bool leftHasSecond = xd.LeftMi.Value.HasSecondRef(); + sbyte above0 = xd.AboveMi.Value.RefFrame[0]; + sbyte above1 = xd.AboveMi.Value.RefFrame[1]; + sbyte left0 = xd.LeftMi.Value.RefFrame[0]; + sbyte left1 = xd.LeftMi.Value.RefFrame[1]; + + if (aboveHasSecond && leftHasSecond) + { + predContext = 1 + (above0 == Constants.LastFrame || above1 == Constants.LastFrame || + left0 == Constants.LastFrame || left1 == Constants.LastFrame ? 1 : 0); + } + else if (aboveHasSecond || leftHasSecond) + { + sbyte rfs = !aboveHasSecond ? above0 : left0; + sbyte crf1 = aboveHasSecond ? above0 : left0; + sbyte crf2 = aboveHasSecond ? above1 : left1; + + if (rfs == Constants.LastFrame) + { + predContext = 3 + (crf1 == Constants.LastFrame || crf2 == Constants.LastFrame ? 1 : 0); + } + else + { + predContext = (crf1 == Constants.LastFrame || crf2 == Constants.LastFrame ? 1 : 0); + } + } + else + { + predContext = 2 * (above0 == Constants.LastFrame ? 1 : 0) + 2 * (left0 == Constants.LastFrame ? 1 : 0); + } + } + } + else if (!xd.AboveMi.IsNull || !xd.LeftMi.IsNull) + { // One edge available + ref ModeInfo edgeMi = ref !xd.AboveMi.IsNull ? ref xd.AboveMi.Value : ref xd.LeftMi.Value; + if (!edgeMi.IsInterBlock()) + { // Intra + predContext = 2; + } + else + { // Inter + if (!edgeMi.HasSecondRef()) + { + predContext = 4 * (edgeMi.RefFrame[0] == Constants.LastFrame ? 1 : 0); + } + else + { + predContext = 1 + (edgeMi.RefFrame[0] == Constants.LastFrame || + edgeMi.RefFrame[1] == Constants.LastFrame ? 1 : 0); + } + } + } + else + { // No edges available + predContext = 2; + } + Debug.Assert(predContext >= 0 && predContext < Constants.RefContexts); + return predContext; + } + + public static int GetPredContextSingleRefP2(ref MacroBlockD xd) + { + int predContext; + + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + if (!xd.AboveMi.IsNull && !xd.LeftMi.IsNull) + { // Both edges available + bool aboveIntra = !xd.AboveMi.Value.IsInterBlock(); + bool leftIntra = !xd.LeftMi.Value.IsInterBlock(); + + if (aboveIntra && leftIntra) + { // Intra/Intra + predContext = 2; + } + else if (aboveIntra || leftIntra) + { // Intra/Inter or Inter/Intra + ref ModeInfo edgeMi = ref aboveIntra ? ref xd.LeftMi.Value : ref xd.AboveMi.Value; + if (!edgeMi.HasSecondRef()) + { + if (edgeMi.RefFrame[0] == Constants.LastFrame) + { + predContext = 3; + } + else + { + predContext = 4 * (edgeMi.RefFrame[0] == Constants.GoldenFrame ? 1 : 0); + } + } + else + { + predContext = 1 + 2 * (edgeMi.RefFrame[0] == Constants.GoldenFrame || + edgeMi.RefFrame[1] == Constants.GoldenFrame ? 1 : 0); + } + } + else + { // Inter/Inter + bool aboveHasSecond = xd.AboveMi.Value.HasSecondRef(); + bool leftHasSecond = xd.LeftMi.Value.HasSecondRef(); + sbyte above0 = xd.AboveMi.Value.RefFrame[0]; + sbyte above1 = xd.AboveMi.Value.RefFrame[1]; + sbyte left0 = xd.LeftMi.Value.RefFrame[0]; + sbyte left1 = xd.LeftMi.Value.RefFrame[1]; + + if (aboveHasSecond && leftHasSecond) + { + if (above0 == left0 && above1 == left1) + { + predContext = 3 * (above0 == Constants.GoldenFrame || above1 == Constants.GoldenFrame || + left0 == Constants.GoldenFrame || left1 == Constants.GoldenFrame ? 1 : 0); + } + else + { + predContext = 2; + } + } + else if (aboveHasSecond || leftHasSecond) + { + sbyte rfs = !aboveHasSecond ? above0 : left0; + sbyte crf1 = aboveHasSecond ? above0 : left0; + sbyte crf2 = aboveHasSecond ? above1 : left1; + + if (rfs == Constants.GoldenFrame) + { + predContext = 3 + (crf1 == Constants.GoldenFrame || crf2 == Constants.GoldenFrame ? 1 : 0); + } + else if (rfs == Constants.AltRefFrame) + { + predContext = crf1 == Constants.GoldenFrame || crf2 == Constants.GoldenFrame ? 1 : 0; + } + else + { + predContext = 1 + 2 * (crf1 == Constants.GoldenFrame || crf2 == Constants.GoldenFrame ? 1 : 0); + } + } + else + { + if (above0 == Constants.LastFrame && left0 == Constants.LastFrame) + { + predContext = 3; + } + else if (above0 == Constants.LastFrame || left0 == Constants.LastFrame) + { + sbyte edge0 = (above0 == Constants.LastFrame) ? left0 : above0; + predContext = 4 * (edge0 == Constants.GoldenFrame ? 1 : 0); + } + else + { + predContext = 2 * (above0 == Constants.GoldenFrame ? 1 : 0) + 2 * (left0 == Constants.GoldenFrame ? 1 : 0); + } + } + } + } + else if (!xd.AboveMi.IsNull || !xd.LeftMi.IsNull) + { // One edge available + ref ModeInfo edgeMi = ref !xd.AboveMi.IsNull ? ref xd.AboveMi.Value : ref xd.LeftMi.Value; + + if (!edgeMi.IsInterBlock() || (edgeMi.RefFrame[0] == Constants.LastFrame && !edgeMi.HasSecondRef())) + { + predContext = 2; + } + else if (!edgeMi.HasSecondRef()) + { + predContext = 4 * (edgeMi.RefFrame[0] == Constants.GoldenFrame ? 1 : 0); + } + else + { + predContext = 3 * (edgeMi.RefFrame[0] == Constants.GoldenFrame || + edgeMi.RefFrame[1] == Constants.GoldenFrame ? 1 : 0); + } + } + else + { // No edges available (2) + predContext = 2; + } + Debug.Assert(predContext >= 0 && predContext < Constants.RefContexts); + return predContext; + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/QuantCommon.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/QuantCommon.cs new file mode 100644 index 00000000..5c52c32f --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/QuantCommon.cs @@ -0,0 +1,203 @@ +using Ryujinx.Graphics.Nvdec.Vp9.Types; +using System; +using System.Diagnostics; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal static class QuantCommon + { + public const int MinQ = 0; + public const int MaxQ = 255; + + private static readonly short[] DcQlookup = new short[] + { + 4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16, 17, 18, + 19, 19, 20, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, + 31, 32, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, + 43, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 53, + 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 62, 63, 64, 65, + 66, 66, 67, 68, 69, 70, 70, 71, 72, 73, 74, 74, 75, 76, + 77, 78, 78, 79, 80, 81, 81, 82, 83, 84, 85, 85, 87, 88, + 90, 92, 93, 95, 96, 98, 99, 101, 102, 104, 105, 107, 108, 110, + 111, 113, 114, 116, 117, 118, 120, 121, 123, 125, 127, 129, 131, 134, + 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 161, 164, + 166, 169, 172, 174, 177, 180, 182, 185, 187, 190, 192, 195, 199, 202, + 205, 208, 211, 214, 217, 220, 223, 226, 230, 233, 237, 240, 243, 247, + 250, 253, 257, 261, 265, 269, 272, 276, 280, 284, 288, 292, 296, 300, + 304, 309, 313, 317, 322, 326, 330, 335, 340, 344, 349, 354, 359, 364, + 369, 374, 379, 384, 389, 395, 400, 406, 411, 417, 423, 429, 435, 441, + 447, 454, 461, 467, 475, 482, 489, 497, 505, 513, 522, 530, 539, 549, + 559, 569, 579, 590, 602, 614, 626, 640, 654, 668, 684, 700, 717, 736, + 755, 775, 796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139, + 1184, 1232, 1282, 1336, + }; + + private static readonly short[] DcQlookup10 = new short[] + { + 4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34, 37, + 40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75, 78, 82, + 86, 90, 93, 97, 101, 105, 109, 113, 116, 120, 124, 128, 132, + 136, 140, 143, 147, 151, 155, 159, 163, 166, 170, 174, 178, 182, + 185, 189, 193, 197, 200, 204, 208, 212, 215, 219, 223, 226, 230, + 233, 237, 241, 244, 248, 251, 255, 259, 262, 266, 269, 273, 276, + 280, 283, 287, 290, 293, 297, 300, 304, 307, 310, 314, 317, 321, + 324, 327, 331, 334, 337, 343, 350, 356, 362, 369, 375, 381, 387, + 394, 400, 406, 412, 418, 424, 430, 436, 442, 448, 454, 460, 466, + 472, 478, 484, 490, 499, 507, 516, 525, 533, 542, 550, 559, 567, + 576, 584, 592, 601, 609, 617, 625, 634, 644, 655, 666, 676, 687, + 698, 708, 718, 729, 739, 749, 759, 770, 782, 795, 807, 819, 831, + 844, 856, 868, 880, 891, 906, 920, 933, 947, 961, 975, 988, 1001, + 1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170, 1186, 1202, + 1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379, 1398, 1416, 1436, + 1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624, 1647, 1670, 1692, 1717, + 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, 1958, 1990, 2021, 2054, 2088, + 2123, 2159, 2197, 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, 2616, 2675, + 2737, 2802, 2871, 2944, 3020, 3102, 3188, 3280, 3375, 3478, 3586, 3702, 3823, + 3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347, + }; + + private static readonly short[] DcQlookup12 = new short[] + { + 4, 12, 18, 25, 33, 41, 50, 60, 70, 80, 91, + 103, 115, 127, 140, 153, 166, 180, 194, 208, 222, 237, + 251, 266, 281, 296, 312, 327, 343, 358, 374, 390, 405, + 421, 437, 453, 469, 484, 500, 516, 532, 548, 564, 580, + 596, 611, 627, 643, 659, 674, 690, 706, 721, 737, 752, + 768, 783, 798, 814, 829, 844, 859, 874, 889, 904, 919, + 934, 949, 964, 978, 993, 1008, 1022, 1037, 1051, 1065, 1080, + 1094, 1108, 1122, 1136, 1151, 1165, 1179, 1192, 1206, 1220, 1234, + 1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342, 1368, 1393, 1419, + 1444, 1469, 1494, 1519, 1544, 1569, 1594, 1618, 1643, 1668, 1692, + 1717, 1741, 1765, 1789, 1814, 1838, 1862, 1885, 1909, 1933, 1957, + 1992, 2027, 2061, 2096, 2130, 2165, 2199, 2233, 2267, 2300, 2334, + 2367, 2400, 2434, 2467, 2499, 2532, 2575, 2618, 2661, 2704, 2746, + 2788, 2830, 2872, 2913, 2954, 2995, 3036, 3076, 3127, 3177, 3226, + 3275, 3324, 3373, 3421, 3469, 3517, 3565, 3621, 3677, 3733, 3788, + 3843, 3897, 3951, 4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420, + 4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942, 5013, 5083, 5153, + 5222, 5291, 5367, 5442, 5517, 5591, 5665, 5745, 5825, 5905, 5984, + 6063, 6149, 6234, 6319, 6404, 6495, 6587, 6678, 6769, 6867, 6966, + 7064, 7163, 7269, 7376, 7483, 7599, 7715, 7832, 7958, 8085, 8214, + 8352, 8492, 8635, 8788, 8945, 9104, 9275, 9450, 9639, 9832, 10031, + 10245, 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, 12750, 13118, + 13501, 13913, 14343, 14807, 15290, 15812, 16356, 16943, 17575, 18237, 18949, + 19718, 20521, 21387, + }; + + private static readonly short[] AcQlookup = new short[] + { + 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, + 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, + 98, 99, 100, 101, 102, 104, 106, 108, 110, 112, 114, 116, 118, + 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, + 146, 148, 150, 152, 155, 158, 161, 164, 167, 170, 173, 176, 179, + 182, 185, 188, 191, 194, 197, 200, 203, 207, 211, 215, 219, 223, + 227, 231, 235, 239, 243, 247, 251, 255, 260, 265, 270, 275, 280, + 285, 290, 295, 300, 305, 311, 317, 323, 329, 335, 341, 347, 353, + 359, 366, 373, 380, 387, 394, 401, 408, 416, 424, 432, 440, 448, + 456, 465, 474, 483, 492, 501, 510, 520, 530, 540, 550, 560, 571, + 582, 593, 604, 615, 627, 639, 651, 663, 676, 689, 702, 715, 729, + 743, 757, 771, 786, 801, 816, 832, 848, 864, 881, 898, 915, 933, + 951, 969, 988, 1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, 1173, 1196, + 1219, 1243, 1267, 1292, 1317, 1343, 1369, 1396, 1423, 1451, 1479, 1508, 1537, + 1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828, + }; + + private static readonly short[] AcQlookup10 = new short[] + { + 4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37, 40, + 44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83, 88, 92, + 96, 100, 105, 109, 114, 118, 122, 127, 131, 136, 140, 145, 149, + 154, 158, 163, 168, 172, 177, 181, 186, 190, 195, 199, 204, 208, + 213, 217, 222, 226, 231, 235, 240, 244, 249, 253, 258, 262, 267, + 271, 275, 280, 284, 289, 293, 297, 302, 306, 311, 315, 319, 324, + 328, 332, 337, 341, 345, 349, 354, 358, 362, 367, 371, 375, 379, + 384, 388, 392, 396, 401, 409, 417, 425, 433, 441, 449, 458, 466, + 474, 482, 490, 498, 506, 514, 523, 531, 539, 547, 555, 563, 571, + 579, 588, 596, 604, 616, 628, 640, 652, 664, 676, 688, 700, 713, + 725, 737, 749, 761, 773, 785, 797, 809, 825, 841, 857, 873, 889, + 905, 922, 938, 954, 970, 986, 1002, 1018, 1038, 1058, 1078, 1098, 1118, + 1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, 1411, + 1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727, 1759, 1791, + 1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159, 2199, 2239, 2283, + 2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703, 2755, 2807, 2859, 2915, + 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, 3455, 3523, 3591, 3659, 3731, + 3803, 3876, 3952, 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, 4692, 4784, + 4876, 4972, 5068, 5168, 5268, 5372, 5476, 5584, 5692, 5804, 5916, 6032, 6148, + 6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312, + }; + + private static readonly short[] AcQlookup12 = new short[] + { + 4, 13, 19, 27, 35, 44, 54, 64, 75, 87, 99, + 112, 126, 139, 154, 168, 183, 199, 214, 230, 247, 263, + 280, 297, 314, 331, 349, 366, 384, 402, 420, 438, 456, + 475, 493, 511, 530, 548, 567, 586, 604, 623, 642, 660, + 679, 698, 716, 735, 753, 772, 791, 809, 828, 846, 865, + 884, 902, 920, 939, 957, 976, 994, 1012, 1030, 1049, 1067, + 1085, 1103, 1121, 1139, 1157, 1175, 1193, 1211, 1229, 1246, 1264, + 1282, 1299, 1317, 1335, 1352, 1370, 1387, 1405, 1422, 1440, 1457, + 1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595, 1627, 1660, 1693, + 1725, 1758, 1791, 1824, 1856, 1889, 1922, 1954, 1987, 2020, 2052, + 2085, 2118, 2150, 2183, 2216, 2248, 2281, 2313, 2346, 2378, 2411, + 2459, 2508, 2556, 2605, 2653, 2701, 2750, 2798, 2847, 2895, 2943, + 2992, 3040, 3088, 3137, 3185, 3234, 3298, 3362, 3426, 3491, 3555, + 3619, 3684, 3748, 3812, 3876, 3941, 4005, 4069, 4149, 4230, 4310, + 4390, 4470, 4550, 4631, 4711, 4791, 4871, 4967, 5064, 5160, 5256, + 5352, 5448, 5544, 5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410, + 6522, 6650, 6778, 6906, 7034, 7162, 7290, 7435, 7579, 7723, 7867, + 8011, 8155, 8315, 8475, 8635, 8795, 8956, 9132, 9308, 9484, 9660, + 9836, 10028, 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, 11885, + 12109, 12333, 12573, 12813, 13053, 13309, 13565, 13821, 14093, 14365, 14637, + 14925, 15213, 15502, 15806, 16110, 16414, 16734, 17054, 17390, 17726, 18062, + 18414, 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, 21902, 22334, + 22766, 23214, 23662, 24126, 24590, 25070, 25551, 26047, 26559, 27071, 27599, + 28143, 28687, 29247, + }; + + public static short DcQuant(int qindex, int delta, BitDepth bitDepth) + { + switch (bitDepth) + { + case BitDepth.Bits8: return DcQlookup[Math.Clamp(qindex + delta, 0, MaxQ)]; + case BitDepth.Bits10: return DcQlookup10[Math.Clamp(qindex + delta, 0, MaxQ)]; + case BitDepth.Bits12: return DcQlookup12[Math.Clamp(qindex + delta, 0, MaxQ)]; + default: + Debug.Assert(false, "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; + } + } + + public static short AcQuant(int qindex, int delta, BitDepth bitDepth) + { + switch (bitDepth) + { + case BitDepth.Bits8: return AcQlookup[Math.Clamp(qindex + delta, 0, MaxQ)]; + case BitDepth.Bits10: return AcQlookup10[Math.Clamp(qindex + delta, 0, MaxQ)]; + case BitDepth.Bits12: return AcQlookup12[Math.Clamp(qindex + delta, 0, MaxQ)]; + default: + Debug.Assert(false, "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; + } + } + + public static int GetQIndex(ref Segmentation seg, int segmentId, int baseQIndex) + { + if (seg.IsSegFeatureActive(segmentId, SegLvlFeatures.SegLvlAltQ) != 0) + { + int data = seg.GetSegData(segmentId, SegLvlFeatures.SegLvlAltQ); + int segQIndex = seg.AbsDelta == Constants.SegmentAbsData ? data : baseQIndex + data; + return Math.Clamp(segQIndex, 0, MaxQ); + } + else + { + return baseQIndex; + } + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/ReconInter.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/ReconInter.cs new file mode 100644 index 00000000..a4c295e5 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/ReconInter.cs @@ -0,0 +1,234 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Types; +using System; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.Filter; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal static class ReconInter + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void InterPredictor( + byte* src, + int srcStride, + byte* dst, + int dstStride, + int subpelX, + int subpelY, + ref ScaleFactors sf, + int w, + int h, + int refr, + Array8<short>[] kernel, + int xs, + int ys) + { + sf.InterPredict( + subpelX != 0 ? 1 : 0, + subpelY != 0 ? 1 : 0, + refr, + src, + srcStride, + dst, + dstStride, + subpelX, + subpelY, + w, + h, + kernel, + xs, + ys); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void HighbdInterPredictor( + ushort* src, + int srcStride, + ushort* dst, + int dstStride, + int subpelX, + int subpelY, + ref ScaleFactors sf, + int w, + int h, + int refr, + Array8<short>[] kernel, + int xs, + int ys, + int bd) + { + sf.HighbdInterPredict( + subpelX != 0 ? 1 : 0, + subpelY != 0 ? 1 : 0, + refr, + src, + srcStride, + dst, + dstStride, + subpelX, + subpelY, + w, + h, + kernel, + xs, + ys, + bd); + } + + private static int RoundMvCompQ4(int value) + { + return (value < 0 ? value - 2 : value + 2) / 4; + } + + private static Mv MiMvPredQ4(ref ModeInfo mi, int idx) + { + Mv res = new Mv() + { + Row = (short)RoundMvCompQ4( + mi.Bmi[0].Mv[idx].Row + mi.Bmi[1].Mv[idx].Row + + mi.Bmi[2].Mv[idx].Row + mi.Bmi[3].Mv[idx].Row), + Col = (short)RoundMvCompQ4( + mi.Bmi[0].Mv[idx].Col + mi.Bmi[1].Mv[idx].Col + + mi.Bmi[2].Mv[idx].Col + mi.Bmi[3].Mv[idx].Col) + }; + return res; + } + + private static int RoundMvCompQ2(int value) + { + return (value < 0 ? value - 1 : value + 1) / 2; + } + + private static Mv MiMvPredQ2(ref ModeInfo mi, int idx, int block0, int block1) + { + Mv res = new Mv() + { + Row = (short)RoundMvCompQ2( + mi.Bmi[block0].Mv[idx].Row + + mi.Bmi[block1].Mv[idx].Row), + Col = (short)RoundMvCompQ2( + mi.Bmi[block0].Mv[idx].Col + + mi.Bmi[block1].Mv[idx].Col) + }; + return res; + } + + public static Mv ClampMvToUmvBorderSb(ref MacroBlockD xd, ref Mv srcMv, int bw, int bh, int ssX, int ssY) + { + // If the MV points so far into the UMV border that no visible pixels + // are used for reconstruction, the subpel part of the MV can be + // discarded and the MV limited to 16 pixels with equivalent results. + int spelLeft = (Constants.Vp9InterpExtend + bw) << SubpelBits; + int spelRight = spelLeft - SubpelShifts; + int spelTop = (Constants.Vp9InterpExtend + bh) << SubpelBits; + int spelBottom = spelTop - SubpelShifts; + Mv clampedMv = new Mv() + { + Row = (short)(srcMv.Row * (1 << (1 - ssY))), + Col = (short)(srcMv.Col * (1 << (1 - ssX))) + }; + + Debug.Assert(ssX <= 1); + Debug.Assert(ssY <= 1); + + clampedMv.ClampMv( + xd.MbToLeftEdge * (1 << (1 - ssX)) - spelLeft, + xd.MbToRightEdge * (1 << (1 - ssX)) + spelRight, + xd.MbToTopEdge * (1 << (1 - ssY)) - spelTop, + xd.MbToBottomEdge * (1 << (1 - ssY)) + spelBottom); + + return clampedMv; + } + + public static Mv AverageSplitMvs(ref MacroBlockDPlane pd, ref ModeInfo mi, int refr, int block) + { + int ssIdx = ((pd.SubsamplingX > 0 ? 1 : 0) << 1) | (pd.SubsamplingY > 0 ? 1 : 0); + Mv res = new Mv(); + switch (ssIdx) + { + case 0: res = mi.Bmi[block].Mv[refr]; break; + case 1: res = MiMvPredQ2(ref mi, refr, block, block + 2); break; + case 2: res = MiMvPredQ2(ref mi, refr, block, block + 1); break; + case 3: res = MiMvPredQ4(ref mi, refr); break; + default: Debug.Assert(ssIdx <= 3 && ssIdx >= 0); break; + } + return res; + } + + private static int ScaledBufferOffset(int xOffset, int yOffset, int stride, Ptr<ScaleFactors> sf) + { + int x = !sf.IsNull ? sf.Value.ScaleValueX(xOffset) : xOffset; + int y = !sf.IsNull ? sf.Value.ScaleValueY(yOffset) : yOffset; + return y * stride + x; + } + + private static void SetupPredPlanes( + ref Buf2D dst, + ArrayPtr<byte> src, + int stride, + int miRow, + int miCol, + Ptr<ScaleFactors> scale, + int subsamplingX, + int subsamplingY) + { + int x = (Constants.MiSize * miCol) >> subsamplingX; + int y = (Constants.MiSize * miRow) >> subsamplingY; + dst.Buf = src.Slice(ScaledBufferOffset(x, y, stride, scale)); + dst.Stride = stride; + } + + public static void SetupDstPlanes( + ref Array3<MacroBlockDPlane> planes, + ref Surface src, + int miRow, + int miCol) + { + Span<ArrayPtr<byte>> buffers = stackalloc ArrayPtr<byte>[Constants.MaxMbPlane]; + buffers[0] = src.YBuffer; + buffers[1] = src.UBuffer; + buffers[2] = src.VBuffer; + Span<int> strides = stackalloc int[Constants.MaxMbPlane]; + strides[0] = src.Stride; + strides[1] = src.UvStride; + strides[2] = src.UvStride; + int i; + + for (i = 0; i < Constants.MaxMbPlane; ++i) + { + ref MacroBlockDPlane pd = ref planes[i]; + SetupPredPlanes(ref pd.Dst, buffers[i], strides[i], miRow, miCol, Ptr<ScaleFactors>.Null, pd.SubsamplingX, pd.SubsamplingY); + } + } + + public static void SetupPrePlanes( + ref MacroBlockD xd, + int idx, + ref Surface src, + int miRow, + int miCol, + Ptr<ScaleFactors> sf) + { + if (!src.YBuffer.IsNull && !src.UBuffer.IsNull && !src.VBuffer.IsNull) + { + Span<ArrayPtr<byte>> buffers = stackalloc ArrayPtr<byte>[Constants.MaxMbPlane]; + buffers[0] = src.YBuffer; + buffers[1] = src.UBuffer; + buffers[2] = src.VBuffer; + Span<int> strides = stackalloc int[Constants.MaxMbPlane]; + strides[0] = src.Stride; + strides[1] = src.UvStride; + strides[2] = src.UvStride; + int i; + + for (i = 0; i < Constants.MaxMbPlane; ++i) + { + ref MacroBlockDPlane pd = ref xd.Plane[i]; + SetupPredPlanes(ref pd.Pre[idx], buffers[i], strides[i], miRow, miCol, sf, pd.SubsamplingX, pd.SubsamplingY); + } + } + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/ReconIntra.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/ReconIntra.cs new file mode 100644 index 00000000..e346c01d --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/ReconIntra.cs @@ -0,0 +1,762 @@ +using Ryujinx.Graphics.Nvdec.Vp9.Common; +using Ryujinx.Graphics.Nvdec.Vp9.Types; +using System; +using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.IntraPred; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal static class ReconIntra + { + public static readonly TxType[] IntraModeToTxTypeLookup = new TxType[] + { + TxType.DctDct, // DC + TxType.AdstDct, // V + TxType.DctAdst, // H + TxType.DctDct, // D45 + TxType.AdstAdst, // D135 + TxType.AdstDct, // D117 + TxType.DctAdst, // D153 + TxType.DctAdst, // D207 + TxType.AdstDct, // D63 + TxType.AdstAdst // TM + }; + + private const int NeedLeft = 1 << 1; + private const int NeedAbove = 1 << 2; + private const int NeedAboveRight = 1 << 3; + + private static ReadOnlySpan<byte> ExtendModes => new byte[] + { + NeedAbove | NeedLeft, // DC + NeedAbove, // V + NeedLeft, // H + NeedAboveRight, // D45 + NeedLeft | NeedAbove, // D135 + NeedLeft | NeedAbove, // D117 + NeedLeft | NeedAbove, // D153 + NeedLeft, // D207 + NeedAboveRight, // D63 + NeedLeft | NeedAbove, // TM + }; + + private unsafe delegate void IntraPredFn(byte* dst, int stride, byte* above, byte* left); + + private static unsafe IntraPredFn[][] _pred = new IntraPredFn[][] + { + new IntraPredFn[] + { + null, + null, + null, + null + }, + new IntraPredFn[] + { + VPredictor4x4, + VPredictor8x8, + VPredictor16x16, + VPredictor32x32 + }, + new IntraPredFn[] + { + HPredictor4x4, + HPredictor8x8, + HPredictor16x16, + HPredictor32x32 + }, + new IntraPredFn[] + { + D45Predictor4x4, + D45Predictor8x8, + D45Predictor16x16, + D45Predictor32x32 + }, + new IntraPredFn[] + { + D135Predictor4x4, + D135Predictor8x8, + D135Predictor16x16, + D135Predictor32x32 + }, + new IntraPredFn[] + { + D117Predictor4x4, + D117Predictor8x8, + D117Predictor16x16, + D117Predictor32x32 + }, + new IntraPredFn[] + { + D153Predictor4x4, + D153Predictor8x8, + D153Predictor16x16, + D153Predictor32x32 + }, + new IntraPredFn[] + { + D207Predictor4x4, + D207Predictor8x8, + D207Predictor16x16, + D207Predictor32x32 + }, + new IntraPredFn[] + { + D63Predictor4x4, + D63Predictor8x8, + D63Predictor16x16, + D63Predictor32x32 + }, + new IntraPredFn[] + { + TMPredictor4x4, + TMPredictor8x8, + TMPredictor16x16, + TMPredictor32x32 + } + }; + + private static unsafe IntraPredFn[][][] _dcPred = new IntraPredFn[][][] + { + new IntraPredFn[][] + { + new IntraPredFn[] + { + Dc128Predictor4x4, + Dc128Predictor8x8, + Dc128Predictor16x16, + Dc128Predictor32x32 + }, + new IntraPredFn[] + { + DcTopPredictor4x4, + DcTopPredictor8x8, + DcTopPredictor16x16, + DcTopPredictor32x32 + } + }, + new IntraPredFn[][] + { + new IntraPredFn[] + { + DcLeftPredictor4x4, + DcLeftPredictor8x8, + DcLeftPredictor16x16, + DcLeftPredictor32x32 + }, + new IntraPredFn[] + { + DcPredictor4x4, + DcPredictor8x8, + DcPredictor16x16, + DcPredictor32x32 + } + } + }; + + private unsafe delegate void IntraHighPredFn(ushort* dst, int stride, ushort* above, ushort* left, int bd); + + private static unsafe IntraHighPredFn[][] _predHigh = new IntraHighPredFn[][] + { + new IntraHighPredFn[] + { + null, + null, + null, + null + }, + new IntraHighPredFn[] + { + HighbdVPredictor4x4, + HighbdVPredictor8x8, + HighbdVPredictor16x16, + HighbdVPredictor32x32 + }, + new IntraHighPredFn[] + { + HighbdHPredictor4x4, + HighbdHPredictor8x8, + HighbdHPredictor16x16, + HighbdHPredictor32x32 + }, + new IntraHighPredFn[] + { + HighbdD45Predictor4x4, + HighbdD45Predictor8x8, + HighbdD45Predictor16x16, + HighbdD45Predictor32x32 + }, + new IntraHighPredFn[] + { + HighbdD135Predictor4x4, + HighbdD135Predictor8x8, + HighbdD135Predictor16x16, + HighbdD135Predictor32x32 + }, + new IntraHighPredFn[] + { + HighbdD117Predictor4x4, + HighbdD117Predictor8x8, + HighbdD117Predictor16x16, + HighbdD117Predictor32x32 + }, + new IntraHighPredFn[] + { + HighbdD153Predictor4x4, + HighbdD153Predictor8x8, + HighbdD153Predictor16x16, + HighbdD153Predictor32x32 + }, + new IntraHighPredFn[] + { + HighbdD207Predictor4x4, + HighbdD207Predictor8x8, + HighbdD207Predictor16x16, + HighbdD207Predictor32x32 + }, + new IntraHighPredFn[] + { + HighbdD63Predictor4x4, + HighbdD63Predictor8x8, + HighbdD63Predictor16x16, + HighbdD63Predictor32x32 + }, + new IntraHighPredFn[] + { + HighbdTMPredictor4x4, + HighbdTMPredictor8x8, + HighbdTMPredictor16x16, + HighbdTMPredictor32x32 + } + }; + + private static unsafe IntraHighPredFn[][][] _dcPredHigh = new IntraHighPredFn[][][] + { + new IntraHighPredFn[][] + { + new IntraHighPredFn[] + { + HighbdDc128Predictor4x4, + HighbdDc128Predictor8x8, + HighbdDc128Predictor16x16, + HighbdDc128Predictor32x32 + }, + new IntraHighPredFn[] + { + HighbdDcTopPredictor4x4, + HighbdDcTopPredictor8x8, + HighbdDcTopPredictor16x16, + HighbdDcTopPredictor32x32 + } + }, + new IntraHighPredFn[][] + { + new IntraHighPredFn[] + { + HighbdDcLeftPredictor4x4, + HighbdDcLeftPredictor8x8, + HighbdDcLeftPredictor16x16, + HighbdDcLeftPredictor32x32 + }, + new IntraHighPredFn[] + { + HighbdDcPredictor4x4, + HighbdDcPredictor8x8, + HighbdDcPredictor16x16, + HighbdDcPredictor32x32 + } + } + }; + + private static unsafe void BuildIntraPredictorsHigh( + ref MacroBlockD xd, + byte* ref8, + int refStride, + byte* dst8, + int dstStride, + PredictionMode mode, + TxSize txSize, + int upAvailable, + int leftAvailable, + int rightAvailable, + int x, + int y, + int plane) + { + int i; + ushort* dst = (ushort*)dst8; + ushort* refr = (ushort*)ref8; + ushort* leftCol = stackalloc ushort[32]; + ushort* aboveData = stackalloc ushort[64 + 16]; + ushort* aboveRow = aboveData + 16; + ushort* constAboveRow = aboveRow; + int bs = 4 << (int)txSize; + int frameWidth, frameHeight; + int x0, y0; + ref MacroBlockDPlane pd = ref xd.Plane[plane]; + int needLeft = ExtendModes[(int)mode] & NeedLeft; + int needAbove = ExtendModes[(int)mode] & NeedAbove; + int needAboveRight = ExtendModes[(int)mode] & NeedAboveRight; + int baseVal = 128 << (xd.Bd - 8); + // 127 127 127 .. 127 127 127 127 127 127 + // 129 A B .. Y Z + // 129 C D .. W X + // 129 E F .. U V + // 129 G H .. S T T T T T + // For 10 bit and 12 bit, 127 and 129 are replaced by base -1 and base + 1. + + // Get current frame pointer, width and height. + if (plane == 0) + { + frameWidth = xd.CurBuf.Width; + frameHeight = xd.CurBuf.Height; + } + else + { + frameWidth = xd.CurBuf.UvWidth; + frameHeight = xd.CurBuf.UvHeight; + } + + // Get block position in current frame. + x0 = (-xd.MbToLeftEdge >> (3 + pd.SubsamplingX)) + x; + y0 = (-xd.MbToTopEdge >> (3 + pd.SubsamplingY)) + y; + + // NEED_LEFT + if (needLeft != 0) + { + if (leftAvailable != 0) + { + if (xd.MbToBottomEdge < 0) + { + /* slower path if the block needs border extension */ + if (y0 + bs <= frameHeight) + { + for (i = 0; i < bs; ++i) + { + leftCol[i] = refr[i * refStride - 1]; + } + } + else + { + int extendBottom = frameHeight - y0; + for (i = 0; i < extendBottom; ++i) + { + leftCol[i] = refr[i * refStride - 1]; + } + + for (; i < bs; ++i) + { + leftCol[i] = refr[(extendBottom - 1) * refStride - 1]; + } + } + } + else + { + /* faster path if the block does not need extension */ + for (i = 0; i < bs; ++i) + { + leftCol[i] = refr[i * refStride - 1]; + } + } + } + else + { + MemoryUtil.Fill(leftCol, (ushort)(baseVal + 1), bs); + } + } + + // NEED_ABOVE + if (needAbove != 0) + { + if (upAvailable != 0) + { + ushort* aboveRef = refr - refStride; + if (xd.MbToRightEdge < 0) + { + /* slower path if the block needs border extension */ + if (x0 + bs <= frameWidth) + { + MemoryUtil.Copy(aboveRow, aboveRef, bs); + } + else if (x0 <= frameWidth) + { + int r = frameWidth - x0; + MemoryUtil.Copy(aboveRow, aboveRef, r); + MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + bs - frameWidth); + } + } + else + { + /* faster path if the block does not need extension */ + if (bs == 4 && rightAvailable != 0 && leftAvailable != 0) + { + constAboveRow = aboveRef; + } + else + { + MemoryUtil.Copy(aboveRow, aboveRef, bs); + } + } + aboveRow[-1] = leftAvailable != 0 ? aboveRef[-1] : (ushort)(baseVal + 1); + } + else + { + MemoryUtil.Fill(aboveRow, (ushort)(baseVal - 1), bs); + aboveRow[-1] = (ushort)(baseVal - 1); + } + } + + // NEED_ABOVERIGHT + if (needAboveRight != 0) + { + if (upAvailable != 0) + { + ushort* aboveRef = refr - refStride; + if (xd.MbToRightEdge < 0) + { + /* slower path if the block needs border extension */ + if (x0 + 2 * bs <= frameWidth) + { + if (rightAvailable != 0 && bs == 4) + { + MemoryUtil.Copy(aboveRow, aboveRef, 2 * bs); + } + else + { + MemoryUtil.Copy(aboveRow, aboveRef, bs); + MemoryUtil.Fill(aboveRow + bs, aboveRow[bs - 1], bs); + } + } + else if (x0 + bs <= frameWidth) + { + int r = frameWidth - x0; + if (rightAvailable != 0 && bs == 4) + { + MemoryUtil.Copy(aboveRow, aboveRef, r); + MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + 2 * bs - frameWidth); + } + else + { + MemoryUtil.Copy(aboveRow, aboveRef, bs); + MemoryUtil.Fill(aboveRow + bs, aboveRow[bs - 1], bs); + } + } + else if (x0 <= frameWidth) + { + int r = frameWidth - x0; + MemoryUtil.Copy(aboveRow, aboveRef, r); + MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + 2 * bs - frameWidth); + } + aboveRow[-1] = leftAvailable != 0 ? aboveRef[-1] : (ushort)(baseVal + 1); + } + else + { + /* faster path if the block does not need extension */ + if (bs == 4 && rightAvailable != 0 && leftAvailable != 0) + { + constAboveRow = aboveRef; + } + else + { + MemoryUtil.Copy(aboveRow, aboveRef, bs); + if (bs == 4 && rightAvailable != 0) + { + MemoryUtil.Copy(aboveRow + bs, aboveRef + bs, bs); + } + else + { + MemoryUtil.Fill(aboveRow + bs, aboveRow[bs - 1], bs); + } + + aboveRow[-1] = leftAvailable != 0 ? aboveRef[-1] : (ushort)(baseVal + 1); + } + } + } + else + { + MemoryUtil.Fill(aboveRow, (ushort)(baseVal - 1), bs * 2); + aboveRow[-1] = (ushort)(baseVal - 1); + } + } + + // Predict + if (mode == PredictionMode.DcPred) + { + _dcPredHigh[leftAvailable][upAvailable][(int)txSize](dst, dstStride, constAboveRow, leftCol, xd.Bd); + } + else + { + _predHigh[(int)mode][(int)txSize](dst, dstStride, constAboveRow, leftCol, xd.Bd); + } + } + + public static unsafe void BuildIntraPredictors( + ref MacroBlockD xd, + byte* refr, + int refStride, + byte* dst, + int dstStride, + PredictionMode mode, + TxSize txSize, + int upAvailable, + int leftAvailable, + int rightAvailable, + int x, + int y, + int plane) + { + int i; + byte* leftCol = stackalloc byte[32]; + byte* aboveData = stackalloc byte[64 + 16]; + byte* aboveRow = aboveData + 16; + byte* constAboveRow = aboveRow; + int bs = 4 << (int)txSize; + int frameWidth, frameHeight; + int x0, y0; + ref MacroBlockDPlane pd = ref xd.Plane[plane]; + + // 127 127 127 .. 127 127 127 127 127 127 + // 129 A B .. Y Z + // 129 C D .. W X + // 129 E F .. U V + // 129 G H .. S T T T T T + // .. + + // Get current frame pointer, width and height. + if (plane == 0) + { + frameWidth = xd.CurBuf.Width; + frameHeight = xd.CurBuf.Height; + } + else + { + frameWidth = xd.CurBuf.UvWidth; + frameHeight = xd.CurBuf.UvHeight; + } + + // Get block position in current frame. + x0 = (-xd.MbToLeftEdge >> (3 + pd.SubsamplingX)) + x; + y0 = (-xd.MbToTopEdge >> (3 + pd.SubsamplingY)) + y; + + // NEED_LEFT + if ((ExtendModes[(int)mode] & NeedLeft) != 0) + { + if (leftAvailable != 0) + { + if (xd.MbToBottomEdge < 0) + { + /* Slower path if the block needs border extension */ + if (y0 + bs <= frameHeight) + { + for (i = 0; i < bs; ++i) + { + leftCol[i] = refr[i * refStride - 1]; + } + } + else + { + int extendBottom = frameHeight - y0; + for (i = 0; i < extendBottom; ++i) + { + leftCol[i] = refr[i * refStride - 1]; + } + + for (; i < bs; ++i) + { + leftCol[i] = refr[(extendBottom - 1) * refStride - 1]; + } + } + } + else + { + /* Faster path if the block does not need extension */ + for (i = 0; i < bs; ++i) + { + leftCol[i] = refr[i * refStride - 1]; + } + } + } + else + { + MemoryUtil.Fill(leftCol, (byte)129, bs); + } + } + + // NEED_ABOVE + if ((ExtendModes[(int)mode] & NeedAbove) != 0) + { + if (upAvailable != 0) + { + byte* aboveRef = refr - refStride; + if (xd.MbToRightEdge < 0) + { + /* Slower path if the block needs border extension */ + if (x0 + bs <= frameWidth) + { + MemoryUtil.Copy(aboveRow, aboveRef, bs); + } + else if (x0 <= frameWidth) + { + int r = frameWidth - x0; + MemoryUtil.Copy(aboveRow, aboveRef, r); + MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + bs - frameWidth); + } + } + else + { + /* Faster path if the block does not need extension */ + if (bs == 4 && rightAvailable != 0 && leftAvailable != 0) + { + constAboveRow = aboveRef; + } + else + { + MemoryUtil.Copy(aboveRow, aboveRef, bs); + } + } + aboveRow[-1] = leftAvailable != 0 ? aboveRef[-1] : (byte)129; + } + else + { + MemoryUtil.Fill(aboveRow, (byte)127, bs); + aboveRow[-1] = 127; + } + } + + // NEED_ABOVERIGHT + if ((ExtendModes[(int)mode] & NeedAboveRight) != 0) + { + if (upAvailable != 0) + { + byte* aboveRef = refr - refStride; + if (xd.MbToRightEdge < 0) + { + /* Slower path if the block needs border extension */ + if (x0 + 2 * bs <= frameWidth) + { + if (rightAvailable != 0 && bs == 4) + { + MemoryUtil.Copy(aboveRow, aboveRef, 2 * bs); + } + else + { + MemoryUtil.Copy(aboveRow, aboveRef, bs); + MemoryUtil.Fill(aboveRow + bs, aboveRow[bs - 1], bs); + } + } + else if (x0 + bs <= frameWidth) + { + int r = frameWidth - x0; + if (rightAvailable != 0 && bs == 4) + { + MemoryUtil.Copy(aboveRow, aboveRef, r); + MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + 2 * bs - frameWidth); + } + else + { + MemoryUtil.Copy(aboveRow, aboveRef, bs); + MemoryUtil.Fill(aboveRow + bs, aboveRow[bs - 1], bs); + } + } + else if (x0 <= frameWidth) + { + int r = frameWidth - x0; + MemoryUtil.Copy(aboveRow, aboveRef, r); + MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + 2 * bs - frameWidth); + } + } + else + { + /* Faster path if the block does not need extension */ + if (bs == 4 && rightAvailable != 0 && leftAvailable != 0) + { + constAboveRow = aboveRef; + } + else + { + MemoryUtil.Copy(aboveRow, aboveRef, bs); + if (bs == 4 && rightAvailable != 0) + { + MemoryUtil.Copy(aboveRow + bs, aboveRef + bs, bs); + } + else + { + MemoryUtil.Fill(aboveRow + bs, aboveRow[bs - 1], bs); + } + } + } + aboveRow[-1] = leftAvailable != 0 ? aboveRef[-1] : (byte)129; + } + else + { + MemoryUtil.Fill(aboveRow, (byte)127, bs * 2); + aboveRow[-1] = 127; + } + } + + // Predict + if (mode == PredictionMode.DcPred) + { + _dcPred[leftAvailable][upAvailable][(int)txSize](dst, dstStride, constAboveRow, leftCol); + } + else + { + _pred[(int)mode][(int)txSize](dst, dstStride, constAboveRow, leftCol); + } + } + + public static unsafe void PredictIntraBlock( + ref MacroBlockD xd, + int bwlIn, + TxSize txSize, + PredictionMode mode, + byte* refr, + int refStride, + byte* dst, + int dstStride, + int aoff, + int loff, + int plane) + { + int bw = 1 << bwlIn; + int txw = 1 << (int)txSize; + int haveTop = loff != 0 || !xd.AboveMi.IsNull ? 1 : 0; + int haveLeft = aoff != 0 || !xd.LeftMi.IsNull ? 1 : 0; + int haveRight = (aoff + txw) < bw ? 1 : 0; + int x = aoff * 4; + int y = loff * 4; + + if (xd.CurBuf.HighBd) + { + BuildIntraPredictorsHigh( + ref xd, + refr, + refStride, + dst, + dstStride, + mode, + txSize, + haveTop, + haveLeft, + haveRight, + x, + y, + plane); + return; + } + BuildIntraPredictors( + ref xd, + refr, + refStride, + dst, + dstStride, + mode, + txSize, + haveTop, + haveLeft, + haveRight, + x, + y, + plane); + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Ryujinx.Graphics.Nvdec.Vp9.csproj b/src/Ryujinx.Graphics.Nvdec.Vp9/Ryujinx.Graphics.Nvdec.Vp9.csproj new file mode 100644 index 00000000..bff1e803 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Ryujinx.Graphics.Nvdec.Vp9.csproj @@ -0,0 +1,13 @@ +<Project Sdk="Microsoft.NET.Sdk"> + + <PropertyGroup> + <TargetFramework>net7.0</TargetFramework> + <AllowUnsafeBlocks>true</AllowUnsafeBlocks> + </PropertyGroup> + + <ItemGroup> + <ProjectReference Include="..\Ryujinx.Common\Ryujinx.Common.csproj" /> + <ProjectReference Include="..\Ryujinx.Graphics.Video\Ryujinx.Graphics.Video.csproj" /> + </ItemGroup> + +</Project> diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/TileBuffer.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/TileBuffer.cs new file mode 100644 index 00000000..c5a25e6b --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/TileBuffer.cs @@ -0,0 +1,11 @@ +using Ryujinx.Common.Memory; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal struct TileBuffer + { + public int Col; + public ArrayPtr<byte> Data; + public int Size; + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/TileWorkerData.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/TileWorkerData.cs new file mode 100644 index 00000000..333a077a --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/TileWorkerData.cs @@ -0,0 +1,20 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Dsp; +using Ryujinx.Graphics.Nvdec.Vp9.Types; +using Ryujinx.Graphics.Video; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal struct TileWorkerData + { + public ArrayPtr<byte> DataEnd; + public int BufStart; + public int BufEnd; + public Reader BitReader; + public Vp9BackwardUpdates Counts; + public MacroBlockD Xd; + /* dqcoeff are shared by all the planes. So planes must be decoded serially */ + public Array32<Array32<int>> Dqcoeff; + public InternalErrorInfo ErrorInfo; + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BModeInfo.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BModeInfo.cs new file mode 100644 index 00000000..9e1cd8b4 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BModeInfo.cs @@ -0,0 +1,10 @@ +using Ryujinx.Common.Memory; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct BModeInfo + { + public PredictionMode Mode; + public Array2<Mv> Mv; // First, second inter predictor motion vectors + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BlockSize.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BlockSize.cs new file mode 100644 index 00000000..22a48e20 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BlockSize.cs @@ -0,0 +1,21 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal enum BlockSize + { + Block4x4 = 0, + Block4x8 = 1, + Block8x4 = 2, + Block8x8 = 3, + Block8x16 = 4, + Block16x8 = 5, + Block16x16 = 6, + Block16x32 = 7, + Block32x16 = 8, + Block32x32 = 9, + Block32x64 = 10, + Block64x32 = 11, + Block64x64 = 12, + BlockSizes = 13, + BlockInvalid = BlockSizes + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Buf2D.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Buf2D.cs new file mode 100644 index 00000000..180d5e34 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Buf2D.cs @@ -0,0 +1,10 @@ +using Ryujinx.Common.Memory; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct Buf2D + { + public ArrayPtr<byte> Buf; + public int Stride; + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/FrameType.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/FrameType.cs new file mode 100644 index 00000000..a783999e --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/FrameType.cs @@ -0,0 +1,8 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal enum FrameType + { + KeyFrame = 0, + InterFrame = 1 + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilter.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilter.cs new file mode 100644 index 00000000..8dc33bda --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilter.cs @@ -0,0 +1,27 @@ +using Ryujinx.Common.Memory; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct LoopFilter + { + public int FilterLevel; + public int LastFiltLevel; + + public int SharpnessLevel; + public int LastSharpnessLevel; + + public bool ModeRefDeltaEnabled; + public bool ModeRefDeltaUpdate; + + // 0 = Intra, Last, GF, ARF + public Array4<sbyte> RefDeltas; + public Array4<sbyte> LastRefDeltas; + + // 0 = ZERO_MV, MV + public Array2<sbyte> ModeDeltas; + public Array2<sbyte> LastModeDeltas; + + public ArrayPtr<LoopFilterMask> Lfm; + public int LfmStride; + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterInfoN.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterInfoN.cs new file mode 100644 index 00000000..0ac38a7b --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterInfoN.cs @@ -0,0 +1,10 @@ +using Ryujinx.Common.Memory; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct LoopFilterInfoN + { + public Array64<LoopFilterThresh> Lfthr; + public Array8<Array4<Array2<byte>>> Lvl; + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterMask.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterMask.cs new file mode 100644 index 00000000..4aff843a --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterMask.cs @@ -0,0 +1,24 @@ +using Ryujinx.Common.Memory; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + // This structure holds bit masks for all 8x8 blocks in a 64x64 region. + // Each 1 bit represents a position in which we want to apply the loop filter. + // Left_ entries refer to whether we apply a filter on the border to the + // left of the block. Above_ entries refer to whether or not to apply a + // filter on the above border. Int_ entries refer to whether or not to + // apply borders on the 4x4 edges within the 8x8 block that each bit + // represents. + // Since each transform is accompanied by a potentially different type of + // loop filter there is a different entry in the array for each transform size. + internal struct LoopFilterMask + { + public Array4<ulong> LeftY; + public Array4<ulong> AboveY; + public ulong Int4x4Y; + public Array4<ushort> LeftUv; + public Array4<ushort> AboveUv; + public ushort Int4x4Uv; + public Array64<byte> LflY; + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterThresh.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterThresh.cs new file mode 100644 index 00000000..edd79af4 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterThresh.cs @@ -0,0 +1,15 @@ +using Ryujinx.Common.Memory; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + // Need to align this structure so when it is declared and + // passed it can be loaded into vector registers. + internal struct LoopFilterThresh + { +#pragma warning disable CS0649 + public Array16<byte> Mblim; + public Array16<byte> Lim; + public Array16<byte> HevThr; +#pragma warning restore CS0649 + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockD.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockD.cs new file mode 100644 index 00000000..f1111528 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockD.cs @@ -0,0 +1,179 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Video; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct MacroBlockD + { + public Array3<MacroBlockDPlane> Plane; + public byte BmodeBlocksWl; + public byte BmodeBlocksHl; + + public Ptr<Vp9BackwardUpdates> Counts; + public TileInfo Tile; + + public int MiStride; + + // Grid of 8x8 cells is placed over the block. + // If some of them belong to the same mbtree-block + // they will just have same mi[i][j] value + public ArrayPtr<Ptr<ModeInfo>> Mi; + public Ptr<ModeInfo> LeftMi; + public Ptr<ModeInfo> AboveMi; + + public uint MaxBlocksWide; + public uint MaxBlocksHigh; + + public ArrayPtr<Array3<byte>> PartitionProbs; + + /* Distance of MB away from frame edges */ + public int MbToLeftEdge; + public int MbToRightEdge; + public int MbToTopEdge; + public int MbToBottomEdge; + + public Ptr<Vp9EntropyProbs> Fc; + + /* pointers to reference frames */ + public Array2<Ptr<RefBuffer>> BlockRefs; + + /* pointer to current frame */ + public Surface CurBuf; + + public Array3<ArrayPtr<sbyte>> AboveContext; + public Array3<Array16<sbyte>> LeftContext; + + public ArrayPtr<sbyte> AboveSegContext; + public Array8<sbyte> LeftSegContext; + + /* Bit depth: 8, 10, 12 */ + public int Bd; + + public bool Lossless; + public bool Corrupted; + + public Ptr<InternalErrorInfo> ErrorInfo; + + public int GetPredContextSegId() + { + sbyte aboveSip = !AboveMi.IsNull ? AboveMi.Value.SegIdPredicted : (sbyte)0; + sbyte leftSip = !LeftMi.IsNull ? LeftMi.Value.SegIdPredicted : (sbyte)0; + + return aboveSip + leftSip; + } + + public int GetSkipContext() + { + int aboveSkip = !AboveMi.IsNull ? AboveMi.Value.Skip : 0; + int leftSkip = !LeftMi.IsNull ? LeftMi.Value.Skip : 0; + return aboveSkip + leftSkip; + } + + public int GetPredContextSwitchableInterp() + { + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + int leftType = !LeftMi.IsNull ? LeftMi.Value.InterpFilter : Constants.SwitchableFilters; + int aboveType = !AboveMi.IsNull ? AboveMi.Value.InterpFilter : Constants.SwitchableFilters; + + if (leftType == aboveType) + { + return leftType; + } + else if (leftType == Constants.SwitchableFilters) + { + return aboveType; + } + else if (aboveType == Constants.SwitchableFilters) + { + return leftType; + } + else + { + return Constants.SwitchableFilters; + } + } + + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + // 0 - inter/inter, inter/--, --/inter, --/-- + // 1 - intra/inter, inter/intra + // 2 - intra/--, --/intra + // 3 - intra/intra + public int GetIntraInterContext() + { + if (!AboveMi.IsNull && !LeftMi.IsNull) + { // Both edges available + bool aboveIntra = !AboveMi.Value.IsInterBlock(); + bool leftIntra = !LeftMi.Value.IsInterBlock(); + return leftIntra && aboveIntra ? 3 : (leftIntra || aboveIntra ? 1 : 0); + } + else if (!AboveMi.IsNull || !LeftMi.IsNull) + { // One edge available + return 2 * (!(!AboveMi.IsNull ? AboveMi.Value : LeftMi.Value).IsInterBlock() ? 1 : 0); + } + return 0; + } + + // Returns a context number for the given MB prediction signal + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real blocks. + // The prediction flags in these dummy entries are initialized to 0. + public int GetTxSizeContext() + { + int maxTxSize = (int)Luts.MaxTxSizeLookup[(int)Mi[0].Value.SbType]; + int aboveCtx = (!AboveMi.IsNull && AboveMi.Value.Skip == 0) ? (int)AboveMi.Value.TxSize : maxTxSize; + int leftCtx = (!LeftMi.IsNull && LeftMi.Value.Skip == 0) ? (int)LeftMi.Value.TxSize : maxTxSize; + if (LeftMi.IsNull) + { + leftCtx = aboveCtx; + } + + if (AboveMi.IsNull) + { + aboveCtx = leftCtx; + } + + return (aboveCtx + leftCtx) > maxTxSize ? 1 : 0; + } + + public void SetupBlockPlanes(int ssX, int ssY) + { + int i; + + for (i = 0; i < Constants.MaxMbPlane; i++) + { + Plane[i].SubsamplingX = i != 0 ? ssX : 0; + Plane[i].SubsamplingY = i != 0 ? ssY : 0; + } + } + + public void SetSkipContext(int miRow, int miCol) + { + int aboveIdx = miCol * 2; + int leftIdx = (miRow * 2) & 15; + int i; + for (i = 0; i < Constants.MaxMbPlane; ++i) + { + ref MacroBlockDPlane pd = ref Plane[i]; + pd.AboveContext = AboveContext[i].Slice(aboveIdx >> pd.SubsamplingX); + pd.LeftContext = new ArrayPtr<sbyte>(ref LeftContext[i][leftIdx >> pd.SubsamplingY], 16 - (leftIdx >> pd.SubsamplingY)); + } + } + + internal void SetMiRowCol(ref TileInfo tile, int miRow, int bh, int miCol, int bw, int miRows, int miCols) + { + MbToTopEdge = -((miRow * Constants.MiSize) * 8); + MbToBottomEdge = ((miRows - bh - miRow) * Constants.MiSize) * 8; + MbToLeftEdge = -((miCol * Constants.MiSize) * 8); + MbToRightEdge = ((miCols - bw - miCol) * Constants.MiSize) * 8; + + // Are edges available for intra prediction? + AboveMi = (miRow != 0) ? Mi[-MiStride] : Ptr<ModeInfo>.Null; + LeftMi = (miCol > tile.MiColStart) ? Mi[-1] : Ptr<ModeInfo>.Null; + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockDPlane.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockDPlane.cs new file mode 100644 index 00000000..ae4ec6f4 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockDPlane.cs @@ -0,0 +1,21 @@ +using Ryujinx.Common.Memory; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct MacroBlockDPlane + { + public ArrayPtr<int> DqCoeff; + public int SubsamplingX; + public int SubsamplingY; + public Buf2D Dst; + public Array2<Buf2D> Pre; + public ArrayPtr<sbyte> AboveContext; + public ArrayPtr<sbyte> LeftContext; + public Array8<Array2<short>> SegDequant; + + // Number of 4x4s in current block + public ushort N4W, N4H; + // Log2 of N4W, N4H + public byte N4Wl, N4Hl; + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ModeInfo.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ModeInfo.cs new file mode 100644 index 00000000..8ef281d8 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ModeInfo.cs @@ -0,0 +1,66 @@ +using Ryujinx.Common.Memory; +using System.Diagnostics; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct ModeInfo + { + // Common for both Inter and Intra blocks + public BlockSize SbType; + public PredictionMode Mode; + public TxSize TxSize; + public sbyte Skip; + public sbyte SegmentId; + public sbyte SegIdPredicted; // Valid only when TemporalUpdate is enabled + + // Only for Intra blocks + public PredictionMode UvMode; + + // Only for Inter blocks + public byte InterpFilter; + + // if ref_frame[idx] is equal to AltRefFrame then + // MacroBlockD.BlockRef[idx] is an altref + public Array2<sbyte> RefFrame; + + public Array2<Mv> Mv; + + public Array4<BModeInfo> Bmi; + + public PredictionMode GetYMode(int block) + { + return SbType < BlockSize.Block8x8 ? Bmi[block].Mode : Mode; + } + + public TxSize GetUvTxSize(ref MacroBlockDPlane pd) + { + Debug.Assert(SbType < BlockSize.Block8x8 || + Luts.SsSizeLookup[(int)SbType][pd.SubsamplingX][pd.SubsamplingY] != BlockSize.BlockInvalid); + return Luts.UvTxsizeLookup[(int)SbType][(int)TxSize][pd.SubsamplingX][pd.SubsamplingY]; + } + + public bool IsInterBlock() + { + return RefFrame[0] > Constants.IntraFrame; + } + + public bool HasSecondRef() + { + return RefFrame[1] > Constants.IntraFrame; + } + + private static readonly int[][] IdxNColumnToSubblock = new int[][] + { + new int[] { 1, 2 }, new int[] { 1, 3 }, new int[] { 3, 2 }, new int[] { 3, 3 } + }; + + // This function returns either the appropriate sub block or block's mv + // on whether the block_size < 8x8 and we have check_sub_blocks set. + public Mv GetSubBlockMv(int whichMv, int searchCol, int blockIdx) + { + return blockIdx >= 0 && SbType < BlockSize.Block8x8 + ? Bmi[IdxNColumnToSubblock[blockIdx][searchCol == 0 ? 1 : 0]].Mv[whichMv] + : Mv[whichMv]; + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MotionVectorContext.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MotionVectorContext.cs new file mode 100644 index 00000000..319c8dba --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MotionVectorContext.cs @@ -0,0 +1,14 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal enum MotionVectorContext + { + BothZero = 0, + ZeroPlusPredicted = 1, + BothPredicted = 2, + NewPlusNonIntra = 3, + BothNew = 4, + IntraPlusNonIntra = 5, + BothIntra = 6, + InvalidCase = 9 + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv.cs new file mode 100644 index 00000000..815bbb32 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv.cs @@ -0,0 +1,189 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Video; +using System; +using System.Diagnostics; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct Mv + { + public short Row; + public short Col; + + private static ReadOnlySpan<byte> LogInBase2 => new byte[] + { + 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10 + }; + + public bool UseMvHp() + { + const int kMvRefThresh = 64; // Threshold for use of high-precision 1/8 mv + return Math.Abs(Row) < kMvRefThresh && Math.Abs(Col) < kMvRefThresh; + } + + public static bool MvJointVertical(MvJointType type) + { + return type == MvJointType.MvJointHzvnz || type == MvJointType.MvJointHnzvnz; + } + + public static bool MvJointHorizontal(MvJointType type) + { + return type == MvJointType.MvJointHnzvz || type == MvJointType.MvJointHnzvnz; + } + + private static int MvClassBase(MvClassType c) + { + return c != 0 ? Constants.Class0Size << ((int)c + 2) : 0; + } + + private static MvClassType GetMvClass(int z, Ptr<int> offset) + { + MvClassType c = (z >= Constants.Class0Size * 4096) ? MvClassType.MvClass10 : (MvClassType)LogInBase2[z >> 3]; + if (!offset.IsNull) + { + offset.Value = z - MvClassBase(c); + } + + return c; + } + + private static void IncMvComponent(int v, ref Vp9BackwardUpdates counts, int comp, int incr, int usehp) + { + int s, z, c, o = 0, d, e, f; + Debug.Assert(v != 0); /* Should not be zero */ + s = v < 0 ? 1 : 0; + counts.Sign[comp][s] += (uint)incr; + z = (s != 0 ? -v : v) - 1; /* Magnitude - 1 */ + + c = (int)GetMvClass(z, new Ptr<int>(ref o)); + counts.Classes[comp][c] += (uint)incr; + + d = (o >> 3); /* Int mv data */ + f = (o >> 1) & 3; /* Fractional pel mv data */ + e = (o & 1); /* High precision mv data */ + + if (c == (int)MvClassType.MvClass0) + { + counts.Class0[comp][d] += (uint)incr; + counts.Class0Fp[comp][d][f] += (uint)incr; + counts.Class0Hp[comp][e] += (uint)(usehp * incr); + } + else + { + int i; + int b = c + Constants.Class0Bits - 1; // Number of bits + for (i = 0; i < b; ++i) + { + counts.Bits[comp][i][((d >> i) & 1)] += (uint)incr; + } + + counts.Fp[comp][f] += (uint)incr; + counts.Hp[comp][e] += (uint)(usehp * incr); + } + } + + private MvJointType GetMvJoint() + { + if (Row == 0) + { + return Col == 0 ? MvJointType.MvJointZero : MvJointType.MvJointHnzvz; + } + else + { + return Col == 0 ? MvJointType.MvJointHzvnz : MvJointType.MvJointHnzvnz; + } + } + + internal void IncMv(Ptr<Vp9BackwardUpdates> counts) + { + if (!counts.IsNull) + { + MvJointType j = GetMvJoint(); + ++counts.Value.Joints[(int)j]; + + if (MvJointVertical(j)) + { + IncMvComponent(Row, ref counts.Value, 0, 1, 1); + } + + if (MvJointHorizontal(j)) + { + IncMvComponent(Col, ref counts.Value, 1, 1, 1); + } + } + } + + public void ClampMv(int minCol, int maxCol, int minRow, int maxRow) + { + Col = (short)Math.Clamp(Col, minCol, maxCol); + Row = (short)Math.Clamp(Row, minRow, maxRow); + } + + private const int MvBorder = (16 << 3); // Allow 16 pels in 1/8th pel units + + public void ClampMvRef(ref MacroBlockD xd) + { + ClampMv( + xd.MbToLeftEdge - MvBorder, + xd.MbToRightEdge + MvBorder, + xd.MbToTopEdge - MvBorder, + xd.MbToBottomEdge + MvBorder); + } + + public void LowerMvPrecision(bool allowHP) + { + bool useHP = allowHP && UseMvHp(); + if (!useHP) + { + if ((Row & 1) != 0) + { + Row += (short)(Row > 0 ? -1 : 1); + } + + if ((Col & 1) != 0) + { + Col += (short)(Col > 0 ? -1 : 1); + } + } + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv32.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv32.cs new file mode 100644 index 00000000..fb25d18e --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv32.cs @@ -0,0 +1,8 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct Mv32 + { + public int Row; + public int Col; + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvClassType.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvClassType.cs new file mode 100644 index 00000000..68a0b59a --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvClassType.cs @@ -0,0 +1,17 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal enum MvClassType + { + MvClass0 = 0, /* (0, 2] integer pel */ + MvClass1 = 1, /* (2, 4] integer pel */ + MvClass2 = 2, /* (4, 8] integer pel */ + MvClass3 = 3, /* (8, 16] integer pel */ + MvClass4 = 4, /* (16, 32] integer pel */ + MvClass5 = 5, /* (32, 64] integer pel */ + MvClass6 = 6, /* (64, 128] integer pel */ + MvClass7 = 7, /* (128, 256] integer pel */ + MvClass8 = 8, /* (256, 512] integer pel */ + MvClass9 = 9, /* (512, 1024] integer pel */ + MvClass10 = 10, /* (1024,2048] integer pel */ + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvJointType.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvJointType.cs new file mode 100644 index 00000000..a20cb6d0 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvJointType.cs @@ -0,0 +1,10 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal enum MvJointType + { + MvJointZero = 0, /* Zero vector */ + MvJointHnzvz = 1, /* Vert zero, hor nonzero */ + MvJointHzvnz = 2, /* Hor zero, vert nonzero */ + MvJointHnzvnz = 3, /* Both components nonzero */ + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvRef.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvRef.cs new file mode 100644 index 00000000..71949a09 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvRef.cs @@ -0,0 +1,10 @@ +using Ryujinx.Common.Memory; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct MvRef + { + public Array2<Mv> Mv; + public Array2<sbyte> RefFrame; + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PartitionType.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PartitionType.cs new file mode 100644 index 00000000..096f9818 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PartitionType.cs @@ -0,0 +1,12 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal enum PartitionType + { + PartitionNone, + PartitionHorz, + PartitionVert, + PartitionSplit, + PartitionTypes, + PartitionInvalid = PartitionTypes + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PlaneType.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PlaneType.cs new file mode 100644 index 00000000..790aa2a0 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PlaneType.cs @@ -0,0 +1,9 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal enum PlaneType + { + Y = 0, + Uv = 1, + PlaneTypes + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Position.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Position.cs new file mode 100644 index 00000000..0d3b56f6 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Position.cs @@ -0,0 +1,14 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct Position + { + public int Row; + public int Col; + + public Position(int row, int col) + { + Row = row; + Col = col; + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PredictionMode.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PredictionMode.cs new file mode 100644 index 00000000..bbb9be9a --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PredictionMode.cs @@ -0,0 +1,21 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal enum PredictionMode + { + DcPred = 0, // Average of above and left pixels + VPred = 1, // Vertical + HPred = 2, // Horizontal + D45Pred = 3, // Directional 45 deg = round(arctan(1 / 1) * 180 / pi) + D135Pred = 4, // Directional 135 deg = 180 - 45 + D117Pred = 5, // Directional 117 deg = 180 - 63 + D153Pred = 6, // Directional 153 deg = 180 - 27 + D207Pred = 7, // Directional 207 deg = 180 + 27 + D63Pred = 8, // Directional 63 deg = round(arctan(2 / 1) * 180 / pi) + TmPred = 9, // True-motion + NearestMv = 10, + NearMv = 11, + ZeroMv = 12, + NewMv = 13, + MbModeCount = 14 + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/RefBuffer.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/RefBuffer.cs new file mode 100644 index 00000000..9942dd05 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/RefBuffer.cs @@ -0,0 +1,8 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct RefBuffer + { + public Surface Buf; + public ScaleFactors Sf; + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ReferenceMode.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ReferenceMode.cs new file mode 100644 index 00000000..7cbf9f4e --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ReferenceMode.cs @@ -0,0 +1,10 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal enum ReferenceMode + { + SingleReference = 0, + CompoundReference = 1, + ReferenceModeSelect = 2, + ReferenceModes = 3 + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ScaleFactors.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ScaleFactors.cs new file mode 100644 index 00000000..970f9680 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ScaleFactors.cs @@ -0,0 +1,451 @@ +using Ryujinx.Common.Memory; +using System.Runtime.CompilerServices; +using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.Convolve; +using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.Filter; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct ScaleFactors + { + private const int RefScaleShift = 14; + private const int RefNoScale = (1 << RefScaleShift); + private const int RefInvalidScale = -1; + + private unsafe delegate void ConvolveFn( + byte* src, + int srcStride, + byte* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h); + + private unsafe delegate void HighbdConvolveFn( + ushort* src, + int srcStride, + ushort* dst, + int dstStride, + Array8<short>[] filter, + int x0Q4, + int xStepQ4, + int y0Q4, + int yStepQ4, + int w, + int h, + int bd); + + private static readonly unsafe ConvolveFn[][][] PredictX16Y16 = new ConvolveFn[][][] + { + new ConvolveFn[][] + { + new ConvolveFn[] + { + ConvolveCopy, + ConvolveAvg + }, + new ConvolveFn[] + { + Convolve8Vert, + Convolve8AvgVert + } + }, + new ConvolveFn[][] + { + new ConvolveFn[] + { + Convolve8Horiz, + Convolve8AvgHoriz + }, + new ConvolveFn[] + { + Convolve8, + Convolve8Avg + } + } + }; + + private static readonly unsafe ConvolveFn[][][] PredictX16 = new ConvolveFn[][][] + { + new ConvolveFn[][] + { + new ConvolveFn[] + { + ScaledVert, + ScaledAvgVert + }, + new ConvolveFn[] + { + ScaledVert, + ScaledAvgVert + } + }, + new ConvolveFn[][] + { + new ConvolveFn[] + { + Scaled2D, + ScaledAvg2D + }, + new ConvolveFn[] + { + Scaled2D, + ScaledAvg2D + } + } + }; + + private static readonly unsafe ConvolveFn[][][] PredictY16 = new ConvolveFn[][][] + { + new ConvolveFn[][] + { + new ConvolveFn[] + { + ScaledHoriz, + ScaledAvgHoriz + }, + new ConvolveFn[] + { + Scaled2D, + ScaledAvg2D + } + }, + new ConvolveFn[][] + { + new ConvolveFn[] + { + ScaledHoriz, + ScaledAvgHoriz + }, + new ConvolveFn[] + { + Scaled2D, + ScaledAvg2D + } + } + }; + + private static readonly unsafe ConvolveFn[][][] Predict = new ConvolveFn[][][] + { + new ConvolveFn[][] + { + new ConvolveFn[] + { + Scaled2D, + ScaledAvg2D + }, + new ConvolveFn[] + { + Scaled2D, + ScaledAvg2D + } + }, + new ConvolveFn[][] + { + new ConvolveFn[] + { + Scaled2D, + ScaledAvg2D + }, + new ConvolveFn[] + { + Scaled2D, + ScaledAvg2D + } + } + }; + + private static readonly unsafe HighbdConvolveFn[][][] HighbdPredictX16Y16 = new HighbdConvolveFn[][][] + { + new HighbdConvolveFn[][] + { + new HighbdConvolveFn[] + { + HighbdConvolveCopy, + HighbdConvolveAvg + }, + new HighbdConvolveFn[] + { + HighbdConvolve8Vert, + HighbdConvolve8AvgVert + } + }, + new HighbdConvolveFn[][] + { + new HighbdConvolveFn[] + { + HighbdConvolve8Horiz, + HighbdConvolve8AvgHoriz + }, + new HighbdConvolveFn[] + { + HighbdConvolve8, + HighbdConvolve8Avg + } + } + }; + + private static readonly unsafe HighbdConvolveFn[][][] HighbdPredictX16 = new HighbdConvolveFn[][][] + { + new HighbdConvolveFn[][] + { + new HighbdConvolveFn[] + { + HighbdConvolve8Vert, + HighbdConvolve8AvgVert + }, + new HighbdConvolveFn[] + { + HighbdConvolve8Vert, + HighbdConvolve8AvgVert + } + }, + new HighbdConvolveFn[][] + { + new HighbdConvolveFn[] + { + HighbdConvolve8, + HighbdConvolve8Avg + }, + new HighbdConvolveFn[] + { + HighbdConvolve8, + HighbdConvolve8Avg + } + } + }; + + private static readonly unsafe HighbdConvolveFn[][][] HighbdPredictY16 = new HighbdConvolveFn[][][] + { + new HighbdConvolveFn[][] + { + new HighbdConvolveFn[] + { + HighbdConvolve8Horiz, + HighbdConvolve8AvgHoriz + }, + new HighbdConvolveFn[] + { + HighbdConvolve8, + HighbdConvolve8Avg + } + }, + new HighbdConvolveFn[][] + { + new HighbdConvolveFn[] + { + HighbdConvolve8Horiz, + HighbdConvolve8AvgHoriz + }, + new HighbdConvolveFn[] + { + HighbdConvolve8, + HighbdConvolve8Avg + } + } + }; + + private static readonly unsafe HighbdConvolveFn[][][] HighbdPredict = new HighbdConvolveFn[][][] + { + new HighbdConvolveFn[][] + { + new HighbdConvolveFn[] + { + HighbdConvolve8, + HighbdConvolve8Avg + }, + new HighbdConvolveFn[] + { + HighbdConvolve8, + HighbdConvolve8Avg + } + }, + new HighbdConvolveFn[][] + { + new HighbdConvolveFn[] + { + HighbdConvolve8, + HighbdConvolve8Avg + }, + new HighbdConvolveFn[] + { + HighbdConvolve8, + HighbdConvolve8Avg + } + } + }; + + public int XScaleFP; // Horizontal fixed point scale factor + public int YScaleFP; // Vertical fixed point scale factor + public int XStepQ4; + public int YStepQ4; + + public int ScaleValueX(int val) + { + return IsScaled() ? ScaledX(val) : val; + } + + public int ScaleValueY(int val) + { + return IsScaled() ? ScaledY(val) : val; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public unsafe void InterPredict( + int horiz, + int vert, + int avg, + byte* src, + int srcStride, + byte* dst, + int dstStride, + int subpelX, + int subpelY, + int w, + int h, + Array8<short>[] kernel, + int xs, + int ys) + { + if (XStepQ4 == 16) + { + if (YStepQ4 == 16) + { + // No scaling in either direction. + PredictX16Y16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h); + } + else + { + // No scaling in x direction. Must always scale in the y direction. + PredictX16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h); + } + } + else + { + if (YStepQ4 == 16) + { + // No scaling in the y direction. Must always scale in the x direction. + PredictY16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h); + } + else + { + // Must always scale in both directions. + Predict[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h); + } + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public unsafe void HighbdInterPredict( + int horiz, + int vert, + int avg, + ushort* src, + int srcStride, + ushort* dst, + int dstStride, + int subpelX, + int subpelY, + int w, + int h, + Array8<short>[] kernel, + int xs, + int ys, + int bd) + { + if (XStepQ4 == 16) + { + if (YStepQ4 == 16) + { + // No scaling in either direction. + HighbdPredictX16Y16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h, bd); + } + else + { + // No scaling in x direction. Must always scale in the y direction. + HighbdPredictX16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h, bd); + } + } + else + { + if (YStepQ4 == 16) + { + // No scaling in the y direction. Must always scale in the x direction. + HighbdPredictY16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h, bd); + } + else + { + // Must always scale in both directions. + HighbdPredict[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h, bd); + } + } + } + + private int ScaledX(int val) + { + return (int)((long)val * XScaleFP >> RefScaleShift); + } + + private int ScaledY(int val) + { + return (int)((long)val * YScaleFP >> RefScaleShift); + } + + private static int GetFixedPointScaleFactor(int otherSize, int thisSize) + { + // Calculate scaling factor once for each reference frame + // and use fixed point scaling factors in decoding and encoding routines. + // Hardware implementations can calculate scale factor in device driver + // and use multiplication and shifting on hardware instead of division. + return (otherSize << RefScaleShift) / thisSize; + } + + public Mv32 ScaleMv(ref Mv mv, int x, int y) + { + int xOffQ4 = ScaledX(x << SubpelBits) & SubpelMask; + int yOffQ4 = ScaledY(y << SubpelBits) & SubpelMask; + Mv32 res = new Mv32() + { + Row = ScaledY(mv.Row) + yOffQ4, + Col = ScaledX(mv.Col) + xOffQ4 + }; + return res; + } + + public bool IsValidScale() + { + return XScaleFP != RefInvalidScale && YScaleFP != RefInvalidScale; + } + + public bool IsScaled() + { + return IsValidScale() && (XScaleFP != RefNoScale || YScaleFP != RefNoScale); + } + + public static bool ValidRefFrameSize(int refWidth, int refHeight, int thisWidth, int thisHeight) + { + return 2 * thisWidth >= refWidth && + 2 * thisHeight >= refHeight && + thisWidth <= 16 * refWidth && + thisHeight <= 16 * refHeight; + } + + public void SetupScaleFactorsForFrame(int otherW, int otherH, int thisW, int thisH) + { + if (!ValidRefFrameSize(otherW, otherH, thisW, thisH)) + { + XScaleFP = RefInvalidScale; + YScaleFP = RefInvalidScale; + return; + } + + XScaleFP = GetFixedPointScaleFactor(otherW, thisW); + YScaleFP = GetFixedPointScaleFactor(otherH, thisH); + XStepQ4 = ScaledX(16); + YStepQ4 = ScaledY(16); + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/SegLvlFeatures.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/SegLvlFeatures.cs new file mode 100644 index 00000000..c3ea3fd8 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/SegLvlFeatures.cs @@ -0,0 +1,11 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal enum SegLvlFeatures + { + SegLvlAltQ = 0, // Use alternate Quantizer .... + SegLvlAltLf = 1, // Use alternate loop filter value... + SegLvlRefFrame = 2, // Optional Segment reference frame + SegLvlSkip = 3, // Optional Segment (0,0) + skip mode + SegLvlMax = 4 // Number of features supported + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Segmentation.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Segmentation.cs new file mode 100644 index 00000000..53d1f2cc --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Segmentation.cs @@ -0,0 +1,71 @@ +using Ryujinx.Common.Memory; +using System.Diagnostics; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct Segmentation + { + private static readonly int[] SegFeatureDataSigned = new int[] { 1, 1, 0, 0 }; + private static readonly int[] SegFeatureDataMax = new int[] { QuantCommon.MaxQ, Vp9.LoopFilter.MaxLoopFilter, 3, 0 }; + + public bool Enabled; + public bool UpdateMap; + public byte UpdateData; + public byte AbsDelta; + public bool TemporalUpdate; + + public Array8<Array4<short>> FeatureData; + public Array8<uint> FeatureMask; + public int AqAvOffset; + + public static byte GetPredProbSegId(ref Array3<byte> segPredProbs, ref MacroBlockD xd) + { + return segPredProbs[xd.GetPredContextSegId()]; + } + + public void ClearAllSegFeatures() + { + MemoryMarshal.CreateSpan(ref FeatureData[0][0], 8 * 4).Fill(0); + MemoryMarshal.CreateSpan(ref FeatureMask[0], 8).Fill(0); + AqAvOffset = 0; + } + + internal void EnableSegFeature(int segmentId, SegLvlFeatures featureId) + { + FeatureMask[segmentId] |= 1u << (int)featureId; + } + + internal static int FeatureDataMax(SegLvlFeatures featureId) + { + return SegFeatureDataMax[(int)featureId]; + } + + internal static int IsSegFeatureSigned(SegLvlFeatures featureId) + { + return SegFeatureDataSigned[(int)featureId]; + } + + internal void SetSegData(int segmentId, SegLvlFeatures featureId, int segData) + { + Debug.Assert(segData <= SegFeatureDataMax[(int)featureId]); + if (segData < 0) + { + Debug.Assert(SegFeatureDataSigned[(int)featureId] != 0); + Debug.Assert(-segData <= SegFeatureDataMax[(int)featureId]); + } + + FeatureData[segmentId][(int)featureId] = (short)segData; + } + + internal int IsSegFeatureActive(int segmentId, SegLvlFeatures featureId) + { + return Enabled && (FeatureMask[segmentId] & (1 << (int)featureId)) != 0 ? 1 : 0; + } + + internal short GetSegData(int segmentId, SegLvlFeatures featureId) + { + return FeatureData[segmentId][(int)featureId]; + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Surface.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Surface.cs new file mode 100644 index 00000000..d5b51bc2 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Surface.cs @@ -0,0 +1,82 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Video; +using System; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct Surface : ISurface + { + public ArrayPtr<byte> YBuffer; + public ArrayPtr<byte> UBuffer; + public ArrayPtr<byte> VBuffer; + + public unsafe Plane YPlane => new Plane((IntPtr)YBuffer.ToPointer(), YBuffer.Length); + public unsafe Plane UPlane => new Plane((IntPtr)UBuffer.ToPointer(), UBuffer.Length); + public unsafe Plane VPlane => new Plane((IntPtr)VBuffer.ToPointer(), VBuffer.Length); + + public FrameField Field => FrameField.Progressive; + + public int Width { get; } + public int Height { get; } + public int AlignedWidth { get; } + public int AlignedHeight { get; } + public int Stride { get; } + public int UvWidth { get; } + public int UvHeight { get; } + public int UvAlignedWidth { get; } + public int UvAlignedHeight { get; } + public int UvStride { get; } + public bool HighBd => false; + + private readonly IntPtr _pointer; + + public Surface(int width, int height) + { + const int border = 32; + const int ssX = 1; + const int ssY = 1; + const bool highbd = false; + + int alignedWidth = (width + 7) & ~7; + int alignedHeight = (height + 7) & ~7; + int yStride = ((alignedWidth + 2 * border) + 31) & ~31; + int yplaneSize = (alignedHeight + 2 * border) * yStride; + int uvWidth = alignedWidth >> ssX; + int uvHeight = alignedHeight >> ssY; + int uvStride = yStride >> ssX; + int uvBorderW = border >> ssX; + int uvBorderH = border >> ssY; + int uvplaneSize = (uvHeight + 2 * uvBorderH) * uvStride; + + int frameSize = (highbd ? 2 : 1) * (yplaneSize + 2 * uvplaneSize); + + IntPtr pointer = Marshal.AllocHGlobal(frameSize); + _pointer = pointer; + Width = width; + Height = height; + AlignedWidth = alignedWidth; + AlignedHeight = alignedHeight; + Stride = yStride; + UvWidth = (width + ssX) >> ssX; + UvHeight = (height + ssY) >> ssY; + UvAlignedWidth = uvWidth; + UvAlignedHeight = uvHeight; + UvStride = uvStride; + + ArrayPtr<byte> NewPlane(int start, int size, int border) + { + return new ArrayPtr<byte>(pointer + start + border, size - border); + } + + YBuffer = NewPlane(0, yplaneSize, (border * yStride) + border); + UBuffer = NewPlane(yplaneSize, uvplaneSize, (uvBorderH * uvStride) + uvBorderW); + VBuffer = NewPlane(yplaneSize + uvplaneSize, uvplaneSize, (uvBorderH * uvStride) + uvBorderW); + } + + public void Dispose() + { + Marshal.FreeHGlobal(_pointer); + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TileInfo.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TileInfo.cs new file mode 100644 index 00000000..67289c47 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TileInfo.cs @@ -0,0 +1,85 @@ +using Ryujinx.Graphics.Nvdec.Vp9.Common; +using System; +using System.Diagnostics; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct TileInfo + { + private const int MinTileWidthB64 = 4; + private const int MaxTileWidthB64 = 64; + + public int MiRowStart, MiRowEnd; + public int MiColStart, MiColEnd; + + public static int MiColsAlignedToSb(int nMis) + { + return BitUtils.AlignPowerOfTwo(nMis, Constants.MiBlockSizeLog2); + } + + private static int GetTileOffset(int idx, int mis, int log2) + { + int sbCols = MiColsAlignedToSb(mis) >> Constants.MiBlockSizeLog2; + int offset = ((idx * sbCols) >> log2) << Constants.MiBlockSizeLog2; + return Math.Min(offset, mis); + } + + public void SetRow(ref Vp9Common cm, int row) + { + MiRowStart = GetTileOffset(row, cm.MiRows, cm.Log2TileRows); + MiRowEnd = GetTileOffset(row + 1, cm.MiRows, cm.Log2TileRows); + } + + public void SetCol(ref Vp9Common cm, int col) + { + MiColStart = GetTileOffset(col, cm.MiCols, cm.Log2TileCols); + MiColEnd = GetTileOffset(col + 1, cm.MiCols, cm.Log2TileCols); + } + + public void Init(ref Vp9Common cm, int row, int col) + { + SetRow(ref cm, row); + SetCol(ref cm, col); + } + + // Checks that the given miRow, miCol and search point + // are inside the borders of the tile. + public bool IsInside(int miCol, int miRow, int miRows, ref Position miPos) + { + return !(miRow + miPos.Row < 0 || + miCol + miPos.Col < MiColStart || + miRow + miPos.Row >= miRows || + miCol + miPos.Col >= MiColEnd); + } + + private static int GetMinLog2TileCols(int sb64Cols) + { + int minLog2 = 0; + while ((MaxTileWidthB64 << minLog2) < sb64Cols) + { + ++minLog2; + } + + return minLog2; + } + + private static int GetMaxLog2TileCols(int sb64Cols) + { + int maxLog2 = 1; + while ((sb64Cols >> maxLog2) >= MinTileWidthB64) + { + ++maxLog2; + } + + return maxLog2 - 1; + } + + public static void GetTileNBits(int miCols, ref int minLog2TileCols, ref int maxLog2TileCols) + { + int sb64Cols = MiColsAlignedToSb(miCols) >> Constants.MiBlockSizeLog2; + minLog2TileCols = GetMinLog2TileCols(sb64Cols); + maxLog2TileCols = GetMaxLog2TileCols(sb64Cols); + Debug.Assert(minLog2TileCols <= maxLog2TileCols); + } + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxMode.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxMode.cs new file mode 100644 index 00000000..db914525 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxMode.cs @@ -0,0 +1,12 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + public enum TxMode + { + Only4X4 = 0, // Only 4x4 transform used + Allow8X8 = 1, // Allow block transform size up to 8x8 + Allow16X16 = 2, // Allow block transform size up to 16x16 + Allow32X32 = 3, // Allow block transform size up to 32x32 + TxModeSelect = 4, // Transform specified for each block + TxModes = 5 + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxSize.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxSize.cs new file mode 100644 index 00000000..994deb2c --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxSize.cs @@ -0,0 +1,11 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + public enum TxSize + { + Tx4x4 = 0, // 4x4 transform + Tx8x8 = 1, // 8x8 transform + Tx16x16 = 2, // 16x16 transform + Tx32x32 = 3, // 32x32 transform + TxSizes = 4 + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxType.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxType.cs new file mode 100644 index 00000000..dbf7251c --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxType.cs @@ -0,0 +1,11 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal enum TxType + { + DctDct = 0, // DCT in both horizontal and vertical + AdstDct = 1, // ADST in vertical, DCT in horizontal + DctAdst = 2, // DCT in vertical, ADST in horizontal + AdstAdst = 3, // ADST in both directions + TxTypes = 4 + } +} diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Common.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Common.cs new file mode 100644 index 00000000..faadd349 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Common.cs @@ -0,0 +1,331 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Common; +using Ryujinx.Graphics.Video; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct Vp9Common + { + public MacroBlockD Mb; + + public ArrayPtr<TileWorkerData> TileWorkerData; + + public InternalErrorInfo Error; + + public int Width; + public int Height; + + public int SubsamplingX; + public int SubsamplingY; + + public ArrayPtr<MvRef> PrevFrameMvs; + public ArrayPtr<MvRef> CurFrameMvs; + + public Array3<RefBuffer> FrameRefs; + + public FrameType FrameType; + + // Flag signaling that the frame is encoded using only Intra modes. + public bool IntraOnly; + + public bool AllowHighPrecisionMv; + + // MBs, MbRows/Cols is in 16-pixel units; MiRows/Cols is in + // ModeInfo (8-pixel) units. + public int MBs; + public int MbRows, MiRows; + public int MbCols, MiCols; + public int MiStride; + + /* Profile settings */ + public TxMode TxMode; + + public int BaseQindex; + public int YDcDeltaQ; + public int UvDcDeltaQ; + public int UvAcDeltaQ; + public Array8<Array2<short>> YDequant; + public Array8<Array2<short>> UvDequant; + + /* We allocate a ModeInfo struct for each macroblock, together with + an extra row on top and column on the left to simplify prediction. */ + public ArrayPtr<ModeInfo> Mip; /* Base of allocated array */ + public ArrayPtr<ModeInfo> Mi; /* Corresponds to upper left visible macroblock */ + + public ArrayPtr<Ptr<ModeInfo>> MiGridBase; + public ArrayPtr<Ptr<ModeInfo>> MiGridVisible; + + // Whether to use previous frame's motion vectors for prediction. + public bool UsePrevFrameMvs; + + // Persistent mb segment id map used in prediction. + public int SegMapIdx; + public int PrevSegMapIdx; + + public Array2<ArrayPtr<byte>> SegMapArray; + public ArrayPtr<byte> LastFrameSegMap; + public ArrayPtr<byte> CurrentFrameSegMap; + + public byte InterpFilter; + + public LoopFilterInfoN LfInfo; + + public Array4<sbyte> RefFrameSignBias; /* Two state 0, 1 */ + + public LoopFilter Lf; + public Segmentation Seg; + + // Context probabilities for reference frame prediction + public sbyte CompFixedRef; + public Array2<sbyte> CompVarRef; + public ReferenceMode ReferenceMode; + + public Ptr<Vp9EntropyProbs> Fc; + public Ptr<Vp9BackwardUpdates> Counts; + + public int Log2TileCols, Log2TileRows; + + public ArrayPtr<sbyte> AboveSegContext; + public ArrayPtr<sbyte> AboveContext; + + public bool FrameIsIntraOnly() + { + return FrameType == FrameType.KeyFrame || IntraOnly; + } + + public bool CompoundReferenceAllowed() + { + int i; + for (i = 1; i < Constants.RefsPerFrame; ++i) + { + if (RefFrameSignBias[i + 1] != RefFrameSignBias[1]) + { + return true; + } + } + + return false; + } + + private static int CalcMiSize(int len) + { + // Len is in mi units. + return len + Constants.MiBlockSize; + } + + public void SetMbMi(int width, int height) + { + int alignedWidth = BitUtils.AlignPowerOfTwo(width, Constants.MiSizeLog2); + int alignedHeight = BitUtils.AlignPowerOfTwo(height, Constants.MiSizeLog2); + + MiCols = alignedWidth >> Constants.MiSizeLog2; + MiRows = alignedHeight >> Constants.MiSizeLog2; + MiStride = CalcMiSize(MiCols); + + MbCols = (MiCols + 1) >> 1; + MbRows = (MiRows + 1) >> 1; + MBs = MbRows * MbCols; + } + + public void AllocTileWorkerData(MemoryAllocator allocator, int tileCols, int tileRows, int maxThreads) + { + TileWorkerData = allocator.Allocate<TileWorkerData>(tileCols * tileRows + (maxThreads > 1 ? maxThreads : 0)); + } + + public void FreeTileWorkerData(MemoryAllocator allocator) + { + allocator.Free(TileWorkerData); + } + + private void AllocSegMap(MemoryAllocator allocator, int segMapSize) + { + int i; + + for (i = 0; i < Constants.NumPingPongBuffers; ++i) + { + SegMapArray[i] = allocator.Allocate<byte>(segMapSize); + } + + // Init the index. + SegMapIdx = 0; + PrevSegMapIdx = 1; + + CurrentFrameSegMap = SegMapArray[SegMapIdx]; + LastFrameSegMap = SegMapArray[PrevSegMapIdx]; + } + + private void FreeSegMap(MemoryAllocator allocator) + { + int i; + + for (i = 0; i < Constants.NumPingPongBuffers; ++i) + { + allocator.Free(SegMapArray[i]); + SegMapArray[i] = ArrayPtr<byte>.Null; + } + + CurrentFrameSegMap = ArrayPtr<byte>.Null; + LastFrameSegMap = ArrayPtr<byte>.Null; + } + + private void DecAllocMi(MemoryAllocator allocator, int miSize) + { + Mip = allocator.Allocate<ModeInfo>(miSize); + MiGridBase = allocator.Allocate<Ptr<ModeInfo>>(miSize); + } + + private void DecFreeMi(MemoryAllocator allocator) + { + allocator.Free(Mip); + Mip = ArrayPtr<ModeInfo>.Null; + allocator.Free(MiGridBase); + MiGridBase = ArrayPtr<Ptr<ModeInfo>>.Null; + } + + public void FreeContextBuffers(MemoryAllocator allocator) + { + DecFreeMi(allocator); + FreeSegMap(allocator); + allocator.Free(AboveContext); + AboveContext = ArrayPtr<sbyte>.Null; + allocator.Free(AboveSegContext); + AboveSegContext = ArrayPtr<sbyte>.Null; + allocator.Free(Lf.Lfm); + Lf.Lfm = ArrayPtr<LoopFilterMask>.Null; + allocator.Free(CurFrameMvs); + CurFrameMvs = ArrayPtr<MvRef>.Null; + if (UsePrevFrameMvs) + { + allocator.Free(PrevFrameMvs); + PrevFrameMvs = ArrayPtr<MvRef>.Null; + } + } + + private void AllocLoopFilter(MemoryAllocator allocator) + { + // Each lfm holds bit masks for all the 8x8 blocks in a 64x64 region. The + // stride and rows are rounded up / truncated to a multiple of 8. + Lf.LfmStride = (MiCols + (Constants.MiBlockSize - 1)) >> 3; + Lf.Lfm = allocator.Allocate<LoopFilterMask>(((MiRows + (Constants.MiBlockSize - 1)) >> 3) * Lf.LfmStride); + } + + public void AllocContextBuffers(MemoryAllocator allocator, int width, int height) + { + SetMbMi(width, height); + int newMiSize = MiStride * CalcMiSize(MiRows); + if (newMiSize != 0) + { + DecAllocMi(allocator, newMiSize); + } + + if (MiRows * MiCols != 0) + { + // Create the segmentation map structure and set to 0. + AllocSegMap(allocator, MiRows * MiCols); + } + + if (MiCols != 0) + { + AboveContext = allocator.Allocate<sbyte>(2 * TileInfo.MiColsAlignedToSb(MiCols) * Constants.MaxMbPlane); + AboveSegContext = allocator.Allocate<sbyte>(TileInfo.MiColsAlignedToSb(MiCols)); + } + + AllocLoopFilter(allocator); + + CurFrameMvs = allocator.Allocate<MvRef>(MiRows * MiCols); + // Using the same size as the current frame is fine here, + // as this is never true when we have a resolution change. + if (UsePrevFrameMvs) + { + PrevFrameMvs = allocator.Allocate<MvRef>(MiRows * MiCols); + } + } + + private unsafe void DecSetupMi() + { + Mi = Mip.Slice(MiStride + 1); + MiGridVisible = MiGridBase.Slice(MiStride + 1); + MemoryUtil.Fill(MiGridBase.ToPointer(), Ptr<ModeInfo>.Null, MiStride * (MiRows + 1)); + } + + public unsafe void InitContextBuffers() + { + DecSetupMi(); + if (!LastFrameSegMap.IsNull) + { + MemoryUtil.Fill(LastFrameSegMap.ToPointer(), (byte)0, MiRows * MiCols); + } + } + + private void SetPartitionProbs(ref MacroBlockD xd) + { + xd.PartitionProbs = FrameIsIntraOnly() + ? new ArrayPtr<Array3<byte>>(ref Fc.Value.KfPartitionProb[0], 16) + : new ArrayPtr<Array3<byte>>(ref Fc.Value.PartitionProb[0], 16); + } + + internal void InitMacroBlockD(ref MacroBlockD xd, ArrayPtr<int> dqcoeff) + { + int i; + + for (i = 0; i < Constants.MaxMbPlane; ++i) + { + xd.Plane[i].DqCoeff = dqcoeff; + xd.AboveContext[i] = AboveContext.Slice(i * 2 * TileInfo.MiColsAlignedToSb(MiCols)); + + if (i == 0) + { + MemoryUtil.Copy(ref xd.Plane[i].SegDequant, ref YDequant); + } + else + { + MemoryUtil.Copy(ref xd.Plane[i].SegDequant, ref UvDequant); + } + xd.Fc = new Ptr<Vp9EntropyProbs>(ref Fc.Value); + } + + xd.AboveSegContext = AboveSegContext; + xd.MiStride = MiStride; + xd.ErrorInfo = new Ptr<InternalErrorInfo>(ref Error); + + SetPartitionProbs(ref xd); + } + + public void SetupSegmentationDequant() + { + const BitDepth bitDepth = BitDepth.Bits8; // TODO: Configurable + // Build y/uv dequant values based on segmentation. + if (Seg.Enabled) + { + int i; + for (i = 0; i < Constants.MaxSegments; ++i) + { + int qIndex = QuantCommon.GetQIndex(ref Seg, i, BaseQindex); + YDequant[i][0] = QuantCommon.DcQuant(qIndex, YDcDeltaQ, bitDepth); + YDequant[i][1] = QuantCommon.AcQuant(qIndex, 0, bitDepth); + UvDequant[i][0] = QuantCommon.DcQuant(qIndex, UvDcDeltaQ, bitDepth); + UvDequant[i][1] = QuantCommon.AcQuant(qIndex, UvAcDeltaQ, bitDepth); + } + } + else + { + int qIndex = BaseQindex; + // When segmentation is disabled, only the first value is used. The + // remaining are don't cares. + YDequant[0][0] = QuantCommon.DcQuant(qIndex, YDcDeltaQ, bitDepth); + YDequant[0][1] = QuantCommon.AcQuant(qIndex, 0, bitDepth); + UvDequant[0][0] = QuantCommon.DcQuant(qIndex, UvDcDeltaQ, bitDepth); + UvDequant[0][1] = QuantCommon.AcQuant(qIndex, UvAcDeltaQ, bitDepth); + } + } + + public void SetupScaleFactors() + { + for (int i = 0; i < Constants.RefsPerFrame; ++i) + { + ref RefBuffer refBuf = ref FrameRefs[i]; + refBuf.Sf.SetupScaleFactorsForFrame(refBuf.Buf.Width, refBuf.Buf.Height, Width, Height); + } + } + } +} |
