aboutsummaryrefslogtreecommitdiff
path: root/ChocolArm64/Instruction/AInstEmitSimdMove.cs
diff options
context:
space:
mode:
authorgdkchan <gab.dark.100@gmail.com>2018-09-26 23:30:21 -0300
committerGitHub <noreply@github.com>2018-09-26 23:30:21 -0300
commit0b52ee66272b673cecebcf9ae9baaf03899e0ee3 (patch)
treea004a0f7215e4c371ee99c187c291a0e11a0365e /ChocolArm64/Instruction/AInstEmitSimdMove.cs
parent40282da93a45c90b3d5a696199ee353a1ae8c730 (diff)
Optimize BIC, BSL, BIT, BIF, XTN, ZIP, DUP (Gp), FMADD (Scalar) and FCVT (Scalar) using SSE intrinsics (#405)
* Optimize BIC, BSL, BIT, BIF, XTN, ZIP, DUP (Gp), FMADD (Scalar) and FCVT (Scalar) using SSE intrinsics, some CQ improvements * Remove useless space * Address PR feedback * Revert EmitVectorZero32_128 changes
Diffstat (limited to 'ChocolArm64/Instruction/AInstEmitSimdMove.cs')
-rw-r--r--ChocolArm64/Instruction/AInstEmitSimdMove.cs200
1 files changed, 164 insertions, 36 deletions
diff --git a/ChocolArm64/Instruction/AInstEmitSimdMove.cs b/ChocolArm64/Instruction/AInstEmitSimdMove.cs
index 3bf1e463..94097f48 100644
--- a/ChocolArm64/Instruction/AInstEmitSimdMove.cs
+++ b/ChocolArm64/Instruction/AInstEmitSimdMove.cs
@@ -3,6 +3,7 @@ using ChocolArm64.State;
using ChocolArm64.Translation;
using System;
using System.Reflection.Emit;
+using System.Runtime.Intrinsics.X86;
using static ChocolArm64.Instruction.AInstEmitSimdHelper;
@@ -14,19 +15,44 @@ namespace ChocolArm64.Instruction
{
AOpCodeSimdIns Op = (AOpCodeSimdIns)Context.CurrOp;
- int Bytes = Op.GetBitsCount() >> 3;
- int Elems = Bytes >> Op.Size;
-
- for (int Index = 0; Index < Elems; Index++)
+ if (AOptimizations.UseSse2)
{
Context.EmitLdintzr(Op.Rn);
- EmitVectorInsert(Context, Op.Rd, Index, Op.Size);
- }
+ switch (Op.Size)
+ {
+ case 0: Context.Emit(OpCodes.Conv_U1); break;
+ case 1: Context.Emit(OpCodes.Conv_U2); break;
+ case 2: Context.Emit(OpCodes.Conv_U4); break;
+ }
- if (Op.RegisterSize == ARegisterSize.SIMD64)
+ Type[] Types = new Type[] { UIntTypesPerSizeLog2[Op.Size] };
+
+ Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), Types));
+
+ EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size);
+
+ if (Op.RegisterSize == ARegisterSize.SIMD64)
+ {
+ EmitVectorZeroUpper(Context, Op.Rd);
+ }
+ }
+ else
{
- EmitVectorZeroUpper(Context, Op.Rd);
+ int Bytes = Op.GetBitsCount() >> 3;
+ int Elems = Bytes >> Op.Size;
+
+ for (int Index = 0; Index < Elems; Index++)
+ {
+ Context.EmitLdintzr(Op.Rn);
+
+ EmitVectorInsert(Context, Op.Rd, Index, Op.Size);
+ }
+
+ if (Op.RegisterSize == ARegisterSize.SIMD64)
+ {
+ EmitVectorZeroUpper(Context, Op.Rd);
+ }
}
}
@@ -295,25 +321,91 @@ namespace ChocolArm64.Instruction
int Part = Op.RegisterSize == ARegisterSize.SIMD128 ? Elems : 0;
- if (Part != 0)
+ if (AOptimizations.UseSse41 && Op.Size < 2)
{
- Context.EmitLdvec(Op.Rd);
- Context.EmitStvectmp();
- }
+ void EmitZeroVector()
+ {
+ switch (Op.Size)
+ {
+ case 0: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt16Zero)); break;
+ case 1: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt32Zero)); break;
+ }
+ }
- for (int Index = 0; Index < Elems; Index++)
- {
- EmitVectorExtractZx(Context, Op.Rn, Index, Op.Size + 1);
+ //For XTN, first operand is source, second operand is 0.
+ //For XTN2, first operand is 0, second operand is source.
+ if (Part != 0)
+ {
+ EmitZeroVector();
+ }
- EmitVectorInsertTmp(Context, Part + Index, Op.Size);
- }
+ EmitLdvecWithSignedCast(Context, Op.Rn, Op.Size + 1);
- Context.EmitLdvectmp();
- Context.EmitStvec(Op.Rd);
+ //Set mask to discard the upper half of the wide elements.
+ switch (Op.Size)
+ {
+ case 0: Context.EmitLdc_I4(0x00ff); break;
+ case 1: Context.EmitLdc_I4(0x0000ffff); break;
+ }
+
+ Type WideType = IntTypesPerSizeLog2[Op.Size + 1];
+
+ Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), new Type[] { WideType }));
+
+ WideType = VectorIntTypesPerSizeLog2[Op.Size + 1];
+
+ Type[] WideTypes = new Type[] { WideType, WideType };
+
+ Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), WideTypes));
+
+ if (Part == 0)
+ {
+ EmitZeroVector();
+ }
+
+ //Pack values with signed saturation, the signed saturation shouldn't
+ //saturate anything since the upper bits were masked off.
+ Type SseType = Op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
+
+ Context.EmitCall(SseType.GetMethod(nameof(Sse2.PackUnsignedSaturate), WideTypes));
+
+ if (Part != 0)
+ {
+ //For XTN2, we additionally need to discard the upper bits
+ //of the target register and OR the result with it.
+ EmitVectorZeroUpper(Context, Op.Rd);
- if (Part == 0)
+ EmitLdvecWithUnsignedCast(Context, Op.Rd, Op.Size);
+
+ Type NarrowType = VectorUIntTypesPerSizeLog2[Op.Size];
+
+ Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or), new Type[] { NarrowType, NarrowType }));
+ }
+
+ EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size);
+ }
+ else
{
- EmitVectorZeroUpper(Context, Op.Rd);
+ if (Part != 0)
+ {
+ Context.EmitLdvec(Op.Rd);
+ Context.EmitStvectmp();
+ }
+
+ for (int Index = 0; Index < Elems; Index++)
+ {
+ EmitVectorExtractZx(Context, Op.Rn, Index, Op.Size + 1);
+
+ EmitVectorInsertTmp(Context, Part + Index, Op.Size);
+ }
+
+ Context.EmitLdvectmp();
+ Context.EmitStvec(Op.Rd);
+
+ if (Part == 0)
+ {
+ EmitVectorZeroUpper(Context, Op.Rd);
+ }
}
}
@@ -394,28 +486,64 @@ namespace ChocolArm64.Instruction
{
AOpCodeSimdReg Op = (AOpCodeSimdReg)Context.CurrOp;
- int Words = Op.GetBitsCount() >> 4;
- int Pairs = Words >> Op.Size;
+ if (AOptimizations.UseSse2)
+ {
+ EmitLdvecWithUnsignedCast(Context, Op.Rn, Op.Size);
+ EmitLdvecWithUnsignedCast(Context, Op.Rm, Op.Size);
- int Base = Part != 0 ? Pairs : 0;
+ Type[] Types = new Type[]
+ {
+ VectorUIntTypesPerSizeLog2[Op.Size],
+ VectorUIntTypesPerSizeLog2[Op.Size]
+ };
- for (int Index = 0; Index < Pairs; Index++)
- {
- int Idx = Index << 1;
+ string Name = Part == 0 || (Part != 0 && Op.RegisterSize == ARegisterSize.SIMD64)
+ ? nameof(Sse2.UnpackLow)
+ : nameof(Sse2.UnpackHigh);
- EmitVectorExtractZx(Context, Op.Rn, Base + Index, Op.Size);
- EmitVectorExtractZx(Context, Op.Rm, Base + Index, Op.Size);
+ Context.EmitCall(typeof(Sse2).GetMethod(Name, Types));
- EmitVectorInsertTmp(Context, Idx + 1, Op.Size);
- EmitVectorInsertTmp(Context, Idx, Op.Size);
- }
+ if (Op.RegisterSize == ARegisterSize.SIMD64 && Part != 0)
+ {
+ Context.EmitLdc_I4(8);
- Context.EmitLdvectmp();
- Context.EmitStvec(Op.Rd);
+ Type[] ShTypes = new Type[] { VectorUIntTypesPerSizeLog2[Op.Size], typeof(byte) };
- if (Op.RegisterSize == ARegisterSize.SIMD64)
+ Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), ShTypes));
+ }
+
+ EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size);
+
+ if (Op.RegisterSize == ARegisterSize.SIMD64 && Part == 0)
+ {
+ EmitVectorZeroUpper(Context, Op.Rd);
+ }
+ }
+ else
{
- EmitVectorZeroUpper(Context, Op.Rd);
+ int Words = Op.GetBitsCount() >> 4;
+ int Pairs = Words >> Op.Size;
+
+ int Base = Part != 0 ? Pairs : 0;
+
+ for (int Index = 0; Index < Pairs; Index++)
+ {
+ int Idx = Index << 1;
+
+ EmitVectorExtractZx(Context, Op.Rn, Base + Index, Op.Size);
+ EmitVectorExtractZx(Context, Op.Rm, Base + Index, Op.Size);
+
+ EmitVectorInsertTmp(Context, Idx + 1, Op.Size);
+ EmitVectorInsertTmp(Context, Idx, Op.Size);
+ }
+
+ Context.EmitLdvectmp();
+ Context.EmitStvec(Op.Rd);
+
+ if (Op.RegisterSize == ARegisterSize.SIMD64)
+ {
+ EmitVectorZeroUpper(Context, Op.Rd);
+ }
}
}
}