aboutsummaryrefslogtreecommitdiff
path: root/ARMeilleure/Instructions
diff options
context:
space:
mode:
authorLDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>2021-01-04 23:45:54 +0100
committerGitHub <noreply@github.com>2021-01-04 23:45:54 +0100
commit430ba6da65a781196db7d723cc88710bb7f5caf8 (patch)
treea7ed55f638dde795f4270a324fa5338ffb80ee12 /ARMeilleure/Instructions
parenta03ab0c4a0bef3c168874dc2105c43c9051e0807 (diff)
CPU (A64): Add Pmull_V Inst. with Clmul fast path for the "1/2D -> 1Q" variant & Sse fast path and slow path for both the "8/16B -> 8H" and "1/2D -> 1Q" variants; with Test. (#1817)
* Add Pmull_V Sse fast path only, both "8/16B -> 8H" and "1/2D -> 1Q" variants; with Test. * Add Clmul fast path for the 128 bits variant. * Small optimisation (save 60 instructions) for the Sse fast path about the 128 bits variant. * Add slow path, both variants. Fix V128 Shl/Shr when shift = 0. * A32: Add Vmull_I P64 variant (slow path); not tested. * A32: Add Vmull_I_P8_P64 Test and fix P64 variant.
Diffstat (limited to 'ARMeilleure/Instructions')
-rw-r--r--ARMeilleure/Instructions/InstEmitSimdArithmetic.cs125
-rw-r--r--ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs36
-rw-r--r--ARMeilleure/Instructions/InstEmitSimdHelper32.cs22
-rw-r--r--ARMeilleure/Instructions/InstName.cs1
-rw-r--r--ARMeilleure/Instructions/SoftFallback.cs17
5 files changed, 178 insertions, 23 deletions
diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
index 3a97bc52..88be07bd 100644
--- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs
@@ -10,6 +10,7 @@ using System.Diagnostics;
using static ARMeilleure.Instructions.InstEmitHelper;
using static ARMeilleure.Instructions.InstEmitSimdHelper;
+using static ARMeilleure.Instructions.InstEmitSimdHelper32;
using static ARMeilleure.IntermediateRepresentation.OperandHelper;
namespace ARMeilleure.Instructions
@@ -1928,6 +1929,112 @@ namespace ARMeilleure.Instructions
}
}
+ public static void Pmull_V(ArmEmitterContext context)
+ {
+ OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
+
+ if (Optimizations.UsePclmulqdq && op.Size == 3)
+ {
+ Operand n = GetVec(op.Rn);
+ Operand m = GetVec(op.Rm);
+
+ int imm8 = op.RegisterSize == RegisterSize.Simd64 ? 0b0000_0000 : 0b0001_0001;
+
+ Operand res = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, n, m, Const(imm8));
+
+ context.Copy(GetVec(op.Rd), res);
+ }
+ else if (Optimizations.UseSse41)
+ {
+ Operand n = GetVec(op.Rn);
+ Operand m = GetVec(op.Rm);
+
+ if (op.RegisterSize == RegisterSize.Simd64)
+ {
+ n = context.VectorZeroUpper64(n);
+ m = context.VectorZeroUpper64(m);
+ }
+ else /* if (op.RegisterSize == RegisterSize.Simd128) */
+ {
+ n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8));
+ m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
+ }
+
+ Operand res = context.VectorZero();
+
+ if (op.Size == 0)
+ {
+ n = context.AddIntrinsic(Intrinsic.X86Pmovzxbw, n);
+ m = context.AddIntrinsic(Intrinsic.X86Pmovzxbw, m);
+
+ for (int i = 0; i < 8; i++)
+ {
+ Operand mask = context.AddIntrinsic(Intrinsic.X86Psllw, n, Const(15 - i));
+ mask = context.AddIntrinsic(Intrinsic.X86Psraw, mask, Const(15));
+
+ Operand tmp = context.AddIntrinsic(Intrinsic.X86Psllw, m, Const(i));
+ tmp = context.AddIntrinsic(Intrinsic.X86Pand, tmp, mask);
+
+ res = context.AddIntrinsic(Intrinsic.X86Pxor, res, tmp);
+ }
+ }
+ else /* if (op.Size == 3) */
+ {
+ Operand zero = context.VectorZero();
+
+ for (int i = 0; i < 64; i++)
+ {
+ Operand mask = context.AddIntrinsic(Intrinsic.X86Movlhps, n, n);
+ mask = context.AddIntrinsic(Intrinsic.X86Psllq, mask, Const(63 - i));
+ mask = context.AddIntrinsic(Intrinsic.X86Psrlq, mask, Const(63));
+ mask = context.AddIntrinsic(Intrinsic.X86Psubq, zero, mask);
+
+ Operand tmp = EmitSse2Sll_128(context, m, i);
+ tmp = context.AddIntrinsic(Intrinsic.X86Pand, tmp, mask);
+
+ res = context.AddIntrinsic(Intrinsic.X86Pxor, res, tmp);
+ }
+ }
+
+ context.Copy(GetVec(op.Rd), res);
+ }
+ else
+ {
+ Operand n = GetVec(op.Rn);
+ Operand m = GetVec(op.Rm);
+
+ Operand res;
+
+ if (op.Size == 0)
+ {
+ res = context.VectorZero();
+
+ int part = op.RegisterSize == RegisterSize.Simd64 ? 0 : 8;
+
+ for (int index = 0; index < 8; index++)
+ {
+ Operand ne = context.VectorExtract8(n, part + index);
+ Operand me = context.VectorExtract8(m, part + index);
+
+ Operand de = EmitPolynomialMultiply(context, ne, me, 8);
+
+ res = EmitVectorInsert(context, res, de, index, 1);
+ }
+ }
+ else /* if (op.Size == 3) */
+ {
+ int part = op.RegisterSize == RegisterSize.Simd64 ? 0 : 1;
+
+ Operand ne = context.VectorExtract(OperandType.I64, n, part);
+ Operand me = context.VectorExtract(OperandType.I64, m, part);
+
+ res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.PolynomialMult64_128)), ne, me);
+ }
+
+ context.Copy(GetVec(op.Rd), res);
+ }
+ }
+
public static void Raddhn_V(ArmEmitterContext context)
{
EmitHighNarrow(context, (op1, op2) => context.Add(op1, op2), round: true);
@@ -3690,5 +3797,23 @@ namespace ARMeilleure.Instructions
context.Copy(GetVec(op.Rd), res);
}
+
+ private static Operand EmitSse2Sll_128(ArmEmitterContext context, Operand op, int shift)
+ {
+ // The upper part of op is assumed to be zero.
+ Debug.Assert(shift >= 0 && shift < 64);
+
+ if (shift == 0)
+ {
+ return op;
+ }
+
+ Operand high = context.AddIntrinsic(Intrinsic.X86Pslldq, op, Const(8));
+ high = context.AddIntrinsic(Intrinsic.X86Psrlq, high, Const(64 - shift));
+
+ Operand low = context.AddIntrinsic(Intrinsic.X86Psllq, op, Const(shift));
+
+ return context.AddIntrinsic(Intrinsic.X86Por, high, low);
+ }
}
}
diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
index d35af209..0fc8c391 100644
--- a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
@@ -920,7 +920,19 @@ namespace ARMeilleure.Instructions
if (op.Polynomial)
{
- EmitVectorBinaryLongOpI32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size), false);
+ if (op.Size == 0) // P8
+ {
+ EmitVectorBinaryLongOpI32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size), false);
+ }
+ else /* if (op.Size == 2) // P64 */
+ {
+ Operand ne = context.VectorExtract(OperandType.I64, GetVec(op.Qn), op.Vn & 1);
+ Operand me = context.VectorExtract(OperandType.I64, GetVec(op.Qm), op.Vm & 1);
+
+ Operand res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.PolynomialMult64_128)), ne, me);
+
+ context.Copy(GetVecA32(op.Qd), res);
+ }
}
else
{
@@ -1366,27 +1378,5 @@ namespace ARMeilleure.Instructions
EmitVectorBinaryOpSimd32(context, genericEmit);
}
}
-
- private static Operand EmitPolynomialMultiply(ArmEmitterContext context, Operand op1, Operand op2, int eSize)
- {
- Debug.Assert(eSize <= 32);
-
- Operand result = eSize == 32 ? Const(0L) : Const(0);
-
- if (eSize == 32)
- {
- op1 = context.ZeroExtend32(OperandType.I64, op1);
- op2 = context.ZeroExtend32(OperandType.I64, op2);
- }
-
- for (int i = 0; i < eSize; i++)
- {
- Operand mask = context.BitwiseAnd(op1, Const(op1.Type, 1L << i));
-
- result = context.BitwiseExclusiveOr(result, context.Multiply(op2, mask));
- }
-
- return result;
- }
}
}
diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
index 39195057..59d3dc29 100644
--- a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
@@ -1167,5 +1167,27 @@ namespace ARMeilleure.Instructions
return res;
}
+
+ public static Operand EmitPolynomialMultiply(ArmEmitterContext context, Operand op1, Operand op2, int eSize)
+ {
+ Debug.Assert(eSize <= 32);
+
+ Operand result = eSize == 32 ? Const(0L) : Const(0);
+
+ if (eSize == 32)
+ {
+ op1 = context.ZeroExtend32(OperandType.I64, op1);
+ op2 = context.ZeroExtend32(OperandType.I64, op2);
+ }
+
+ for (int i = 0; i < eSize; i++)
+ {
+ Operand mask = context.BitwiseAnd(op1, Const(op1.Type, 1L << i));
+
+ result = context.BitwiseExclusiveOr(result, context.Multiply(op2, mask));
+ }
+
+ return result;
+ }
}
}
diff --git a/ARMeilleure/Instructions/InstName.cs b/ARMeilleure/Instructions/InstName.cs
index 41bb51f3..a0ec9dc3 100644
--- a/ARMeilleure/Instructions/InstName.cs
+++ b/ARMeilleure/Instructions/InstName.cs
@@ -296,6 +296,7 @@ namespace ARMeilleure.Instructions
Orn_V,
Orr_V,
Orr_Vi,
+ Pmull_V,
Raddhn_V,
Rbit_V,
Rev16_V,
diff --git a/ARMeilleure/Instructions/SoftFallback.cs b/ARMeilleure/Instructions/SoftFallback.cs
index ef00fd9d..1d8fa2e2 100644
--- a/ARMeilleure/Instructions/SoftFallback.cs
+++ b/ARMeilleure/Instructions/SoftFallback.cs
@@ -1260,5 +1260,22 @@ namespace ARMeilleure.Instructions
: (uint)(value >> 32);
}
#endregion
+
+ public static V128 PolynomialMult64_128(ulong op1, ulong op2)
+ {
+ V128 result = V128.Zero;
+
+ V128 op2_128 = new V128(op2, 0);
+
+ for (int i = 0; i < 64; i++)
+ {
+ if (((op1 >> i) & 1) == 1)
+ {
+ result ^= op2_128 << i;
+ }
+ }
+
+ return result;
+ }
}
}