aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormerry <MerryMage@users.noreply.github.com>2020-06-05 11:58:27 +0100
committerGitHub <noreply@github.com>2020-06-05 20:58:27 +1000
commitf8cd072b62808c8da06549807cc263003f0049b7 (patch)
tree630749551fb53adc3687edd63945361292f93927
parentbcb7761eacaf9e40cc506648fec1eed58c23eff0 (diff)
Faster crc32 implementation (#1294)
* Add Pclmulqdq intrinsic * Implement crc32 in terms of pclmulqdq * Address PR comments
-rw-r--r--ARMeilleure/CodeGen/X86/Assembler.cs8
-rw-r--r--ARMeilleure/CodeGen/X86/IntrinsicTable.cs1
-rw-r--r--ARMeilleure/CodeGen/X86/X86Instruction.cs1
-rw-r--r--ARMeilleure/Instructions/InstEmitHash.cs137
-rw-r--r--ARMeilleure/IntermediateRepresentation/Intrinsic.cs1
-rw-r--r--ARMeilleure/Optimizations.cs38
6 files changed, 160 insertions, 26 deletions
diff --git a/ARMeilleure/CodeGen/X86/Assembler.cs b/ARMeilleure/CodeGen/X86/Assembler.cs
index de361677..5ad54289 100644
--- a/ARMeilleure/CodeGen/X86/Assembler.cs
+++ b/ARMeilleure/CodeGen/X86/Assembler.cs
@@ -165,6 +165,7 @@ namespace ARMeilleure.CodeGen.X86
Add(X86Instruction.Pavgb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fe0, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Pavgw, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fe3, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Pblendvb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3810, InstructionFlags.Prefix66));
+ Add(X86Instruction.Pclmulqdq, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a44, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Pcmpeqb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f74, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Pcmpeqd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f76, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Pcmpeqq, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3829, InstructionFlags.Vex | InstructionFlags.Prefix66));
@@ -633,6 +634,13 @@ namespace ARMeilleure.CodeGen.X86
WriteInstruction(dest, source, type, X86Instruction.Or);
}
+ public void Pclmulqdq(Operand dest, Operand source, byte imm)
+ {
+ WriteInstruction(dest, null, source, X86Instruction.Pclmulqdq);
+
+ WriteByte(imm);
+ }
+
public void Pcmpeqw(Operand dest, Operand src1, Operand src2)
{
WriteInstruction(dest, src1, src2, X86Instruction.Pcmpeqw);
diff --git a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
index 5382e3ea..bc07c6b0 100644
--- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
+++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
@@ -82,6 +82,7 @@ namespace ARMeilleure.CodeGen.X86
Add(Intrinsic.X86Pavgb, new IntrinsicInfo(X86Instruction.Pavgb, IntrinsicType.Binary));
Add(Intrinsic.X86Pavgw, new IntrinsicInfo(X86Instruction.Pavgw, IntrinsicType.Binary));
Add(Intrinsic.X86Pblendvb, new IntrinsicInfo(X86Instruction.Pblendvb, IntrinsicType.Ternary));
+ Add(Intrinsic.X86Pclmulqdq, new IntrinsicInfo(X86Instruction.Pclmulqdq, IntrinsicType.TernaryImm));
Add(Intrinsic.X86Pcmpeqb, new IntrinsicInfo(X86Instruction.Pcmpeqb, IntrinsicType.Binary));
Add(Intrinsic.X86Pcmpeqd, new IntrinsicInfo(X86Instruction.Pcmpeqd, IntrinsicType.Binary));
Add(Intrinsic.X86Pcmpeqq, new IntrinsicInfo(X86Instruction.Pcmpeqq, IntrinsicType.Binary));
diff --git a/ARMeilleure/CodeGen/X86/X86Instruction.cs b/ARMeilleure/CodeGen/X86/X86Instruction.cs
index e4682595..c3dffc62 100644
--- a/ARMeilleure/CodeGen/X86/X86Instruction.cs
+++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs
@@ -98,6 +98,7 @@ namespace ARMeilleure.CodeGen.X86
Pavgb,
Pavgw,
Pblendvb,
+ Pclmulqdq,
Pcmpeqb,
Pcmpeqd,
Pcmpeqq,
diff --git a/ARMeilleure/Instructions/InstEmitHash.cs b/ARMeilleure/Instructions/InstEmitHash.cs
index 0be8458e..8a539666 100644
--- a/ARMeilleure/Instructions/InstEmitHash.cs
+++ b/ARMeilleure/Instructions/InstEmitHash.cs
@@ -1,9 +1,13 @@
+// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+
using ARMeilleure.Decoders;
using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.Translation;
using System;
using static ARMeilleure.Instructions.InstEmitHelper;
+using static ARMeilleure.Instructions.InstEmitSimdHelper;
+using static ARMeilleure.IntermediateRepresentation.OperandHelper;
namespace ARMeilleure.Instructions
{
@@ -11,42 +15,159 @@ namespace ARMeilleure.Instructions
{
public static void Crc32b(ArmEmitterContext context)
{
- EmitCrc32Call(context, new _U32_U32_U8(SoftFallback.Crc32b));
+ if (Optimizations.UsePclmulqdq)
+ {
+ EmitCrc32Optimized(context, false, 8);
+ }
+ else
+ {
+ EmitCrc32Call(context, new _U32_U32_U8(SoftFallback.Crc32b));
+ }
}
public static void Crc32h(ArmEmitterContext context)
{
- EmitCrc32Call(context, new _U32_U32_U16(SoftFallback.Crc32h));
+ if (Optimizations.UsePclmulqdq)
+ {
+ EmitCrc32Optimized(context, false, 16);
+ }
+ else
+ {
+ EmitCrc32Call(context, new _U32_U32_U16(SoftFallback.Crc32h));
+ }
}
public static void Crc32w(ArmEmitterContext context)
{
- EmitCrc32Call(context, new _U32_U32_U32(SoftFallback.Crc32w));
+ if (Optimizations.UsePclmulqdq)
+ {
+ EmitCrc32Optimized(context, false, 32);
+ }
+ else
+ {
+ EmitCrc32Call(context, new _U32_U32_U32(SoftFallback.Crc32w));
+ }
}
public static void Crc32x(ArmEmitterContext context)
{
- EmitCrc32Call(context, new _U32_U32_U64(SoftFallback.Crc32x));
+ if (Optimizations.UsePclmulqdq)
+ {
+ EmitCrc32Optimized64(context, false);
+ }
+ else
+ {
+ EmitCrc32Call(context, new _U32_U32_U64(SoftFallback.Crc32x));
+ }
}
public static void Crc32cb(ArmEmitterContext context)
{
- EmitCrc32Call(context, new _U32_U32_U8(SoftFallback.Crc32cb));
+ if (Optimizations.UsePclmulqdq)
+ {
+ EmitCrc32Optimized(context, true, 8);
+ }
+ else
+ {
+ EmitCrc32Call(context, new _U32_U32_U8(SoftFallback.Crc32cb));
+ }
}
public static void Crc32ch(ArmEmitterContext context)
{
- EmitCrc32Call(context, new _U32_U32_U16(SoftFallback.Crc32ch));
+ if (Optimizations.UsePclmulqdq)
+ {
+ EmitCrc32Optimized(context, true, 16);
+ }
+ else
+ {
+ EmitCrc32Call(context, new _U32_U32_U16(SoftFallback.Crc32ch));
+ }
}
public static void Crc32cw(ArmEmitterContext context)
{
- EmitCrc32Call(context, new _U32_U32_U32(SoftFallback.Crc32cw));
+ if (Optimizations.UsePclmulqdq)
+ {
+ EmitCrc32Optimized(context, true, 32);
+ }
+ else
+ {
+ EmitCrc32Call(context, new _U32_U32_U32(SoftFallback.Crc32cw));
+ }
}
public static void Crc32cx(ArmEmitterContext context)
{
- EmitCrc32Call(context, new _U32_U32_U64(SoftFallback.Crc32cx));
+ if (Optimizations.UsePclmulqdq)
+ {
+ EmitCrc32Optimized64(context, true);
+ }
+ else
+ {
+ EmitCrc32Call(context, new _U32_U32_U64(SoftFallback.Crc32cx));
+ }
+ }
+
+ private static void EmitCrc32Optimized(ArmEmitterContext context, bool castagnoli, int bitsize)
+ {
+ OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp;
+
+ long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
+ long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
+
+ Operand crc = GetIntOrZR(context, op.Rn);
+ Operand data = GetIntOrZR(context, op.Rm);
+
+ crc = context.VectorInsert(context.VectorZero(), crc, 0);
+
+ switch (bitsize)
+ {
+ case 8: data = context.VectorInsert8(context.VectorZero(), data, 0); break;
+ case 16: data = context.VectorInsert16(context.VectorZero(), data, 0); break;
+ case 32: data = context.VectorInsert(context.VectorZero(), data, 0); break;
+ }
+
+ Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
+ tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(64 - bitsize));
+ tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(0));
+ tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
+
+ if (bitsize < 32)
+ {
+ crc = context.AddIntrinsic(Intrinsic.X86Pslldq, crc, Const((64 - bitsize) / 8));
+ tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, crc);
+ }
+
+ SetIntOrZR(context, op.Rd, context.VectorExtract(OperandType.I32, tmp, 2));
+ }
+
+ private static void EmitCrc32Optimized64(ArmEmitterContext context, bool castagnoli)
+ {
+ OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp;
+
+ long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
+ long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
+
+ Operand crc = GetIntOrZR(context, op.Rn);
+ Operand data = GetIntOrZR(context, op.Rm);
+
+ crc = context.VectorInsert(context.VectorZero(), crc, 0);
+ data = context.VectorInsert(context.VectorZero(), data, 0);
+
+ Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
+ Operand res = context.AddIntrinsic(Intrinsic.X86Pslldq, tmp, Const(4));
+
+ tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, res, X86GetScalar(context, mu), Const(0));
+ tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
+
+ tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, res);
+ tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(32));
+
+ tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(1));
+ tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
+
+ SetIntOrZR(context, op.Rd, context.VectorExtract(OperandType.I32, tmp, 2));
}
private static void EmitCrc32Call(ArmEmitterContext context, Delegate dlg)
diff --git a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
index 28ec9f32..639ba7f9 100644
--- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
+++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
@@ -71,6 +71,7 @@ namespace ARMeilleure.IntermediateRepresentation
X86Pavgb,
X86Pavgw,
X86Pblendvb,
+ X86Pclmulqdq,
X86Pcmpeqb,
X86Pcmpeqd,
X86Pcmpeqq,
diff --git a/ARMeilleure/Optimizations.cs b/ARMeilleure/Optimizations.cs
index b486c5d2..fa06a410 100644
--- a/ARMeilleure/Optimizations.cs
+++ b/ARMeilleure/Optimizations.cs
@@ -8,15 +8,16 @@ namespace ARMeilleure
public static bool FastFP { get; set; } = true;
- public static bool UseSseIfAvailable { get; set; } = true;
- public static bool UseSse2IfAvailable { get; set; } = true;
- public static bool UseSse3IfAvailable { get; set; } = true;
- public static bool UseSsse3IfAvailable { get; set; } = true;
- public static bool UseSse41IfAvailable { get; set; } = true;
- public static bool UseSse42IfAvailable { get; set; } = true;
- public static bool UsePopCntIfAvailable { get; set; } = true;
- public static bool UseAvxIfAvailable { get; set; } = true;
- public static bool UseAesniIfAvailable { get; set; } = true;
+ public static bool UseSseIfAvailable { get; set; } = true;
+ public static bool UseSse2IfAvailable { get; set; } = true;
+ public static bool UseSse3IfAvailable { get; set; } = true;
+ public static bool UseSsse3IfAvailable { get; set; } = true;
+ public static bool UseSse41IfAvailable { get; set; } = true;
+ public static bool UseSse42IfAvailable { get; set; } = true;
+ public static bool UsePopCntIfAvailable { get; set; } = true;
+ public static bool UseAvxIfAvailable { get; set; } = true;
+ public static bool UseAesniIfAvailable { get; set; } = true;
+ public static bool UsePclmulqdqIfAvailable { get; set; } = true;
public static bool ForceLegacySse
{
@@ -24,14 +25,15 @@ namespace ARMeilleure
set => HardwareCapabilities.ForceLegacySse = value;
}
- internal static bool UseSse => UseSseIfAvailable && HardwareCapabilities.SupportsSse;
- internal static bool UseSse2 => UseSse2IfAvailable && HardwareCapabilities.SupportsSse2;
- internal static bool UseSse3 => UseSse3IfAvailable && HardwareCapabilities.SupportsSse3;
- internal static bool UseSsse3 => UseSsse3IfAvailable && HardwareCapabilities.SupportsSsse3;
- internal static bool UseSse41 => UseSse41IfAvailable && HardwareCapabilities.SupportsSse41;
- internal static bool UseSse42 => UseSse42IfAvailable && HardwareCapabilities.SupportsSse42;
- internal static bool UsePopCnt => UsePopCntIfAvailable && HardwareCapabilities.SupportsPopcnt;
- internal static bool UseAvx => UseAvxIfAvailable && HardwareCapabilities.SupportsAvx && !ForceLegacySse;
- internal static bool UseAesni => UseAesniIfAvailable && HardwareCapabilities.SupportsAesni;
+ internal static bool UseSse => UseSseIfAvailable && HardwareCapabilities.SupportsSse;
+ internal static bool UseSse2 => UseSse2IfAvailable && HardwareCapabilities.SupportsSse2;
+ internal static bool UseSse3 => UseSse3IfAvailable && HardwareCapabilities.SupportsSse3;
+ internal static bool UseSsse3 => UseSsse3IfAvailable && HardwareCapabilities.SupportsSsse3;
+ internal static bool UseSse41 => UseSse41IfAvailable && HardwareCapabilities.SupportsSse41;
+ internal static bool UseSse42 => UseSse42IfAvailable && HardwareCapabilities.SupportsSse42;
+ internal static bool UsePopCnt => UsePopCntIfAvailable && HardwareCapabilities.SupportsPopcnt;
+ internal static bool UseAvx => UseAvxIfAvailable && HardwareCapabilities.SupportsAvx && !ForceLegacySse;
+ internal static bool UseAesni => UseAesniIfAvailable && HardwareCapabilities.SupportsAesni;
+ internal static bool UsePclmulqdq => UsePclmulqdqIfAvailable && HardwareCapabilities.SupportsPclmulqdq;
}
} \ No newline at end of file