diff options
| author | LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> | 2020-12-17 20:43:41 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-12-17 20:43:41 +0100 |
| commit | 8a33e884f8f482e93e2b90380b158c1417cc50f8 (patch) | |
| tree | 65eabad1c3a78d2a3bd7bf7992413fa78056178f /ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs | |
| parent | b5c215111de665ef8d18b38405ac55e17996e30e (diff) | |
Fix Vnmls_S fast path (F64: losing input d value). Fix Vnmla_S & Vnmls_S slow paths (using fused inst.s). Fix Vfma_V slow path not using StandardFPSCRValue(). (#1775)
* Fix Vnmls_S fast path (F64: losing input d value). Fix Vnmla_S & Vnmls_S slow paths (using fused inst.s).
Add Vfma_S & Vfms_S Fma fast paths.
Add Vfnma_S inst. with Fma/Sse fast paths and slow path.
Add Vfnms_S Sse fast path.
Add Tests for affected inst.s.
Nits.
* InternalVersion = 1775
* Nits.
* Fix Vfma_V slow path not using StandardFPSCRValue().
* Nit: Fix Vfma_V order.
* Add Vfms_V Sse fast path and slow path.
* Add Vfma_V and Vfms_V Test.
Diffstat (limited to 'ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs')
| -rw-r--r-- | ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs | 102 |
1 files changed, 49 insertions, 53 deletions
diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs index 40289520..d35af209 100644 --- a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs @@ -252,44 +252,48 @@ namespace ARMeilleure.Instructions } } - public static void Vfma_V(ArmEmitterContext context) // Fused. + public static void Vfma_S(ArmEmitterContext context) // Fused. { if (Optimizations.FastFP && Optimizations.UseFma) { - // Vectors contain elements that are 32-bits in length always. The only thing that will change is the number of elements in a vector. - // The 64-bit variant will never be used. - EmitVectorTernaryOpF32(context, Intrinsic.X86Vfmadd231ps, Intrinsic.X86Vfmadd231pd); + EmitScalarTernaryOpF32(context, Intrinsic.X86Vfmadd231ss, Intrinsic.X86Vfmadd231sd); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd); } else { - EmitVectorTernaryOpF32(context, (op1, op2, op3) => + EmitScalarTernaryOpF32(context, (op1, op2, op3) => { return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), op1, op2, op3); }); } } - public static void Vfma_S(ArmEmitterContext context) // Fused. + public static void Vfma_V(ArmEmitterContext context) // Fused. { - if (Optimizations.FastFP && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseFma) { - // TODO: Use FMA instruction set. - EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd); + EmitVectorTernaryOpF32(context, Intrinsic.X86Vfmadd231ps); } else { - EmitScalarTernaryOpF32(context, (op1, op2, op3) => + EmitVectorTernaryOpF32(context, (op1, op2, op3) => { - return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), op1, op2, op3); + return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulAddFpscr), op1, op2, op3); }); } } public static void Vfms_S(ArmEmitterContext context) // Fused. { - if (Optimizations.FastFP && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseFma) + { + EmitScalarTernaryOpF32(context, Intrinsic.X86Vfnmadd231ss, Intrinsic.X86Vfnmadd231sd); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) { - // TODO: Use FMA instruction set. EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd); } else @@ -301,17 +305,36 @@ namespace ARMeilleure.Instructions } } + public static void Vfms_V(ArmEmitterContext context) // Fused. + { + if (Optimizations.FastFP && Optimizations.UseFma) + { + EmitVectorTernaryOpF32(context, Intrinsic.X86Vfnmadd231ps); + } + else + { + EmitVectorTernaryOpF32(context, (op1, op2, op3) => + { + return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulSubFpscr), op1, op2, op3); + }); + } + } + public static void Vfnma_S(ArmEmitterContext context) // Fused. { if (Optimizations.FastFP && Optimizations.UseFma) { EmitScalarTernaryOpF32(context, Intrinsic.X86Vfnmsub231ss, Intrinsic.X86Vfnmsub231sd); } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd, isNegD: true); + } else { EmitScalarTernaryOpF32(context, (op1, op2, op3) => { - return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), context.Negate(op1), context.Negate(op2), op3); + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulAdd), op1, op2, op3); }); } } @@ -322,11 +345,15 @@ namespace ARMeilleure.Instructions { EmitScalarTernaryOpF32(context, Intrinsic.X86Vfmsub231ss, Intrinsic.X86Vfmsub231sd); } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd, isNegD: true); + } else { EmitScalarTernaryOpF32(context, (op1, op2, op3) => { - return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), context.Negate(op1), op2, op3); + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulSub), op1, op2, op3); }); } } @@ -422,36 +449,21 @@ namespace ARMeilleure.Instructions if (Optimizations.FastFP && Optimizations.UseSse2) { - EmitScalarTernaryOpSimd32(context, (d, n, m) => - { - if ((op.Size & 1) == 0) - { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); - res = context.AddIntrinsic(Intrinsic.X86Addss, d, res); - Operand mask = X86GetScalar(context, -0f); - return context.AddIntrinsic(Intrinsic.X86Xorps, mask, res); - } - else - { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); - res = context.AddIntrinsic(Intrinsic.X86Addsd, d, res); - Operand mask = X86GetScalar(context, -0d); - return context.AddIntrinsic(Intrinsic.X86Xorpd, mask, res); - } - }); + EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd, isNegD: true); } else if (Optimizations.FastFP) { EmitScalarTernaryOpF32(context, (op1, op2, op3) => { - return context.Negate(context.Add(op1, context.Multiply(op2, op3))); + return context.Subtract(context.Negate(op1), context.Multiply(op2, op3)); }); } else { EmitScalarTernaryOpF32(context, (op1, op2, op3) => { - return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulAdd), op1, op2, op3); + Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op2, op3); + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub), context.Negate(op1), res); }); } } @@ -462,24 +474,7 @@ namespace ARMeilleure.Instructions if (Optimizations.FastFP && Optimizations.UseSse2) { - EmitScalarTernaryOpSimd32(context, (d, n, m) => - { - if ((op.Size & 1) == 0) - { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); - Operand mask = X86GetScalar(context, -0f); - d = context.AddIntrinsic(Intrinsic.X86Xorps, mask, d); - return context.AddIntrinsic(Intrinsic.X86Addss, d, res); - - } - else - { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); - Operand mask = X86GetScalar(context, -0d); - d = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, res); - return context.AddIntrinsic(Intrinsic.X86Addsd, d, res); - } - }); + EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd, isNegD: true); } else if (Optimizations.FastFP) { @@ -492,7 +487,8 @@ namespace ARMeilleure.Instructions { EmitScalarTernaryOpF32(context, (op1, op2, op3) => { - return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulSub), op1, op2, op3); + Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op2, op3); + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), context.Negate(op1), res); }); } } |
