diff options
Diffstat (limited to 'ARMeilleure/Instructions')
| -rw-r--r-- | ARMeilleure/Instructions/InstEmitSimdArithmetic.cs | 512 | ||||
| -rw-r--r-- | ARMeilleure/Instructions/InstEmitSimdCvt32.cs | 17 | ||||
| -rw-r--r-- | ARMeilleure/Instructions/InstEmitSimdHelper.cs | 105 | ||||
| -rw-r--r-- | ARMeilleure/Instructions/InstEmitSimdHelper32.cs | 2 | ||||
| -rw-r--r-- | ARMeilleure/Instructions/InstEmitSystem.cs | 4 | ||||
| -rw-r--r-- | ARMeilleure/Instructions/InstEmitSystem32.cs | 2 |
6 files changed, 437 insertions, 205 deletions
diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs index d0bb68e4..7e7f26b1 100644 --- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs +++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs @@ -615,14 +615,11 @@ namespace ARMeilleure.Instructions { return EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; + IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; - Intrinsic addInst = (op.Size & 1) == 0 ? Intrinsic.X86Addps : Intrinsic.X86Addpd; + Intrinsic addInst = (op.Size & 1) == 0 ? Intrinsic.X86Addps : Intrinsic.X86Addpd; - return context.AddIntrinsic(addInst, op1, op2); - }, scalar: false, op1, op2); + return context.AddIntrinsic(addInst, op1, op2); }, scalar: false, op1, op2); }); } @@ -696,17 +693,33 @@ namespace ARMeilleure.Instructions Operand n = GetVec(op.Rn); Operand m = GetVec(op.Rm); + Operand res; + if (op.Size == 0) { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); - res = context.AddIntrinsic(Intrinsic.X86Addss, a, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ss, a, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addss, a, res); + } context.Copy(d, context.VectorZeroUpper96(res)); } else /* if (op.Size == 1) */ { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); - res = context.AddIntrinsic(Intrinsic.X86Addsd, a, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231sd, a, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addsd, a, res); + } context.Copy(d, context.VectorZeroUpper64(res)); } @@ -730,10 +743,7 @@ namespace ARMeilleure.Instructions { EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); - }, scalar: true, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); }, scalar: true); } else @@ -755,10 +765,7 @@ namespace ARMeilleure.Instructions { EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); - }, scalar: false, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); }, scalar: false); } else @@ -886,10 +893,7 @@ namespace ARMeilleure.Instructions { return EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); - }, scalar: false, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); }, scalar: false, op1, op2); }); } @@ -914,10 +918,7 @@ namespace ARMeilleure.Instructions { return EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); - }, scalar: false, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); }, scalar: false, op1, op2); }); } @@ -940,10 +941,7 @@ namespace ARMeilleure.Instructions { EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); - }, scalar: true, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); }, scalar: true); } else @@ -965,10 +963,7 @@ namespace ARMeilleure.Instructions { EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); - }, scalar: false, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); }, scalar: false); } else @@ -1096,10 +1091,7 @@ namespace ARMeilleure.Instructions { return EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); - }, scalar: false, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); }, scalar: false, op1, op2); }); } @@ -1124,10 +1116,7 @@ namespace ARMeilleure.Instructions { return EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); - }, scalar: false, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); }, scalar: false, op1, op2); }); } @@ -1146,6 +1135,37 @@ namespace ARMeilleure.Instructions { InstEmitSimdHelperArm64.EmitScalarTernaryOpFRdByElem(context, Intrinsic.Arm64FmlaSe); } + else if (Optimizations.UseFma) + { + OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + int shuffleMask = op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask)); + + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ss, d, n, res); + + context.Copy(d, context.VectorZeroUpper96(res)); + } + else /* if (sizeF == 1) */ + { + int shuffleMask = op.Index | op.Index << 1; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask)); + + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231sd, d, n, res); + + context.Copy(d, context.VectorZeroUpper64(res)); + } + } else { EmitScalarTernaryOpByElemF(context, (op1, op2, op3) => @@ -1171,11 +1191,19 @@ namespace ARMeilleure.Instructions int sizeF = op.Size & 1; + Operand res; + if (sizeF == 0) { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); - - res = context.AddIntrinsic(Intrinsic.X86Addps, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ps, d, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addps, d, res); + } if (op.RegisterSize == RegisterSize.Simd64) { @@ -1186,9 +1214,15 @@ namespace ARMeilleure.Instructions } else /* if (sizeF == 1) */ { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); - - res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231pd, d, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res); + } context.Copy(d, res); } @@ -1224,8 +1258,15 @@ namespace ARMeilleure.Instructions Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask)); - res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res); - res = context.AddIntrinsic(Intrinsic.X86Addps, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ps, d, n, res); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res); + res = context.AddIntrinsic(Intrinsic.X86Addps, d, res); + } if (op.RegisterSize == RegisterSize.Simd64) { @@ -1240,8 +1281,15 @@ namespace ARMeilleure.Instructions Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask)); - res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res); - res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231pd, d, n, res); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res); + res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res); + } context.Copy(d, res); } @@ -1261,6 +1309,37 @@ namespace ARMeilleure.Instructions { InstEmitSimdHelperArm64.EmitScalarTernaryOpFRdByElem(context, Intrinsic.Arm64FmlsSe); } + else if (Optimizations.UseFma) + { + OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + int shuffleMask = op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask)); + + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, d, n, res); + + context.Copy(d, context.VectorZeroUpper96(res)); + } + else /* if (sizeF == 1) */ + { + int shuffleMask = op.Index | op.Index << 1; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask)); + + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, d, n, res); + + context.Copy(d, context.VectorZeroUpper64(res)); + } + } else { EmitScalarTernaryOpByElemF(context, (op1, op2, op3) => @@ -1286,11 +1365,19 @@ namespace ARMeilleure.Instructions int sizeF = op.Size & 1; + Operand res; + if (sizeF == 0) { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); - - res = context.AddIntrinsic(Intrinsic.X86Subps, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, d, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subps, d, res); + } if (op.RegisterSize == RegisterSize.Simd64) { @@ -1301,9 +1388,15 @@ namespace ARMeilleure.Instructions } else /* if (sizeF == 1) */ { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); - - res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, d, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res); + } context.Copy(d, res); } @@ -1339,8 +1432,15 @@ namespace ARMeilleure.Instructions Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask)); - res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res); - res = context.AddIntrinsic(Intrinsic.X86Subps, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, d, n, res); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res); + res = context.AddIntrinsic(Intrinsic.X86Subps, d, res); + } if (op.RegisterSize == RegisterSize.Simd64) { @@ -1355,8 +1455,15 @@ namespace ARMeilleure.Instructions Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask)); - res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res); - res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, d, n, res); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res); + res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res); + } context.Copy(d, res); } @@ -1385,17 +1492,33 @@ namespace ARMeilleure.Instructions Operand n = GetVec(op.Rn); Operand m = GetVec(op.Rm); + Operand res; + if (op.Size == 0) { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); - res = context.AddIntrinsic(Intrinsic.X86Subss, a, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, a, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subss, a, res); + } context.Copy(d, context.VectorZeroUpper96(res)); } else /* if (op.Size == 1) */ { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); - res = context.AddIntrinsic(Intrinsic.X86Subsd, a, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, a, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subsd, a, res); + } context.Copy(d, context.VectorZeroUpper64(res)); } @@ -1669,25 +1792,39 @@ namespace ARMeilleure.Instructions Operand n = GetVec(op.Rn); Operand m = GetVec(op.Rm); + Operand res; + if (op.Size == 0) { - Operand mask = X86GetScalar(context, -0f); - - Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmsub231ss, a, n, m); + } + else + { + Operand mask = X86GetScalar(context, -0f); + Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); - res = context.AddIntrinsic(Intrinsic.X86Subss, aNeg, res); + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subss, aNeg, res); + } context.Copy(d, context.VectorZeroUpper96(res)); } else /* if (op.Size == 1) */ { - Operand mask = X86GetScalar(context, -0d); - - Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmsub231sd, a, n, m); + } + else + { + Operand mask = X86GetScalar(context, -0d); + Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); - res = context.AddIntrinsic(Intrinsic.X86Subsd, aNeg, res); + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subsd, aNeg, res); + } context.Copy(d, context.VectorZeroUpper64(res)); } @@ -1716,25 +1853,39 @@ namespace ARMeilleure.Instructions Operand n = GetVec(op.Rn); Operand m = GetVec(op.Rm); + Operand res; + if (op.Size == 0) { - Operand mask = X86GetScalar(context, -0f); - - Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmsub231ss, a, n, m); + } + else + { + Operand mask = X86GetScalar(context, -0f); + Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); - res = context.AddIntrinsic(Intrinsic.X86Addss, aNeg, res); + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addss, aNeg, res); + } context.Copy(d, context.VectorZeroUpper96(res)); } else /* if (op.Size == 1) */ { - Operand mask = X86GetScalar(context, -0d); - - Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmsub231sd, a, n, m); + } + else + { + Operand mask = X86GetScalar(context, -0d); + Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); - res = context.AddIntrinsic(Intrinsic.X86Addsd, aNeg, res); + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addsd, aNeg, res); + } context.Copy(d, context.VectorZeroUpper64(res)); } @@ -1830,13 +1981,22 @@ namespace ARMeilleure.Instructions int sizeF = op.Size & 1; + Operand res; + if (sizeF == 0) { Operand mask = X86GetScalar(context, 2f); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, mask, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subss, mask, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subss, mask, res); res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: true, sizeF); context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res)); @@ -1845,9 +2005,16 @@ namespace ARMeilleure.Instructions { Operand mask = X86GetScalar(context, 2d); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, mask, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subsd, mask, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subsd, mask, res); res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: true, sizeF); context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res)); @@ -1877,14 +2044,23 @@ namespace ARMeilleure.Instructions int sizeF = op.Size & 1; + Operand res; + if (sizeF == 0) { Operand mask = X86GetAllElements(context, 2f); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); - res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, mask, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subps, mask, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subps, mask, res); + res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF); if (op.RegisterSize == RegisterSize.Simd64) { @@ -1897,10 +2073,17 @@ namespace ARMeilleure.Instructions { Operand mask = X86GetAllElements(context, 2d); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); - res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, mask, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subpd, mask, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subpd, mask, res); + res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF); context.Copy(GetVec(op.Rd), res); } @@ -2113,20 +2296,32 @@ namespace ARMeilleure.Instructions public static void Frintx_S(ArmEmitterContext context) { - // TODO Arm64: Fast path. Should we set host FPCR? - EmitScalarUnaryOpF(context, (op1) => + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrintxS); + } + else { - return EmitRoundByRMode(context, op1); - }); + EmitScalarUnaryOpF(context, (op1) => + { + return EmitRoundByRMode(context, op1); + }); + } } public static void Frintx_V(ArmEmitterContext context) { - // TODO Arm64: Fast path. Should we set host FPCR? - EmitVectorUnaryOpF(context, (op1) => + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrintxV); + } + else { - return EmitRoundByRMode(context, op1); - }); + EmitVectorUnaryOpF(context, (op1) => + { + return EmitRoundByRMode(context, op1); + }); + } } public static void Frintz_S(ArmEmitterContext context) @@ -2237,16 +2432,25 @@ namespace ARMeilleure.Instructions int sizeF = op.Size & 1; + Operand res; + if (sizeF == 0) { Operand maskHalf = X86GetScalar(context, 0.5f); Operand maskThree = X86GetScalar(context, 3f); Operand maskOneHalf = X86GetScalar(context, 1.5f); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, maskThree, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subss, maskThree, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subss, maskThree, res); - res = context.AddIntrinsic(Intrinsic.X86Mulss, maskHalf, res); + res = context.AddIntrinsic(Intrinsic.X86Mulss, maskHalf, res); res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: true, sizeF); context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res)); @@ -2257,10 +2461,17 @@ namespace ARMeilleure.Instructions Operand maskThree = X86GetScalar(context, 3d); Operand maskOneHalf = X86GetScalar(context, 1.5d); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, maskThree, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subsd, maskThree, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subsd, maskThree, res); - res = context.AddIntrinsic(Intrinsic.X86Mulsd, maskHalf, res); + res = context.AddIntrinsic(Intrinsic.X86Mulsd, maskHalf, res); res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: true, sizeF); context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res)); @@ -2290,15 +2501,24 @@ namespace ARMeilleure.Instructions int sizeF = op.Size & 1; + Operand res; + if (sizeF == 0) { Operand maskHalf = X86GetAllElements(context, 0.5f); Operand maskThree = X86GetAllElements(context, 3f); Operand maskOneHalf = X86GetAllElements(context, 1.5f); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, maskThree, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subps, maskThree, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subps, maskThree, res); res = context.AddIntrinsic(Intrinsic.X86Mulps, maskHalf, res); res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: false, sizeF); @@ -2315,9 +2535,16 @@ namespace ARMeilleure.Instructions Operand maskThree = X86GetAllElements(context, 3d); Operand maskOneHalf = X86GetAllElements(context, 1.5d); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, maskThree, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subpd, maskThree, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subpd, maskThree, res); res = context.AddIntrinsic(Intrinsic.X86Mulpd, maskHalf, res); res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: false, sizeF); @@ -4728,53 +4955,6 @@ namespace ARMeilleure.Instructions } } - public static Operand EmitSseOrAvxHandleFzModeOpF( - ArmEmitterContext context, - Func2I emit, - bool scalar, - Operand n = default, - Operand m = default) - { - Operand nCopy = n == default ? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rn)) : n; - Operand mCopy = m == default ? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rm)) : m; - - EmitSseOrAvxEnterFtzAndDazModesOpF(context, out Operand isTrue); - - Operand res = emit(nCopy, mCopy); - - EmitSseOrAvxExitFtzAndDazModesOpF(context, isTrue); - - if (n != default || m != default) - { - return res; - } - - int sizeF = ((IOpCodeSimd)context.CurrOp).Size & 1; - - if (sizeF == 0) - { - if (scalar) - { - res = context.VectorZeroUpper96(res); - } - else if (((OpCodeSimdReg)context.CurrOp).RegisterSize == RegisterSize.Simd64) - { - res = context.VectorZeroUpper64(res); - } - } - else /* if (sizeF == 1) */ - { - if (scalar) - { - res = context.VectorZeroUpper64(res); - } - } - - context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res); - - return default; - } - private static Operand EmitSse2VectorMaxMinOpF(ArmEmitterContext context, Operand n, Operand m, bool isMax) { IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; @@ -4834,10 +5014,7 @@ namespace ARMeilleure.Instructions Operand res = EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum); - }, scalar: scalar, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum); }, scalar: scalar, nCopy, mCopy); if (n != default || m != default) @@ -4872,10 +5049,7 @@ namespace ARMeilleure.Instructions Operand res = EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum); - }, scalar: scalar, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum); }, scalar: scalar, nCopy, mCopy); if (n != default || m != default) diff --git a/ARMeilleure/Instructions/InstEmitSimdCvt32.cs b/ARMeilleure/Instructions/InstEmitSimdCvt32.cs index 5fdc3b5a..33ae83df 100644 --- a/ARMeilleure/Instructions/InstEmitSimdCvt32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdCvt32.cs @@ -356,9 +356,11 @@ namespace ARMeilleure.Instructions ? typeof(SoftFloat64_16).GetMethod(nameof(SoftFloat64_16.FPConvert)) : typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)); + context.ExitArmFpMode(); context.StoreToContext(); Operand res = context.Call(method, src); context.LoadFromContext(); + context.EnterArmFpMode(); InsertScalar16(context, op.Vd, op.T, res); } @@ -372,9 +374,11 @@ namespace ARMeilleure.Instructions ? typeof(SoftFloat16_64).GetMethod(nameof(SoftFloat16_64.FPConvert)) : typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)); + context.ExitArmFpMode(); context.StoreToContext(); Operand res = context.Call(method, src); context.LoadFromContext(); + context.EnterArmFpMode(); InsertScalar(context, op.Vd, res); } @@ -542,10 +546,17 @@ namespace ARMeilleure.Instructions // VRINTX (floating-point). public static void Vrintx_S(ArmEmitterContext context) { - EmitScalarUnaryOpF32(context, (op1) => + if (Optimizations.UseAdvSimd) { - return EmitRoundByRMode(context, op1); - }); + InstEmitSimdHelper32Arm64.EmitScalarUnaryOpF32(context, Intrinsic.Arm64FrintxS); + } + else + { + EmitScalarUnaryOpF32(context, (op1) => + { + return EmitRoundByRMode(context, op1); + }); + } } private static Operand EmitFPConvert(ArmEmitterContext context, Operand value, OperandType type, bool signed) diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper.cs b/ARMeilleure/Instructions/InstEmitSimdHelper.cs index 0e7af794..c44c9b4d 100644 --- a/ARMeilleure/Instructions/InstEmitSimdHelper.cs +++ b/ARMeilleure/Instructions/InstEmitSimdHelper.cs @@ -1,3 +1,4 @@ +using ARMeilleure.CodeGen.X86; using ARMeilleure.Decoders; using ARMeilleure.IntermediateRepresentation; using ARMeilleure.State; @@ -158,6 +159,75 @@ namespace ARMeilleure.Instructions }; #endregion + public static void EnterArmFpMode(EmitterContext context, Func<FPState, Operand> getFpFlag) + { + if (Optimizations.UseSse2) + { + Operand mxcsr = context.AddIntrinsicInt(Intrinsic.X86Stmxcsr); + + Operand fzTrue = getFpFlag(FPState.FzFlag); + Operand r0True = getFpFlag(FPState.RMode0Flag); + Operand r1True = getFpFlag(FPState.RMode1Flag); + + mxcsr = context.BitwiseAnd(mxcsr, Const(~(int)(Mxcsr.Ftz | Mxcsr.Daz | Mxcsr.Rhi | Mxcsr.Rlo))); + + mxcsr = context.BitwiseOr(mxcsr, context.ConditionalSelect(fzTrue, Const((int)(Mxcsr.Ftz | Mxcsr.Daz | Mxcsr.Um | Mxcsr.Dm)), Const(0))); + + // X86 round modes in order: nearest, negative, positive, zero + // ARM round modes in order: nearest, positive, negative, zero + // Read the bits backwards to correct this. + + mxcsr = context.BitwiseOr(mxcsr, context.ConditionalSelect(r0True, Const((int)Mxcsr.Rhi), Const(0))); + mxcsr = context.BitwiseOr(mxcsr, context.ConditionalSelect(r1True, Const((int)Mxcsr.Rlo), Const(0))); + + context.AddIntrinsicNoRet(Intrinsic.X86Ldmxcsr, mxcsr); + } + else if (Optimizations.UseAdvSimd) + { + Operand fpcr = context.AddIntrinsicInt(Intrinsic.Arm64MrsFpcr); + + Operand fzTrue = getFpFlag(FPState.FzFlag); + Operand r0True = getFpFlag(FPState.RMode0Flag); + Operand r1True = getFpFlag(FPState.RMode1Flag); + + fpcr = context.BitwiseAnd(fpcr, Const(~(int)(FPCR.Fz | FPCR.RMode0 | FPCR.RMode1))); + + fpcr = context.BitwiseOr(fpcr, context.ConditionalSelect(fzTrue, Const((int)FPCR.Fz), Const(0))); + fpcr = context.BitwiseOr(fpcr, context.ConditionalSelect(r0True, Const((int)FPCR.RMode0), Const(0))); + fpcr = context.BitwiseOr(fpcr, context.ConditionalSelect(r1True, Const((int)FPCR.RMode1), Const(0))); + + context.AddIntrinsicNoRet(Intrinsic.Arm64MsrFpcr, fpcr); + + // TODO: Restore FPSR + } + } + + public static void ExitArmFpMode(EmitterContext context, Action<FPState, Operand> setFpFlag) + { + if (Optimizations.UseSse2) + { + Operand mxcsr = context.AddIntrinsicInt(Intrinsic.X86Stmxcsr); + + // Unset round mode (to nearest) and ftz. + mxcsr = context.BitwiseAnd(mxcsr, Const(~(int)(Mxcsr.Ftz | Mxcsr.Daz | Mxcsr.Rhi | Mxcsr.Rlo))); + + context.AddIntrinsicNoRet(Intrinsic.X86Ldmxcsr, mxcsr); + + // Status flags would be stored here if they were used. + } + else if (Optimizations.UseAdvSimd) + { + Operand fpcr = context.AddIntrinsicInt(Intrinsic.Arm64MrsFpcr); + + // Unset round mode (to nearest) and fz. + fpcr = context.BitwiseAnd(fpcr, Const(~(int)(FPCR.Fz | FPCR.RMode0 | FPCR.RMode1))); + + context.AddIntrinsicNoRet(Intrinsic.Arm64MsrFpcr, fpcr); + + // TODO: Store FPSR + } + } + public static int GetImmShl(OpCodeSimdShImm op) { return op.Imm - (8 << op.Size); @@ -465,9 +535,11 @@ namespace ARMeilleure.Instructions ? typeof(SoftFloat32).GetMethod(name) : typeof(SoftFloat64).GetMethod(name); + context.ExitArmFpMode(); context.StoreToContext(); Operand res = context.Call(info, callArgs); context.LoadFromContext(); + context.EnterArmFpMode(); return res; } @@ -1358,39 +1430,6 @@ namespace ARMeilleure.Instructions } } - [Flags] - public enum Mxcsr - { - Ftz = 1 << 15, // Flush To Zero. - Um = 1 << 11, // Underflow Mask. - Dm = 1 << 8, // Denormal Mask. - Daz = 1 << 6 // Denormals Are Zero. - } - - public static void EmitSseOrAvxEnterFtzAndDazModesOpF(ArmEmitterContext context, out Operand isTrue) - { - isTrue = GetFpFlag(FPState.FzFlag); - - Operand lblTrue = Label(); - context.BranchIfFalse(lblTrue, isTrue); - - context.AddIntrinsicNoRet(Intrinsic.X86Mxcsrmb, Const((int)(Mxcsr.Ftz | Mxcsr.Um | Mxcsr.Dm | Mxcsr.Daz))); - - context.MarkLabel(lblTrue); - } - - public static void EmitSseOrAvxExitFtzAndDazModesOpF(ArmEmitterContext context, Operand isTrue = default) - { - isTrue = isTrue == default ? GetFpFlag(FPState.FzFlag) : isTrue; - - Operand lblTrue = Label(); - context.BranchIfFalse(lblTrue, isTrue); - - context.AddIntrinsicNoRet(Intrinsic.X86Mxcsrub, Const((int)(Mxcsr.Ftz | Mxcsr.Daz))); - - context.MarkLabel(lblTrue); - } - public enum CmpCondition { // Legacy Sse. diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs index 84b01d05..36d27d42 100644 --- a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs @@ -1197,9 +1197,11 @@ namespace ARMeilleure.Instructions Array.Resize(ref callArgs, callArgs.Length + 1); callArgs[callArgs.Length - 1] = Const(1); + context.ExitArmFpMode(); context.StoreToContext(); Operand res = context.Call(info, callArgs); context.LoadFromContext(); + context.EnterArmFpMode(); return res; } diff --git a/ARMeilleure/Instructions/InstEmitSystem.cs b/ARMeilleure/Instructions/InstEmitSystem.cs index 1345bbf1..f668b83b 100644 --- a/ARMeilleure/Instructions/InstEmitSystem.cs +++ b/ARMeilleure/Instructions/InstEmitSystem.cs @@ -192,6 +192,8 @@ namespace ARMeilleure.Instructions SetFpFlag(context, (FPState)flag, context.BitwiseAnd(context.ShiftRightUI(fpcr, Const(flag)), Const(1))); } } + + context.UpdateArmFpMode(); } private static void EmitSetFpsr(ArmEmitterContext context) @@ -210,6 +212,8 @@ namespace ARMeilleure.Instructions SetFpFlag(context, (FPState)flag, context.BitwiseAnd(context.ShiftRightUI(fpsr, Const(flag)), Const(1))); } } + + context.UpdateArmFpMode(); } } } diff --git a/ARMeilleure/Instructions/InstEmitSystem32.cs b/ARMeilleure/Instructions/InstEmitSystem32.cs index e07db412..2f6cf19d 100644 --- a/ARMeilleure/Instructions/InstEmitSystem32.cs +++ b/ARMeilleure/Instructions/InstEmitSystem32.cs @@ -321,6 +321,8 @@ namespace ARMeilleure.Instructions SetFpFlag(context, (FPState)flag, context.BitwiseAnd(context.ShiftRightUI(fpscr, Const(flag)), Const(1))); } } + + context.UpdateArmFpMode(); } } } |
