diff options
| author | LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> | 2020-07-13 13:08:47 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-07-13 21:08:47 +1000 |
| commit | a804db6eed016a8a1f152c2837fc7b65e50f02df (patch) | |
| tree | 5ac490b42ba8fc2c22038bb784de4ea3fcda352f /ARMeilleure | |
| parent | d7044b10a253dae31b9a0041a432e3a7adce59f6 (diff) | |
Add Fmax/minv_V & S/Ushl_S Inst.s with Tests. Fix Maxps/d & Minps/d d… (#1335)
* Add Fmax/minv_V & S/Ushl_S Inst.s with Tests. Fix Maxps/d & Minps/d double zero sign handling. Allows better handling of NaNs.
* Optimized EmitSse2VectorIsNaNOpF() for multiple uses per opF.
Diffstat (limited to 'ARMeilleure')
| -rw-r--r-- | ARMeilleure/Decoders/OpCodeTable.cs | 6 | ||||
| -rw-r--r-- | ARMeilleure/Instructions/InstEmitSimdArithmetic.cs | 356 | ||||
| -rw-r--r-- | ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs | 4 | ||||
| -rw-r--r-- | ARMeilleure/Instructions/InstEmitSimdHelper.cs | 45 | ||||
| -rw-r--r-- | ARMeilleure/Instructions/InstEmitSimdShift.cs | 98 | ||||
| -rw-r--r-- | ARMeilleure/Instructions/InstName.cs | 6 | ||||
| -rw-r--r-- | ARMeilleure/Translation/PTC/Ptc.cs | 4 |
7 files changed, 419 insertions, 100 deletions
diff --git a/ARMeilleure/Decoders/OpCodeTable.cs b/ARMeilleure/Decoders/OpCodeTable.cs index c1632d46..b98fcab1 100644 --- a/ARMeilleure/Decoders/OpCodeTable.cs +++ b/ARMeilleure/Decoders/OpCodeTable.cs @@ -332,14 +332,18 @@ namespace ARMeilleure.Decoders SetA64("0>0011100<1xxxxx111101xxxxxxxxxx", InstName.Fmax_V, InstEmit.Fmax_V, typeof(OpCodeSimdReg)); SetA64("000111100x1xxxxx011010xxxxxxxxxx", InstName.Fmaxnm_S, InstEmit.Fmaxnm_S, typeof(OpCodeSimdReg)); SetA64("0>0011100<1xxxxx110001xxxxxxxxxx", InstName.Fmaxnm_V, InstEmit.Fmaxnm_V, typeof(OpCodeSimdReg)); + SetA64("0>1011100<1xxxxx110001xxxxxxxxxx", InstName.Fmaxnmp_V, InstEmit.Fmaxnmp_V, typeof(OpCodeSimdReg)); SetA64("0110111000110000110010xxxxxxxxxx", InstName.Fmaxnmv_V, InstEmit.Fmaxnmv_V, typeof(OpCodeSimd)); SetA64("0>1011100<1xxxxx111101xxxxxxxxxx", InstName.Fmaxp_V, InstEmit.Fmaxp_V, typeof(OpCodeSimdReg)); + SetA64("0110111000110000111110xxxxxxxxxx", InstName.Fmaxv_V, InstEmit.Fmaxv_V, typeof(OpCodeSimd)); SetA64("000111100x1xxxxx010110xxxxxxxxxx", InstName.Fmin_S, InstEmit.Fmin_S, typeof(OpCodeSimdReg)); SetA64("0>0011101<1xxxxx111101xxxxxxxxxx", InstName.Fmin_V, InstEmit.Fmin_V, typeof(OpCodeSimdReg)); SetA64("000111100x1xxxxx011110xxxxxxxxxx", InstName.Fminnm_S, InstEmit.Fminnm_S, typeof(OpCodeSimdReg)); SetA64("0>0011101<1xxxxx110001xxxxxxxxxx", InstName.Fminnm_V, InstEmit.Fminnm_V, typeof(OpCodeSimdReg)); + SetA64("0>1011101<1xxxxx110001xxxxxxxxxx", InstName.Fminnmp_V, InstEmit.Fminnmp_V, typeof(OpCodeSimdReg)); SetA64("0110111010110000110010xxxxxxxxxx", InstName.Fminnmv_V, InstEmit.Fminnmv_V, typeof(OpCodeSimd)); SetA64("0>1011101<1xxxxx111101xxxxxxxxxx", InstName.Fminp_V, InstEmit.Fminp_V, typeof(OpCodeSimdReg)); + SetA64("0110111010110000111110xxxxxxxxxx", InstName.Fminv_V, InstEmit.Fminv_V, typeof(OpCodeSimd)); SetA64("010111111xxxxxxx0001x0xxxxxxxxxx", InstName.Fmla_Se, InstEmit.Fmla_Se, typeof(OpCodeSimdRegElemF)); SetA64("0>0011100<1xxxxx110011xxxxxxxxxx", InstName.Fmla_V, InstEmit.Fmla_V, typeof(OpCodeSimdReg)); SetA64("0>0011111<xxxxxx0001x0xxxxxxxxxx", InstName.Fmla_Ve, InstEmit.Fmla_Ve, typeof(OpCodeSimdRegElemF)); @@ -529,6 +533,7 @@ namespace ARMeilleure.Decoders SetA64("0101111101xxxxxx001101xxxxxxxxxx", InstName.Srsra_S, InstEmit.Srsra_S, typeof(OpCodeSimdShImm)); SetA64("0x00111100>>>xxx001101xxxxxxxxxx", InstName.Srsra_V, InstEmit.Srsra_V, typeof(OpCodeSimdShImm)); SetA64("0100111101xxxxxx001101xxxxxxxxxx", InstName.Srsra_V, InstEmit.Srsra_V, typeof(OpCodeSimdShImm)); + SetA64("01011110111xxxxx010001xxxxxxxxxx", InstName.Sshl_S, InstEmit.Sshl_S, typeof(OpCodeSimdReg)); SetA64("0>001110<<1xxxxx010001xxxxxxxxxx", InstName.Sshl_V, InstEmit.Sshl_V, typeof(OpCodeSimdReg)); SetA64("0x00111100>>>xxx101001xxxxxxxxxx", InstName.Sshll_V, InstEmit.Sshll_V, typeof(OpCodeSimdShImm)); SetA64("0101111101xxxxxx000001xxxxxxxxxx", InstName.Sshr_S, InstEmit.Sshr_S, typeof(OpCodeSimdShImm)); @@ -611,6 +616,7 @@ namespace ARMeilleure.Decoders SetA64("0111111101xxxxxx001101xxxxxxxxxx", InstName.Ursra_S, InstEmit.Ursra_S, typeof(OpCodeSimdShImm)); SetA64("0x10111100>>>xxx001101xxxxxxxxxx", InstName.Ursra_V, InstEmit.Ursra_V, typeof(OpCodeSimdShImm)); SetA64("0110111101xxxxxx001101xxxxxxxxxx", InstName.Ursra_V, InstEmit.Ursra_V, typeof(OpCodeSimdShImm)); + SetA64("01111110111xxxxx010001xxxxxxxxxx", InstName.Ushl_S, InstEmit.Ushl_S, typeof(OpCodeSimdReg)); SetA64("0>101110<<1xxxxx010001xxxxxxxxxx", InstName.Ushl_V, InstEmit.Ushl_V, typeof(OpCodeSimdReg)); SetA64("0x10111100>>>xxx101001xxxxxxxxxx", InstName.Ushll_V, InstEmit.Ushll_V, typeof(OpCodeSimdShImm)); SetA64("0111111101xxxxxx000001xxxxxxxxxx", InstName.Ushr_S, InstEmit.Ushr_S, typeof(OpCodeSimdShImm)); diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs index b3041aac..0d417f70 100644 --- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs +++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs @@ -382,7 +382,14 @@ namespace ARMeilleure.Instructions { if (Optimizations.FastFP && Optimizations.UseSse2) { - EmitSse2VectorPairwiseOpF(context, Intrinsic.X86Addps, Intrinsic.X86Addpd); + EmitSse2VectorPairwiseOpF(context, (op1, op2) => + { + IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; + + Intrinsic addInst = (op.Size & 1) == 0 ? Intrinsic.X86Addps : Intrinsic.X86Addpd; + + return context.AddIntrinsic(addInst, op1, op2); + }); } else { @@ -468,9 +475,12 @@ namespace ARMeilleure.Instructions public static void Fmax_S(ArmEmitterContext context) { - if (Optimizations.FastFP && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse41) { - EmitScalarBinaryOpF(context, Intrinsic.X86Maxss, Intrinsic.X86Maxsd); + EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); + }, scalar: true); } else { @@ -483,9 +493,12 @@ namespace ARMeilleure.Instructions public static void Fmax_V(ArmEmitterContext context) { - if (Optimizations.FastFP && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse41) { - EmitVectorBinaryOpF(context, Intrinsic.X86Maxps, Intrinsic.X86Maxpd); + EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); + }, scalar: false); } else { @@ -526,19 +539,53 @@ namespace ARMeilleure.Instructions } } + public static void Fmaxnmp_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse2VectorPairwiseOpF(context, (op1, op2) => + { + return EmitSse41MaxMinNumOpF(context, isMaxNum: true, scalar: false, op1, op2); + }); + } + else + { + EmitVectorPairwiseOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum), op1, op2); + }); + } + } + public static void Fmaxnmv_V(ArmEmitterContext context) { - EmitVectorAcrossVectorOpF(context, (op1, op2) => + if (Optimizations.FastFP && Optimizations.UseSse41) { - return context.Call(typeof(SoftFloat32).GetMethod(nameof(SoftFloat32.FPMaxNum)), op1, op2); - }); + EmitSse2VectorAcrossVectorOpF(context, (op1, op2) => + { + return EmitSse41MaxMinNumOpF(context, isMaxNum: true, scalar: false, op1, op2); + }); + } + else + { + EmitVectorAcrossVectorOpF(context, (op1, op2) => + { + return context.Call(typeof(SoftFloat32).GetMethod(nameof(SoftFloat32.FPMaxNum)), op1, op2); + }); + } } public static void Fmaxp_V(ArmEmitterContext context) { - if (Optimizations.FastFP && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse41) { - EmitSse2VectorPairwiseOpF(context, Intrinsic.X86Maxps, Intrinsic.X86Maxpd); + EmitSse2VectorPairwiseOpF(context, (op1, op2) => + { + return EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); + }, scalar: false, op1, op2); + }); } else { @@ -549,11 +596,35 @@ namespace ARMeilleure.Instructions } } + public static void Fmaxv_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse2VectorAcrossVectorOpF(context, (op1, op2) => + { + return EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); + }, scalar: false, op1, op2); + }); + } + else + { + EmitVectorAcrossVectorOpF(context, (op1, op2) => + { + return context.Call(typeof(SoftFloat32).GetMethod(nameof(SoftFloat32.FPMax)), op1, op2); + }); + } + } + public static void Fmin_S(ArmEmitterContext context) { - if (Optimizations.FastFP && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse41) { - EmitScalarBinaryOpF(context, Intrinsic.X86Minss, Intrinsic.X86Minsd); + EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); + }, scalar: true); } else { @@ -566,9 +637,12 @@ namespace ARMeilleure.Instructions public static void Fmin_V(ArmEmitterContext context) { - if (Optimizations.FastFP && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse41) { - EmitVectorBinaryOpF(context, Intrinsic.X86Minps, Intrinsic.X86Minpd); + EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); + }, scalar: false); } else { @@ -609,19 +683,53 @@ namespace ARMeilleure.Instructions } } + public static void Fminnmp_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse2VectorPairwiseOpF(context, (op1, op2) => + { + return EmitSse41MaxMinNumOpF(context, isMaxNum: false, scalar: false, op1, op2); + }); + } + else + { + EmitVectorPairwiseOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum), op1, op2); + }); + } + } + public static void Fminnmv_V(ArmEmitterContext context) { - EmitVectorAcrossVectorOpF(context, (op1, op2) => + if (Optimizations.FastFP && Optimizations.UseSse41) { - return context.Call(typeof(SoftFloat32).GetMethod(nameof(SoftFloat32.FPMinNum)), op1, op2); - }); + EmitSse2VectorAcrossVectorOpF(context, (op1, op2) => + { + return EmitSse41MaxMinNumOpF(context, isMaxNum: false, scalar: false, op1, op2); + }); + } + else + { + EmitVectorAcrossVectorOpF(context, (op1, op2) => + { + return context.Call(typeof(SoftFloat32).GetMethod(nameof(SoftFloat32.FPMinNum)), op1, op2); + }); + } } public static void Fminp_V(ArmEmitterContext context) { - if (Optimizations.FastFP && Optimizations.UseSse2) + if (Optimizations.FastFP && Optimizations.UseSse41) { - EmitSse2VectorPairwiseOpF(context, Intrinsic.X86Minps, Intrinsic.X86Minpd); + EmitSse2VectorPairwiseOpF(context, (op1, op2) => + { + return EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); + }, scalar: false, op1, op2); + }); } else { @@ -632,6 +740,27 @@ namespace ARMeilleure.Instructions } } + public static void Fminv_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse2VectorAcrossVectorOpF(context, (op1, op2) => + { + return EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); + }, scalar: false, op1, op2); + }); + } + else + { + EmitVectorAcrossVectorOpF(context, (op1, op2) => + { + return context.Call(typeof(SoftFloat32).GetMethod(nameof(SoftFloat32.FPMin)), op1, op2); + }); + } + } + public static void Fmla_Se(ArmEmitterContext context) // Fused. { EmitScalarTernaryOpByElemF(context, (op1, op2, op3) => @@ -3111,7 +3240,12 @@ namespace ARMeilleure.Instructions context.Copy(GetVec(op.Rd), res); } - public static Operand EmitSse2VectorIsQNaNOpF(ArmEmitterContext context, Operand opF) + public static void EmitSse2VectorIsNaNOpF( + ArmEmitterContext context, + Operand opF, + out Operand qNaNMask, + out Operand sNaNMask, + bool? isQNaN = null) { IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; @@ -3126,7 +3260,8 @@ namespace ARMeilleure.Instructions Operand mask2 = context.AddIntrinsic(Intrinsic.X86Pand, opF, qMask); mask2 = context.AddIntrinsic(Intrinsic.X86Cmpps, mask2, qMask, Const((int)CmpCondition.Equal)); - return context.AddIntrinsic(Intrinsic.X86Andps, mask1, mask2); + qNaNMask = isQNaN == null || (bool)isQNaN ? context.AddIntrinsic(Intrinsic.X86Andps, mask2, mask1) : null; + sNaNMask = isQNaN == null || !(bool)isQNaN ? context.AddIntrinsic(Intrinsic.X86Andnps, mask2, mask1) : null; } else /* if ((op.Size & 1) == 1) */ { @@ -3139,67 +3274,202 @@ namespace ARMeilleure.Instructions Operand mask2 = context.AddIntrinsic(Intrinsic.X86Pand, opF, qMask); mask2 = context.AddIntrinsic(Intrinsic.X86Cmppd, mask2, qMask, Const((int)CmpCondition.Equal)); - return context.AddIntrinsic(Intrinsic.X86Andpd, mask1, mask2); + qNaNMask = isQNaN == null || (bool)isQNaN ? context.AddIntrinsic(Intrinsic.X86Andpd, mask2, mask1) : null; + sNaNMask = isQNaN == null || !(bool)isQNaN ? context.AddIntrinsic(Intrinsic.X86Andnpd, mask2, mask1) : null; } } - private static void EmitSse41MaxMinNumOpF(ArmEmitterContext context, bool isMaxNum, bool scalar) + public static Operand EmitSse41ProcessNaNsOpF( + ArmEmitterContext context, + Func2I emit, + bool scalar, + Operand n = null, + Operand m = null) { - OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + Operand nCopy = n ?? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rn)); + Operand mCopy = m ?? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rm)); - Operand d = GetVec(op.Rd); - Operand n = GetVec(op.Rn); - Operand m = GetVec(op.Rm); + EmitSse2VectorIsNaNOpF(context, nCopy, out Operand nQNaNMask, out Operand nSNaNMask); + EmitSse2VectorIsNaNOpF(context, mCopy, out _, out Operand mSNaNMask, isQNaN: false); - Operand nNum = context.Copy(n); - Operand mNum = context.Copy(m); + int sizeF = ((IOpCodeSimd)context.CurrOp).Size & 1; - Operand nQNaNMask = EmitSse2VectorIsQNaNOpF(context, nNum); - Operand mQNaNMask = EmitSse2VectorIsQNaNOpF(context, mNum); + if (sizeF == 0) + { + const int QBit = 22; - int sizeF = op.Size & 1; + Operand qMask = scalar ? X86GetScalar(context, 1 << QBit) : X86GetAllElements(context, 1 << QBit); + + Operand resNaNMask = context.AddIntrinsic(Intrinsic.X86Pandn, mSNaNMask, nQNaNMask); + resNaNMask = context.AddIntrinsic(Intrinsic.X86Por, resNaNMask, nSNaNMask); + + Operand resNaN = context.AddIntrinsic(Intrinsic.X86Blendvps, mCopy, nCopy, resNaNMask); + resNaN = context.AddIntrinsic(Intrinsic.X86Por, resNaN, qMask); + + Operand resMask = context.AddIntrinsic(Intrinsic.X86Cmpps, nCopy, mCopy, Const((int)CmpCondition.OrderedQ)); + + Operand res = context.AddIntrinsic(Intrinsic.X86Blendvps, resNaN, emit(nCopy, mCopy), resMask); + + if (n != null || m != null) + { + return res; + } + + if (scalar) + { + res = context.VectorZeroUpper96(res); + } + else if (((OpCodeSimdReg)context.CurrOp).RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res); + + return null; + } + else /* if (sizeF == 1) */ + { + const int QBit = 51; + + Operand qMask = scalar ? X86GetScalar(context, 1L << QBit) : X86GetAllElements(context, 1L << QBit); + + Operand resNaNMask = context.AddIntrinsic(Intrinsic.X86Pandn, mSNaNMask, nQNaNMask); + resNaNMask = context.AddIntrinsic(Intrinsic.X86Por, resNaNMask, nSNaNMask); + + Operand resNaN = context.AddIntrinsic(Intrinsic.X86Blendvpd, mCopy, nCopy, resNaNMask); + resNaN = context.AddIntrinsic(Intrinsic.X86Por, resNaN, qMask); + + Operand resMask = context.AddIntrinsic(Intrinsic.X86Cmppd, nCopy, mCopy, Const((int)CmpCondition.OrderedQ)); + + Operand res = context.AddIntrinsic(Intrinsic.X86Blendvpd, resNaN, emit(nCopy, mCopy), resMask); + + if (n != null || m != null) + { + return res; + } + + if (scalar) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res); + + return null; + } + } + + private static Operand EmitSse2VectorMaxMinOpF(ArmEmitterContext context, Operand n, Operand m, bool isMax) + { + IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; + + if ((op.Size & 1) == 0) + { + Operand mask = X86GetAllElements(context, -0f); + + Operand res = context.AddIntrinsic(isMax ? Intrinsic.X86Maxps : Intrinsic.X86Minps, n, m); + res = context.AddIntrinsic(Intrinsic.X86Andnps, mask, res); + + Operand resSign = context.AddIntrinsic(isMax ? Intrinsic.X86Pand : Intrinsic.X86Por, n, m); + resSign = context.AddIntrinsic(Intrinsic.X86Andps, mask, resSign); + + return context.AddIntrinsic(Intrinsic.X86Por, res, resSign); + } + else /* if ((op.Size & 1) == 1) */ + { + Operand mask = X86GetAllElements(context, -0d); + + Operand res = context.AddIntrinsic(isMax ? Intrinsic.X86Maxpd : Intrinsic.X86Minpd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Andnpd, mask, res); + + Operand resSign = context.AddIntrinsic(isMax ? Intrinsic.X86Pand : Intrinsic.X86Por, n, m); + resSign = context.AddIntrinsic(Intrinsic.X86Andpd, mask, resSign); + + return context.AddIntrinsic(Intrinsic.X86Por, res, resSign); + } + } + + private static Operand EmitSse41MaxMinNumOpF( + ArmEmitterContext context, + bool isMaxNum, + bool scalar, + Operand n = null, + Operand m = null) + { + Operand nCopy = n ?? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rn)); + Operand mCopy = m ?? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rm)); + + EmitSse2VectorIsNaNOpF(context, nCopy, out Operand nQNaNMask, out _, isQNaN: true); + EmitSse2VectorIsNaNOpF(context, mCopy, out Operand mQNaNMask, out _, isQNaN: true); + + int sizeF = ((IOpCodeSimd)context.CurrOp).Size & 1; if (sizeF == 0) { - Operand negInfMask = X86GetAllElements(context, isMaxNum ? float.NegativeInfinity : float.PositiveInfinity); + Operand negInfMask = scalar + ? X86GetScalar (context, isMaxNum ? float.NegativeInfinity : float.PositiveInfinity) + : X86GetAllElements(context, isMaxNum ? float.NegativeInfinity : float.PositiveInfinity); Operand nMask = context.AddIntrinsic(Intrinsic.X86Andnps, mQNaNMask, nQNaNMask); Operand mMask = context.AddIntrinsic(Intrinsic.X86Andnps, nQNaNMask, mQNaNMask); - nNum = context.AddIntrinsic(Intrinsic.X86Blendvps, nNum, negInfMask, nMask); - mNum = context.AddIntrinsic(Intrinsic.X86Blendvps, mNum, negInfMask, mMask); + nCopy = context.AddIntrinsic(Intrinsic.X86Blendvps, nCopy, negInfMask, nMask); + mCopy = context.AddIntrinsic(Intrinsic.X86Blendvps, mCopy, negInfMask, mMask); + + Operand res = EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum); + }, scalar: scalar, nCopy, mCopy); - Operand res = context.AddIntrinsic(isMaxNum ? Intrinsic.X86Maxps : Intrinsic.X86Minps, nNum, mNum); + if (n != null || m != null) + { + return res; + } if (scalar) { res = context.VectorZeroUpper96(res); } - else if (op.RegisterSize == RegisterSize.Simd64) + else if (((OpCodeSimdReg)context.CurrOp).RegisterSize == RegisterSize.Simd64) { res = context.VectorZeroUpper64(res); } - context.Copy(d, res); + context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res); + + return null; } else /* if (sizeF == 1) */ { - Operand negInfMask = X86GetAllElements(context, isMaxNum ? double.NegativeInfinity : double.PositiveInfinity); + Operand negInfMask = scalar + ? X86GetScalar (context, isMaxNum ? double.NegativeInfinity : double.PositiveInfinity) + : X86GetAllElements(context, isMaxNum ? double.NegativeInfinity : double.PositiveInfinity); Operand nMask = context.AddIntrinsic(Intrinsic.X86Andnpd, mQNaNMask, nQNaNMask); Operand mMask = context.AddIntrinsic(Intrinsic.X86Andnpd, nQNaNMask, mQNaNMask); - nNum = context.AddIntrinsic(Intrinsic.X86Blendvpd, nNum, negInfMask, nMask); - mNum = context.AddIntrinsic(Intrinsic.X86Blendvpd, mNum, negInfMask, mMask); + nCopy = context.AddIntrinsic(Intrinsic.X86Blendvpd, nCopy, negInfMask, nMask); + mCopy = context.AddIntrinsic(Intrinsic.X86Blendvpd, mCopy, negInfMask, mMask); - Operand res = context.AddIntrinsic(isMaxNum ? Intrinsic.X86Maxpd : Intrinsic.X86Minpd, nNum, mNum); + Operand res = EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum); + }, scalar: scalar, nCopy, mCopy); + + if (n != null || m != null) + { + return res; + } if (scalar) { res = context.VectorZeroUpper64(res); } - context.Copy(d, res); + context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res); + + return null; } } diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs index 82f57d63..eb86ac9e 100644 --- a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs @@ -1200,8 +1200,8 @@ namespace ARMeilleure.Instructions Operand nNum = context.Copy(n); Operand mNum = context.Copy(m); - Operand nQNaNMask = InstEmit.EmitSse2VectorIsQNaNOpF(context, nNum); - Operand mQNaNMask = InstEmit.EmitSse2VectorIsQNaNOpF(context, mNum); + InstEmit.EmitSse2VectorIsNaNOpF(context, nNum, out Operand nQNaNMask, out _, isQNaN: true); + InstEmit.EmitSse2VectorIsNaNOpF(context, mNum, out Operand mQNaNMask, out _, isQNaN: true); int sizeF = op.Size & 1; diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper.cs b/ARMeilleure/Instructions/InstEmitSimdHelper.cs index 912c2260..69e79a6d 100644 --- a/ARMeilleure/Instructions/InstEmitSimdHelper.cs +++ b/ARMeilleure/Instructions/InstEmitSimdHelper.cs @@ -1095,6 +1095,29 @@ namespace ARMeilleure.Instructions context.Copy(GetVec(op.Rd), d); } + public static void EmitSse2VectorAcrossVectorOpF(ArmEmitterContext context, Func2I emit) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Debug.Assert((op.Size & 1) == 0 && op.RegisterSize == RegisterSize.Simd128); + + const int sm0 = 0 << 6 | 0 << 4 | 0 << 2 | 0 << 0; + const int sm1 = 1 << 6 | 1 << 4 | 1 << 2 | 1 << 0; + const int sm2 = 2 << 6 | 2 << 4 | 2 << 2 | 2 << 0; + const int sm3 = 3 << 6 | 3 << 4 | 3 << 2 | 3 << 0; + + Operand nCopy = context.Copy(GetVec(op.Rn)); + + Operand part0 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, nCopy, Const(sm0)); + Operand part1 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, nCopy, Const(sm1)); + Operand part2 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, nCopy, Const(sm2)); + Operand part3 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, nCopy, Const(sm3)); + + Operand res = emit(emit(part0, part1), emit(part2, part3)); + + context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res)); + } + public static void EmitVectorPairwiseOpF(ArmEmitterContext context, Func2I emit) { OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; @@ -1124,12 +1147,12 @@ namespace ARMeilleure.Instructions context.Copy(GetVec(op.Rd), res); } - public static void EmitSse2VectorPairwiseOpF(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + public static void EmitSse2VectorPairwiseOpF(ArmEmitterContext context, Func2I emit) { OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; - Operand n = GetVec(op.Rn); - Operand m = GetVec(op.Rm); + Operand nCopy = context.Copy(GetVec(op.Rn)); + Operand mCopy = context.Copy(GetVec(op.Rm)); int sizeF = op.Size & 1; @@ -1137,32 +1160,32 @@ namespace ARMeilleure.Instructions { if (op.RegisterSize == RegisterSize.Simd64) { - Operand unpck = context.AddIntrinsic(Intrinsic.X86Unpcklps, n, m); + Operand unpck = context.AddIntrinsic(Intrinsic.X86Unpcklps, nCopy, mCopy); Operand zero = context.VectorZero(); Operand part0 = context.AddIntrinsic(Intrinsic.X86Movlhps, unpck, zero); Operand part1 = context.AddIntrinsic(Intrinsic.X86Movhlps, zero, unpck); - context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst32, part0, part1)); + context.Copy(GetVec(op.Rd), emit(part0, part1)); } else /* if (op.RegisterSize == RegisterSize.Simd128) */ { const int sm0 = 2 << 6 | 0 << 4 | 2 << 2 | 0 << 0; const int sm1 = 3 << 6 | 1 << 4 | 3 << 2 | 1 << 0; - Operand part0 = context.AddIntrinsic(Intrinsic.X86Shufps, n, m, Const(sm0)); - Operand part1 = context.AddIntrinsic(Intrinsic.X86Shufps, n, m, Const(sm1)); + Operand part0 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, mCopy, Const(sm0)); + Operand part1 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, mCopy, Const(sm1)); - context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst32, part0, part1)); + context.Copy(GetVec(op.Rd), emit(part0, part1)); } } else /* if (sizeF == 1) */ { - Operand part0 = context.AddIntrinsic(Intrinsic.X86Unpcklpd, n, m); - Operand part1 = context.AddIntrinsic(Intrinsic.X86Unpckhpd, n, m); + Operand part0 = context.AddIntrinsic(Intrinsic.X86Unpcklpd, nCopy, mCopy); + Operand part1 = context.AddIntrinsic(Intrinsic.X86Unpckhpd, nCopy, mCopy); - context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst64, part0, part1)); + context.Copy(GetVec(op.Rd), emit(part0, part1)); } } diff --git a/ARMeilleure/Instructions/InstEmitSimdShift.cs b/ARMeilleure/Instructions/InstEmitSimdShift.cs index 0b3d85ae..62363fde 100644 --- a/ARMeilleure/Instructions/InstEmitSimdShift.cs +++ b/ARMeilleure/Instructions/InstEmitSimdShift.cs @@ -391,25 +391,14 @@ namespace ARMeilleure.Instructions } } - public static void Sshl_V(ArmEmitterContext context) + public static void Sshl_S(ArmEmitterContext context) { - OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; - - Operand res = context.VectorZero(); - - int elems = op.GetBytesCount() >> op.Size; - - for (int index = 0; index < elems; index++) - { - Operand ne = EmitVectorExtractSx(context, op.Rn, index, op.Size); - Operand me = EmitVectorExtractSx(context, op.Rm, index, op.Size); - - Operand e = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.SignedShlReg)), ne, me, Const(0), Const(op.Size)); - - res = EmitVectorInsert(context, res, e, index, op.Size); - } + EmitSshlOrUshl(context, signed: true, scalar: true); + } - context.Copy(GetVec(op.Rd), res); + public static void Sshl_V(ArmEmitterContext context) + { + EmitSshlOrUshl(context, signed: true, scalar: false); } public static void Sshll_V(ArmEmitterContext context) @@ -686,25 +675,14 @@ namespace ARMeilleure.Instructions } } - public static void Ushl_V(ArmEmitterContext context) + public static void Ushl_S(ArmEmitterContext context) { - OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; - - Operand res = context.VectorZero(); - - int elems = op.GetBytesCount() >> op.Size; - - for (int index = 0; index < elems; index++) - { - Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size); - Operand me = EmitVectorExtractSx(context, op.Rm, index << op.Size, 0); - - Operand e = EmitUnsignedShlRegOp(context, ne, context.ConvertI64ToI32(me), op.Size); - - res = EmitVectorInsert(context, res, e, index, op.Size); - } + EmitSshlOrUshl(context, signed: false, scalar: true); + } - context.Copy(GetVec(op.Rd), res); + public static void Ushl_V(ArmEmitterContext context) + { + EmitSshlOrUshl(context, signed: false, scalar: false); } public static void Ushll_V(ArmEmitterContext context) @@ -894,7 +872,7 @@ namespace ARMeilleure.Instructions context.Copy(GetVec(op.Rd), res); } - private static Operand EmitUnsignedShlRegOp(ArmEmitterContext context, Operand op, Operand shiftLsB, int size) + private static Operand EmitShlRegOp(ArmEmitterContext context, Operand op, Operand shiftLsB, int size, bool signed) { Debug.Assert(op.Type == OperandType.I64); Debug.Assert(shiftLsB.Type == OperandType.I32); @@ -902,18 +880,33 @@ namespace ARMeilleure.Instructions Operand negShiftLsB = context.Negate(shiftLsB); + Operand isInRange = context.BitwiseAnd( + context.ICompareLess(shiftLsB, Const(8 << size)), + context.ICompareLess(negShiftLsB, Const(8 << size))); + Operand isPositive = context.ICompareGreaterOrEqual(shiftLsB, Const(0)); - Operand shl = context.ShiftLeft (op, shiftLsB); - Operand shr = context.ShiftRightUI(op, negShiftLsB); + Operand shl = context.ShiftLeft(op, shiftLsB); + + Operand sarOrShr = signed + ? context.ShiftRightSI(op, negShiftLsB) + : context.ShiftRightUI(op, negShiftLsB); - Operand res = context.ConditionalSelect(isPositive, shl, shr); + Operand res = context.ConditionalSelect(isPositive, shl, sarOrShr); - Operand isOutOfRange = context.BitwiseOr( - context.ICompareGreaterOrEqual(shiftLsB, Const(8 << size)), - context.ICompareGreaterOrEqual(negShiftLsB, Const(8 << size))); + if (signed) + { + Operand isPositive2 = context.ICompareGreaterOrEqual(op, Const(0L)); + + Operand res2 = context.ConditionalSelect(isPositive2, Const(0L), Const(-1L)); + res2 = context.ConditionalSelect(isPositive, Const(0L), res2); - return context.ConditionalSelect(isOutOfRange, Const(0UL), res); + return context.ConditionalSelect(isInRange, res, res2); + } + else + { + return context.ConditionalSelect(isInRange, res, Const(0UL)); + } } private static void EmitVectorShrImmNarrowOpZx(ArmEmitterContext context, bool round) @@ -1174,5 +1167,26 @@ namespace ARMeilleure.Instructions context.Copy(GetVec(op.Rd), res); } } + + private static void EmitSshlOrUshl(ArmEmitterContext context, bool signed, bool scalar) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = !scalar ? op.GetBytesCount() >> op.Size : 1; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtract (context, op.Rn, index, op.Size, signed); + Operand me = EmitVectorExtractSx(context, op.Rm, index << op.Size, 0); + + Operand e = EmitShlRegOp(context, ne, context.ConvertI64ToI32(me), op.Size, signed); + + res = EmitVectorInsert(context, res, e, index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } } } diff --git a/ARMeilleure/Instructions/InstName.cs b/ARMeilleure/Instructions/InstName.cs index e4d08456..69b5d3fc 100644 --- a/ARMeilleure/Instructions/InstName.cs +++ b/ARMeilleure/Instructions/InstName.cs @@ -212,14 +212,18 @@ namespace ARMeilleure.Instructions Fmax_V, Fmaxnm_S, Fmaxnm_V, + Fmaxnmp_V, Fmaxnmv_V, Fmaxp_V, + Fmaxv_V, Fmin_S, Fmin_V, Fminnm_S, Fminnm_V, + Fminnmp_V, Fminnmv_V, Fminp_V, + Fminv_V, Fmla_Se, Fmla_V, Fmla_Ve, @@ -378,6 +382,7 @@ namespace ARMeilleure.Instructions Srshr_V, Srsra_S, Srsra_V, + Sshl_S, Sshl_V, Sshll_V, Sshr_S, @@ -444,6 +449,7 @@ namespace ARMeilleure.Instructions Urshr_V, Ursra_S, Ursra_V, + Ushl_S, Ushl_V, Ushll_V, Ushr_S, diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs index deffabe1..b951caf8 100644 --- a/ARMeilleure/Translation/PTC/Ptc.cs +++ b/ARMeilleure/Translation/PTC/Ptc.cs @@ -19,8 +19,8 @@ namespace ARMeilleure.Translation.PTC public static class Ptc { private const string HeaderMagic = "PTChd"; - - private const int InternalVersion = 9; //! To be incremented manually for each change to the ARMeilleure project. + + private const int InternalVersion = 10; //! To be incremented manually for each change to the ARMeilleure project. private const string BaseDir = "Ryujinx"; |
