diff options
| author | LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> | 2020-11-18 19:35:54 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-11-18 19:35:54 +0100 |
| commit | 0679084f115b6838dec4d8c5e85044c33d4122d0 (patch) | |
| tree | 0d25ace42740e37d6bb2a8cd30fa92c5313d265a /ARMeilleure/Instructions | |
| parent | eafee34feebd432151809df402f3f696e4d93d08 (diff) | |
CPU (A64): Add FP16/FP32 fast paths (F16C Intrinsics) for Fcvt_S, Fcvtl_V & Fcvtn_V Instructions. Now HardwareCapabilities uses CpuId. (#1650)
* net5.0
* CPU (A64): Add FP16/FP32 fast paths (F16C Intrinsics) for Fcvt_S, Fcvtl_V & Fcvtn_V Instructions. Switch to .NET 5.0.
Nits.
Tests performed successfully in both debug and release mode (for all instructions involved).
* Address comment.
* Update appveyor.yml
* Revert "Update appveyor.yml"
This reverts commit 27cdd59e8b90e227e6924d9c162af26c00a89013.
* Remove Assembler CpuId.
* Update appveyor.yml
* Address comment.
Diffstat (limited to 'ARMeilleure/Instructions')
| -rw-r--r-- | ARMeilleure/Instructions/InstEmitSimdCvt.cs | 88 |
1 files changed, 65 insertions, 23 deletions
diff --git a/ARMeilleure/Instructions/InstEmitSimdCvt.cs b/ARMeilleure/Instructions/InstEmitSimdCvt.cs index edcf35d5..0350427c 100644 --- a/ARMeilleure/Instructions/InstEmitSimdCvt.cs +++ b/ARMeilleure/Instructions/InstEmitSimdCvt.cs @@ -60,21 +60,48 @@ namespace ARMeilleure.Instructions } else if (op.Size == 0 && op.Opc == 3) // Single -> Half. { - Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0); + if (Optimizations.UseF16c) + { + Debug.Assert(!Optimizations.ForceLegacySse); - Operand res = context.Call(typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)), ne); + Operand n = GetVec(op.Rn); - res = context.ZeroExtend16(OperandType.I64, res); + Operand res = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, n, Const(X86GetRoundControl(FPRoundingMode.ToNearest))); + res = context.AddIntrinsic(Intrinsic.X86Pslldq, res, Const(14)); // VectorZeroUpper112() + res = context.AddIntrinsic(Intrinsic.X86Psrldq, res, Const(14)); - context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, 1)); + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0); + + Operand res = context.Call(typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)), ne); + + res = context.ZeroExtend16(OperandType.I64, res); + + context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, 1)); + } } else if (op.Size == 3 && op.Opc == 0) // Half -> Single. { - Operand ne = EmitVectorExtractZx(context, op.Rn, 0, 1); + if (Optimizations.UseF16c) + { + Debug.Assert(!Optimizations.ForceLegacySse); + + Operand res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, GetVec(op.Rn)); + res = context.VectorZeroUpper96(res); + + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand ne = EmitVectorExtractZx(context, op.Rn, 0, 1); - Operand res = context.Call(typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)), ne); + Operand res = context.Call(typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)), ne); - context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0)); + context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0)); + } } else if (op.Size == 1 && op.Opc == 3) // Double -> Half. { @@ -129,18 +156,20 @@ namespace ARMeilleure.Instructions if (Optimizations.UseSse2 && sizeF == 1) { Operand n = GetVec(op.Rn); - Operand res; - if (op.RegisterSize == RegisterSize.Simd128) - { - res = context.AddIntrinsic(Intrinsic.X86Movhlps, n, n); - } - else - { - res = n; - } + Operand res = op.RegisterSize == RegisterSize.Simd128 ? context.AddIntrinsic(Intrinsic.X86Movhlps, n, n) : n; + res = context.AddIntrinsic(Intrinsic.X86Cvtps2pd, res); - res = context.AddIntrinsic(Intrinsic.X86Cvtps2pd, res); + context.Copy(GetVec(op.Rd), res); + } + else if (Optimizations.UseF16c && sizeF == 0) + { + Debug.Assert(!Optimizations.ForceLegacySse); + + Operand n = GetVec(op.Rn); + + Operand res = op.RegisterSize == RegisterSize.Simd128 ? context.AddIntrinsic(Intrinsic.X86Movhlps, n, n) : n; + res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, res); context.Copy(GetVec(op.Rd), res); } @@ -210,17 +239,30 @@ namespace ARMeilleure.Instructions { Operand d = GetVec(op.Rd); - Operand res = context.VectorZeroUpper64(d); + Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps; Operand nInt = context.AddIntrinsic(Intrinsic.X86Cvtpd2ps, GetVec(op.Rn)); + nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt); - nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt); + Operand res = context.VectorZeroUpper64(d); + res = context.AddIntrinsic(movInst, res, nInt); + + context.Copy(d, res); + } + else if (Optimizations.UseF16c && sizeF == 0) + { + Debug.Assert(!Optimizations.ForceLegacySse); - Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 - ? Intrinsic.X86Movlhps - : Intrinsic.X86Movhlps; + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps; - res = context.AddIntrinsic(movInst, res, nInt); + Operand nInt = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, n, Const(X86GetRoundControl(FPRoundingMode.ToNearest))); + nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt); + + Operand res = context.VectorZeroUpper64(d); + res = context.AddIntrinsic(movInst, res, nInt); context.Copy(d, res); } |
