CPU (A64): Add FP16/FP32 fast paths (F16C Intrinsics) for Fcvt_S, Fcvtl_V & Fcvtn_V Instructions. Now HardwareCapabilities uses CpuId. (#1650)

* net5.0 * CPU (A64): Add FP16/FP32 fast paths (F16C Intrinsics) for Fcvt_S, Fcvtl_V & Fcvtn_V Instructions. Switch to .NET 5.0. Nits. Tests performed successfully in both debug and release mode (for all instructions involved). * Address comment. * Update appveyor.yml * Revert "Update appveyor.yml" This reverts commit 27cdd59e8b90e227e6924d9c162af26c00a89013. * Remove Assembler CpuId. * Update appveyor.yml * Address comment.
author: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> 2020-11-18 19:35:54 +0100
committer: GitHub <noreply@github.com> 2020-11-18 19:35:54 +0100
commit: 0679084f115b6838dec4d8c5e85044c33d4122d0 (patch)
tree: 0d25ace42740e37d6bb2a8cd30fa92c5313d265a /ARMeilleure/Instructions
parent: eafee34feebd432151809df402f3f696e4d93d08 (diff)
1 files changed, 65 insertions, 23 deletions
diff --git a/ARMeilleure/Instructions/InstEmitSimdCvt.cs b/ARMeilleure/Instructions/InstEmitSimdCvt.cs
index edcf35d5..0350427c 100644
--- a/ARMeilleure/Instructions/InstEmitSimdCvt.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdCvt.cs
@@ -60,21 +60,48 @@ namespace ARMeilleure.Instructions
             }
             else if (op.Size == 0 && op.Opc == 3) // Single -> Half.
             {
-                Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0);
+                if (Optimizations.UseF16c)
+                {
+                    Debug.Assert(!Optimizations.ForceLegacySse);
 
-                Operand res = context.Call(typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)), ne);
+                    Operand n = GetVec(op.Rn);
 
-                res = context.ZeroExtend16(OperandType.I64, res);
+                    Operand res = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, n, Const(X86GetRoundControl(FPRoundingMode.ToNearest)));
+                            res = context.AddIntrinsic(Intrinsic.X86Pslldq, res, Const(14)); // VectorZeroUpper112()
+                            res = context.AddIntrinsic(Intrinsic.X86Psrldq, res, Const(14));
 
-                context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, 1));
+                    context.Copy(GetVec(op.Rd), res);
+                }
+                else
+                {
+                    Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0);
+
+                    Operand res = context.Call(typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)), ne);
+
+                    res = context.ZeroExtend16(OperandType.I64, res);
+
+                    context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, 1));
+                }
             }
             else if (op.Size == 3 && op.Opc == 0) // Half -> Single.
             {
-                Operand ne = EmitVectorExtractZx(context, op.Rn, 0, 1);
+                if (Optimizations.UseF16c)
+                {
+                    Debug.Assert(!Optimizations.ForceLegacySse);
+
+                    Operand res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, GetVec(op.Rn));
+                            res = context.VectorZeroUpper96(res);
+
+                    context.Copy(GetVec(op.Rd), res);
+                }
+                else
+                {
+                    Operand ne = EmitVectorExtractZx(context, op.Rn, 0, 1);
 
-                Operand res = context.Call(typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)), ne);
+                    Operand res = context.Call(typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)), ne);
 
-                context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0));
+                    context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0));
+                }
             }
             else if (op.Size == 1 && op.Opc == 3) // Double -> Half.
             {
@@ -129,18 +156,20 @@ namespace ARMeilleure.Instructions
             if (Optimizations.UseSse2 && sizeF == 1)
             {
                 Operand n = GetVec(op.Rn);
-                Operand res;
 
-                if (op.RegisterSize == RegisterSize.Simd128)
-                {
-                    res = context.AddIntrinsic(Intrinsic.X86Movhlps, n, n);
-                }
-                else
-                {
-                    res = n;
-                }
+                Operand res = op.RegisterSize == RegisterSize.Simd128 ? context.AddIntrinsic(Intrinsic.X86Movhlps, n, n) : n;
+                        res = context.AddIntrinsic(Intrinsic.X86Cvtps2pd, res);
 
-                res = context.AddIntrinsic(Intrinsic.X86Cvtps2pd, res);
+                context.Copy(GetVec(op.Rd), res);
+            }
+            else if (Optimizations.UseF16c && sizeF == 0)
+            {
+                Debug.Assert(!Optimizations.ForceLegacySse);
+
+                Operand n = GetVec(op.Rn);
+
+                Operand res = op.RegisterSize == RegisterSize.Simd128 ? context.AddIntrinsic(Intrinsic.X86Movhlps, n, n) : n;
+                        res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, res);
 
                 context.Copy(GetVec(op.Rd), res);
             }
@@ -210,17 +239,30 @@ namespace ARMeilleure.Instructions
             {
                 Operand d = GetVec(op.Rd);
 
-                Operand res = context.VectorZeroUpper64(d);
+                Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps;
 
                 Operand nInt = context.AddIntrinsic(Intrinsic.X86Cvtpd2ps, GetVec(op.Rn));
+                        nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt);
 
-                nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt);
+                Operand res = context.VectorZeroUpper64(d);
+                        res = context.AddIntrinsic(movInst, res, nInt);
+
+                context.Copy(d, res);
+            }
+            else if (Optimizations.UseF16c && sizeF == 0)
+            {
+                Debug.Assert(!Optimizations.ForceLegacySse);
 
-                Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128
-                    ? Intrinsic.X86Movlhps
-                    : Intrinsic.X86Movhlps;
+                Operand d = GetVec(op.Rd);
+                Operand n = GetVec(op.Rn);
+
+                Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps;
 
-                res = context.AddIntrinsic(movInst, res, nInt);
+                Operand nInt = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, n, Const(X86GetRoundControl(FPRoundingMode.ToNearest)));
+                        nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt);
+
+                Operand res = context.VectorZeroUpper64(d);
+                        res = context.AddIntrinsic(movInst, res, nInt);
 
                 context.Copy(d, res);
             }
author	LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>	2020-11-18 19:35:54 +0100
committer	GitHub <noreply@github.com>	2020-11-18 19:35:54 +0100
commit	0679084f115b6838dec4d8c5e85044c33d4122d0 (patch)
tree	0d25ace42740e37d6bb2a8cd30fa92c5313d265a /ARMeilleure/Instructions
parent	eafee34feebd432151809df402f3f696e4d93d08 (diff)