diff options
| author | TSR Berry <20988865+TSRBerry@users.noreply.github.com> | 2023-04-08 01:22:00 +0200 |
|---|---|---|
| committer | Mary <thog@protonmail.com> | 2023-04-27 23:51:14 +0200 |
| commit | cee712105850ac3385cd0091a923438167433f9f (patch) | |
| tree | 4a5274b21d8b7f938c0d0ce18736d3f2993b11b1 /src/ARMeilleure/Instructions | |
| parent | cd124bda587ef09668a971fa1cac1c3f0cfc9f21 (diff) | |
Move solution and projects to src
Diffstat (limited to 'src/ARMeilleure/Instructions')
55 files changed, 32362 insertions, 0 deletions
diff --git a/src/ARMeilleure/Instructions/CryptoHelper.cs b/src/ARMeilleure/Instructions/CryptoHelper.cs new file mode 100644 index 00000000..e517c75d --- /dev/null +++ b/src/ARMeilleure/Instructions/CryptoHelper.cs @@ -0,0 +1,280 @@ +// https://www.intel.com/content/dam/doc/white-paper/advanced-encryption-standard-new-instructions-set-paper.pdf + +using ARMeilleure.State; +using System; + +namespace ARMeilleure.Instructions +{ + static class CryptoHelper + { +#region "LookUp Tables" + private static ReadOnlySpan<byte> _sBox => new byte[] + { + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, + 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, + 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, + 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, + 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, + 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, + 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, + 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, + 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, + 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, + 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 + }; + + private static ReadOnlySpan<byte> _invSBox => new byte[] + { + 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, + 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, + 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, + 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, + 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, + 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, + 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, + 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, + 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, + 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, + 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + }; + + private static ReadOnlySpan<byte> _gfMul02 => new byte[] + { + 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e, + 0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, + 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e, + 0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e, 0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e, + 0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e, 0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e, + 0xa0, 0xa2, 0xa4, 0xa6, 0xa8, 0xaa, 0xac, 0xae, 0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe, + 0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde, + 0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee, 0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe, + 0x1b, 0x19, 0x1f, 0x1d, 0x13, 0x11, 0x17, 0x15, 0x0b, 0x09, 0x0f, 0x0d, 0x03, 0x01, 0x07, 0x05, + 0x3b, 0x39, 0x3f, 0x3d, 0x33, 0x31, 0x37, 0x35, 0x2b, 0x29, 0x2f, 0x2d, 0x23, 0x21, 0x27, 0x25, + 0x5b, 0x59, 0x5f, 0x5d, 0x53, 0x51, 0x57, 0x55, 0x4b, 0x49, 0x4f, 0x4d, 0x43, 0x41, 0x47, 0x45, + 0x7b, 0x79, 0x7f, 0x7d, 0x73, 0x71, 0x77, 0x75, 0x6b, 0x69, 0x6f, 0x6d, 0x63, 0x61, 0x67, 0x65, + 0x9b, 0x99, 0x9f, 0x9d, 0x93, 0x91, 0x97, 0x95, 0x8b, 0x89, 0x8f, 0x8d, 0x83, 0x81, 0x87, 0x85, + 0xbb, 0xb9, 0xbf, 0xbd, 0xb3, 0xb1, 0xb7, 0xb5, 0xab, 0xa9, 0xaf, 0xad, 0xa3, 0xa1, 0xa7, 0xa5, + 0xdb, 0xd9, 0xdf, 0xdd, 0xd3, 0xd1, 0xd7, 0xd5, 0xcb, 0xc9, 0xcf, 0xcd, 0xc3, 0xc1, 0xc7, 0xc5, + 0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5 + }; + + private static ReadOnlySpan<byte> _gfMul03 => new byte[] + { + 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11, + 0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39, 0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21, + 0x60, 0x63, 0x66, 0x65, 0x6c, 0x6f, 0x6a, 0x69, 0x78, 0x7b, 0x7e, 0x7d, 0x74, 0x77, 0x72, 0x71, + 0x50, 0x53, 0x56, 0x55, 0x5c, 0x5f, 0x5a, 0x59, 0x48, 0x4b, 0x4e, 0x4d, 0x44, 0x47, 0x42, 0x41, + 0xc0, 0xc3, 0xc6, 0xc5, 0xcc, 0xcf, 0xca, 0xc9, 0xd8, 0xdb, 0xde, 0xdd, 0xd4, 0xd7, 0xd2, 0xd1, + 0xf0, 0xf3, 0xf6, 0xf5, 0xfc, 0xff, 0xfa, 0xf9, 0xe8, 0xeb, 0xee, 0xed, 0xe4, 0xe7, 0xe2, 0xe1, + 0xa0, 0xa3, 0xa6, 0xa5, 0xac, 0xaf, 0xaa, 0xa9, 0xb8, 0xbb, 0xbe, 0xbd, 0xb4, 0xb7, 0xb2, 0xb1, + 0x90, 0x93, 0x96, 0x95, 0x9c, 0x9f, 0x9a, 0x99, 0x88, 0x8b, 0x8e, 0x8d, 0x84, 0x87, 0x82, 0x81, + 0x9b, 0x98, 0x9d, 0x9e, 0x97, 0x94, 0x91, 0x92, 0x83, 0x80, 0x85, 0x86, 0x8f, 0x8c, 0x89, 0x8a, + 0xab, 0xa8, 0xad, 0xae, 0xa7, 0xa4, 0xa1, 0xa2, 0xb3, 0xb0, 0xb5, 0xb6, 0xbf, 0xbc, 0xb9, 0xba, + 0xfb, 0xf8, 0xfd, 0xfe, 0xf7, 0xf4, 0xf1, 0xf2, 0xe3, 0xe0, 0xe5, 0xe6, 0xef, 0xec, 0xe9, 0xea, + 0xcb, 0xc8, 0xcd, 0xce, 0xc7, 0xc4, 0xc1, 0xc2, 0xd3, 0xd0, 0xd5, 0xd6, 0xdf, 0xdc, 0xd9, 0xda, + 0x5b, 0x58, 0x5d, 0x5e, 0x57, 0x54, 0x51, 0x52, 0x43, 0x40, 0x45, 0x46, 0x4f, 0x4c, 0x49, 0x4a, + 0x6b, 0x68, 0x6d, 0x6e, 0x67, 0x64, 0x61, 0x62, 0x73, 0x70, 0x75, 0x76, 0x7f, 0x7c, 0x79, 0x7a, + 0x3b, 0x38, 0x3d, 0x3e, 0x37, 0x34, 0x31, 0x32, 0x23, 0x20, 0x25, 0x26, 0x2f, 0x2c, 0x29, 0x2a, + 0x0b, 0x08, 0x0d, 0x0e, 0x07, 0x04, 0x01, 0x02, 0x13, 0x10, 0x15, 0x16, 0x1f, 0x1c, 0x19, 0x1a + }; + + private static ReadOnlySpan<byte> _gfMul09 => new byte[] + { + 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77, + 0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7, + 0x3b, 0x32, 0x29, 0x20, 0x1f, 0x16, 0x0d, 0x04, 0x73, 0x7a, 0x61, 0x68, 0x57, 0x5e, 0x45, 0x4c, + 0xab, 0xa2, 0xb9, 0xb0, 0x8f, 0x86, 0x9d, 0x94, 0xe3, 0xea, 0xf1, 0xf8, 0xc7, 0xce, 0xd5, 0xdc, + 0x76, 0x7f, 0x64, 0x6d, 0x52, 0x5b, 0x40, 0x49, 0x3e, 0x37, 0x2c, 0x25, 0x1a, 0x13, 0x08, 0x01, + 0xe6, 0xef, 0xf4, 0xfd, 0xc2, 0xcb, 0xd0, 0xd9, 0xae, 0xa7, 0xbc, 0xb5, 0x8a, 0x83, 0x98, 0x91, + 0x4d, 0x44, 0x5f, 0x56, 0x69, 0x60, 0x7b, 0x72, 0x05, 0x0c, 0x17, 0x1e, 0x21, 0x28, 0x33, 0x3a, + 0xdd, 0xd4, 0xcf, 0xc6, 0xf9, 0xf0, 0xeb, 0xe2, 0x95, 0x9c, 0x87, 0x8e, 0xb1, 0xb8, 0xa3, 0xaa, + 0xec, 0xe5, 0xfe, 0xf7, 0xc8, 0xc1, 0xda, 0xd3, 0xa4, 0xad, 0xb6, 0xbf, 0x80, 0x89, 0x92, 0x9b, + 0x7c, 0x75, 0x6e, 0x67, 0x58, 0x51, 0x4a, 0x43, 0x34, 0x3d, 0x26, 0x2f, 0x10, 0x19, 0x02, 0x0b, + 0xd7, 0xde, 0xc5, 0xcc, 0xf3, 0xfa, 0xe1, 0xe8, 0x9f, 0x96, 0x8d, 0x84, 0xbb, 0xb2, 0xa9, 0xa0, + 0x47, 0x4e, 0x55, 0x5c, 0x63, 0x6a, 0x71, 0x78, 0x0f, 0x06, 0x1d, 0x14, 0x2b, 0x22, 0x39, 0x30, + 0x9a, 0x93, 0x88, 0x81, 0xbe, 0xb7, 0xac, 0xa5, 0xd2, 0xdb, 0xc0, 0xc9, 0xf6, 0xff, 0xe4, 0xed, + 0x0a, 0x03, 0x18, 0x11, 0x2e, 0x27, 0x3c, 0x35, 0x42, 0x4b, 0x50, 0x59, 0x66, 0x6f, 0x74, 0x7d, + 0xa1, 0xa8, 0xb3, 0xba, 0x85, 0x8c, 0x97, 0x9e, 0xe9, 0xe0, 0xfb, 0xf2, 0xcd, 0xc4, 0xdf, 0xd6, + 0x31, 0x38, 0x23, 0x2a, 0x15, 0x1c, 0x07, 0x0e, 0x79, 0x70, 0x6b, 0x62, 0x5d, 0x54, 0x4f, 0x46 + }; + + private static ReadOnlySpan<byte> _gfMul0B => new byte[] + { + 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69, + 0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9, + 0x7b, 0x70, 0x6d, 0x66, 0x57, 0x5c, 0x41, 0x4a, 0x23, 0x28, 0x35, 0x3e, 0x0f, 0x04, 0x19, 0x12, + 0xcb, 0xc0, 0xdd, 0xd6, 0xe7, 0xec, 0xf1, 0xfa, 0x93, 0x98, 0x85, 0x8e, 0xbf, 0xb4, 0xa9, 0xa2, + 0xf6, 0xfd, 0xe0, 0xeb, 0xda, 0xd1, 0xcc, 0xc7, 0xae, 0xa5, 0xb8, 0xb3, 0x82, 0x89, 0x94, 0x9f, + 0x46, 0x4d, 0x50, 0x5b, 0x6a, 0x61, 0x7c, 0x77, 0x1e, 0x15, 0x08, 0x03, 0x32, 0x39, 0x24, 0x2f, + 0x8d, 0x86, 0x9b, 0x90, 0xa1, 0xaa, 0xb7, 0xbc, 0xd5, 0xde, 0xc3, 0xc8, 0xf9, 0xf2, 0xef, 0xe4, + 0x3d, 0x36, 0x2b, 0x20, 0x11, 0x1a, 0x07, 0x0c, 0x65, 0x6e, 0x73, 0x78, 0x49, 0x42, 0x5f, 0x54, + 0xf7, 0xfc, 0xe1, 0xea, 0xdb, 0xd0, 0xcd, 0xc6, 0xaf, 0xa4, 0xb9, 0xb2, 0x83, 0x88, 0x95, 0x9e, + 0x47, 0x4c, 0x51, 0x5a, 0x6b, 0x60, 0x7d, 0x76, 0x1f, 0x14, 0x09, 0x02, 0x33, 0x38, 0x25, 0x2e, + 0x8c, 0x87, 0x9a, 0x91, 0xa0, 0xab, 0xb6, 0xbd, 0xd4, 0xdf, 0xc2, 0xc9, 0xf8, 0xf3, 0xee, 0xe5, + 0x3c, 0x37, 0x2a, 0x21, 0x10, 0x1b, 0x06, 0x0d, 0x64, 0x6f, 0x72, 0x79, 0x48, 0x43, 0x5e, 0x55, + 0x01, 0x0a, 0x17, 0x1c, 0x2d, 0x26, 0x3b, 0x30, 0x59, 0x52, 0x4f, 0x44, 0x75, 0x7e, 0x63, 0x68, + 0xb1, 0xba, 0xa7, 0xac, 0x9d, 0x96, 0x8b, 0x80, 0xe9, 0xe2, 0xff, 0xf4, 0xc5, 0xce, 0xd3, 0xd8, + 0x7a, 0x71, 0x6c, 0x67, 0x56, 0x5d, 0x40, 0x4b, 0x22, 0x29, 0x34, 0x3f, 0x0e, 0x05, 0x18, 0x13, + 0xca, 0xc1, 0xdc, 0xd7, 0xe6, 0xed, 0xf0, 0xfb, 0x92, 0x99, 0x84, 0x8f, 0xbe, 0xb5, 0xa8, 0xa3 + }; + + private static ReadOnlySpan<byte> _gfMul0D => new byte[] + { + 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b, + 0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b, + 0xbb, 0xb6, 0xa1, 0xac, 0x8f, 0x82, 0x95, 0x98, 0xd3, 0xde, 0xc9, 0xc4, 0xe7, 0xea, 0xfd, 0xf0, + 0x6b, 0x66, 0x71, 0x7c, 0x5f, 0x52, 0x45, 0x48, 0x03, 0x0e, 0x19, 0x14, 0x37, 0x3a, 0x2d, 0x20, + 0x6d, 0x60, 0x77, 0x7a, 0x59, 0x54, 0x43, 0x4e, 0x05, 0x08, 0x1f, 0x12, 0x31, 0x3c, 0x2b, 0x26, + 0xbd, 0xb0, 0xa7, 0xaa, 0x89, 0x84, 0x93, 0x9e, 0xd5, 0xd8, 0xcf, 0xc2, 0xe1, 0xec, 0xfb, 0xf6, + 0xd6, 0xdb, 0xcc, 0xc1, 0xe2, 0xef, 0xf8, 0xf5, 0xbe, 0xb3, 0xa4, 0xa9, 0x8a, 0x87, 0x90, 0x9d, + 0x06, 0x0b, 0x1c, 0x11, 0x32, 0x3f, 0x28, 0x25, 0x6e, 0x63, 0x74, 0x79, 0x5a, 0x57, 0x40, 0x4d, + 0xda, 0xd7, 0xc0, 0xcd, 0xee, 0xe3, 0xf4, 0xf9, 0xb2, 0xbf, 0xa8, 0xa5, 0x86, 0x8b, 0x9c, 0x91, + 0x0a, 0x07, 0x10, 0x1d, 0x3e, 0x33, 0x24, 0x29, 0x62, 0x6f, 0x78, 0x75, 0x56, 0x5b, 0x4c, 0x41, + 0x61, 0x6c, 0x7b, 0x76, 0x55, 0x58, 0x4f, 0x42, 0x09, 0x04, 0x13, 0x1e, 0x3d, 0x30, 0x27, 0x2a, + 0xb1, 0xbc, 0xab, 0xa6, 0x85, 0x88, 0x9f, 0x92, 0xd9, 0xd4, 0xc3, 0xce, 0xed, 0xe0, 0xf7, 0xfa, + 0xb7, 0xba, 0xad, 0xa0, 0x83, 0x8e, 0x99, 0x94, 0xdf, 0xd2, 0xc5, 0xc8, 0xeb, 0xe6, 0xf1, 0xfc, + 0x67, 0x6a, 0x7d, 0x70, 0x53, 0x5e, 0x49, 0x44, 0x0f, 0x02, 0x15, 0x18, 0x3b, 0x36, 0x21, 0x2c, + 0x0c, 0x01, 0x16, 0x1b, 0x38, 0x35, 0x22, 0x2f, 0x64, 0x69, 0x7e, 0x73, 0x50, 0x5d, 0x4a, 0x47, + 0xdc, 0xd1, 0xc6, 0xcb, 0xe8, 0xe5, 0xf2, 0xff, 0xb4, 0xb9, 0xae, 0xa3, 0x80, 0x8d, 0x9a, 0x97 + }; + + private static ReadOnlySpan<byte> _gfMul0E => new byte[] + { + 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a, + 0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba, + 0xdb, 0xd5, 0xc7, 0xc9, 0xe3, 0xed, 0xff, 0xf1, 0xab, 0xa5, 0xb7, 0xb9, 0x93, 0x9d, 0x8f, 0x81, + 0x3b, 0x35, 0x27, 0x29, 0x03, 0x0d, 0x1f, 0x11, 0x4b, 0x45, 0x57, 0x59, 0x73, 0x7d, 0x6f, 0x61, + 0xad, 0xa3, 0xb1, 0xbf, 0x95, 0x9b, 0x89, 0x87, 0xdd, 0xd3, 0xc1, 0xcf, 0xe5, 0xeb, 0xf9, 0xf7, + 0x4d, 0x43, 0x51, 0x5f, 0x75, 0x7b, 0x69, 0x67, 0x3d, 0x33, 0x21, 0x2f, 0x05, 0x0b, 0x19, 0x17, + 0x76, 0x78, 0x6a, 0x64, 0x4e, 0x40, 0x52, 0x5c, 0x06, 0x08, 0x1a, 0x14, 0x3e, 0x30, 0x22, 0x2c, + 0x96, 0x98, 0x8a, 0x84, 0xae, 0xa0, 0xb2, 0xbc, 0xe6, 0xe8, 0xfa, 0xf4, 0xde, 0xd0, 0xc2, 0xcc, + 0x41, 0x4f, 0x5d, 0x53, 0x79, 0x77, 0x65, 0x6b, 0x31, 0x3f, 0x2d, 0x23, 0x09, 0x07, 0x15, 0x1b, + 0xa1, 0xaf, 0xbd, 0xb3, 0x99, 0x97, 0x85, 0x8b, 0xd1, 0xdf, 0xcd, 0xc3, 0xe9, 0xe7, 0xf5, 0xfb, + 0x9a, 0x94, 0x86, 0x88, 0xa2, 0xac, 0xbe, 0xb0, 0xea, 0xe4, 0xf6, 0xf8, 0xd2, 0xdc, 0xce, 0xc0, + 0x7a, 0x74, 0x66, 0x68, 0x42, 0x4c, 0x5e, 0x50, 0x0a, 0x04, 0x16, 0x18, 0x32, 0x3c, 0x2e, 0x20, + 0xec, 0xe2, 0xf0, 0xfe, 0xd4, 0xda, 0xc8, 0xc6, 0x9c, 0x92, 0x80, 0x8e, 0xa4, 0xaa, 0xb8, 0xb6, + 0x0c, 0x02, 0x10, 0x1e, 0x34, 0x3a, 0x28, 0x26, 0x7c, 0x72, 0x60, 0x6e, 0x44, 0x4a, 0x58, 0x56, + 0x37, 0x39, 0x2b, 0x25, 0x0f, 0x01, 0x13, 0x1d, 0x47, 0x49, 0x5b, 0x55, 0x7f, 0x71, 0x63, 0x6d, + 0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d + }; + + private static ReadOnlySpan<byte> _srPerm => new byte[] + { + 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3 + }; + + private static ReadOnlySpan<byte> _isrPerm => new byte[] + { + 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11 + }; +#endregion + + public static V128 AesInvMixColumns(V128 op) + { + byte[] inState = op.ToArray(); + byte[] outState = new byte[16]; + + for (int columns = 0; columns <= 3; columns++) + { + int idx = columns << 2; + + byte row0 = inState[idx + 0]; // A, E, I, M: [row0, col0-col3] + byte row1 = inState[idx + 1]; // B, F, J, N: [row1, col0-col3] + byte row2 = inState[idx + 2]; // C, G, K, O: [row2, col0-col3] + byte row3 = inState[idx + 3]; // D, H, L, P: [row3, col0-col3] + + outState[idx + 0] = (byte)((uint)_gfMul0E[row0] ^ _gfMul0B[row1] ^ _gfMul0D[row2] ^ _gfMul09[row3]); + outState[idx + 1] = (byte)((uint)_gfMul09[row0] ^ _gfMul0E[row1] ^ _gfMul0B[row2] ^ _gfMul0D[row3]); + outState[idx + 2] = (byte)((uint)_gfMul0D[row0] ^ _gfMul09[row1] ^ _gfMul0E[row2] ^ _gfMul0B[row3]); + outState[idx + 3] = (byte)((uint)_gfMul0B[row0] ^ _gfMul0D[row1] ^ _gfMul09[row2] ^ _gfMul0E[row3]); + } + + return new V128(outState); + } + + public static V128 AesInvShiftRows(V128 op) + { + byte[] inState = op.ToArray(); + byte[] outState = new byte[16]; + + for (int idx = 0; idx <= 15; idx++) + { + outState[_isrPerm[idx]] = inState[idx]; + } + + return new V128(outState); + } + + public static V128 AesInvSubBytes(V128 op) + { + byte[] inState = op.ToArray(); + byte[] outState = new byte[16]; + + for (int idx = 0; idx <= 15; idx++) + { + outState[idx] = _invSBox[inState[idx]]; + } + + return new V128(outState); + } + + public static V128 AesMixColumns(V128 op) + { + byte[] inState = op.ToArray(); + byte[] outState = new byte[16]; + + for (int columns = 0; columns <= 3; columns++) + { + int idx = columns << 2; + + byte row0 = inState[idx + 0]; // A, E, I, M: [row0, col0-col3] + byte row1 = inState[idx + 1]; // B, F, J, N: [row1, col0-col3] + byte row2 = inState[idx + 2]; // C, G, K, O: [row2, col0-col3] + byte row3 = inState[idx + 3]; // D, H, L, P: [row3, col0-col3] + + outState[idx + 0] = (byte)((uint)_gfMul02[row0] ^ _gfMul03[row1] ^ row2 ^ row3); + outState[idx + 1] = (byte)((uint)row0 ^ _gfMul02[row1] ^ _gfMul03[row2] ^ row3); + outState[idx + 2] = (byte)((uint)row0 ^ row1 ^ _gfMul02[row2] ^ _gfMul03[row3]); + outState[idx + 3] = (byte)((uint)_gfMul03[row0] ^ row1 ^ row2 ^ _gfMul02[row3]); + } + + return new V128(outState); + } + + public static V128 AesShiftRows(V128 op) + { + byte[] inState = op.ToArray(); + byte[] outState = new byte[16]; + + for (int idx = 0; idx <= 15; idx++) + { + outState[_srPerm[idx]] = inState[idx]; + } + + return new V128(outState); + } + + public static V128 AesSubBytes(V128 op) + { + byte[] inState = op.ToArray(); + byte[] outState = new byte[16]; + + for (int idx = 0; idx <= 15; idx++) + { + outState[idx] = _sBox[inState[idx]]; + } + + return new V128(outState); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitAlu.cs b/src/ARMeilleure/Instructions/InstEmitAlu.cs new file mode 100644 index 00000000..e0d10e77 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitAlu.cs @@ -0,0 +1,400 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; +using System.Diagnostics; + +using static ARMeilleure.Instructions.InstEmitAluHelper; +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit + { + public static void Adc(ArmEmitterContext context) => EmitAdc(context, setFlags: false); + public static void Adcs(ArmEmitterContext context) => EmitAdc(context, setFlags: true); + + private static void EmitAdc(ArmEmitterContext context, bool setFlags) + { + Operand n = GetAluN(context); + Operand m = GetAluM(context); + + Operand d = context.Add(n, m); + + Operand carry = GetFlag(PState.CFlag); + + if (context.CurrOp.RegisterSize == RegisterSize.Int64) + { + carry = context.ZeroExtend32(OperandType.I64, carry); + } + + d = context.Add(d, carry); + + if (setFlags) + { + EmitNZFlagsCheck(context, d); + + EmitAdcsCCheck(context, n, d); + EmitAddsVCheck(context, n, m, d); + } + + SetAluDOrZR(context, d); + } + + public static void Add(ArmEmitterContext context) + { + SetAluD(context, context.Add(GetAluN(context), GetAluM(context))); + } + + public static void Adds(ArmEmitterContext context) + { + Operand n = GetAluN(context); + Operand m = GetAluM(context); + + context.MarkComparison(n, m); + + Operand d = context.Add(n, m); + + EmitNZFlagsCheck(context, d); + + EmitAddsCCheck(context, n, d); + EmitAddsVCheck(context, n, m, d); + + SetAluDOrZR(context, d); + } + + public static void And(ArmEmitterContext context) + { + SetAluD(context, context.BitwiseAnd(GetAluN(context), GetAluM(context))); + } + + public static void Ands(ArmEmitterContext context) + { + Operand n = GetAluN(context); + Operand m = GetAluM(context); + + Operand d = context.BitwiseAnd(n, m); + + EmitNZFlagsCheck(context, d); + EmitCVFlagsClear(context); + + SetAluDOrZR(context, d); + } + + public static void Asrv(ArmEmitterContext context) + { + SetAluDOrZR(context, context.ShiftRightSI(GetAluN(context), GetAluMShift(context))); + } + + public static void Bic(ArmEmitterContext context) => EmitBic(context, setFlags: false); + public static void Bics(ArmEmitterContext context) => EmitBic(context, setFlags: true); + + private static void EmitBic(ArmEmitterContext context, bool setFlags) + { + Operand n = GetAluN(context); + Operand m = GetAluM(context); + + Operand d = context.BitwiseAnd(n, context.BitwiseNot(m)); + + if (setFlags) + { + EmitNZFlagsCheck(context, d); + EmitCVFlagsClear(context); + } + + SetAluD(context, d, setFlags); + } + + public static void Cls(ArmEmitterContext context) + { + OpCodeAlu op = (OpCodeAlu)context.CurrOp; + + Operand n = GetIntOrZR(context, op.Rn); + + Operand nHigh = context.ShiftRightUI(n, Const(1)); + + bool is32Bits = op.RegisterSize == RegisterSize.Int32; + + Operand mask = is32Bits ? Const(int.MaxValue) : Const(long.MaxValue); + + Operand nLow = context.BitwiseAnd(n, mask); + + Operand res = context.CountLeadingZeros(context.BitwiseExclusiveOr(nHigh, nLow)); + + res = context.Subtract(res, Const(res.Type, 1)); + + SetAluDOrZR(context, res); + } + + public static void Clz(ArmEmitterContext context) + { + OpCodeAlu op = (OpCodeAlu)context.CurrOp; + + Operand n = GetIntOrZR(context, op.Rn); + + Operand d = context.CountLeadingZeros(n); + + SetAluDOrZR(context, d); + } + + public static void Eon(ArmEmitterContext context) + { + Operand n = GetAluN(context); + Operand m = GetAluM(context); + + Operand d = context.BitwiseExclusiveOr(n, context.BitwiseNot(m)); + + SetAluD(context, d); + } + + public static void Eor(ArmEmitterContext context) + { + SetAluD(context, context.BitwiseExclusiveOr(GetAluN(context), GetAluM(context))); + } + + public static void Extr(ArmEmitterContext context) + { + OpCodeAluRs op = (OpCodeAluRs)context.CurrOp; + + Operand res = GetIntOrZR(context, op.Rm); + + if (op.Shift != 0) + { + if (op.Rn == op.Rm) + { + res = context.RotateRight(res, Const(op.Shift)); + } + else + { + res = context.ShiftRightUI(res, Const(op.Shift)); + + Operand n = GetIntOrZR(context, op.Rn); + + int invShift = op.GetBitsCount() - op.Shift; + + res = context.BitwiseOr(res, context.ShiftLeft(n, Const(invShift))); + } + } + + SetAluDOrZR(context, res); + } + + public static void Lslv(ArmEmitterContext context) + { + SetAluDOrZR(context, context.ShiftLeft(GetAluN(context), GetAluMShift(context))); + } + + public static void Lsrv(ArmEmitterContext context) + { + SetAluDOrZR(context, context.ShiftRightUI(GetAluN(context), GetAluMShift(context))); + } + + public static void Sbc(ArmEmitterContext context) => EmitSbc(context, setFlags: false); + public static void Sbcs(ArmEmitterContext context) => EmitSbc(context, setFlags: true); + + private static void EmitSbc(ArmEmitterContext context, bool setFlags) + { + Operand n = GetAluN(context); + Operand m = GetAluM(context); + + Operand d = context.Subtract(n, m); + + Operand borrow = context.BitwiseExclusiveOr(GetFlag(PState.CFlag), Const(1)); + + if (context.CurrOp.RegisterSize == RegisterSize.Int64) + { + borrow = context.ZeroExtend32(OperandType.I64, borrow); + } + + d = context.Subtract(d, borrow); + + if (setFlags) + { + EmitNZFlagsCheck(context, d); + + EmitSbcsCCheck(context, n, m); + EmitSubsVCheck(context, n, m, d); + } + + SetAluDOrZR(context, d); + } + + public static void Sub(ArmEmitterContext context) + { + SetAluD(context, context.Subtract(GetAluN(context), GetAluM(context))); + } + + public static void Subs(ArmEmitterContext context) + { + Operand n = GetAluN(context); + Operand m = GetAluM(context); + + context.MarkComparison(n, m); + + Operand d = context.Subtract(n, m); + + EmitNZFlagsCheck(context, d); + + EmitSubsCCheck(context, n, m); + EmitSubsVCheck(context, n, m, d); + + SetAluDOrZR(context, d); + } + + public static void Orn(ArmEmitterContext context) + { + Operand n = GetAluN(context); + Operand m = GetAluM(context); + + Operand d = context.BitwiseOr(n, context.BitwiseNot(m)); + + SetAluD(context, d); + } + + public static void Orr(ArmEmitterContext context) + { + SetAluD(context, context.BitwiseOr(GetAluN(context), GetAluM(context))); + } + + public static void Rbit(ArmEmitterContext context) + { + OpCodeAlu op = (OpCodeAlu)context.CurrOp; + + Operand n = GetIntOrZR(context, op.Rn); + Operand d; + + if (op.RegisterSize == RegisterSize.Int32) + { + d = EmitReverseBits32Op(context, n); + } + else + { + d = EmitReverseBits64Op(context, n); + } + + SetAluDOrZR(context, d); + } + + private static Operand EmitReverseBits64Op(ArmEmitterContext context, Operand op) + { + Debug.Assert(op.Type == OperandType.I64); + + Operand val = context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op, Const(0xaaaaaaaaaaaaaaaaul)), Const(1)), + context.ShiftLeft (context.BitwiseAnd(op, Const(0x5555555555555555ul)), Const(1))); + + val = context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(val, Const(0xccccccccccccccccul)), Const(2)), + context.ShiftLeft (context.BitwiseAnd(val, Const(0x3333333333333333ul)), Const(2))); + val = context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(val, Const(0xf0f0f0f0f0f0f0f0ul)), Const(4)), + context.ShiftLeft (context.BitwiseAnd(val, Const(0x0f0f0f0f0f0f0f0ful)), Const(4))); + val = context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(val, Const(0xff00ff00ff00ff00ul)), Const(8)), + context.ShiftLeft (context.BitwiseAnd(val, Const(0x00ff00ff00ff00fful)), Const(8))); + val = context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(val, Const(0xffff0000ffff0000ul)), Const(16)), + context.ShiftLeft (context.BitwiseAnd(val, Const(0x0000ffff0000fffful)), Const(16))); + + return context.BitwiseOr(context.ShiftRightUI(val, Const(32)), context.ShiftLeft(val, Const(32))); + } + + public static void Rev16(ArmEmitterContext context) + { + OpCodeAlu op = (OpCodeAlu)context.CurrOp; + + Operand n = GetIntOrZR(context, op.Rn); + Operand d; + + if (op.RegisterSize == RegisterSize.Int32) + { + d = EmitReverseBytes16_32Op(context, n); + } + else + { + d = EmitReverseBytes16_64Op(context, n); + } + + SetAluDOrZR(context, d); + } + + public static void Rev32(ArmEmitterContext context) + { + OpCodeAlu op = (OpCodeAlu)context.CurrOp; + + Operand n = GetIntOrZR(context, op.Rn); + Operand d; + + if (op.RegisterSize == RegisterSize.Int32) + { + d = context.ByteSwap(n); + } + else + { + d = EmitReverseBytes32_64Op(context, n); + } + + SetAluDOrZR(context, d); + } + + private static Operand EmitReverseBytes32_64Op(ArmEmitterContext context, Operand op) + { + Debug.Assert(op.Type == OperandType.I64); + + Operand val = EmitReverseBytes16_64Op(context, op); + + return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(val, Const(0xffff0000ffff0000ul)), Const(16)), + context.ShiftLeft (context.BitwiseAnd(val, Const(0x0000ffff0000fffful)), Const(16))); + } + + public static void Rev64(ArmEmitterContext context) + { + OpCodeAlu op = (OpCodeAlu)context.CurrOp; + + SetAluDOrZR(context, context.ByteSwap(GetIntOrZR(context, op.Rn))); + } + + public static void Rorv(ArmEmitterContext context) + { + SetAluDOrZR(context, context.RotateRight(GetAluN(context), GetAluMShift(context))); + } + + private static Operand GetAluMShift(ArmEmitterContext context) + { + IOpCodeAluRs op = (IOpCodeAluRs)context.CurrOp; + + Operand m = GetIntOrZR(context, op.Rm); + + if (op.RegisterSize == RegisterSize.Int64) + { + m = context.ConvertI64ToI32(m); + } + + return context.BitwiseAnd(m, Const(context.CurrOp.GetBitsCount() - 1)); + } + + private static void EmitCVFlagsClear(ArmEmitterContext context) + { + SetFlag(context, PState.CFlag, Const(0)); + SetFlag(context, PState.VFlag, Const(0)); + } + + public static void SetAluD(ArmEmitterContext context, Operand d) + { + SetAluD(context, d, x31IsZR: false); + } + + public static void SetAluDOrZR(ArmEmitterContext context, Operand d) + { + SetAluD(context, d, x31IsZR: true); + } + + public static void SetAluD(ArmEmitterContext context, Operand d, bool x31IsZR) + { + IOpCodeAlu op = (IOpCodeAlu)context.CurrOp; + + if ((x31IsZR || op is IOpCodeAluRs) && op.Rd == RegisterConsts.ZeroIndex) + { + return; + } + + SetIntOrSP(context, op.Rd, d); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitAlu32.cs b/src/ARMeilleure/Instructions/InstEmitAlu32.cs new file mode 100644 index 00000000..584ada7e --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitAlu32.cs @@ -0,0 +1,931 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitAluHelper; +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit32 + { + public static void Add(ArmEmitterContext context) + { + IOpCode32Alu op = (IOpCode32Alu)context.CurrOp; + + Operand n = GetAluN(context); + Operand m = GetAluM(context, setCarry: false); + + Operand res = context.Add(n, m); + + if (ShouldSetFlags(context)) + { + EmitNZFlagsCheck(context, res); + + EmitAddsCCheck(context, n, res); + EmitAddsVCheck(context, n, m, res); + } + + EmitAluStore(context, res); + } + + public static void Adc(ArmEmitterContext context) + { + IOpCode32Alu op = (IOpCode32Alu)context.CurrOp; + + Operand n = GetAluN(context); + Operand m = GetAluM(context, setCarry: false); + + Operand res = context.Add(n, m); + + Operand carry = GetFlag(PState.CFlag); + + res = context.Add(res, carry); + + if (ShouldSetFlags(context)) + { + EmitNZFlagsCheck(context, res); + + EmitAdcsCCheck(context, n, res); + EmitAddsVCheck(context, n, m, res); + } + + EmitAluStore(context, res); + } + + public static void And(ArmEmitterContext context) + { + IOpCode32Alu op = (IOpCode32Alu)context.CurrOp; + + Operand n = GetAluN(context); + Operand m = GetAluM(context); + + Operand res = context.BitwiseAnd(n, m); + + if (ShouldSetFlags(context)) + { + EmitNZFlagsCheck(context, res); + } + + EmitAluStore(context, res); + } + + public static void Bfc(ArmEmitterContext context) + { + IOpCode32AluBf op = (IOpCode32AluBf)context.CurrOp; + + Operand d = GetIntA32(context, op.Rd); + Operand res = context.BitwiseAnd(d, Const(~op.DestMask)); + + SetIntA32(context, op.Rd, res); + } + + public static void Bfi(ArmEmitterContext context) + { + IOpCode32AluBf op = (IOpCode32AluBf)context.CurrOp; + + Operand n = GetIntA32(context, op.Rn); + Operand d = GetIntA32(context, op.Rd); + Operand part = context.BitwiseAnd(n, Const(op.SourceMask)); + + if (op.Lsb != 0) + { + part = context.ShiftLeft(part, Const(op.Lsb)); + } + + Operand res = context.BitwiseAnd(d, Const(~op.DestMask)); + res = context.BitwiseOr(res, context.BitwiseAnd(part, Const(op.DestMask))); + + SetIntA32(context, op.Rd, res); + } + + public static void Bic(ArmEmitterContext context) + { + IOpCode32Alu op = (IOpCode32Alu)context.CurrOp; + + Operand n = GetAluN(context); + Operand m = GetAluM(context); + + Operand res = context.BitwiseAnd(n, context.BitwiseNot(m)); + + if (ShouldSetFlags(context)) + { + EmitNZFlagsCheck(context, res); + } + + EmitAluStore(context, res); + } + + public static void Clz(ArmEmitterContext context) + { + Operand m = GetAluM(context, setCarry: false); + + Operand res = context.CountLeadingZeros(m); + EmitAluStore(context, res); + } + + public static void Cmp(ArmEmitterContext context) + { + Operand n = GetAluN(context); + Operand m = GetAluM(context, setCarry: false); + + Operand res = context.Subtract(n, m); + + EmitNZFlagsCheck(context, res); + + EmitSubsCCheck(context, n, res); + EmitSubsVCheck(context, n, m, res); + } + + public static void Cmn(ArmEmitterContext context) + { + Operand n = GetAluN(context); + Operand m = GetAluM(context, setCarry: false); + + Operand res = context.Add(n, m); + + EmitNZFlagsCheck(context, res); + + EmitAddsCCheck(context, n, res); + EmitAddsVCheck(context, n, m, res); + } + + public static void Eor(ArmEmitterContext context) + { + IOpCode32Alu op = (IOpCode32Alu)context.CurrOp; + + Operand n = GetAluN(context); + Operand m = GetAluM(context); + + Operand res = context.BitwiseExclusiveOr(n, m); + + if (ShouldSetFlags(context)) + { + EmitNZFlagsCheck(context, res); + } + + EmitAluStore(context, res); + } + + public static void Mov(ArmEmitterContext context) + { + IOpCode32Alu op = (IOpCode32Alu)context.CurrOp; + + Operand m = GetAluM(context); + + if (ShouldSetFlags(context)) + { + EmitNZFlagsCheck(context, m); + } + + EmitAluStore(context, m); + } + + public static void Movt(ArmEmitterContext context) + { + IOpCode32AluImm16 op = (IOpCode32AluImm16)context.CurrOp; + + Operand d = GetIntA32(context, op.Rd); + Operand imm = Const(op.Immediate << 16); // Immeditate value as top halfword. + Operand res = context.BitwiseAnd(d, Const(0x0000ffff)); + res = context.BitwiseOr(res, imm); + + EmitAluStore(context, res); + } + + public static void Mul(ArmEmitterContext context) + { + IOpCode32Alu op = (IOpCode32Alu)context.CurrOp; + + Operand n = GetAluN(context); + Operand m = GetAluM(context); + + Operand res = context.Multiply(n, m); + + if (ShouldSetFlags(context)) + { + EmitNZFlagsCheck(context, res); + } + + EmitAluStore(context, res); + } + + public static void Mvn(ArmEmitterContext context) + { + IOpCode32Alu op = (IOpCode32Alu)context.CurrOp; + Operand m = GetAluM(context); + + Operand res = context.BitwiseNot(m); + + if (ShouldSetFlags(context)) + { + EmitNZFlagsCheck(context, res); + } + + EmitAluStore(context, res); + } + + public static void Orr(ArmEmitterContext context) + { + IOpCode32Alu op = (IOpCode32Alu)context.CurrOp; + + Operand n = GetAluN(context); + Operand m = GetAluM(context); + + Operand res = context.BitwiseOr(n, m); + + if (ShouldSetFlags(context)) + { + EmitNZFlagsCheck(context, res); + } + + EmitAluStore(context, res); + } + + public static void Orn(ArmEmitterContext context) + { + IOpCode32Alu op = (IOpCode32Alu)context.CurrOp; + + Operand n = GetAluN(context); + Operand m = GetAluM(context); + + Operand res = context.BitwiseOr(n, context.BitwiseNot(m)); + + if (ShouldSetFlags(context)) + { + EmitNZFlagsCheck(context, res); + } + + EmitAluStore(context, res); + } + + public static void Pkh(ArmEmitterContext context) + { + OpCode32AluRsImm op = (OpCode32AluRsImm)context.CurrOp; + + Operand n = GetAluN(context); + Operand m = GetAluM(context); + + Operand res; + + bool tbform = op.ShiftType == ShiftType.Asr; + if (tbform) + { + res = context.BitwiseOr(context.BitwiseAnd(n, Const(0xFFFF0000)), context.BitwiseAnd(m, Const(0xFFFF))); + } + else + { + res = context.BitwiseOr(context.BitwiseAnd(m, Const(0xFFFF0000)), context.BitwiseAnd(n, Const(0xFFFF))); + } + + EmitAluStore(context, res); + } + + public static void Rbit(ArmEmitterContext context) + { + Operand m = GetAluM(context); + + Operand res = EmitReverseBits32Op(context, m); + + EmitAluStore(context, res); + } + + public static void Rev(ArmEmitterContext context) + { + Operand m = GetAluM(context); + + Operand res = context.ByteSwap(m); + + EmitAluStore(context, res); + } + + public static void Rev16(ArmEmitterContext context) + { + Operand m = GetAluM(context); + + Operand res = EmitReverseBytes16_32Op(context, m); + + EmitAluStore(context, res); + } + + public static void Revsh(ArmEmitterContext context) + { + Operand m = GetAluM(context); + + Operand res = EmitReverseBytes16_32Op(context, m); + + EmitAluStore(context, context.SignExtend16(OperandType.I32, res)); + } + + public static void Rsc(ArmEmitterContext context) + { + IOpCode32Alu op = (IOpCode32Alu)context.CurrOp; + + Operand n = GetAluN(context); + Operand m = GetAluM(context, setCarry: false); + + Operand res = context.Subtract(m, n); + + Operand borrow = context.BitwiseExclusiveOr(GetFlag(PState.CFlag), Const(1)); + + res = context.Subtract(res, borrow); + + if (ShouldSetFlags(context)) + { + EmitNZFlagsCheck(context, res); + + EmitSbcsCCheck(context, m, n); + EmitSubsVCheck(context, m, n, res); + } + + EmitAluStore(context, res); + } + + public static void Rsb(ArmEmitterContext context) + { + IOpCode32Alu op = (IOpCode32Alu)context.CurrOp; + + Operand n = GetAluN(context); + Operand m = GetAluM(context, setCarry: false); + + Operand res = context.Subtract(m, n); + + if (ShouldSetFlags(context)) + { + EmitNZFlagsCheck(context, res); + + EmitSubsCCheck(context, m, res); + EmitSubsVCheck(context, m, n, res); + } + + EmitAluStore(context, res); + } + + public static void Sadd8(ArmEmitterContext context) + { + EmitAddSub8(context, add: true, unsigned: false); + } + + public static void Sbc(ArmEmitterContext context) + { + IOpCode32Alu op = (IOpCode32Alu)context.CurrOp; + + Operand n = GetAluN(context); + Operand m = GetAluM(context, setCarry: false); + + Operand res = context.Subtract(n, m); + + Operand borrow = context.BitwiseExclusiveOr(GetFlag(PState.CFlag), Const(1)); + + res = context.Subtract(res, borrow); + + if (ShouldSetFlags(context)) + { + EmitNZFlagsCheck(context, res); + + EmitSbcsCCheck(context, n, m); + EmitSubsVCheck(context, n, m, res); + } + + EmitAluStore(context, res); + } + + public static void Sbfx(ArmEmitterContext context) + { + IOpCode32AluBf op = (IOpCode32AluBf)context.CurrOp; + + var msb = op.Lsb + op.Msb; // For this instruction, the msb is actually a width. + + Operand n = GetIntA32(context, op.Rn); + Operand res = context.ShiftRightSI(context.ShiftLeft(n, Const(31 - msb)), Const(31 - op.Msb)); + + SetIntA32(context, op.Rd, res); + } + + public static void Sdiv(ArmEmitterContext context) + { + EmitDiv(context, unsigned: false); + } + + public static void Sel(ArmEmitterContext context) + { + IOpCode32AluReg op = (IOpCode32AluReg)context.CurrOp; + + Operand n = GetIntA32(context, op.Rn); + Operand m = GetIntA32(context, op.Rm); + + Operand ge0 = context.ZeroExtend8(OperandType.I32, context.Negate(GetFlag(PState.GE0Flag))); + Operand ge1 = context.ZeroExtend8(OperandType.I32, context.Negate(GetFlag(PState.GE1Flag))); + Operand ge2 = context.ZeroExtend8(OperandType.I32, context.Negate(GetFlag(PState.GE2Flag))); + Operand ge3 = context.Negate(GetFlag(PState.GE3Flag)); + + Operand mask = context.BitwiseOr(ge0, context.ShiftLeft(ge1, Const(8))); + mask = context.BitwiseOr(mask, context.ShiftLeft(ge2, Const(16))); + mask = context.BitwiseOr(mask, context.ShiftLeft(ge3, Const(24))); + + Operand res = context.BitwiseOr(context.BitwiseAnd(n, mask), context.BitwiseAnd(m, context.BitwiseNot(mask))); + + SetIntA32(context, op.Rd, res); + } + + public static void Shadd8(ArmEmitterContext context) + { + EmitHadd8(context, unsigned: false); + } + + public static void Shsub8(ArmEmitterContext context) + { + EmitHsub8(context, unsigned: false); + } + + public static void Ssat(ArmEmitterContext context) + { + OpCode32Sat op = (OpCode32Sat)context.CurrOp; + + EmitSat(context, -(1 << op.SatImm), (1 << op.SatImm) - 1); + } + + public static void Ssat16(ArmEmitterContext context) + { + OpCode32Sat16 op = (OpCode32Sat16)context.CurrOp; + + EmitSat16(context, -(1 << op.SatImm), (1 << op.SatImm) - 1); + } + + public static void Ssub8(ArmEmitterContext context) + { + EmitAddSub8(context, add: false, unsigned: false); + } + + public static void Sub(ArmEmitterContext context) + { + IOpCode32Alu op = (IOpCode32Alu)context.CurrOp; + + Operand n = GetAluN(context); + Operand m = GetAluM(context, setCarry: false); + + Operand res = context.Subtract(n, m); + + if (ShouldSetFlags(context)) + { + EmitNZFlagsCheck(context, res); + + EmitSubsCCheck(context, n, res); + EmitSubsVCheck(context, n, m, res); + } + + EmitAluStore(context, res); + } + + public static void Sxtb(ArmEmitterContext context) + { + EmitSignExtend(context, true, 8); + } + + public static void Sxtb16(ArmEmitterContext context) + { + EmitExtend16(context, true); + } + + public static void Sxth(ArmEmitterContext context) + { + EmitSignExtend(context, true, 16); + } + + public static void Teq(ArmEmitterContext context) + { + Operand n = GetAluN(context); + Operand m = GetAluM(context); + + Operand res = context.BitwiseExclusiveOr(n, m); + + EmitNZFlagsCheck(context, res); + } + + public static void Tst(ArmEmitterContext context) + { + Operand n = GetAluN(context); + Operand m = GetAluM(context); + + Operand res = context.BitwiseAnd(n, m); + EmitNZFlagsCheck(context, res); + } + + public static void Uadd8(ArmEmitterContext context) + { + EmitAddSub8(context, add: true, unsigned: true); + } + + public static void Ubfx(ArmEmitterContext context) + { + IOpCode32AluBf op = (IOpCode32AluBf)context.CurrOp; + + var msb = op.Lsb + op.Msb; // For this instruction, the msb is actually a width. + + Operand n = GetIntA32(context, op.Rn); + Operand res = context.ShiftRightUI(context.ShiftLeft(n, Const(31 - msb)), Const(31 - op.Msb)); + + SetIntA32(context, op.Rd, res); + } + + public static void Udiv(ArmEmitterContext context) + { + EmitDiv(context, unsigned: true); + } + + public static void Uhadd8(ArmEmitterContext context) + { + EmitHadd8(context, unsigned: true); + } + + public static void Uhsub8(ArmEmitterContext context) + { + EmitHsub8(context, unsigned: true); + } + + public static void Usat(ArmEmitterContext context) + { + OpCode32Sat op = (OpCode32Sat)context.CurrOp; + + EmitSat(context, 0, op.SatImm == 32 ? (int)(~0) : (1 << op.SatImm) - 1); + } + + public static void Usat16(ArmEmitterContext context) + { + OpCode32Sat16 op = (OpCode32Sat16)context.CurrOp; + + EmitSat16(context, 0, (1 << op.SatImm) - 1); + } + + public static void Usub8(ArmEmitterContext context) + { + EmitAddSub8(context, add: false, unsigned: true); + } + + public static void Uxtb(ArmEmitterContext context) + { + EmitSignExtend(context, false, 8); + } + + public static void Uxtb16(ArmEmitterContext context) + { + EmitExtend16(context, false); + } + + public static void Uxth(ArmEmitterContext context) + { + EmitSignExtend(context, false, 16); + } + + private static void EmitSignExtend(ArmEmitterContext context, bool signed, int bits) + { + IOpCode32AluUx op = (IOpCode32AluUx)context.CurrOp; + + Operand m = GetAluM(context); + Operand res; + + if (op.RotateBits == 0) + { + res = m; + } + else + { + Operand rotate = Const(op.RotateBits); + res = context.RotateRight(m, rotate); + } + + switch (bits) + { + case 8: + res = (signed) ? context.SignExtend8(OperandType.I32, res) : context.ZeroExtend8(OperandType.I32, res); + break; + case 16: + res = (signed) ? context.SignExtend16(OperandType.I32, res) : context.ZeroExtend16(OperandType.I32, res); + break; + } + + if (op.Add) + { + res = context.Add(res, GetAluN(context)); + } + + EmitAluStore(context, res); + } + + private static void EmitExtend16(ArmEmitterContext context, bool signed) + { + IOpCode32AluUx op = (IOpCode32AluUx)context.CurrOp; + + Operand m = GetAluM(context); + Operand res; + + if (op.RotateBits == 0) + { + res = m; + } + else + { + Operand rotate = Const(op.RotateBits); + res = context.RotateRight(m, rotate); + } + + Operand low16, high16; + if (signed) + { + low16 = context.SignExtend8(OperandType.I32, res); + high16 = context.SignExtend8(OperandType.I32, context.ShiftRightUI(res, Const(16))); + } + else + { + low16 = context.ZeroExtend8(OperandType.I32, res); + high16 = context.ZeroExtend8(OperandType.I32, context.ShiftRightUI(res, Const(16))); + } + + if (op.Add) + { + Operand n = GetAluN(context); + Operand lowAdd, highAdd; + if (signed) + { + lowAdd = context.SignExtend16(OperandType.I32, n); + highAdd = context.SignExtend16(OperandType.I32, context.ShiftRightUI(n, Const(16))); + } + else + { + lowAdd = context.ZeroExtend16(OperandType.I32, n); + highAdd = context.ZeroExtend16(OperandType.I32, context.ShiftRightUI(n, Const(16))); + } + + low16 = context.Add(low16, lowAdd); + high16 = context.Add(high16, highAdd); + } + + res = context.BitwiseOr( + context.ZeroExtend16(OperandType.I32, low16), + context.ShiftLeft(context.ZeroExtend16(OperandType.I32, high16), Const(16))); + + EmitAluStore(context, res); + } + + private static void EmitDiv(ArmEmitterContext context, bool unsigned) + { + Operand n = GetAluN(context); + Operand m = GetAluM(context); + Operand zero = Const(m.Type, 0); + + Operand divisorIsZero = context.ICompareEqual(m, zero); + + Operand lblBadDiv = Label(); + Operand lblEnd = Label(); + + context.BranchIfTrue(lblBadDiv, divisorIsZero); + + if (!unsigned) + { + // ARM64 behaviour: If Rn == INT_MIN && Rm == -1, Rd = INT_MIN (overflow). + // TODO: tests to ensure A32 works the same + + Operand intMin = Const(int.MinValue); + Operand minus1 = Const(-1); + + Operand nIsIntMin = context.ICompareEqual(n, intMin); + Operand mIsMinus1 = context.ICompareEqual(m, minus1); + + Operand lblGoodDiv = Label(); + + context.BranchIfFalse(lblGoodDiv, context.BitwiseAnd(nIsIntMin, mIsMinus1)); + + EmitAluStore(context, intMin); + + context.Branch(lblEnd); + + context.MarkLabel(lblGoodDiv); + } + + Operand res = unsigned + ? context.DivideUI(n, m) + : context.Divide(n, m); + + EmitAluStore(context, res); + + context.Branch(lblEnd); + + context.MarkLabel(lblBadDiv); + + EmitAluStore(context, zero); + + context.MarkLabel(lblEnd); + } + + private static void EmitAddSub8(ArmEmitterContext context, bool add, bool unsigned) + { + IOpCode32AluReg op = (IOpCode32AluReg)context.CurrOp; + + Operand n = GetIntA32(context, op.Rn); + Operand m = GetIntA32(context, op.Rm); + + Operand res = Const(0); + + for (int byteSel = 0; byteSel < 4; byteSel++) + { + Operand shift = Const(byteSel * 8); + + Operand nByte = context.ShiftRightUI(n, shift); + Operand mByte = context.ShiftRightUI(m, shift); + + nByte = unsigned ? context.ZeroExtend8(OperandType.I32, nByte) : context.SignExtend8(OperandType.I32, nByte); + mByte = unsigned ? context.ZeroExtend8(OperandType.I32, mByte) : context.SignExtend8(OperandType.I32, mByte); + + Operand resByte = add ? context.Add(nByte, mByte) : context.Subtract(nByte, mByte); + + res = context.BitwiseOr(res, context.ShiftLeft(context.ZeroExtend8(OperandType.I32, resByte), shift)); + + SetFlag(context, PState.GE0Flag + byteSel, unsigned && add + ? context.ShiftRightUI(resByte, Const(8)) + : context.ShiftRightUI(context.BitwiseNot(resByte), Const(31))); + } + + SetIntA32(context, op.Rd, res); + } + + private static void EmitHadd8(ArmEmitterContext context, bool unsigned) + { + IOpCode32AluReg op = (IOpCode32AluReg)context.CurrOp; + + Operand m = GetIntA32(context, op.Rm); + Operand n = GetIntA32(context, op.Rn); + + Operand xor, res, carry; + + // This relies on the equality x+y == ((x&y) << 1) + (x^y). + // Note that x^y always contains the LSB of the result. + // Since we want to calculate (x+y)/2, we can instead calculate (x&y) + ((x^y)>>1). + // We mask by 0x7F to remove the LSB so that it doesn't leak into the field below. + + res = context.BitwiseAnd(m, n); + carry = context.BitwiseExclusiveOr(m, n); + xor = context.ShiftRightUI(carry, Const(1)); + xor = context.BitwiseAnd(xor, Const(0x7F7F7F7Fu)); + res = context.Add(res, xor); + + if (!unsigned) + { + // Propagates the sign bit from (x^y)>>1 upwards by one. + carry = context.BitwiseAnd(carry, Const(0x80808080u)); + res = context.BitwiseExclusiveOr(res, carry); + } + + SetIntA32(context, op.Rd, res); + } + + private static void EmitHsub8(ArmEmitterContext context, bool unsigned) + { + IOpCode32AluReg op = (IOpCode32AluReg)context.CurrOp; + + Operand m = GetIntA32(context, op.Rm); + Operand n = GetIntA32(context, op.Rn); + Operand left, right, carry, res; + + // This relies on the equality x-y == (x^y) - (((x^y)&y) << 1). + // Note that x^y always contains the LSB of the result. + // Since we want to calculate (x+y)/2, we can instead calculate ((x^y)>>1) - ((x^y)&y). + + carry = context.BitwiseExclusiveOr(m, n); + left = context.ShiftRightUI(carry, Const(1)); + right = context.BitwiseAnd(carry, m); + + // We must now perform a partitioned subtraction. + // We can do this because minuend contains 7 bit fields. + // We use the extra bit in minuend as a bit to borrow from; we set this bit. + // We invert this bit at the end as this tells us if that bit was borrowed from. + + res = context.BitwiseOr(left, Const(0x80808080)); + res = context.Subtract(res, right); + res = context.BitwiseExclusiveOr(res, Const(0x80808080)); + + if (!unsigned) + { + // We then sign extend the result into this bit. + carry = context.BitwiseAnd(carry, Const(0x80808080)); + res = context.BitwiseExclusiveOr(res, carry); + } + + SetIntA32(context, op.Rd, res); + } + + private static void EmitSat(ArmEmitterContext context, int intMin, int intMax) + { + OpCode32Sat op = (OpCode32Sat)context.CurrOp; + + Operand n = GetIntA32(context, op.Rn); + + int shift = DecodeImmShift(op.ShiftType, op.Imm5); + + switch (op.ShiftType) + { + case ShiftType.Lsl: + if (shift == 32) + { + n = Const(0); + } + else + { + n = context.ShiftLeft(n, Const(shift)); + } + break; + case ShiftType.Asr: + if (shift == 32) + { + n = context.ShiftRightSI(n, Const(31)); + } + else + { + n = context.ShiftRightSI(n, Const(shift)); + } + break; + } + + Operand lblCheckLtIntMin = Label(); + Operand lblNoSat = Label(); + Operand lblEnd = Label(); + + context.BranchIfFalse(lblCheckLtIntMin, context.ICompareGreater(n, Const(intMax))); + + SetFlag(context, PState.QFlag, Const(1)); + SetIntA32(context, op.Rd, Const(intMax)); + context.Branch(lblEnd); + + context.MarkLabel(lblCheckLtIntMin); + context.BranchIfFalse(lblNoSat, context.ICompareLess(n, Const(intMin))); + + SetFlag(context, PState.QFlag, Const(1)); + SetIntA32(context, op.Rd, Const(intMin)); + context.Branch(lblEnd); + + context.MarkLabel(lblNoSat); + + SetIntA32(context, op.Rd, n); + + context.MarkLabel(lblEnd); + } + + private static void EmitSat16(ArmEmitterContext context, int intMin, int intMax) + { + OpCode32Sat16 op = (OpCode32Sat16)context.CurrOp; + + void SetD(int part, Operand value) + { + if (part == 0) + { + SetIntA32(context, op.Rd, context.ZeroExtend16(OperandType.I32, value)); + } + else + { + SetIntA32(context, op.Rd, context.BitwiseOr(GetIntA32(context, op.Rd), context.ShiftLeft(value, Const(16)))); + } + } + + Operand n = GetIntA32(context, op.Rn); + + Operand nLow = context.SignExtend16(OperandType.I32, n); + Operand nHigh = context.ShiftRightSI(n, Const(16)); + + for (int part = 0; part < 2; part++) + { + Operand nPart = part == 0 ? nLow : nHigh; + + Operand lblCheckLtIntMin = Label(); + Operand lblNoSat = Label(); + Operand lblEnd = Label(); + + context.BranchIfFalse(lblCheckLtIntMin, context.ICompareGreater(nPart, Const(intMax))); + + SetFlag(context, PState.QFlag, Const(1)); + SetD(part, Const(intMax)); + context.Branch(lblEnd); + + context.MarkLabel(lblCheckLtIntMin); + context.BranchIfFalse(lblNoSat, context.ICompareLess(nPart, Const(intMin))); + + SetFlag(context, PState.QFlag, Const(1)); + SetD(part, Const(intMin)); + context.Branch(lblEnd); + + context.MarkLabel(lblNoSat); + + SetD(part, nPart); + + context.MarkLabel(lblEnd); + } + } + + private static void EmitAluStore(ArmEmitterContext context, Operand value) + { + IOpCode32Alu op = (IOpCode32Alu)context.CurrOp; + + EmitGenericAluStoreA32(context, op.Rd, ShouldSetFlags(context), value); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitAluHelper.cs b/src/ARMeilleure/Instructions/InstEmitAluHelper.cs new file mode 100644 index 00000000..994878ad --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitAluHelper.cs @@ -0,0 +1,613 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; +using System; +using System.Diagnostics; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static class InstEmitAluHelper + { + public static bool ShouldSetFlags(ArmEmitterContext context) + { + IOpCode32HasSetFlags op = (IOpCode32HasSetFlags)context.CurrOp; + + if (op.SetFlags == null) + { + return !context.IsInIfThenBlock; + } + + return op.SetFlags.Value; + } + + public static void EmitNZFlagsCheck(ArmEmitterContext context, Operand d) + { + SetFlag(context, PState.NFlag, context.ICompareLess (d, Const(d.Type, 0))); + SetFlag(context, PState.ZFlag, context.ICompareEqual(d, Const(d.Type, 0))); + } + + public static void EmitAdcsCCheck(ArmEmitterContext context, Operand n, Operand d) + { + // C = (Rd == Rn && CIn) || Rd < Rn + Operand cIn = GetFlag(PState.CFlag); + + Operand cOut = context.BitwiseAnd(context.ICompareEqual(d, n), cIn); + + cOut = context.BitwiseOr(cOut, context.ICompareLessUI(d, n)); + + SetFlag(context, PState.CFlag, cOut); + } + + public static void EmitAddsCCheck(ArmEmitterContext context, Operand n, Operand d) + { + // C = Rd < Rn + SetFlag(context, PState.CFlag, context.ICompareLessUI(d, n)); + } + + public static void EmitAddsVCheck(ArmEmitterContext context, Operand n, Operand m, Operand d) + { + // V = (Rd ^ Rn) & ~(Rn ^ Rm) < 0 + Operand vOut = context.BitwiseExclusiveOr(d, n); + + vOut = context.BitwiseAnd(vOut, context.BitwiseNot(context.BitwiseExclusiveOr(n, m))); + + vOut = context.ICompareLess(vOut, Const(vOut.Type, 0)); + + SetFlag(context, PState.VFlag, vOut); + } + + public static void EmitSbcsCCheck(ArmEmitterContext context, Operand n, Operand m) + { + // C = (Rn == Rm && CIn) || Rn > Rm + Operand cIn = GetFlag(PState.CFlag); + + Operand cOut = context.BitwiseAnd(context.ICompareEqual(n, m), cIn); + + cOut = context.BitwiseOr(cOut, context.ICompareGreaterUI(n, m)); + + SetFlag(context, PState.CFlag, cOut); + } + + public static void EmitSubsCCheck(ArmEmitterContext context, Operand n, Operand m) + { + // C = Rn >= Rm + SetFlag(context, PState.CFlag, context.ICompareGreaterOrEqualUI(n, m)); + } + + public static void EmitSubsVCheck(ArmEmitterContext context, Operand n, Operand m, Operand d) + { + // V = (Rd ^ Rn) & (Rn ^ Rm) < 0 + Operand vOut = context.BitwiseExclusiveOr(d, n); + + vOut = context.BitwiseAnd(vOut, context.BitwiseExclusiveOr(n, m)); + + vOut = context.ICompareLess(vOut, Const(vOut.Type, 0)); + + SetFlag(context, PState.VFlag, vOut); + } + + public static Operand EmitReverseBits32Op(ArmEmitterContext context, Operand op) + { + Debug.Assert(op.Type == OperandType.I32); + + Operand val = context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op, Const(0xaaaaaaaau)), Const(1)), + context.ShiftLeft(context.BitwiseAnd(op, Const(0x55555555u)), Const(1))); + + val = context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(val, Const(0xccccccccu)), Const(2)), + context.ShiftLeft(context.BitwiseAnd(val, Const(0x33333333u)), Const(2))); + val = context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(val, Const(0xf0f0f0f0u)), Const(4)), + context.ShiftLeft(context.BitwiseAnd(val, Const(0x0f0f0f0fu)), Const(4))); + val = context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(val, Const(0xff00ff00u)), Const(8)), + context.ShiftLeft(context.BitwiseAnd(val, Const(0x00ff00ffu)), Const(8))); + + return context.BitwiseOr(context.ShiftRightUI(val, Const(16)), context.ShiftLeft(val, Const(16))); + } + + public static Operand EmitReverseBytes16_64Op(ArmEmitterContext context, Operand op) + { + Debug.Assert(op.Type == OperandType.I64); + + return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op, Const(0xff00ff00ff00ff00ul)), Const(8)), + context.ShiftLeft(context.BitwiseAnd(op, Const(0x00ff00ff00ff00fful)), Const(8))); + } + + public static Operand EmitReverseBytes16_32Op(ArmEmitterContext context, Operand op) + { + Debug.Assert(op.Type == OperandType.I32); + + Operand val = EmitReverseBytes16_64Op(context, context.ZeroExtend32(OperandType.I64, op)); + + return context.ConvertI64ToI32(val); + } + + private static void EmitAluWritePc(ArmEmitterContext context, Operand value) + { + Debug.Assert(value.Type == OperandType.I32); + + if (((OpCode32)context.CurrOp).IsThumb) + { + bool isReturn = IsA32Return(context); + if (!isReturn) + { + context.StoreToContext(); + } + + InstEmitFlowHelper.EmitVirtualJump(context, value, isReturn); + } + else + { + EmitBxWritePc(context, value); + } + } + + public static void EmitGenericAluStoreA32(ArmEmitterContext context, int rd, bool setFlags, Operand value) + { + Debug.Assert(value.Type == OperandType.I32); + + if (rd == RegisterAlias.Aarch32Pc && setFlags) + { + if (setFlags) + { + // TODO: Load SPSR etc. + + EmitBxWritePc(context, value); + } + else + { + EmitAluWritePc(context, value); + } + } + else + { + SetIntA32(context, rd, value); + } + } + + public static Operand GetAluN(ArmEmitterContext context) + { + if (context.CurrOp is IOpCodeAlu op) + { + if (op.DataOp == DataOp.Logical || op is IOpCodeAluRs) + { + return GetIntOrZR(context, op.Rn); + } + else + { + return GetIntOrSP(context, op.Rn); + } + } + else if (context.CurrOp is IOpCode32Alu op32) + { + return GetIntA32(context, op32.Rn); + } + else + { + throw InvalidOpCodeType(context.CurrOp); + } + } + + public static Operand GetAluM(ArmEmitterContext context, bool setCarry = true) + { + switch (context.CurrOp) + { + // ARM32. + case IOpCode32AluImm op: + { + if (ShouldSetFlags(context) && op.IsRotated && setCarry) + { + SetFlag(context, PState.CFlag, Const((uint)op.Immediate >> 31)); + } + + return Const(op.Immediate); + } + + case IOpCode32AluImm16 op: return Const(op.Immediate); + + case IOpCode32AluRsImm op: return GetMShiftedByImmediate(context, op, setCarry); + case IOpCode32AluRsReg op: return GetMShiftedByReg(context, op, setCarry); + + case IOpCode32AluReg op: return GetIntA32(context, op.Rm); + + // ARM64. + case IOpCodeAluImm op: + { + if (op.GetOperandType() == OperandType.I32) + { + return Const((int)op.Immediate); + } + else + { + return Const(op.Immediate); + } + } + + case IOpCodeAluRs op: + { + Operand value = GetIntOrZR(context, op.Rm); + + switch (op.ShiftType) + { + case ShiftType.Lsl: value = context.ShiftLeft (value, Const(op.Shift)); break; + case ShiftType.Lsr: value = context.ShiftRightUI(value, Const(op.Shift)); break; + case ShiftType.Asr: value = context.ShiftRightSI(value, Const(op.Shift)); break; + case ShiftType.Ror: value = context.RotateRight (value, Const(op.Shift)); break; + } + + return value; + } + + case IOpCodeAluRx op: + { + Operand value = GetExtendedM(context, op.Rm, op.IntType); + + value = context.ShiftLeft(value, Const(op.Shift)); + + return value; + } + + default: throw InvalidOpCodeType(context.CurrOp); + } + } + + private static Exception InvalidOpCodeType(OpCode opCode) + { + return new InvalidOperationException($"Invalid OpCode type \"{opCode?.GetType().Name ?? "null"}\"."); + } + + // ARM32 helpers. + public static Operand GetMShiftedByImmediate(ArmEmitterContext context, IOpCode32AluRsImm op, bool setCarry) + { + Operand m = GetIntA32(context, op.Rm); + + int shift = op.Immediate; + + if (shift == 0) + { + switch (op.ShiftType) + { + case ShiftType.Lsr: shift = 32; break; + case ShiftType.Asr: shift = 32; break; + case ShiftType.Ror: shift = 1; break; + } + } + + if (shift != 0) + { + setCarry &= ShouldSetFlags(context); + + switch (op.ShiftType) + { + case ShiftType.Lsl: m = GetLslC(context, m, setCarry, shift); break; + case ShiftType.Lsr: m = GetLsrC(context, m, setCarry, shift); break; + case ShiftType.Asr: m = GetAsrC(context, m, setCarry, shift); break; + case ShiftType.Ror: + if (op.Immediate != 0) + { + m = GetRorC(context, m, setCarry, shift); + } + else + { + m = GetRrxC(context, m, setCarry); + } + break; + } + } + + return m; + } + + public static int DecodeImmShift(ShiftType shiftType, int shift) + { + if (shift == 0) + { + switch (shiftType) + { + case ShiftType.Lsr: shift = 32; break; + case ShiftType.Asr: shift = 32; break; + case ShiftType.Ror: shift = 1; break; + } + } + + return shift; + } + + public static Operand GetMShiftedByReg(ArmEmitterContext context, IOpCode32AluRsReg op, bool setCarry) + { + Operand m = GetIntA32(context, op.Rm); + Operand s = context.ZeroExtend8(OperandType.I32, GetIntA32(context, op.Rs)); + Operand shiftIsZero = context.ICompareEqual(s, Const(0)); + + Operand zeroResult = m; + Operand shiftResult = m; + + setCarry &= ShouldSetFlags(context); + + switch (op.ShiftType) + { + case ShiftType.Lsl: shiftResult = EmitLslC(context, m, setCarry, s, shiftIsZero); break; + case ShiftType.Lsr: shiftResult = EmitLsrC(context, m, setCarry, s, shiftIsZero); break; + case ShiftType.Asr: shiftResult = EmitAsrC(context, m, setCarry, s, shiftIsZero); break; + case ShiftType.Ror: shiftResult = EmitRorC(context, m, setCarry, s, shiftIsZero); break; + } + + return context.ConditionalSelect(shiftIsZero, zeroResult, shiftResult); + } + + public static void EmitIfHelper(ArmEmitterContext context, Operand boolValue, Action action, bool expected = true) + { + Debug.Assert(boolValue.Type == OperandType.I32); + + Operand endLabel = Label(); + + if (expected) + { + context.BranchIfFalse(endLabel, boolValue); + } + else + { + context.BranchIfTrue(endLabel, boolValue); + } + + action(); + + context.MarkLabel(endLabel); + } + + public static Operand EmitLslC(ArmEmitterContext context, Operand m, bool setCarry, Operand shift, Operand shiftIsZero) + { + Debug.Assert(m.Type == OperandType.I32 && shift.Type == OperandType.I32 && shiftIsZero.Type == OperandType.I32); + + Operand shiftLarge = context.ICompareGreaterOrEqual(shift, Const(32)); + Operand result = context.ShiftLeft(m, shift); + if (setCarry) + { + EmitIfHelper(context, shiftIsZero, () => + { + Operand cOut = context.ShiftRightUI(m, context.Subtract(Const(32), shift)); + + cOut = context.BitwiseAnd(cOut, Const(1)); + cOut = context.ConditionalSelect(context.ICompareGreater(shift, Const(32)), Const(0), cOut); + + SetFlag(context, PState.CFlag, cOut); + }, false); + } + + return context.ConditionalSelect(shiftLarge, Const(0), result); + } + + public static Operand GetLslC(ArmEmitterContext context, Operand m, bool setCarry, int shift) + { + Debug.Assert(m.Type == OperandType.I32); + + if ((uint)shift > 32) + { + return GetShiftByMoreThan32(context, setCarry); + } + else if (shift == 32) + { + if (setCarry) + { + SetCarryMLsb(context, m); + } + + return Const(0); + } + else + { + if (setCarry) + { + Operand cOut = context.ShiftRightUI(m, Const(32 - shift)); + + cOut = context.BitwiseAnd(cOut, Const(1)); + + SetFlag(context, PState.CFlag, cOut); + } + + return context.ShiftLeft(m, Const(shift)); + } + } + + public static Operand EmitLsrC(ArmEmitterContext context, Operand m, bool setCarry, Operand shift, Operand shiftIsZero) + { + Debug.Assert(m.Type == OperandType.I32 && shift.Type == OperandType.I32 && shiftIsZero.Type == OperandType.I32); + + Operand shiftLarge = context.ICompareGreaterOrEqual(shift, Const(32)); + Operand result = context.ShiftRightUI(m, shift); + if (setCarry) + { + EmitIfHelper(context, shiftIsZero, () => + { + Operand cOut = context.ShiftRightUI(m, context.Subtract(shift, Const(1))); + + cOut = context.BitwiseAnd(cOut, Const(1)); + cOut = context.ConditionalSelect(context.ICompareGreater(shift, Const(32)), Const(0), cOut); + + SetFlag(context, PState.CFlag, cOut); + }, false); + } + + return context.ConditionalSelect(shiftLarge, Const(0), result); + } + + public static Operand GetLsrC(ArmEmitterContext context, Operand m, bool setCarry, int shift) + { + Debug.Assert(m.Type == OperandType.I32); + + if ((uint)shift > 32) + { + return GetShiftByMoreThan32(context, setCarry); + } + else if (shift == 32) + { + if (setCarry) + { + SetCarryMMsb(context, m); + } + + return Const(0); + } + else + { + if (setCarry) + { + SetCarryMShrOut(context, m, shift); + } + + return context.ShiftRightUI(m, Const(shift)); + } + } + + private static Operand GetShiftByMoreThan32(ArmEmitterContext context, bool setCarry) + { + if (setCarry) + { + SetFlag(context, PState.CFlag, Const(0)); + } + + return Const(0); + } + + public static Operand EmitAsrC(ArmEmitterContext context, Operand m, bool setCarry, Operand shift, Operand shiftIsZero) + { + Debug.Assert(m.Type == OperandType.I32 && shift.Type == OperandType.I32 && shiftIsZero.Type == OperandType.I32); + + Operand l32Result; + Operand ge32Result; + + Operand less32 = context.ICompareLess(shift, Const(32)); + + ge32Result = context.ShiftRightSI(m, Const(31)); + + if (setCarry) + { + EmitIfHelper(context, context.BitwiseOr(less32, shiftIsZero), () => + { + SetCarryMLsb(context, ge32Result); + }, false); + } + + l32Result = context.ShiftRightSI(m, shift); + if (setCarry) + { + EmitIfHelper(context, context.BitwiseAnd(less32, context.BitwiseNot(shiftIsZero)), () => + { + Operand cOut = context.ShiftRightUI(m, context.Subtract(shift, Const(1))); + + cOut = context.BitwiseAnd(cOut, Const(1)); + + SetFlag(context, PState.CFlag, cOut); + }); + } + + return context.ConditionalSelect(less32, l32Result, ge32Result); + } + + public static Operand GetAsrC(ArmEmitterContext context, Operand m, bool setCarry, int shift) + { + Debug.Assert(m.Type == OperandType.I32); + + if ((uint)shift >= 32) + { + m = context.ShiftRightSI(m, Const(31)); + + if (setCarry) + { + SetCarryMLsb(context, m); + } + + return m; + } + else + { + if (setCarry) + { + SetCarryMShrOut(context, m, shift); + } + + return context.ShiftRightSI(m, Const(shift)); + } + } + + public static Operand EmitRorC(ArmEmitterContext context, Operand m, bool setCarry, Operand shift, Operand shiftIsZero) + { + Debug.Assert(m.Type == OperandType.I32 && shift.Type == OperandType.I32 && shiftIsZero.Type == OperandType.I32); + + shift = context.BitwiseAnd(shift, Const(0x1f)); + m = context.RotateRight(m, shift); + + if (setCarry) + { + EmitIfHelper(context, shiftIsZero, () => + { + SetCarryMMsb(context, m); + }, false); + } + + return m; + } + + public static Operand GetRorC(ArmEmitterContext context, Operand m, bool setCarry, int shift) + { + Debug.Assert(m.Type == OperandType.I32); + + shift &= 0x1f; + + m = context.RotateRight(m, Const(shift)); + + if (setCarry) + { + SetCarryMMsb(context, m); + } + + return m; + } + + public static Operand GetRrxC(ArmEmitterContext context, Operand m, bool setCarry) + { + Debug.Assert(m.Type == OperandType.I32); + + // Rotate right by 1 with carry. + Operand cIn = context.Copy(GetFlag(PState.CFlag)); + + if (setCarry) + { + SetCarryMLsb(context, m); + } + + m = context.ShiftRightUI(m, Const(1)); + + m = context.BitwiseOr(m, context.ShiftLeft(cIn, Const(31))); + + return m; + } + + private static void SetCarryMLsb(ArmEmitterContext context, Operand m) + { + Debug.Assert(m.Type == OperandType.I32); + + SetFlag(context, PState.CFlag, context.BitwiseAnd(m, Const(1))); + } + + private static void SetCarryMMsb(ArmEmitterContext context, Operand m) + { + Debug.Assert(m.Type == OperandType.I32); + + SetFlag(context, PState.CFlag, context.ShiftRightUI(m, Const(31))); + } + + private static void SetCarryMShrOut(ArmEmitterContext context, Operand m, int shift) + { + Debug.Assert(m.Type == OperandType.I32); + + Operand cOut = context.ShiftRightUI(m, Const(shift - 1)); + + cOut = context.BitwiseAnd(cOut, Const(1)); + + SetFlag(context, PState.CFlag, cOut); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitBfm.cs b/src/ARMeilleure/Instructions/InstEmitBfm.cs new file mode 100644 index 00000000..46a7dddd --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitBfm.cs @@ -0,0 +1,196 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit + { + public static void Bfm(ArmEmitterContext context) + { + OpCodeBfm op = (OpCodeBfm)context.CurrOp; + + Operand d = GetIntOrZR(context, op.Rd); + Operand n = GetIntOrZR(context, op.Rn); + + Operand res; + + if (op.Pos < op.Shift) + { + // BFI. + int shift = op.GetBitsCount() - op.Shift; + + int width = op.Pos + 1; + + long mask = (long)(ulong.MaxValue >> (64 - width)); + + res = context.ShiftLeft(context.BitwiseAnd(n, Const(n.Type, mask)), Const(shift)); + + res = context.BitwiseOr(res, context.BitwiseAnd(d, Const(d.Type, ~(mask << shift)))); + } + else + { + // BFXIL. + int shift = op.Shift; + + int width = op.Pos - shift + 1; + + long mask = (long)(ulong.MaxValue >> (64 - width)); + + res = context.BitwiseAnd(context.ShiftRightUI(n, Const(shift)), Const(n.Type, mask)); + + res = context.BitwiseOr(res, context.BitwiseAnd(d, Const(d.Type, ~mask))); + } + + SetIntOrZR(context, op.Rd, res); + } + + public static void Sbfm(ArmEmitterContext context) + { + OpCodeBfm op = (OpCodeBfm)context.CurrOp; + + int bitsCount = op.GetBitsCount(); + + if (op.Pos + 1 == bitsCount) + { + EmitSbfmShift(context); + } + else if (op.Pos < op.Shift) + { + EmitSbfiz(context); + } + else if (op.Pos == 7 && op.Shift == 0) + { + Operand n = GetIntOrZR(context, op.Rn); + + SetIntOrZR(context, op.Rd, context.SignExtend8(n.Type, n)); + } + else if (op.Pos == 15 && op.Shift == 0) + { + Operand n = GetIntOrZR(context, op.Rn); + + SetIntOrZR(context, op.Rd, context.SignExtend16(n.Type, n)); + } + else if (op.Pos == 31 && op.Shift == 0) + { + Operand n = GetIntOrZR(context, op.Rn); + + SetIntOrZR(context, op.Rd, context.SignExtend32(n.Type, n)); + } + else + { + Operand res = GetIntOrZR(context, op.Rn); + + res = context.ShiftLeft (res, Const(bitsCount - 1 - op.Pos)); + res = context.ShiftRightSI(res, Const(bitsCount - 1)); + res = context.BitwiseAnd (res, Const(res.Type, ~op.TMask)); + + Operand n2 = GetBfmN(context); + + SetIntOrZR(context, op.Rd, context.BitwiseOr(res, n2)); + } + } + + public static void Ubfm(ArmEmitterContext context) + { + OpCodeBfm op = (OpCodeBfm)context.CurrOp; + + if (op.Pos + 1 == op.GetBitsCount()) + { + EmitUbfmShift(context); + } + else if (op.Pos < op.Shift) + { + EmitUbfiz(context); + } + else if (op.Pos + 1 == op.Shift) + { + EmitBfmLsl(context); + } + else if (op.Pos == 7 && op.Shift == 0) + { + Operand n = GetIntOrZR(context, op.Rn); + + SetIntOrZR(context, op.Rd, context.BitwiseAnd(n, Const(n.Type, 0xff))); + } + else if (op.Pos == 15 && op.Shift == 0) + { + Operand n = GetIntOrZR(context, op.Rn); + + SetIntOrZR(context, op.Rd, context.BitwiseAnd(n, Const(n.Type, 0xffff))); + } + else + { + SetIntOrZR(context, op.Rd, GetBfmN(context)); + } + } + + private static void EmitSbfiz(ArmEmitterContext context) => EmitBfiz(context, signed: true); + private static void EmitUbfiz(ArmEmitterContext context) => EmitBfiz(context, signed: false); + + private static void EmitBfiz(ArmEmitterContext context, bool signed) + { + OpCodeBfm op = (OpCodeBfm)context.CurrOp; + + int width = op.Pos + 1; + + Operand res = GetIntOrZR(context, op.Rn); + + res = context.ShiftLeft(res, Const(op.GetBitsCount() - width)); + + res = signed + ? context.ShiftRightSI(res, Const(op.Shift - width)) + : context.ShiftRightUI(res, Const(op.Shift - width)); + + SetIntOrZR(context, op.Rd, res); + } + + private static void EmitSbfmShift(ArmEmitterContext context) + { + EmitBfmShift(context, signed: true); + } + + private static void EmitUbfmShift(ArmEmitterContext context) + { + EmitBfmShift(context, signed: false); + } + + private static void EmitBfmShift(ArmEmitterContext context, bool signed) + { + OpCodeBfm op = (OpCodeBfm)context.CurrOp; + + Operand res = GetIntOrZR(context, op.Rn); + + res = signed + ? context.ShiftRightSI(res, Const(op.Shift)) + : context.ShiftRightUI(res, Const(op.Shift)); + + SetIntOrZR(context, op.Rd, res); + } + + private static void EmitBfmLsl(ArmEmitterContext context) + { + OpCodeBfm op = (OpCodeBfm)context.CurrOp; + + Operand res = GetIntOrZR(context, op.Rn); + + int shift = op.GetBitsCount() - op.Shift; + + SetIntOrZR(context, op.Rd, context.ShiftLeft(res, Const(shift))); + } + + private static Operand GetBfmN(ArmEmitterContext context) + { + OpCodeBfm op = (OpCodeBfm)context.CurrOp; + + Operand res = GetIntOrZR(context, op.Rn); + + long mask = op.WMask & op.TMask; + + return context.BitwiseAnd(context.RotateRight(res, Const(op.Shift)), Const(res.Type, mask)); + } + } +}
\ No newline at end of file diff --git a/src/ARMeilleure/Instructions/InstEmitCcmp.cs b/src/ARMeilleure/Instructions/InstEmitCcmp.cs new file mode 100644 index 00000000..7f0beb6c --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitCcmp.cs @@ -0,0 +1,61 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitAluHelper; +using static ARMeilleure.Instructions.InstEmitFlowHelper; +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit + { + public static void Ccmn(ArmEmitterContext context) => EmitCcmp(context, isNegated: true); + public static void Ccmp(ArmEmitterContext context) => EmitCcmp(context, isNegated: false); + + private static void EmitCcmp(ArmEmitterContext context, bool isNegated) + { + OpCodeCcmp op = (OpCodeCcmp)context.CurrOp; + + Operand lblTrue = Label(); + Operand lblEnd = Label(); + + EmitCondBranch(context, lblTrue, op.Cond); + + SetFlag(context, PState.VFlag, Const((op.Nzcv >> 0) & 1)); + SetFlag(context, PState.CFlag, Const((op.Nzcv >> 1) & 1)); + SetFlag(context, PState.ZFlag, Const((op.Nzcv >> 2) & 1)); + SetFlag(context, PState.NFlag, Const((op.Nzcv >> 3) & 1)); + + context.Branch(lblEnd); + + context.MarkLabel(lblTrue); + + Operand n = GetAluN(context); + Operand m = GetAluM(context); + + if (isNegated) + { + Operand d = context.Add(n, m); + + EmitNZFlagsCheck(context, d); + + EmitAddsCCheck(context, n, d); + EmitAddsVCheck(context, n, m, d); + } + else + { + Operand d = context.Subtract(n, m); + + EmitNZFlagsCheck(context, d); + + EmitSubsCCheck(context, n, m); + EmitSubsVCheck(context, n, m, d); + } + + context.MarkLabel(lblEnd); + } + } +}
\ No newline at end of file diff --git a/src/ARMeilleure/Instructions/InstEmitCsel.cs b/src/ARMeilleure/Instructions/InstEmitCsel.cs new file mode 100644 index 00000000..926b9a9e --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitCsel.cs @@ -0,0 +1,53 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitFlowHelper; +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit + { + private enum CselOperation + { + None, + Increment, + Invert, + Negate + } + + public static void Csel(ArmEmitterContext context) => EmitCsel(context, CselOperation.None); + public static void Csinc(ArmEmitterContext context) => EmitCsel(context, CselOperation.Increment); + public static void Csinv(ArmEmitterContext context) => EmitCsel(context, CselOperation.Invert); + public static void Csneg(ArmEmitterContext context) => EmitCsel(context, CselOperation.Negate); + + private static void EmitCsel(ArmEmitterContext context, CselOperation cselOp) + { + OpCodeCsel op = (OpCodeCsel)context.CurrOp; + + Operand n = GetIntOrZR(context, op.Rn); + Operand m = GetIntOrZR(context, op.Rm); + + if (cselOp == CselOperation.Increment) + { + m = context.Add(m, Const(m.Type, 1)); + } + else if (cselOp == CselOperation.Invert) + { + m = context.BitwiseNot(m); + } + else if (cselOp == CselOperation.Negate) + { + m = context.Negate(m); + } + + Operand condTrue = GetCondTrue(context, op.Cond); + + Operand d = context.ConditionalSelect(condTrue, n, m); + + SetIntOrZR(context, op.Rd, d); + } + } +}
\ No newline at end of file diff --git a/src/ARMeilleure/Instructions/InstEmitDiv.cs b/src/ARMeilleure/Instructions/InstEmitDiv.cs new file mode 100644 index 00000000..39a5c32e --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitDiv.cs @@ -0,0 +1,67 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit + { + public static void Sdiv(ArmEmitterContext context) => EmitDiv(context, unsigned: false); + public static void Udiv(ArmEmitterContext context) => EmitDiv(context, unsigned: true); + + private static void EmitDiv(ArmEmitterContext context, bool unsigned) + { + OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp; + + // If Rm == 0, Rd = 0 (division by zero). + Operand n = GetIntOrZR(context, op.Rn); + Operand m = GetIntOrZR(context, op.Rm); + + Operand divisorIsZero = context.ICompareEqual(m, Const(m.Type, 0)); + + Operand lblBadDiv = Label(); + Operand lblEnd = Label(); + + context.BranchIfTrue(lblBadDiv, divisorIsZero); + + if (!unsigned) + { + // If Rn == INT_MIN && Rm == -1, Rd = INT_MIN (overflow). + bool is32Bits = op.RegisterSize == RegisterSize.Int32; + + Operand intMin = is32Bits ? Const(int.MinValue) : Const(long.MinValue); + Operand minus1 = is32Bits ? Const(-1) : Const(-1L); + + Operand nIsIntMin = context.ICompareEqual(n, intMin); + Operand mIsMinus1 = context.ICompareEqual(m, minus1); + + Operand lblGoodDiv = Label(); + + context.BranchIfFalse(lblGoodDiv, context.BitwiseAnd(nIsIntMin, mIsMinus1)); + + SetAluDOrZR(context, intMin); + + context.Branch(lblEnd); + + context.MarkLabel(lblGoodDiv); + } + + Operand d = unsigned + ? context.DivideUI(n, m) + : context.Divide (n, m); + + SetAluDOrZR(context, d); + + context.Branch(lblEnd); + + context.MarkLabel(lblBadDiv); + + SetAluDOrZR(context, Const(op.GetOperandType(), 0)); + + context.MarkLabel(lblEnd); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitException.cs b/src/ARMeilleure/Instructions/InstEmitException.cs new file mode 100644 index 00000000..0baaa87d --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitException.cs @@ -0,0 +1,55 @@ +using ARMeilleure.Decoders; +using ARMeilleure.Translation; + +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit + { + public static void Brk(ArmEmitterContext context) + { + OpCodeException op = (OpCodeException)context.CurrOp; + + string name = nameof(NativeInterface.Break); + + context.StoreToContext(); + + context.Call(typeof(NativeInterface).GetMethod(name), Const(op.Address), Const(op.Id)); + + context.LoadFromContext(); + + context.Return(Const(op.Address)); + } + + public static void Svc(ArmEmitterContext context) + { + OpCodeException op = (OpCodeException)context.CurrOp; + + string name = nameof(NativeInterface.SupervisorCall); + + context.StoreToContext(); + + context.Call(typeof(NativeInterface).GetMethod(name), Const(op.Address), Const(op.Id)); + + context.LoadFromContext(); + + Translator.EmitSynchronization(context); + } + + public static void Und(ArmEmitterContext context) + { + OpCode op = context.CurrOp; + + string name = nameof(NativeInterface.Undefined); + + context.StoreToContext(); + + context.Call(typeof(NativeInterface).GetMethod(name), Const(op.Address), Const(op.RawOpCode)); + + context.LoadFromContext(); + + context.Return(Const(op.Address)); + } + } +}
\ No newline at end of file diff --git a/src/ARMeilleure/Instructions/InstEmitException32.cs b/src/ARMeilleure/Instructions/InstEmitException32.cs new file mode 100644 index 00000000..ec0c32bf --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitException32.cs @@ -0,0 +1,39 @@ +using ARMeilleure.Decoders; +using ARMeilleure.Translation; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit32 + { + public static void Svc(ArmEmitterContext context) + { + IOpCode32Exception op = (IOpCode32Exception)context.CurrOp; + + string name = nameof(NativeInterface.SupervisorCall); + + context.StoreToContext(); + + context.Call(typeof(NativeInterface).GetMethod(name), Const(((IOpCode)op).Address), Const(op.Id)); + + context.LoadFromContext(); + + Translator.EmitSynchronization(context); + } + + public static void Trap(ArmEmitterContext context) + { + IOpCode32Exception op = (IOpCode32Exception)context.CurrOp; + + string name = nameof(NativeInterface.Break); + + context.StoreToContext(); + + context.Call(typeof(NativeInterface).GetMethod(name), Const(((IOpCode)op).Address), Const(op.Id)); + + context.LoadFromContext(); + + context.Return(Const(context.CurrOp.Address)); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitFlow.cs b/src/ARMeilleure/Instructions/InstEmitFlow.cs new file mode 100644 index 00000000..c40eb55c --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitFlow.cs @@ -0,0 +1,107 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitFlowHelper; +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit + { + public static void B(ArmEmitterContext context) + { + OpCodeBImmAl op = (OpCodeBImmAl)context.CurrOp; + + context.Branch(context.GetLabel((ulong)op.Immediate)); + } + + public static void B_Cond(ArmEmitterContext context) + { + OpCodeBImmCond op = (OpCodeBImmCond)context.CurrOp; + + EmitBranch(context, op.Cond); + } + + public static void Bl(ArmEmitterContext context) + { + OpCodeBImmAl op = (OpCodeBImmAl)context.CurrOp; + + context.Copy(GetIntOrZR(context, RegisterAlias.Lr), Const(op.Address + 4)); + + EmitCall(context, (ulong)op.Immediate); + } + + public static void Blr(ArmEmitterContext context) + { + OpCodeBReg op = (OpCodeBReg)context.CurrOp; + + Operand n = context.Copy(GetIntOrZR(context, op.Rn)); + + context.Copy(GetIntOrZR(context, RegisterAlias.Lr), Const(op.Address + 4)); + + EmitVirtualCall(context, n); + } + + public static void Br(ArmEmitterContext context) + { + OpCodeBReg op = (OpCodeBReg)context.CurrOp; + + EmitVirtualJump(context, GetIntOrZR(context, op.Rn), op.Rn == RegisterAlias.Lr); + } + + public static void Cbnz(ArmEmitterContext context) => EmitCb(context, onNotZero: true); + public static void Cbz(ArmEmitterContext context) => EmitCb(context, onNotZero: false); + + private static void EmitCb(ArmEmitterContext context, bool onNotZero) + { + OpCodeBImmCmp op = (OpCodeBImmCmp)context.CurrOp; + + EmitBranch(context, GetIntOrZR(context, op.Rt), onNotZero); + } + + public static void Ret(ArmEmitterContext context) + { + OpCodeBReg op = (OpCodeBReg)context.CurrOp; + + context.Return(GetIntOrZR(context, op.Rn)); + } + + public static void Tbnz(ArmEmitterContext context) => EmitTb(context, onNotZero: true); + public static void Tbz(ArmEmitterContext context) => EmitTb(context, onNotZero: false); + + private static void EmitTb(ArmEmitterContext context, bool onNotZero) + { + OpCodeBImmTest op = (OpCodeBImmTest)context.CurrOp; + + Operand value = context.BitwiseAnd(GetIntOrZR(context, op.Rt), Const(1L << op.Bit)); + + EmitBranch(context, value, onNotZero); + } + + private static void EmitBranch(ArmEmitterContext context, Condition cond) + { + OpCodeBImm op = (OpCodeBImm)context.CurrOp; + + EmitCondBranch(context, context.GetLabel((ulong)op.Immediate), cond); + } + + private static void EmitBranch(ArmEmitterContext context, Operand value, bool onNotZero) + { + OpCodeBImm op = (OpCodeBImm)context.CurrOp; + + Operand lblTarget = context.GetLabel((ulong)op.Immediate); + + if (onNotZero) + { + context.BranchIfTrue(lblTarget, value); + } + else + { + context.BranchIfFalse(lblTarget, value); + } + } + } +}
\ No newline at end of file diff --git a/src/ARMeilleure/Instructions/InstEmitFlow32.cs b/src/ARMeilleure/Instructions/InstEmitFlow32.cs new file mode 100644 index 00000000..3a7707ee --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitFlow32.cs @@ -0,0 +1,136 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitFlowHelper; +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit32 + { + public static void B(ArmEmitterContext context) + { + IOpCode32BImm op = (IOpCode32BImm)context.CurrOp; + + context.Branch(context.GetLabel((ulong)op.Immediate)); + } + + public static void Bl(ArmEmitterContext context) + { + Blx(context, x: false); + } + + public static void Blx(ArmEmitterContext context) + { + Blx(context, x: true); + } + + private static void Blx(ArmEmitterContext context, bool x) + { + IOpCode32BImm op = (IOpCode32BImm)context.CurrOp; + + uint pc = op.GetPc(); + + bool isThumb = ((OpCode32)context.CurrOp).IsThumb; + + uint currentPc = isThumb + ? pc | 1 + : pc - 4; + + SetIntA32(context, GetBankedRegisterAlias(context.Mode, RegisterAlias.Aarch32Lr), Const(currentPc)); + + // If x is true, then this is a branch with link and exchange. + // In this case we need to swap the mode between Arm <-> Thumb. + if (x) + { + SetFlag(context, PState.TFlag, Const(isThumb ? 0 : 1)); + } + + EmitCall(context, (ulong)op.Immediate); + } + + public static void Blxr(ArmEmitterContext context) + { + IOpCode32BReg op = (IOpCode32BReg)context.CurrOp; + + uint pc = op.GetPc(); + + Operand addr = context.Copy(GetIntA32(context, op.Rm)); + Operand bitOne = context.BitwiseAnd(addr, Const(1)); + + bool isThumb = ((OpCode32)context.CurrOp).IsThumb; + + uint currentPc = isThumb + ? (pc - 2) | 1 + : pc - 4; + + SetIntA32(context, GetBankedRegisterAlias(context.Mode, RegisterAlias.Aarch32Lr), Const(currentPc)); + + SetFlag(context, PState.TFlag, bitOne); + + EmitBxWritePc(context, addr); + } + + public static void Bx(ArmEmitterContext context) + { + IOpCode32BReg op = (IOpCode32BReg)context.CurrOp; + + EmitBxWritePc(context, GetIntA32(context, op.Rm), op.Rm); + } + + public static void Cbnz(ArmEmitterContext context) => EmitCb(context, onNotZero: true); + public static void Cbz(ArmEmitterContext context) => EmitCb(context, onNotZero: false); + + private static void EmitCb(ArmEmitterContext context, bool onNotZero) + { + OpCodeT16BImmCmp op = (OpCodeT16BImmCmp)context.CurrOp; + + Operand value = GetIntA32(context, op.Rn); + Operand lblTarget = context.GetLabel((ulong)op.Immediate); + + if (onNotZero) + { + context.BranchIfTrue(lblTarget, value); + } + else + { + context.BranchIfFalse(lblTarget, value); + } + } + + public static void It(ArmEmitterContext context) + { + OpCodeT16IfThen op = (OpCodeT16IfThen)context.CurrOp; + + context.SetIfThenBlockState(op.IfThenBlockConds); + } + + public static void Tbb(ArmEmitterContext context) => EmitTb(context, halfword: false); + public static void Tbh(ArmEmitterContext context) => EmitTb(context, halfword: true); + + private static void EmitTb(ArmEmitterContext context, bool halfword) + { + OpCodeT32Tb op = (OpCodeT32Tb)context.CurrOp; + + Operand halfwords; + + if (halfword) + { + Operand address = context.Add(GetIntA32(context, op.Rn), context.ShiftLeft(GetIntA32(context, op.Rm), Const(1))); + halfwords = InstEmitMemoryHelper.EmitReadInt(context, address, 1); + } + else + { + Operand address = context.Add(GetIntA32(context, op.Rn), GetIntA32(context, op.Rm)); + halfwords = InstEmitMemoryHelper.EmitReadIntAligned(context, address, 0); + } + + Operand targetAddress = context.Add(Const((int)op.GetPc()), context.ShiftLeft(halfwords, Const(1))); + + EmitVirtualJump(context, targetAddress, isReturn: false); + } + } +}
\ No newline at end of file diff --git a/src/ARMeilleure/Instructions/InstEmitFlowHelper.cs b/src/ARMeilleure/Instructions/InstEmitFlowHelper.cs new file mode 100644 index 00000000..6ac32908 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitFlowHelper.cs @@ -0,0 +1,240 @@ +using ARMeilleure.CodeGen.Linking; +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; +using ARMeilleure.Translation.PTC; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static class InstEmitFlowHelper + { + public static void EmitCondBranch(ArmEmitterContext context, Operand target, Condition cond) + { + if (cond != Condition.Al) + { + context.BranchIfTrue(target, GetCondTrue(context, cond)); + } + else + { + context.Branch(target); + } + } + + public static Operand GetCondTrue(ArmEmitterContext context, Condition condition) + { + Operand cmpResult = context.TryGetComparisonResult(condition); + + if (cmpResult != default) + { + return cmpResult; + } + + Operand value = Const(1); + + Operand Inverse(Operand val) + { + return context.BitwiseExclusiveOr(val, Const(1)); + } + + switch (condition) + { + case Condition.Eq: + value = GetFlag(PState.ZFlag); + break; + + case Condition.Ne: + value = Inverse(GetFlag(PState.ZFlag)); + break; + + case Condition.GeUn: + value = GetFlag(PState.CFlag); + break; + + case Condition.LtUn: + value = Inverse(GetFlag(PState.CFlag)); + break; + + case Condition.Mi: + value = GetFlag(PState.NFlag); + break; + + case Condition.Pl: + value = Inverse(GetFlag(PState.NFlag)); + break; + + case Condition.Vs: + value = GetFlag(PState.VFlag); + break; + + case Condition.Vc: + value = Inverse(GetFlag(PState.VFlag)); + break; + + case Condition.GtUn: + { + Operand c = GetFlag(PState.CFlag); + Operand z = GetFlag(PState.ZFlag); + + value = context.BitwiseAnd(c, Inverse(z)); + + break; + } + + case Condition.LeUn: + { + Operand c = GetFlag(PState.CFlag); + Operand z = GetFlag(PState.ZFlag); + + value = context.BitwiseOr(Inverse(c), z); + + break; + } + + case Condition.Ge: + { + Operand n = GetFlag(PState.NFlag); + Operand v = GetFlag(PState.VFlag); + + value = context.ICompareEqual(n, v); + + break; + } + + case Condition.Lt: + { + Operand n = GetFlag(PState.NFlag); + Operand v = GetFlag(PState.VFlag); + + value = context.ICompareNotEqual(n, v); + + break; + } + + case Condition.Gt: + { + Operand n = GetFlag(PState.NFlag); + Operand z = GetFlag(PState.ZFlag); + Operand v = GetFlag(PState.VFlag); + + value = context.BitwiseAnd(Inverse(z), context.ICompareEqual(n, v)); + + break; + } + + case Condition.Le: + { + Operand n = GetFlag(PState.NFlag); + Operand z = GetFlag(PState.ZFlag); + Operand v = GetFlag(PState.VFlag); + + value = context.BitwiseOr(z, context.ICompareNotEqual(n, v)); + + break; + } + } + + return value; + } + + public static void EmitCall(ArmEmitterContext context, ulong immediate) + { + bool isRecursive = immediate == context.EntryAddress; + + if (isRecursive) + { + context.Branch(context.GetLabel(immediate)); + } + else + { + EmitTableBranch(context, Const(immediate), isJump: false); + } + } + + public static void EmitVirtualCall(ArmEmitterContext context, Operand target) + { + EmitTableBranch(context, target, isJump: false); + } + + public static void EmitVirtualJump(ArmEmitterContext context, Operand target, bool isReturn) + { + if (isReturn) + { + if (target.Type == OperandType.I32) + { + target = context.ZeroExtend32(OperandType.I64, target); + } + + context.Return(target); + } + else + { + EmitTableBranch(context, target, isJump: true); + } + } + + private static void EmitTableBranch(ArmEmitterContext context, Operand guestAddress, bool isJump) + { + context.StoreToContext(); + + if (guestAddress.Type == OperandType.I32) + { + guestAddress = context.ZeroExtend32(OperandType.I64, guestAddress); + } + + // Store the target guest address into the native context. The stubs uses this address to dispatch into the + // next translation. + Operand nativeContext = context.LoadArgument(OperandType.I64, 0); + Operand dispAddressAddr = context.Add(nativeContext, Const((ulong)NativeContext.GetDispatchAddressOffset())); + context.Store(dispAddressAddr, guestAddress); + + Operand hostAddress; + + // If address is mapped onto the function table, we can skip the table walk. Otherwise we fallback + // onto the dispatch stub. + if (guestAddress.Kind == OperandKind.Constant && context.FunctionTable.IsValid(guestAddress.Value)) + { + Operand hostAddressAddr = !context.HasPtc ? + Const(ref context.FunctionTable.GetValue(guestAddress.Value)) : + Const(ref context.FunctionTable.GetValue(guestAddress.Value), new Symbol(SymbolType.FunctionTable, guestAddress.Value)); + + hostAddress = context.Load(OperandType.I64, hostAddressAddr); + } + else + { + hostAddress = !context.HasPtc ? + Const((long)context.Stubs.DispatchStub) : + Const((long)context.Stubs.DispatchStub, Ptc.DispatchStubSymbol); + } + + if (isJump) + { + context.Tailcall(hostAddress, nativeContext); + } + else + { + OpCode op = context.CurrOp; + + Operand returnAddress = context.Call(hostAddress, OperandType.I64, nativeContext); + + context.LoadFromContext(); + + // Note: The return value of a translated function is always an Int64 with the address execution has + // returned to. We expect this address to be immediately after the current instruction, if it isn't we + // keep returning until we reach the dispatcher. + Operand nextAddr = Const((long)op.Address + op.OpCodeSizeInBytes); + + // Try to continue within this block. + // If the return address isn't to our next instruction, we need to return so the JIT can figure out + // what to do. + Operand lblContinue = context.GetLabel(nextAddr.Value); + context.BranchIf(lblContinue, returnAddress, nextAddr, Comparison.Equal, BasicBlockFrequency.Cold); + + context.Return(returnAddress); + } + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitHash.cs b/src/ARMeilleure/Instructions/InstEmitHash.cs new file mode 100644 index 00000000..82b3e353 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitHash.cs @@ -0,0 +1,69 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitHashHelper; +using static ARMeilleure.Instructions.InstEmitHelper; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit + { + private const int ByteSizeLog2 = 0; + private const int HWordSizeLog2 = 1; + private const int WordSizeLog2 = 2; + private const int DWordSizeLog2 = 3; + + public static void Crc32b(ArmEmitterContext context) + { + EmitCrc32Call(context, ByteSizeLog2, false); + } + + public static void Crc32h(ArmEmitterContext context) + { + EmitCrc32Call(context, HWordSizeLog2, false); + } + + public static void Crc32w(ArmEmitterContext context) + { + EmitCrc32Call(context, WordSizeLog2, false); + } + + public static void Crc32x(ArmEmitterContext context) + { + EmitCrc32Call(context, DWordSizeLog2, false); + } + + public static void Crc32cb(ArmEmitterContext context) + { + EmitCrc32Call(context, ByteSizeLog2, true); + } + + public static void Crc32ch(ArmEmitterContext context) + { + EmitCrc32Call(context, HWordSizeLog2, true); + } + + public static void Crc32cw(ArmEmitterContext context) + { + EmitCrc32Call(context, WordSizeLog2, true); + } + + public static void Crc32cx(ArmEmitterContext context) + { + EmitCrc32Call(context, DWordSizeLog2, true); + } + + private static void EmitCrc32Call(ArmEmitterContext context, int size, bool c) + { + OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp; + + Operand n = GetIntOrZR(context, op.Rn); + Operand m = GetIntOrZR(context, op.Rm); + + Operand d = EmitCrc32(context, n, m, size, c); + + SetIntOrZR(context, op.Rd, d); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitHash32.cs b/src/ARMeilleure/Instructions/InstEmitHash32.cs new file mode 100644 index 00000000..5d39f8af --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitHash32.cs @@ -0,0 +1,53 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; +using static ARMeilleure.Instructions.InstEmitHashHelper; +using static ARMeilleure.Instructions.InstEmitHelper; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit32 + { + public static void Crc32b(ArmEmitterContext context) + { + EmitCrc32Call(context, ByteSizeLog2, false); + } + + public static void Crc32h(ArmEmitterContext context) + { + EmitCrc32Call(context, HWordSizeLog2, false); + } + + public static void Crc32w(ArmEmitterContext context) + { + EmitCrc32Call(context, WordSizeLog2, false); + } + + public static void Crc32cb(ArmEmitterContext context) + { + EmitCrc32Call(context, ByteSizeLog2, true); + } + + public static void Crc32ch(ArmEmitterContext context) + { + EmitCrc32Call(context, HWordSizeLog2, true); + } + + public static void Crc32cw(ArmEmitterContext context) + { + EmitCrc32Call(context, WordSizeLog2, true); + } + + private static void EmitCrc32Call(ArmEmitterContext context, int size, bool c) + { + IOpCode32AluReg op = (IOpCode32AluReg)context.CurrOp; + + Operand n = GetIntA32(context, op.Rn); + Operand m = GetIntA32(context, op.Rm); + + Operand d = EmitCrc32(context, n, m, size, c); + + EmitAluStore(context, d); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitHashHelper.cs b/src/ARMeilleure/Instructions/InstEmitHashHelper.cs new file mode 100644 index 00000000..55a03a4f --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitHashHelper.cs @@ -0,0 +1,118 @@ +// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf + +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; +using System; +using System.Diagnostics; +using static ARMeilleure.Instructions.InstEmitSimdHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static class InstEmitHashHelper + { + public const uint Crc32RevPoly = 0xedb88320; + public const uint Crc32cRevPoly = 0x82f63b78; + + public static Operand EmitCrc32(ArmEmitterContext context, Operand crc, Operand value, int size, bool castagnoli) + { + Debug.Assert(crc.Type.IsInteger() && value.Type.IsInteger()); + Debug.Assert(size >= 0 && size < 4); + Debug.Assert((size < 3) || (value.Type == OperandType.I64)); + + if (castagnoli && Optimizations.UseSse42) + { + // The CRC32 instruction does not have an immediate variant, so ensure both inputs are in registers. + value = (value.Kind == OperandKind.Constant) ? context.Copy(value) : value; + crc = (crc.Kind == OperandKind.Constant) ? context.Copy(crc) : crc; + + Intrinsic op = size switch + { + 0 => Intrinsic.X86Crc32_8, + 1 => Intrinsic.X86Crc32_16, + _ => Intrinsic.X86Crc32, + }; + + return (size == 3) ? context.ConvertI64ToI32(context.AddIntrinsicLong(op, crc, value)) : context.AddIntrinsicInt(op, crc, value); + } + else if (Optimizations.UsePclmulqdq) + { + return size switch + { + 3 => EmitCrc32Optimized64(context, crc, value, castagnoli), + _ => EmitCrc32Optimized(context, crc, value, castagnoli, size), + }; + } + else + { + string name = (size, castagnoli) switch + { + (0, false) => nameof(SoftFallback.Crc32b), + (1, false) => nameof(SoftFallback.Crc32h), + (2, false) => nameof(SoftFallback.Crc32w), + (3, false) => nameof(SoftFallback.Crc32x), + (0, true) => nameof(SoftFallback.Crc32cb), + (1, true) => nameof(SoftFallback.Crc32ch), + (2, true) => nameof(SoftFallback.Crc32cw), + (3, true) => nameof(SoftFallback.Crc32cx), + _ => throw new ArgumentOutOfRangeException(nameof(size)) + }; + + return context.Call(typeof(SoftFallback).GetMethod(name), crc, value); + } + } + + private static Operand EmitCrc32Optimized(ArmEmitterContext context, Operand crc, Operand data, bool castagnoli, int size) + { + long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))' + long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1 + + crc = context.VectorInsert(context.VectorZero(), crc, 0); + + switch (size) + { + case 0: data = context.VectorInsert8(context.VectorZero(), data, 0); break; + case 1: data = context.VectorInsert16(context.VectorZero(), data, 0); break; + case 2: data = context.VectorInsert(context.VectorZero(), data, 0); break; + } + + int bitsize = 8 << size; + + Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data); + tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(64 - bitsize)); + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(0)); + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); + + if (bitsize < 32) + { + crc = context.AddIntrinsic(Intrinsic.X86Pslldq, crc, Const((64 - bitsize) / 8)); + tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, crc); + } + + return context.VectorExtract(OperandType.I32, tmp, 2); + } + + private static Operand EmitCrc32Optimized64(ArmEmitterContext context, Operand crc, Operand data, bool castagnoli) + { + long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))' + long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1 + + crc = context.VectorInsert(context.VectorZero(), crc, 0); + data = context.VectorInsert(context.VectorZero(), data, 0); + + Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data); + Operand res = context.AddIntrinsic(Intrinsic.X86Pslldq, tmp, Const(4)); + + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, res, X86GetScalar(context, mu), Const(0)); + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); + + tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, res); + tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(32)); + + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(1)); + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); + + return context.VectorExtract(OperandType.I32, tmp, 2); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitHelper.cs b/src/ARMeilleure/Instructions/InstEmitHelper.cs new file mode 100644 index 00000000..a22bb3fb --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitHelper.cs @@ -0,0 +1,264 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; +using System; + +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static class InstEmitHelper + { + public static Operand GetExtendedM(ArmEmitterContext context, int rm, IntType type) + { + Operand value = GetIntOrZR(context, rm); + + switch (type) + { + case IntType.UInt8: value = context.ZeroExtend8 (value.Type, value); break; + case IntType.UInt16: value = context.ZeroExtend16(value.Type, value); break; + case IntType.UInt32: value = context.ZeroExtend32(value.Type, value); break; + + case IntType.Int8: value = context.SignExtend8 (value.Type, value); break; + case IntType.Int16: value = context.SignExtend16(value.Type, value); break; + case IntType.Int32: value = context.SignExtend32(value.Type, value); break; + } + + return value; + } + + public static Operand GetIntA32(ArmEmitterContext context, int regIndex) + { + if (regIndex == RegisterAlias.Aarch32Pc) + { + OpCode32 op = (OpCode32)context.CurrOp; + + return Const((int)op.GetPc()); + } + else + { + return Register(GetRegisterAlias(context.Mode, regIndex), RegisterType.Integer, OperandType.I32); + } + } + + public static Operand GetIntA32AlignedPC(ArmEmitterContext context, int regIndex) + { + if (regIndex == RegisterAlias.Aarch32Pc) + { + OpCode32 op = (OpCode32)context.CurrOp; + + return Const((int)(op.GetPc() & 0xfffffffc)); + } + else + { + return Register(GetRegisterAlias(context.Mode, regIndex), RegisterType.Integer, OperandType.I32); + } + } + + public static Operand GetVecA32(int regIndex) + { + return Register(regIndex, RegisterType.Vector, OperandType.V128); + } + + public static void SetIntA32(ArmEmitterContext context, int regIndex, Operand value) + { + if (regIndex == RegisterAlias.Aarch32Pc) + { + if (!IsA32Return(context)) + { + context.StoreToContext(); + } + + EmitBxWritePc(context, value); + } + else + { + if (value.Type == OperandType.I64) + { + value = context.ConvertI64ToI32(value); + } + Operand reg = Register(GetRegisterAlias(context.Mode, regIndex), RegisterType.Integer, OperandType.I32); + + context.Copy(reg, value); + } + } + + public static int GetRegisterAlias(Aarch32Mode mode, int regIndex) + { + // Only registers >= 8 are banked, + // with registers in the range [8, 12] being + // banked for the FIQ mode, and registers + // 13 and 14 being banked for all modes. + if ((uint)regIndex < 8) + { + return regIndex; + } + + return GetBankedRegisterAlias(mode, regIndex); + } + + public static int GetBankedRegisterAlias(Aarch32Mode mode, int regIndex) + { + switch (regIndex) + { + case 8: return mode == Aarch32Mode.Fiq + ? RegisterAlias.R8Fiq + : RegisterAlias.R8Usr; + + case 9: return mode == Aarch32Mode.Fiq + ? RegisterAlias.R9Fiq + : RegisterAlias.R9Usr; + + case 10: return mode == Aarch32Mode.Fiq + ? RegisterAlias.R10Fiq + : RegisterAlias.R10Usr; + + case 11: return mode == Aarch32Mode.Fiq + ? RegisterAlias.R11Fiq + : RegisterAlias.R11Usr; + + case 12: return mode == Aarch32Mode.Fiq + ? RegisterAlias.R12Fiq + : RegisterAlias.R12Usr; + + case 13: + switch (mode) + { + case Aarch32Mode.User: + case Aarch32Mode.System: return RegisterAlias.SpUsr; + case Aarch32Mode.Fiq: return RegisterAlias.SpFiq; + case Aarch32Mode.Irq: return RegisterAlias.SpIrq; + case Aarch32Mode.Supervisor: return RegisterAlias.SpSvc; + case Aarch32Mode.Abort: return RegisterAlias.SpAbt; + case Aarch32Mode.Hypervisor: return RegisterAlias.SpHyp; + case Aarch32Mode.Undefined: return RegisterAlias.SpUnd; + + default: throw new ArgumentException(nameof(mode)); + } + + case 14: + switch (mode) + { + case Aarch32Mode.User: + case Aarch32Mode.Hypervisor: + case Aarch32Mode.System: return RegisterAlias.LrUsr; + case Aarch32Mode.Fiq: return RegisterAlias.LrFiq; + case Aarch32Mode.Irq: return RegisterAlias.LrIrq; + case Aarch32Mode.Supervisor: return RegisterAlias.LrSvc; + case Aarch32Mode.Abort: return RegisterAlias.LrAbt; + case Aarch32Mode.Undefined: return RegisterAlias.LrUnd; + + default: throw new ArgumentException(nameof(mode)); + } + + default: throw new ArgumentOutOfRangeException(nameof(regIndex)); + } + } + + public static bool IsA32Return(ArmEmitterContext context) + { + switch (context.CurrOp) + { + case IOpCode32MemMult op: + return true; // Setting PC using LDM is nearly always a return. + case OpCode32AluRsImm op: + return op.Rm == RegisterAlias.Aarch32Lr; + case OpCode32AluRsReg op: + return op.Rm == RegisterAlias.Aarch32Lr; + case OpCode32AluReg op: + return op.Rm == RegisterAlias.Aarch32Lr; + case OpCode32Mem op: + return op.Rn == RegisterAlias.Aarch32Sp && op.WBack && !op.Index; // Setting PC to an address stored on the stack is nearly always a return. + } + return false; + } + + public static void EmitBxWritePc(ArmEmitterContext context, Operand pc, int sourceRegister = 0) + { + bool isReturn = sourceRegister == RegisterAlias.Aarch32Lr || IsA32Return(context); + Operand mode = context.BitwiseAnd(pc, Const(1)); + + SetFlag(context, PState.TFlag, mode); + + Operand addr = context.ConditionalSelect(mode, context.BitwiseAnd(pc, Const(~1)), context.BitwiseAnd(pc, Const(~3))); + + InstEmitFlowHelper.EmitVirtualJump(context, addr, isReturn); + } + + public static Operand GetIntOrZR(ArmEmitterContext context, int regIndex) + { + if (regIndex == RegisterConsts.ZeroIndex) + { + OperandType type = context.CurrOp.GetOperandType(); + + return type == OperandType.I32 ? Const(0) : Const(0L); + } + else + { + return GetIntOrSP(context, regIndex); + } + } + + public static void SetIntOrZR(ArmEmitterContext context, int regIndex, Operand value) + { + if (regIndex == RegisterConsts.ZeroIndex) + { + return; + } + + SetIntOrSP(context, regIndex, value); + } + + public static Operand GetIntOrSP(ArmEmitterContext context, int regIndex) + { + Operand value = Register(regIndex, RegisterType.Integer, OperandType.I64); + + if (context.CurrOp.RegisterSize == RegisterSize.Int32) + { + value = context.ConvertI64ToI32(value); + } + + return value; + } + + public static void SetIntOrSP(ArmEmitterContext context, int regIndex, Operand value) + { + Operand reg = Register(regIndex, RegisterType.Integer, OperandType.I64); + + if (value.Type == OperandType.I32) + { + value = context.ZeroExtend32(OperandType.I64, value); + } + + context.Copy(reg, value); + } + + public static Operand GetVec(int regIndex) + { + return Register(regIndex, RegisterType.Vector, OperandType.V128); + } + + public static Operand GetFlag(PState stateFlag) + { + return Register((int)stateFlag, RegisterType.Flag, OperandType.I32); + } + + public static Operand GetFpFlag(FPState stateFlag) + { + return Register((int)stateFlag, RegisterType.FpFlag, OperandType.I32); + } + + public static void SetFlag(ArmEmitterContext context, PState stateFlag, Operand value) + { + context.Copy(GetFlag(stateFlag), value); + + context.MarkFlagSet(stateFlag); + } + + public static void SetFpFlag(ArmEmitterContext context, FPState stateFlag, Operand value) + { + context.Copy(GetFpFlag(stateFlag), value); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitMemory.cs b/src/ARMeilleure/Instructions/InstEmitMemory.cs new file mode 100644 index 00000000..7baed14c --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitMemory.cs @@ -0,0 +1,184 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitMemoryHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit + { + public static void Adr(ArmEmitterContext context) + { + OpCodeAdr op = (OpCodeAdr)context.CurrOp; + + SetIntOrZR(context, op.Rd, Const(op.Address + (ulong)op.Immediate)); + } + + public static void Adrp(ArmEmitterContext context) + { + OpCodeAdr op = (OpCodeAdr)context.CurrOp; + + ulong address = (op.Address & ~0xfffUL) + ((ulong)op.Immediate << 12); + + SetIntOrZR(context, op.Rd, Const(address)); + } + + public static void Ldr(ArmEmitterContext context) => EmitLdr(context, signed: false); + public static void Ldrs(ArmEmitterContext context) => EmitLdr(context, signed: true); + + private static void EmitLdr(ArmEmitterContext context, bool signed) + { + OpCodeMem op = (OpCodeMem)context.CurrOp; + + Operand address = GetAddress(context); + + if (signed && op.Extend64) + { + EmitLoadSx64(context, address, op.Rt, op.Size); + } + else if (signed) + { + EmitLoadSx32(context, address, op.Rt, op.Size); + } + else + { + EmitLoadZx(context, address, op.Rt, op.Size); + } + + EmitWBackIfNeeded(context, address); + } + + public static void Ldr_Literal(ArmEmitterContext context) + { + IOpCodeLit op = (IOpCodeLit)context.CurrOp; + + if (op.Prefetch) + { + return; + } + + if (op.Signed) + { + EmitLoadSx64(context, Const(op.Immediate), op.Rt, op.Size); + } + else + { + EmitLoadZx(context, Const(op.Immediate), op.Rt, op.Size); + } + } + + public static void Ldp(ArmEmitterContext context) + { + OpCodeMemPair op = (OpCodeMemPair)context.CurrOp; + + void EmitLoad(int rt, Operand ldAddr) + { + if (op.Extend64) + { + EmitLoadSx64(context, ldAddr, rt, op.Size); + } + else + { + EmitLoadZx(context, ldAddr, rt, op.Size); + } + } + + Operand address = GetAddress(context); + Operand address2 = GetAddress(context, 1L << op.Size); + + EmitLoad(op.Rt, address); + EmitLoad(op.Rt2, address2); + + EmitWBackIfNeeded(context, address); + } + + public static void Str(ArmEmitterContext context) + { + OpCodeMem op = (OpCodeMem)context.CurrOp; + + Operand address = GetAddress(context); + + EmitStore(context, address, op.Rt, op.Size); + + EmitWBackIfNeeded(context, address); + } + + public static void Stp(ArmEmitterContext context) + { + OpCodeMemPair op = (OpCodeMemPair)context.CurrOp; + + Operand address = GetAddress(context); + Operand address2 = GetAddress(context, 1L << op.Size); + + EmitStore(context, address, op.Rt, op.Size); + EmitStore(context, address2, op.Rt2, op.Size); + + EmitWBackIfNeeded(context, address); + } + + private static Operand GetAddress(ArmEmitterContext context, long addend = 0) + { + Operand address = default; + + switch (context.CurrOp) + { + case OpCodeMemImm op: + { + address = context.Copy(GetIntOrSP(context, op.Rn)); + + // Pre-indexing. + if (!op.PostIdx) + { + address = context.Add(address, Const(op.Immediate + addend)); + } + else if (addend != 0) + { + address = context.Add(address, Const(addend)); + } + + break; + } + + case OpCodeMemReg op: + { + Operand n = GetIntOrSP(context, op.Rn); + + Operand m = GetExtendedM(context, op.Rm, op.IntType); + + if (op.Shift) + { + m = context.ShiftLeft(m, Const(op.Size)); + } + + address = context.Add(n, m); + + if (addend != 0) + { + address = context.Add(address, Const(addend)); + } + + break; + } + } + + return address; + } + + private static void EmitWBackIfNeeded(ArmEmitterContext context, Operand address) + { + // Check whenever the current OpCode has post-indexed write back, if so write it. + if (context.CurrOp is OpCodeMemImm op && op.WBack) + { + if (op.PostIdx) + { + address = context.Add(address, Const(op.Immediate)); + } + + SetIntOrSP(context, op.Rn, address); + } + } + } +}
\ No newline at end of file diff --git a/src/ARMeilleure/Instructions/InstEmitMemory32.cs b/src/ARMeilleure/Instructions/InstEmitMemory32.cs new file mode 100644 index 00000000..17ec97aa --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitMemory32.cs @@ -0,0 +1,265 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; +using System; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitMemoryHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit32 + { + private const int ByteSizeLog2 = 0; + private const int HWordSizeLog2 = 1; + private const int WordSizeLog2 = 2; + private const int DWordSizeLog2 = 3; + + [Flags] + enum AccessType + { + Store = 0, + Signed = 1, + Load = 2, + Ordered = 4, + Exclusive = 8, + + LoadZx = Load, + LoadSx = Load | Signed, + } + + public static void Ldm(ArmEmitterContext context) + { + IOpCode32MemMult op = (IOpCode32MemMult)context.CurrOp; + + Operand n = GetIntA32(context, op.Rn); + + Operand baseAddress = context.Add(n, Const(op.Offset)); + + bool writesToPc = (op.RegisterMask & (1 << RegisterAlias.Aarch32Pc)) != 0; + + bool writeBack = op.PostOffset != 0 && (op.Rn != RegisterAlias.Aarch32Pc || !writesToPc); + + if (writeBack) + { + SetIntA32(context, op.Rn, context.Add(n, Const(op.PostOffset))); + } + + int mask = op.RegisterMask; + int offset = 0; + + for (int register = 0; mask != 0; mask >>= 1, register++) + { + if ((mask & 1) != 0) + { + Operand address = context.Add(baseAddress, Const(offset)); + + EmitLoadZx(context, address, register, WordSizeLog2); + + offset += 4; + } + } + } + + public static void Ldr(ArmEmitterContext context) + { + EmitLoadOrStore(context, WordSizeLog2, AccessType.LoadZx); + } + + public static void Ldrb(ArmEmitterContext context) + { + EmitLoadOrStore(context, ByteSizeLog2, AccessType.LoadZx); + } + + public static void Ldrd(ArmEmitterContext context) + { + EmitLoadOrStore(context, DWordSizeLog2, AccessType.LoadZx); + } + + public static void Ldrh(ArmEmitterContext context) + { + EmitLoadOrStore(context, HWordSizeLog2, AccessType.LoadZx); + } + + public static void Ldrsb(ArmEmitterContext context) + { + EmitLoadOrStore(context, ByteSizeLog2, AccessType.LoadSx); + } + + public static void Ldrsh(ArmEmitterContext context) + { + EmitLoadOrStore(context, HWordSizeLog2, AccessType.LoadSx); + } + + public static void Stm(ArmEmitterContext context) + { + IOpCode32MemMult op = (IOpCode32MemMult)context.CurrOp; + + Operand n = context.Copy(GetIntA32(context, op.Rn)); + + Operand baseAddress = context.Add(n, Const(op.Offset)); + + int mask = op.RegisterMask; + int offset = 0; + + for (int register = 0; mask != 0; mask >>= 1, register++) + { + if ((mask & 1) != 0) + { + Operand address = context.Add(baseAddress, Const(offset)); + + EmitStore(context, address, register, WordSizeLog2); + + // Note: If Rn is also specified on the register list, + // and Rn is the first register on this list, then the + // value that is written to memory is the unmodified value, + // before the write back. If it is on the list, but it's + // not the first one, then the value written to memory + // varies between CPUs. + if (offset == 0 && op.PostOffset != 0) + { + // Emit write back after the first write. + SetIntA32(context, op.Rn, context.Add(n, Const(op.PostOffset))); + } + + offset += 4; + } + } + } + + public static void Str(ArmEmitterContext context) + { + EmitLoadOrStore(context, WordSizeLog2, AccessType.Store); + } + + public static void Strb(ArmEmitterContext context) + { + EmitLoadOrStore(context, ByteSizeLog2, AccessType.Store); + } + + public static void Strd(ArmEmitterContext context) + { + EmitLoadOrStore(context, DWordSizeLog2, AccessType.Store); + } + + public static void Strh(ArmEmitterContext context) + { + EmitLoadOrStore(context, HWordSizeLog2, AccessType.Store); + } + + private static void EmitLoadOrStore(ArmEmitterContext context, int size, AccessType accType) + { + IOpCode32Mem op = (IOpCode32Mem)context.CurrOp; + + Operand n = context.Copy(GetIntA32AlignedPC(context, op.Rn)); + Operand m = GetMemM(context, setCarry: false); + + Operand temp = default; + + if (op.Index || op.WBack) + { + temp = op.Add + ? context.Add (n, m) + : context.Subtract(n, m); + } + + if (op.WBack) + { + SetIntA32(context, op.Rn, temp); + } + + Operand address; + + if (op.Index) + { + address = temp; + } + else + { + address = n; + } + + if ((accType & AccessType.Load) != 0) + { + void Load(int rt, int offs, int loadSize) + { + Operand addr = context.Add(address, Const(offs)); + + if ((accType & AccessType.Signed) != 0) + { + EmitLoadSx32(context, addr, rt, loadSize); + } + else + { + EmitLoadZx(context, addr, rt, loadSize); + } + } + + if (size == DWordSizeLog2) + { + Operand lblBigEndian = Label(); + Operand lblEnd = Label(); + + context.BranchIfTrue(lblBigEndian, GetFlag(PState.EFlag)); + + Load(op.Rt, 0, WordSizeLog2); + Load(op.Rt2, 4, WordSizeLog2); + + context.Branch(lblEnd); + + context.MarkLabel(lblBigEndian); + + Load(op.Rt2, 0, WordSizeLog2); + Load(op.Rt, 4, WordSizeLog2); + + context.MarkLabel(lblEnd); + } + else + { + Load(op.Rt, 0, size); + } + } + else + { + void Store(int rt, int offs, int storeSize) + { + Operand addr = context.Add(address, Const(offs)); + + EmitStore(context, addr, rt, storeSize); + } + + if (size == DWordSizeLog2) + { + Operand lblBigEndian = Label(); + Operand lblEnd = Label(); + + context.BranchIfTrue(lblBigEndian, GetFlag(PState.EFlag)); + + Store(op.Rt, 0, WordSizeLog2); + Store(op.Rt2, 4, WordSizeLog2); + + context.Branch(lblEnd); + + context.MarkLabel(lblBigEndian); + + Store(op.Rt2, 0, WordSizeLog2); + Store(op.Rt, 4, WordSizeLog2); + + context.MarkLabel(lblEnd); + } + else + { + Store(op.Rt, 0, size); + } + } + } + + public static void Adr(ArmEmitterContext context) + { + IOpCode32Adr op = (IOpCode32Adr)context.CurrOp; + SetIntA32(context, op.Rd, Const(op.Immediate)); + } + } +}
\ No newline at end of file diff --git a/src/ARMeilleure/Instructions/InstEmitMemoryEx.cs b/src/ARMeilleure/Instructions/InstEmitMemoryEx.cs new file mode 100644 index 00000000..c7ed01e3 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitMemoryEx.cs @@ -0,0 +1,178 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; +using System; +using System.Diagnostics; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitMemoryExHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit + { + [Flags] + private enum AccessType + { + None = 0, + Ordered = 1, + Exclusive = 2, + OrderedEx = Ordered | Exclusive + } + + public static void Clrex(ArmEmitterContext context) + { + EmitClearExclusive(context); + } + + public static void Csdb(ArmEmitterContext context) + { + // Execute as no-op. + } + + public static void Dmb(ArmEmitterContext context) => EmitBarrier(context); + public static void Dsb(ArmEmitterContext context) => EmitBarrier(context); + + public static void Ldar(ArmEmitterContext context) => EmitLdr(context, AccessType.Ordered); + public static void Ldaxr(ArmEmitterContext context) => EmitLdr(context, AccessType.OrderedEx); + public static void Ldxr(ArmEmitterContext context) => EmitLdr(context, AccessType.Exclusive); + public static void Ldxp(ArmEmitterContext context) => EmitLdp(context, AccessType.Exclusive); + public static void Ldaxp(ArmEmitterContext context) => EmitLdp(context, AccessType.OrderedEx); + + private static void EmitLdr(ArmEmitterContext context, AccessType accType) + { + EmitLoadEx(context, accType, pair: false); + } + + private static void EmitLdp(ArmEmitterContext context, AccessType accType) + { + EmitLoadEx(context, accType, pair: true); + } + + private static void EmitLoadEx(ArmEmitterContext context, AccessType accType, bool pair) + { + OpCodeMemEx op = (OpCodeMemEx)context.CurrOp; + + bool ordered = (accType & AccessType.Ordered) != 0; + bool exclusive = (accType & AccessType.Exclusive) != 0; + + if (ordered) + { + EmitBarrier(context); + } + + Operand address = context.Copy(GetIntOrSP(context, op.Rn)); + + if (pair) + { + // Exclusive loads should be atomic. For pairwise loads, we need to + // read all the data at once. For a 32-bits pairwise load, we do a + // simple 64-bits load, for a 128-bits load, we need to call a special + // method to read 128-bits atomically. + if (op.Size == 2) + { + Operand value = EmitLoadExclusive(context, address, exclusive, 3); + + Operand valueLow = context.ConvertI64ToI32(value); + + valueLow = context.ZeroExtend32(OperandType.I64, valueLow); + + Operand valueHigh = context.ShiftRightUI(value, Const(32)); + + SetIntOrZR(context, op.Rt, valueLow); + SetIntOrZR(context, op.Rt2, valueHigh); + } + else if (op.Size == 3) + { + Operand value = EmitLoadExclusive(context, address, exclusive, 4); + + Operand valueLow = context.VectorExtract(OperandType.I64, value, 0); + Operand valueHigh = context.VectorExtract(OperandType.I64, value, 1); + + SetIntOrZR(context, op.Rt, valueLow); + SetIntOrZR(context, op.Rt2, valueHigh); + } + else + { + throw new InvalidOperationException($"Invalid load size of {1 << op.Size} bytes."); + } + } + else + { + // 8, 16, 32 or 64-bits (non-pairwise) load. + Operand value = EmitLoadExclusive(context, address, exclusive, op.Size); + + SetIntOrZR(context, op.Rt, value); + } + } + + public static void Prfm(ArmEmitterContext context) + { + // Memory Prefetch, execute as no-op. + } + + public static void Stlr(ArmEmitterContext context) => EmitStr(context, AccessType.Ordered); + public static void Stlxr(ArmEmitterContext context) => EmitStr(context, AccessType.OrderedEx); + public static void Stxr(ArmEmitterContext context) => EmitStr(context, AccessType.Exclusive); + public static void Stxp(ArmEmitterContext context) => EmitStp(context, AccessType.Exclusive); + public static void Stlxp(ArmEmitterContext context) => EmitStp(context, AccessType.OrderedEx); + + private static void EmitStr(ArmEmitterContext context, AccessType accType) + { + EmitStoreEx(context, accType, pair: false); + } + + private static void EmitStp(ArmEmitterContext context, AccessType accType) + { + EmitStoreEx(context, accType, pair: true); + } + + private static void EmitStoreEx(ArmEmitterContext context, AccessType accType, bool pair) + { + OpCodeMemEx op = (OpCodeMemEx)context.CurrOp; + + bool ordered = (accType & AccessType.Ordered) != 0; + bool exclusive = (accType & AccessType.Exclusive) != 0; + + Operand address = context.Copy(GetIntOrSP(context, op.Rn)); + + Operand t = GetIntOrZR(context, op.Rt); + + if (pair) + { + Debug.Assert(op.Size == 2 || op.Size == 3, "Invalid size for pairwise store."); + + Operand t2 = GetIntOrZR(context, op.Rt2); + + Operand value; + + if (op.Size == 2) + { + value = context.BitwiseOr(t, context.ShiftLeft(t2, Const(32))); + } + else /* if (op.Size == 3) */ + { + value = context.VectorInsert(context.VectorZero(), t, 0); + value = context.VectorInsert(value, t2, 1); + } + + EmitStoreExclusive(context, address, value, exclusive, op.Size + 1, op.Rs, a32: false); + } + else + { + EmitStoreExclusive(context, address, t, exclusive, op.Size, op.Rs, a32: false); + } + + if (ordered) + { + EmitBarrier(context); + } + } + + private static void EmitBarrier(ArmEmitterContext context) + { + context.MemoryBarrier(); + } + } +}
\ No newline at end of file diff --git a/src/ARMeilleure/Instructions/InstEmitMemoryEx32.cs b/src/ARMeilleure/Instructions/InstEmitMemoryEx32.cs new file mode 100644 index 00000000..c0b6fc39 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitMemoryEx32.cs @@ -0,0 +1,237 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitMemoryExHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit32 + { + public static void Clrex(ArmEmitterContext context) + { + EmitClearExclusive(context); + } + + public static void Csdb(ArmEmitterContext context) + { + // Execute as no-op. + } + + public static void Dmb(ArmEmitterContext context) => EmitBarrier(context); + + public static void Dsb(ArmEmitterContext context) => EmitBarrier(context); + + public static void Ldrex(ArmEmitterContext context) + { + EmitExLoadOrStore(context, WordSizeLog2, AccessType.LoadZx | AccessType.Exclusive); + } + + public static void Ldrexb(ArmEmitterContext context) + { + EmitExLoadOrStore(context, ByteSizeLog2, AccessType.LoadZx | AccessType.Exclusive); + } + + public static void Ldrexd(ArmEmitterContext context) + { + EmitExLoadOrStore(context, DWordSizeLog2, AccessType.LoadZx | AccessType.Exclusive); + } + + public static void Ldrexh(ArmEmitterContext context) + { + EmitExLoadOrStore(context, HWordSizeLog2, AccessType.LoadZx | AccessType.Exclusive); + } + + public static void Lda(ArmEmitterContext context) + { + EmitExLoadOrStore(context, WordSizeLog2, AccessType.LoadZx | AccessType.Ordered); + } + + public static void Ldab(ArmEmitterContext context) + { + EmitExLoadOrStore(context, ByteSizeLog2, AccessType.LoadZx | AccessType.Ordered); + } + + public static void Ldaex(ArmEmitterContext context) + { + EmitExLoadOrStore(context, WordSizeLog2, AccessType.LoadZx | AccessType.Exclusive | AccessType.Ordered); + } + + public static void Ldaexb(ArmEmitterContext context) + { + EmitExLoadOrStore(context, ByteSizeLog2, AccessType.LoadZx | AccessType.Exclusive | AccessType.Ordered); + } + + public static void Ldaexd(ArmEmitterContext context) + { + EmitExLoadOrStore(context, DWordSizeLog2, AccessType.LoadZx | AccessType.Exclusive | AccessType.Ordered); + } + + public static void Ldaexh(ArmEmitterContext context) + { + EmitExLoadOrStore(context, HWordSizeLog2, AccessType.LoadZx | AccessType.Exclusive | AccessType.Ordered); + } + + public static void Ldah(ArmEmitterContext context) + { + EmitExLoadOrStore(context, HWordSizeLog2, AccessType.LoadZx | AccessType.Ordered); + } + + // Stores. + + public static void Strex(ArmEmitterContext context) + { + EmitExLoadOrStore(context, WordSizeLog2, AccessType.Store | AccessType.Exclusive); + } + + public static void Strexb(ArmEmitterContext context) + { + EmitExLoadOrStore(context, ByteSizeLog2, AccessType.Store | AccessType.Exclusive); + } + + public static void Strexd(ArmEmitterContext context) + { + EmitExLoadOrStore(context, DWordSizeLog2, AccessType.Store | AccessType.Exclusive); + } + + public static void Strexh(ArmEmitterContext context) + { + EmitExLoadOrStore(context, HWordSizeLog2, AccessType.Store | AccessType.Exclusive); + } + + public static void Stl(ArmEmitterContext context) + { + EmitExLoadOrStore(context, WordSizeLog2, AccessType.Store | AccessType.Ordered); + } + + public static void Stlb(ArmEmitterContext context) + { + EmitExLoadOrStore(context, ByteSizeLog2, AccessType.Store | AccessType.Ordered); + } + + public static void Stlex(ArmEmitterContext context) + { + EmitExLoadOrStore(context, WordSizeLog2, AccessType.Store | AccessType.Exclusive | AccessType.Ordered); + } + + public static void Stlexb(ArmEmitterContext context) + { + EmitExLoadOrStore(context, ByteSizeLog2, AccessType.Store | AccessType.Exclusive | AccessType.Ordered); + } + + public static void Stlexd(ArmEmitterContext context) + { + EmitExLoadOrStore(context, DWordSizeLog2, AccessType.Store | AccessType.Exclusive | AccessType.Ordered); + } + + public static void Stlexh(ArmEmitterContext context) + { + EmitExLoadOrStore(context, HWordSizeLog2, AccessType.Store | AccessType.Exclusive | AccessType.Ordered); + } + + public static void Stlh(ArmEmitterContext context) + { + EmitExLoadOrStore(context, HWordSizeLog2, AccessType.Store | AccessType.Ordered); + } + + private static void EmitExLoadOrStore(ArmEmitterContext context, int size, AccessType accType) + { + IOpCode32MemEx op = (IOpCode32MemEx)context.CurrOp; + + Operand address = context.Copy(GetIntA32(context, op.Rn)); + + var exclusive = (accType & AccessType.Exclusive) != 0; + var ordered = (accType & AccessType.Ordered) != 0; + + if ((accType & AccessType.Load) != 0) + { + if (ordered) + { + EmitBarrier(context); + } + + if (size == DWordSizeLog2) + { + // Keep loads atomic - make the call to get the whole region and then decompose it into parts + // for the registers. + + Operand value = EmitLoadExclusive(context, address, exclusive, size); + + Operand valueLow = context.ConvertI64ToI32(value); + + valueLow = context.ZeroExtend32(OperandType.I64, valueLow); + + Operand valueHigh = context.ShiftRightUI(value, Const(32)); + + Operand lblBigEndian = Label(); + Operand lblEnd = Label(); + + context.BranchIfTrue(lblBigEndian, GetFlag(PState.EFlag)); + + SetIntA32(context, op.Rt, valueLow); + SetIntA32(context, op.Rt2, valueHigh); + + context.Branch(lblEnd); + + context.MarkLabel(lblBigEndian); + + SetIntA32(context, op.Rt2, valueLow); + SetIntA32(context, op.Rt, valueHigh); + + context.MarkLabel(lblEnd); + } + else + { + SetIntA32(context, op.Rt, EmitLoadExclusive(context, address, exclusive, size)); + } + } + else + { + if (size == DWordSizeLog2) + { + // Split the result into 2 words (based on endianness) + + Operand lo = context.ZeroExtend32(OperandType.I64, GetIntA32(context, op.Rt)); + Operand hi = context.ZeroExtend32(OperandType.I64, GetIntA32(context, op.Rt2)); + + Operand lblBigEndian = Label(); + Operand lblEnd = Label(); + + context.BranchIfTrue(lblBigEndian, GetFlag(PState.EFlag)); + + Operand leResult = context.BitwiseOr(lo, context.ShiftLeft(hi, Const(32))); + EmitStoreExclusive(context, address, leResult, exclusive, size, op.Rd, a32: true); + + context.Branch(lblEnd); + + context.MarkLabel(lblBigEndian); + + Operand beResult = context.BitwiseOr(hi, context.ShiftLeft(lo, Const(32))); + EmitStoreExclusive(context, address, beResult, exclusive, size, op.Rd, a32: true); + + context.MarkLabel(lblEnd); + } + else + { + Operand value = context.ZeroExtend32(OperandType.I64, GetIntA32(context, op.Rt)); + EmitStoreExclusive(context, address, value, exclusive, size, op.Rd, a32: true); + } + + if (ordered) + { + EmitBarrier(context); + } + } + } + + private static void EmitBarrier(ArmEmitterContext context) + { + // Note: This barrier is most likely not necessary, and probably + // doesn't make any difference since we need to do a ton of stuff + // (software MMU emulation) to read or write anything anyway. + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitMemoryExHelper.cs b/src/ARMeilleure/Instructions/InstEmitMemoryExHelper.cs new file mode 100644 index 00000000..9a69442a --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitMemoryExHelper.cs @@ -0,0 +1,174 @@ +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static class InstEmitMemoryExHelper + { + private const int ErgSizeLog2 = 4; + + public static Operand EmitLoadExclusive(ArmEmitterContext context, Operand address, bool exclusive, int size) + { + if (exclusive) + { + Operand value; + + if (size == 4) + { + // Only 128-bit CAS is guaranteed to have a atomic load. + Operand physAddr = InstEmitMemoryHelper.EmitPtPointerLoad(context, address, default, write: false, 4); + + Operand zero = context.VectorZero(); + + value = context.CompareAndSwap(physAddr, zero, zero); + } + else + { + value = InstEmitMemoryHelper.EmitReadIntAligned(context, address, size); + } + + Operand arg0 = context.LoadArgument(OperandType.I64, 0); + + Operand exAddrPtr = context.Add(arg0, Const((long)NativeContext.GetExclusiveAddressOffset())); + Operand exValuePtr = context.Add(arg0, Const((long)NativeContext.GetExclusiveValueOffset())); + + context.Store(exAddrPtr, context.BitwiseAnd(address, Const(address.Type, GetExclusiveAddressMask()))); + + // Make sure the unused higher bits of the value are cleared. + if (size < 3) + { + context.Store(exValuePtr, Const(0UL)); + } + if (size < 4) + { + context.Store(context.Add(exValuePtr, Const(exValuePtr.Type, 8L)), Const(0UL)); + } + + // Store the new exclusive value. + context.Store(exValuePtr, value); + + return value; + } + else + { + return InstEmitMemoryHelper.EmitReadIntAligned(context, address, size); + } + } + + public static void EmitStoreExclusive( + ArmEmitterContext context, + Operand address, + Operand value, + bool exclusive, + int size, + int rs, + bool a32) + { + if (size < 3) + { + value = context.ConvertI64ToI32(value); + } + + if (exclusive) + { + // We overwrite one of the register (Rs), + // keep a copy of the values to ensure we are working with the correct values. + address = context.Copy(address); + value = context.Copy(value); + + void SetRs(Operand value) + { + if (a32) + { + SetIntA32(context, rs, value); + } + else + { + SetIntOrZR(context, rs, value); + } + } + + Operand arg0 = context.LoadArgument(OperandType.I64, 0); + + Operand exAddrPtr = context.Add(arg0, Const((long)NativeContext.GetExclusiveAddressOffset())); + Operand exAddr = context.Load(address.Type, exAddrPtr); + + // STEP 1: Check if we have exclusive access to this memory region. If not, fail and skip store. + Operand maskedAddress = context.BitwiseAnd(address, Const(address.Type, GetExclusiveAddressMask())); + + Operand exFailed = context.ICompareNotEqual(exAddr, maskedAddress); + + Operand lblExit = Label(); + + SetRs(Const(1)); + + context.BranchIfTrue(lblExit, exFailed); + + // STEP 2: We have exclusive access and the address is valid, attempt the store using CAS. + Operand physAddr = InstEmitMemoryHelper.EmitPtPointerLoad(context, address, default, write: true, size); + + Operand exValuePtr = context.Add(arg0, Const((long)NativeContext.GetExclusiveValueOffset())); + Operand exValue = size switch + { + 0 => context.Load8(exValuePtr), + 1 => context.Load16(exValuePtr), + 2 => context.Load(OperandType.I32, exValuePtr), + 3 => context.Load(OperandType.I64, exValuePtr), + _ => context.Load(OperandType.V128, exValuePtr) + }; + + Operand currValue = size switch + { + 0 => context.CompareAndSwap8(physAddr, exValue, value), + 1 => context.CompareAndSwap16(physAddr, exValue, value), + _ => context.CompareAndSwap(physAddr, exValue, value) + }; + + // STEP 3: Check if we succeeded by comparing expected and in-memory values. + Operand storeFailed; + + if (size == 4) + { + Operand currValueLow = context.VectorExtract(OperandType.I64, currValue, 0); + Operand currValueHigh = context.VectorExtract(OperandType.I64, currValue, 1); + + Operand exValueLow = context.VectorExtract(OperandType.I64, exValue, 0); + Operand exValueHigh = context.VectorExtract(OperandType.I64, exValue, 1); + + storeFailed = context.BitwiseOr( + context.ICompareNotEqual(currValueLow, exValueLow), + context.ICompareNotEqual(currValueHigh, exValueHigh)); + } + else + { + storeFailed = context.ICompareNotEqual(currValue, exValue); + } + + SetRs(storeFailed); + + context.MarkLabel(lblExit); + } + else + { + InstEmitMemoryHelper.EmitWriteIntAligned(context, address, value, size); + } + } + + public static void EmitClearExclusive(ArmEmitterContext context) + { + Operand arg0 = context.LoadArgument(OperandType.I64, 0); + + Operand exAddrPtr = context.Add(arg0, Const((long)NativeContext.GetExclusiveAddressOffset())); + + // We store ULONG max to force any exclusive address checks to fail, + // since this value is not aligned to the ERG mask. + context.Store(exAddrPtr, Const(ulong.MaxValue)); + } + + private static long GetExclusiveAddressMask() => ~((4L << ErgSizeLog2) - 1); + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitMemoryHelper.cs b/src/ARMeilleure/Instructions/InstEmitMemoryHelper.cs new file mode 100644 index 00000000..f97e395c --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitMemoryHelper.cs @@ -0,0 +1,648 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Memory; +using ARMeilleure.Translation; +using ARMeilleure.Translation.PTC; +using System; +using System.Reflection; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static class InstEmitMemoryHelper + { + private const int PageBits = 12; + private const int PageMask = (1 << PageBits) - 1; + + private enum Extension + { + Zx, + Sx32, + Sx64 + } + + public static void EmitLoadZx(ArmEmitterContext context, Operand address, int rt, int size) + { + EmitLoad(context, address, Extension.Zx, rt, size); + } + + public static void EmitLoadSx32(ArmEmitterContext context, Operand address, int rt, int size) + { + EmitLoad(context, address, Extension.Sx32, rt, size); + } + + public static void EmitLoadSx64(ArmEmitterContext context, Operand address, int rt, int size) + { + EmitLoad(context, address, Extension.Sx64, rt, size); + } + + private static void EmitLoad(ArmEmitterContext context, Operand address, Extension ext, int rt, int size) + { + bool isSimd = IsSimd(context); + + if ((uint)size > (isSimd ? 4 : 3)) + { + throw new ArgumentOutOfRangeException(nameof(size)); + } + + if (isSimd) + { + EmitReadVector(context, address, context.VectorZero(), rt, 0, size); + } + else + { + EmitReadInt(context, address, rt, size); + } + + if (!isSimd && !(context.CurrOp is OpCode32 && rt == State.RegisterAlias.Aarch32Pc)) + { + Operand value = GetInt(context, rt); + + if (ext == Extension.Sx32 || ext == Extension.Sx64) + { + OperandType destType = ext == Extension.Sx64 ? OperandType.I64 : OperandType.I32; + + switch (size) + { + case 0: value = context.SignExtend8 (destType, value); break; + case 1: value = context.SignExtend16(destType, value); break; + case 2: value = context.SignExtend32(destType, value); break; + } + } + + SetInt(context, rt, value); + } + } + + public static void EmitLoadSimd( + ArmEmitterContext context, + Operand address, + Operand vector, + int rt, + int elem, + int size) + { + EmitReadVector(context, address, vector, rt, elem, size); + } + + public static void EmitStore(ArmEmitterContext context, Operand address, int rt, int size) + { + bool isSimd = IsSimd(context); + + if ((uint)size > (isSimd ? 4 : 3)) + { + throw new ArgumentOutOfRangeException(nameof(size)); + } + + if (isSimd) + { + EmitWriteVector(context, address, rt, 0, size); + } + else + { + EmitWriteInt(context, address, rt, size); + } + } + + public static void EmitStoreSimd( + ArmEmitterContext context, + Operand address, + int rt, + int elem, + int size) + { + EmitWriteVector(context, address, rt, elem, size); + } + + private static bool IsSimd(ArmEmitterContext context) + { + return context.CurrOp is IOpCodeSimd && + !(context.CurrOp is OpCodeSimdMemMs || + context.CurrOp is OpCodeSimdMemSs); + } + + public static Operand EmitReadInt(ArmEmitterContext context, Operand address, int size) + { + Operand temp = context.AllocateLocal(size == 3 ? OperandType.I64 : OperandType.I32); + + Operand lblSlowPath = Label(); + Operand lblEnd = Label(); + + Operand physAddr = EmitPtPointerLoad(context, address, lblSlowPath, write: false, size); + + Operand value = default; + + switch (size) + { + case 0: value = context.Load8 (physAddr); break; + case 1: value = context.Load16(physAddr); break; + case 2: value = context.Load (OperandType.I32, physAddr); break; + case 3: value = context.Load (OperandType.I64, physAddr); break; + } + + context.Copy(temp, value); + + if (!context.Memory.Type.IsHostMapped()) + { + context.Branch(lblEnd); + + context.MarkLabel(lblSlowPath, BasicBlockFrequency.Cold); + + context.Copy(temp, EmitReadIntFallback(context, address, size)); + + context.MarkLabel(lblEnd); + } + + return temp; + } + + private static void EmitReadInt(ArmEmitterContext context, Operand address, int rt, int size) + { + Operand lblSlowPath = Label(); + Operand lblEnd = Label(); + + Operand physAddr = EmitPtPointerLoad(context, address, lblSlowPath, write: false, size); + + Operand value = default; + + switch (size) + { + case 0: value = context.Load8 (physAddr); break; + case 1: value = context.Load16(physAddr); break; + case 2: value = context.Load (OperandType.I32, physAddr); break; + case 3: value = context.Load (OperandType.I64, physAddr); break; + } + + SetInt(context, rt, value); + + if (!context.Memory.Type.IsHostMapped()) + { + context.Branch(lblEnd); + + context.MarkLabel(lblSlowPath, BasicBlockFrequency.Cold); + + EmitReadIntFallback(context, address, rt, size); + + context.MarkLabel(lblEnd); + } + } + + public static Operand EmitReadIntAligned(ArmEmitterContext context, Operand address, int size) + { + if ((uint)size > 4) + { + throw new ArgumentOutOfRangeException(nameof(size)); + } + + Operand physAddr = EmitPtPointerLoad(context, address, default, write: false, size); + + return size switch + { + 0 => context.Load8(physAddr), + 1 => context.Load16(physAddr), + 2 => context.Load(OperandType.I32, physAddr), + 3 => context.Load(OperandType.I64, physAddr), + _ => context.Load(OperandType.V128, physAddr) + }; + } + + private static void EmitReadVector( + ArmEmitterContext context, + Operand address, + Operand vector, + int rt, + int elem, + int size) + { + Operand lblSlowPath = Label(); + Operand lblEnd = Label(); + + Operand physAddr = EmitPtPointerLoad(context, address, lblSlowPath, write: false, size); + + Operand value = default; + + switch (size) + { + case 0: value = context.VectorInsert8 (vector, context.Load8(physAddr), elem); break; + case 1: value = context.VectorInsert16(vector, context.Load16(physAddr), elem); break; + case 2: value = context.VectorInsert (vector, context.Load(OperandType.I32, physAddr), elem); break; + case 3: value = context.VectorInsert (vector, context.Load(OperandType.I64, physAddr), elem); break; + case 4: value = context.Load (OperandType.V128, physAddr); break; + } + + context.Copy(GetVec(rt), value); + + if (!context.Memory.Type.IsHostMapped()) + { + context.Branch(lblEnd); + + context.MarkLabel(lblSlowPath, BasicBlockFrequency.Cold); + + EmitReadVectorFallback(context, address, vector, rt, elem, size); + + context.MarkLabel(lblEnd); + } + } + + private static Operand VectorCreate(ArmEmitterContext context, Operand value) + { + return context.VectorInsert(context.VectorZero(), value, 0); + } + + private static void EmitWriteInt(ArmEmitterContext context, Operand address, int rt, int size) + { + Operand lblSlowPath = Label(); + Operand lblEnd = Label(); + + Operand physAddr = EmitPtPointerLoad(context, address, lblSlowPath, write: true, size); + + Operand value = GetInt(context, rt); + + if (size < 3 && value.Type == OperandType.I64) + { + value = context.ConvertI64ToI32(value); + } + + switch (size) + { + case 0: context.Store8 (physAddr, value); break; + case 1: context.Store16(physAddr, value); break; + case 2: context.Store (physAddr, value); break; + case 3: context.Store (physAddr, value); break; + } + + if (!context.Memory.Type.IsHostMapped()) + { + context.Branch(lblEnd); + + context.MarkLabel(lblSlowPath, BasicBlockFrequency.Cold); + + EmitWriteIntFallback(context, address, rt, size); + + context.MarkLabel(lblEnd); + } + } + + public static void EmitWriteIntAligned(ArmEmitterContext context, Operand address, Operand value, int size) + { + if ((uint)size > 4) + { + throw new ArgumentOutOfRangeException(nameof(size)); + } + + Operand physAddr = EmitPtPointerLoad(context, address, default, write: true, size); + + if (size < 3 && value.Type == OperandType.I64) + { + value = context.ConvertI64ToI32(value); + } + + if (size == 0) + { + context.Store8(physAddr, value); + } + else if (size == 1) + { + context.Store16(physAddr, value); + } + else + { + context.Store(physAddr, value); + } + } + + private static void EmitWriteVector( + ArmEmitterContext context, + Operand address, + int rt, + int elem, + int size) + { + Operand lblSlowPath = Label(); + Operand lblEnd = Label(); + + Operand physAddr = EmitPtPointerLoad(context, address, lblSlowPath, write: true, size); + + Operand value = GetVec(rt); + + switch (size) + { + case 0: context.Store8 (physAddr, context.VectorExtract8(value, elem)); break; + case 1: context.Store16(physAddr, context.VectorExtract16(value, elem)); break; + case 2: context.Store (physAddr, context.VectorExtract(OperandType.I32, value, elem)); break; + case 3: context.Store (physAddr, context.VectorExtract(OperandType.I64, value, elem)); break; + case 4: context.Store (physAddr, value); break; + } + + if (!context.Memory.Type.IsHostMapped()) + { + context.Branch(lblEnd); + + context.MarkLabel(lblSlowPath, BasicBlockFrequency.Cold); + + EmitWriteVectorFallback(context, address, rt, elem, size); + + context.MarkLabel(lblEnd); + } + } + + public static Operand EmitPtPointerLoad(ArmEmitterContext context, Operand address, Operand lblSlowPath, bool write, int size) + { + if (context.Memory.Type.IsHostMapped()) + { + return EmitHostMappedPointer(context, address); + } + + int ptLevelBits = context.Memory.AddressSpaceBits - PageBits; + int ptLevelSize = 1 << ptLevelBits; + int ptLevelMask = ptLevelSize - 1; + + Operand addrRotated = size != 0 ? context.RotateRight(address, Const(size)) : address; + Operand addrShifted = context.ShiftRightUI(addrRotated, Const(PageBits - size)); + + Operand pte = !context.HasPtc + ? Const(context.Memory.PageTablePointer.ToInt64()) + : Const(context.Memory.PageTablePointer.ToInt64(), Ptc.PageTableSymbol); + + Operand pteOffset = context.BitwiseAnd(addrShifted, Const(addrShifted.Type, ptLevelMask)); + + if (pteOffset.Type == OperandType.I32) + { + pteOffset = context.ZeroExtend32(OperandType.I64, pteOffset); + } + + pte = context.Load(OperandType.I64, context.Add(pte, context.ShiftLeft(pteOffset, Const(3)))); + + if (addrShifted.Type == OperandType.I32) + { + addrShifted = context.ZeroExtend32(OperandType.I64, addrShifted); + } + + // If the VA is out of range, or not aligned to the access size, force PTE to 0 by masking it. + pte = context.BitwiseAnd(pte, context.ShiftRightSI(context.Add(addrShifted, Const(-(long)ptLevelSize)), Const(63))); + + if (lblSlowPath != default) + { + if (write) + { + context.BranchIf(lblSlowPath, pte, Const(0L), Comparison.LessOrEqual); + pte = context.BitwiseAnd(pte, Const(0xffffffffffffUL)); // Ignore any software protection bits. (they are still used by C# memory access) + } + else + { + pte = context.ShiftLeft(pte, Const(1)); + context.BranchIf(lblSlowPath, pte, Const(0L), Comparison.LessOrEqual); + pte = context.ShiftRightUI(pte, Const(1)); + } + } + else + { + // When no label is provided to jump to a slow path if the address is invalid, + // we do the validation ourselves, and throw if needed. + + Operand lblNotWatched = Label(); + + // Is the page currently being tracked for read/write? If so we need to call SignalMemoryTracking. + context.BranchIf(lblNotWatched, pte, Const(0L), Comparison.GreaterOrEqual, BasicBlockFrequency.Cold); + + // Signal memory tracking. Size here doesn't matter as address is assumed to be size aligned here. + context.Call(typeof(NativeInterface).GetMethod(nameof(NativeInterface.SignalMemoryTracking)), address, Const(1UL), Const(write ? 1 : 0)); + context.MarkLabel(lblNotWatched); + + pte = context.BitwiseAnd(pte, Const(0xffffffffffffUL)); // Ignore any software protection bits. (they are still used by C# memory access) + + Operand lblNonNull = Label(); + + // Skip exception if the PTE address is non-null (not zero). + context.BranchIfTrue(lblNonNull, pte, BasicBlockFrequency.Cold); + + // The call is not expected to return (it should throw). + context.Call(typeof(NativeInterface).GetMethod(nameof(NativeInterface.ThrowInvalidMemoryAccess)), address); + context.MarkLabel(lblNonNull); + } + + Operand pageOffset = context.BitwiseAnd(address, Const(address.Type, PageMask)); + + if (pageOffset.Type == OperandType.I32) + { + pageOffset = context.ZeroExtend32(OperandType.I64, pageOffset); + } + + return context.Add(pte, pageOffset); + } + + public static Operand EmitHostMappedPointer(ArmEmitterContext context, Operand address) + { + if (address.Type == OperandType.I32) + { + address = context.ZeroExtend32(OperandType.I64, address); + } + + if (context.Memory.Type == MemoryManagerType.HostMapped) + { + Operand mask = Const(ulong.MaxValue >> (64 - context.Memory.AddressSpaceBits)); + address = context.BitwiseAnd(address, mask); + } + + Operand baseAddr = !context.HasPtc + ? Const(context.Memory.PageTablePointer.ToInt64()) + : Const(context.Memory.PageTablePointer.ToInt64(), Ptc.PageTableSymbol); + + return context.Add(baseAddr, address); + } + + private static void EmitReadIntFallback(ArmEmitterContext context, Operand address, int rt, int size) + { + SetInt(context, rt, EmitReadIntFallback(context, address, size)); + } + + private static Operand EmitReadIntFallback(ArmEmitterContext context, Operand address, int size) + { + MethodInfo info = null; + + switch (size) + { + case 0: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.ReadByte)); break; + case 1: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.ReadUInt16)); break; + case 2: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.ReadUInt32)); break; + case 3: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.ReadUInt64)); break; + } + + return context.Call(info, address); + } + + private static void EmitReadVectorFallback( + ArmEmitterContext context, + Operand address, + Operand vector, + int rt, + int elem, + int size) + { + MethodInfo info = null; + + switch (size) + { + case 0: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.ReadByte)); break; + case 1: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.ReadUInt16)); break; + case 2: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.ReadUInt32)); break; + case 3: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.ReadUInt64)); break; + case 4: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.ReadVector128)); break; + } + + Operand value = context.Call(info, address); + + switch (size) + { + case 0: value = context.VectorInsert8 (vector, value, elem); break; + case 1: value = context.VectorInsert16(vector, value, elem); break; + case 2: value = context.VectorInsert (vector, value, elem); break; + case 3: value = context.VectorInsert (vector, value, elem); break; + } + + context.Copy(GetVec(rt), value); + } + + private static void EmitWriteIntFallback(ArmEmitterContext context, Operand address, int rt, int size) + { + MethodInfo info = null; + + switch (size) + { + case 0: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.WriteByte)); break; + case 1: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.WriteUInt16)); break; + case 2: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.WriteUInt32)); break; + case 3: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.WriteUInt64)); break; + } + + Operand value = GetInt(context, rt); + + if (size < 3 && value.Type == OperandType.I64) + { + value = context.ConvertI64ToI32(value); + } + + context.Call(info, address, value); + } + + private static void EmitWriteVectorFallback( + ArmEmitterContext context, + Operand address, + int rt, + int elem, + int size) + { + MethodInfo info = null; + + switch (size) + { + case 0: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.WriteByte)); break; + case 1: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.WriteUInt16)); break; + case 2: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.WriteUInt32)); break; + case 3: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.WriteUInt64)); break; + case 4: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.WriteVector128)); break; + } + + Operand value = default; + + if (size < 4) + { + switch (size) + { + case 0: value = context.VectorExtract8 (GetVec(rt), elem); break; + case 1: value = context.VectorExtract16(GetVec(rt), elem); break; + case 2: value = context.VectorExtract (OperandType.I32, GetVec(rt), elem); break; + case 3: value = context.VectorExtract (OperandType.I64, GetVec(rt), elem); break; + } + } + else + { + value = GetVec(rt); + } + + context.Call(info, address, value); + } + + private static Operand GetInt(ArmEmitterContext context, int rt) + { + return context.CurrOp is OpCode32 ? GetIntA32(context, rt) : GetIntOrZR(context, rt); + } + + private static void SetInt(ArmEmitterContext context, int rt, Operand value) + { + if (context.CurrOp is OpCode32) + { + SetIntA32(context, rt, value); + } + else + { + SetIntOrZR(context, rt, value); + } + } + + // ARM32 helpers. + public static Operand GetMemM(ArmEmitterContext context, bool setCarry = true) + { + switch (context.CurrOp) + { + case IOpCode32MemRsImm op: return GetMShiftedByImmediate(context, op, setCarry); + + case IOpCode32MemReg op: return GetIntA32(context, op.Rm); + + case IOpCode32Mem op: return Const(op.Immediate); + + case OpCode32SimdMemImm op: return Const(op.Immediate); + + default: throw InvalidOpCodeType(context.CurrOp); + } + } + + private static Exception InvalidOpCodeType(OpCode opCode) + { + return new InvalidOperationException($"Invalid OpCode type \"{opCode?.GetType().Name ?? "null"}\"."); + } + + public static Operand GetMShiftedByImmediate(ArmEmitterContext context, IOpCode32MemRsImm op, bool setCarry) + { + Operand m = GetIntA32(context, op.Rm); + + int shift = op.Immediate; + + if (shift == 0) + { + switch (op.ShiftType) + { + case ShiftType.Lsr: shift = 32; break; + case ShiftType.Asr: shift = 32; break; + case ShiftType.Ror: shift = 1; break; + } + } + + if (shift != 0) + { + setCarry &= false; + + switch (op.ShiftType) + { + case ShiftType.Lsl: m = InstEmitAluHelper.GetLslC(context, m, setCarry, shift); break; + case ShiftType.Lsr: m = InstEmitAluHelper.GetLsrC(context, m, setCarry, shift); break; + case ShiftType.Asr: m = InstEmitAluHelper.GetAsrC(context, m, setCarry, shift); break; + case ShiftType.Ror: + if (op.Immediate != 0) + { + m = InstEmitAluHelper.GetRorC(context, m, setCarry, shift); + } + else + { + m = InstEmitAluHelper.GetRrxC(context, m, setCarry); + } + break; + } + } + + return m; + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitMove.cs b/src/ARMeilleure/Instructions/InstEmitMove.cs new file mode 100644 index 00000000..d551bf2d --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitMove.cs @@ -0,0 +1,41 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit + { + public static void Movk(ArmEmitterContext context) + { + OpCodeMov op = (OpCodeMov)context.CurrOp; + + OperandType type = op.GetOperandType(); + + Operand res = GetIntOrZR(context, op.Rd); + + res = context.BitwiseAnd(res, Const(type, ~(0xffffL << op.Bit))); + + res = context.BitwiseOr(res, Const(type, op.Immediate)); + + SetIntOrZR(context, op.Rd, res); + } + + public static void Movn(ArmEmitterContext context) + { + OpCodeMov op = (OpCodeMov)context.CurrOp; + + SetIntOrZR(context, op.Rd, Const(op.GetOperandType(), ~op.Immediate)); + } + + public static void Movz(ArmEmitterContext context) + { + OpCodeMov op = (OpCodeMov)context.CurrOp; + + SetIntOrZR(context, op.Rd, Const(op.GetOperandType(), op.Immediate)); + } + } +}
\ No newline at end of file diff --git a/src/ARMeilleure/Instructions/InstEmitMul.cs b/src/ARMeilleure/Instructions/InstEmitMul.cs new file mode 100644 index 00000000..65d11b30 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitMul.cs @@ -0,0 +1,100 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; +using System; + +using static ARMeilleure.Instructions.InstEmitHelper; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit + { + public static void Madd(ArmEmitterContext context) => EmitMul(context, isAdd: true); + public static void Msub(ArmEmitterContext context) => EmitMul(context, isAdd: false); + + private static void EmitMul(ArmEmitterContext context, bool isAdd) + { + OpCodeMul op = (OpCodeMul)context.CurrOp; + + Operand a = GetIntOrZR(context, op.Ra); + Operand n = GetIntOrZR(context, op.Rn); + Operand m = GetIntOrZR(context, op.Rm); + + Operand res = context.Multiply(n, m); + + res = isAdd ? context.Add(a, res) : context.Subtract(a, res); + + SetIntOrZR(context, op.Rd, res); + } + + public static void Smaddl(ArmEmitterContext context) => EmitMull(context, MullFlags.SignedAdd); + public static void Smsubl(ArmEmitterContext context) => EmitMull(context, MullFlags.SignedSubtract); + public static void Umaddl(ArmEmitterContext context) => EmitMull(context, MullFlags.Add); + public static void Umsubl(ArmEmitterContext context) => EmitMull(context, MullFlags.Subtract); + + [Flags] + private enum MullFlags + { + Subtract = 0, + Add = 1 << 0, + Signed = 1 << 1, + + SignedAdd = Signed | Add, + SignedSubtract = Signed | Subtract + } + + private static void EmitMull(ArmEmitterContext context, MullFlags flags) + { + OpCodeMul op = (OpCodeMul)context.CurrOp; + + Operand GetExtendedRegister32(int index) + { + Operand value = GetIntOrZR(context, index); + + if ((flags & MullFlags.Signed) != 0) + { + return context.SignExtend32(value.Type, value); + } + else + { + return context.ZeroExtend32(value.Type, value); + } + } + + Operand a = GetIntOrZR(context, op.Ra); + + Operand n = GetExtendedRegister32(op.Rn); + Operand m = GetExtendedRegister32(op.Rm); + + Operand res = context.Multiply(n, m); + + res = (flags & MullFlags.Add) != 0 ? context.Add(a, res) : context.Subtract(a, res); + + SetIntOrZR(context, op.Rd, res); + } + + public static void Smulh(ArmEmitterContext context) + { + OpCodeMul op = (OpCodeMul)context.CurrOp; + + Operand n = GetIntOrZR(context, op.Rn); + Operand m = GetIntOrZR(context, op.Rm); + + Operand d = context.Multiply64HighSI(n, m); + + SetIntOrZR(context, op.Rd, d); + } + + public static void Umulh(ArmEmitterContext context) + { + OpCodeMul op = (OpCodeMul)context.CurrOp; + + Operand n = GetIntOrZR(context, op.Rn); + Operand m = GetIntOrZR(context, op.Rm); + + Operand d = context.Multiply64HighUI(n, m); + + SetIntOrZR(context, op.Rd, d); + } + } +}
\ No newline at end of file diff --git a/src/ARMeilleure/Instructions/InstEmitMul32.cs b/src/ARMeilleure/Instructions/InstEmitMul32.cs new file mode 100644 index 00000000..0822f92c --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitMul32.cs @@ -0,0 +1,379 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; +using System; + +using static ARMeilleure.Instructions.InstEmitAluHelper; +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit32 + { + [Flags] + private enum MullFlags + { + Subtract = 1, + Add = 1 << 1, + Signed = 1 << 2, + + SignedAdd = Signed | Add, + SignedSubtract = Signed | Subtract + } + + public static void Mla(ArmEmitterContext context) + { + IOpCode32AluMla op = (IOpCode32AluMla)context.CurrOp; + + Operand n = GetAluN(context); + Operand m = GetAluM(context); + Operand a = GetIntA32(context, op.Ra); + + Operand res = context.Add(a, context.Multiply(n, m)); + + if (ShouldSetFlags(context)) + { + EmitNZFlagsCheck(context, res); + } + + EmitAluStore(context, res); + } + + public static void Mls(ArmEmitterContext context) + { + IOpCode32AluMla op = (IOpCode32AluMla)context.CurrOp; + + Operand n = GetAluN(context); + Operand m = GetAluM(context); + Operand a = GetIntA32(context, op.Ra); + + Operand res = context.Subtract(a, context.Multiply(n, m)); + + EmitAluStore(context, res); + } + + public static void Smmla(ArmEmitterContext context) + { + EmitSmmul(context, MullFlags.SignedAdd); + } + + public static void Smmls(ArmEmitterContext context) + { + EmitSmmul(context, MullFlags.SignedSubtract); + } + + public static void Smmul(ArmEmitterContext context) + { + EmitSmmul(context, MullFlags.Signed); + } + + private static void EmitSmmul(ArmEmitterContext context, MullFlags flags) + { + IOpCode32AluMla op = (IOpCode32AluMla)context.CurrOp; + + Operand n = context.SignExtend32(OperandType.I64, GetIntA32(context, op.Rn)); + Operand m = context.SignExtend32(OperandType.I64, GetIntA32(context, op.Rm)); + + Operand res = context.Multiply(n, m); + + if (flags.HasFlag(MullFlags.Add) && op.Ra != 0xf) + { + res = context.Add(context.ShiftLeft(context.ZeroExtend32(OperandType.I64, GetIntA32(context, op.Ra)), Const(32)), res); + } + else if (flags.HasFlag(MullFlags.Subtract)) + { + res = context.Subtract(context.ShiftLeft(context.ZeroExtend32(OperandType.I64, GetIntA32(context, op.Ra)), Const(32)), res); + } + + if (op.R) + { + res = context.Add(res, Const(0x80000000L)); + } + + Operand hi = context.ConvertI64ToI32(context.ShiftRightSI(res, Const(32))); + + EmitGenericAluStoreA32(context, op.Rd, false, hi); + } + + public static void Smla__(ArmEmitterContext context) + { + IOpCode32AluMla op = (IOpCode32AluMla)context.CurrOp; + + Operand n = GetIntA32(context, op.Rn); + Operand m = GetIntA32(context, op.Rm); + Operand a = GetIntA32(context, op.Ra); + + if (op.NHigh) + { + n = context.SignExtend16(OperandType.I64, context.ShiftRightUI(n, Const(16))); + } + else + { + n = context.SignExtend16(OperandType.I64, n); + } + + if (op.MHigh) + { + m = context.SignExtend16(OperandType.I64, context.ShiftRightUI(m, Const(16))); + } + else + { + m = context.SignExtend16(OperandType.I64, m); + } + + Operand res = context.Multiply(n, m); + + Operand toAdd = context.SignExtend32(OperandType.I64, a); + res = context.Add(res, toAdd); + Operand q = context.ICompareNotEqual(res, context.SignExtend32(OperandType.I64, res)); + res = context.ConvertI64ToI32(res); + + UpdateQFlag(context, q); + + EmitGenericAluStoreA32(context, op.Rd, false, res); + } + + public static void Smlal(ArmEmitterContext context) + { + EmitMlal(context, true); + } + + public static void Smlal__(ArmEmitterContext context) + { + IOpCode32AluUmull op = (IOpCode32AluUmull)context.CurrOp; + + Operand n = GetIntA32(context, op.Rn); + Operand m = GetIntA32(context, op.Rm); + + if (op.NHigh) + { + n = context.SignExtend16(OperandType.I64, context.ShiftRightUI(n, Const(16))); + } + else + { + n = context.SignExtend16(OperandType.I64, n); + } + + if (op.MHigh) + { + m = context.SignExtend16(OperandType.I64, context.ShiftRightUI(m, Const(16))); + } + else + { + m = context.SignExtend16(OperandType.I64, m); + } + + Operand res = context.Multiply(n, m); + + Operand toAdd = context.ShiftLeft(context.ZeroExtend32(OperandType.I64, GetIntA32(context, op.RdHi)), Const(32)); + toAdd = context.BitwiseOr(toAdd, context.ZeroExtend32(OperandType.I64, GetIntA32(context, op.RdLo))); + res = context.Add(res, toAdd); + + Operand hi = context.ConvertI64ToI32(context.ShiftRightUI(res, Const(32))); + Operand lo = context.ConvertI64ToI32(res); + + EmitGenericAluStoreA32(context, op.RdHi, false, hi); + EmitGenericAluStoreA32(context, op.RdLo, false, lo); + } + + public static void Smlaw_(ArmEmitterContext context) + { + IOpCode32AluMla op = (IOpCode32AluMla)context.CurrOp; + + Operand n = GetIntA32(context, op.Rn); + Operand m = GetIntA32(context, op.Rm); + Operand a = GetIntA32(context, op.Ra); + + if (op.MHigh) + { + m = context.SignExtend16(OperandType.I64, context.ShiftRightUI(m, Const(16))); + } + else + { + m = context.SignExtend16(OperandType.I64, m); + } + + Operand res = context.Multiply(context.SignExtend32(OperandType.I64, n), m); + + Operand toAdd = context.ShiftLeft(context.SignExtend32(OperandType.I64, a), Const(16)); + res = context.Add(res, toAdd); + res = context.ShiftRightSI(res, Const(16)); + Operand q = context.ICompareNotEqual(res, context.SignExtend32(OperandType.I64, res)); + res = context.ConvertI64ToI32(res); + + UpdateQFlag(context, q); + + EmitGenericAluStoreA32(context, op.Rd, false, res); + } + + public static void Smul__(ArmEmitterContext context) + { + IOpCode32AluMla op = (IOpCode32AluMla)context.CurrOp; + + Operand n = GetIntA32(context, op.Rn); + Operand m = GetIntA32(context, op.Rm); + + if (op.NHigh) + { + n = context.ShiftRightSI(n, Const(16)); + } + else + { + n = context.SignExtend16(OperandType.I32, n); + } + + if (op.MHigh) + { + m = context.ShiftRightSI(m, Const(16)); + } + else + { + m = context.SignExtend16(OperandType.I32, m); + } + + Operand res = context.Multiply(n, m); + + EmitGenericAluStoreA32(context, op.Rd, false, res); + } + + public static void Smull(ArmEmitterContext context) + { + IOpCode32AluUmull op = (IOpCode32AluUmull)context.CurrOp; + + Operand n = context.SignExtend32(OperandType.I64, GetIntA32(context, op.Rn)); + Operand m = context.SignExtend32(OperandType.I64, GetIntA32(context, op.Rm)); + + Operand res = context.Multiply(n, m); + + Operand hi = context.ConvertI64ToI32(context.ShiftRightUI(res, Const(32))); + Operand lo = context.ConvertI64ToI32(res); + + if (ShouldSetFlags(context)) + { + EmitNZFlagsCheck(context, res); + } + + EmitGenericAluStoreA32(context, op.RdHi, ShouldSetFlags(context), hi); + EmitGenericAluStoreA32(context, op.RdLo, ShouldSetFlags(context), lo); + } + + public static void Smulw_(ArmEmitterContext context) + { + IOpCode32AluMla op = (IOpCode32AluMla)context.CurrOp; + + Operand n = GetIntA32(context, op.Rn); + Operand m = GetIntA32(context, op.Rm); + + if (op.MHigh) + { + m = context.SignExtend16(OperandType.I64, context.ShiftRightUI(m, Const(16))); + } + else + { + m = context.SignExtend16(OperandType.I64, m); + } + + Operand res = context.Multiply(context.SignExtend32(OperandType.I64, n), m); + + res = context.ShiftRightUI(res, Const(16)); + res = context.ConvertI64ToI32(res); + + EmitGenericAluStoreA32(context, op.Rd, false, res); + } + + public static void Umaal(ArmEmitterContext context) + { + IOpCode32AluUmull op = (IOpCode32AluUmull)context.CurrOp; + + Operand n = context.ZeroExtend32(OperandType.I64, GetIntA32(context, op.Rn)); + Operand m = context.ZeroExtend32(OperandType.I64, GetIntA32(context, op.Rm)); + Operand dHi = context.ZeroExtend32(OperandType.I64, GetIntA32(context, op.RdHi)); + Operand dLo = context.ZeroExtend32(OperandType.I64, GetIntA32(context, op.RdLo)); + + Operand res = context.Multiply(n, m); + res = context.Add(res, dHi); + res = context.Add(res, dLo); + + Operand hi = context.ConvertI64ToI32(context.ShiftRightUI(res, Const(32))); + Operand lo = context.ConvertI64ToI32(res); + + EmitGenericAluStoreA32(context, op.RdHi, false, hi); + EmitGenericAluStoreA32(context, op.RdLo, false, lo); + } + + public static void Umlal(ArmEmitterContext context) + { + EmitMlal(context, false); + } + + public static void Umull(ArmEmitterContext context) + { + IOpCode32AluUmull op = (IOpCode32AluUmull)context.CurrOp; + + Operand n = context.ZeroExtend32(OperandType.I64, GetIntA32(context, op.Rn)); + Operand m = context.ZeroExtend32(OperandType.I64, GetIntA32(context, op.Rm)); + + Operand res = context.Multiply(n, m); + + Operand hi = context.ConvertI64ToI32(context.ShiftRightUI(res, Const(32))); + Operand lo = context.ConvertI64ToI32(res); + + if (ShouldSetFlags(context)) + { + EmitNZFlagsCheck(context, res); + } + + EmitGenericAluStoreA32(context, op.RdHi, ShouldSetFlags(context), hi); + EmitGenericAluStoreA32(context, op.RdLo, ShouldSetFlags(context), lo); + } + + private static void EmitMlal(ArmEmitterContext context, bool signed) + { + IOpCode32AluUmull op = (IOpCode32AluUmull)context.CurrOp; + + Operand n = GetIntA32(context, op.Rn); + Operand m = GetIntA32(context, op.Rm); + + if (signed) + { + n = context.SignExtend32(OperandType.I64, n); + m = context.SignExtend32(OperandType.I64, m); + } + else + { + n = context.ZeroExtend32(OperandType.I64, n); + m = context.ZeroExtend32(OperandType.I64, m); + } + + Operand res = context.Multiply(n, m); + + Operand toAdd = context.ShiftLeft(context.ZeroExtend32(OperandType.I64, GetIntA32(context, op.RdHi)), Const(32)); + toAdd = context.BitwiseOr(toAdd, context.ZeroExtend32(OperandType.I64, GetIntA32(context, op.RdLo))); + res = context.Add(res, toAdd); + + Operand hi = context.ConvertI64ToI32(context.ShiftRightUI(res, Const(32))); + Operand lo = context.ConvertI64ToI32(res); + + if (ShouldSetFlags(context)) + { + EmitNZFlagsCheck(context, res); + } + + EmitGenericAluStoreA32(context, op.RdHi, ShouldSetFlags(context), hi); + EmitGenericAluStoreA32(context, op.RdLo, ShouldSetFlags(context), lo); + } + + private static void UpdateQFlag(ArmEmitterContext context, Operand q) + { + Operand lblSkipSetQ = Label(); + + context.BranchIfFalse(lblSkipSetQ, q); + + SetFlag(context, PState.QFlag, Const(1)); + + context.MarkLabel(lblSkipSetQ); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/src/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs new file mode 100644 index 00000000..7e7f26b1 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs @@ -0,0 +1,5224 @@ +// https://github.com/intel/ARM_NEON_2_x86_SSE/blob/master/NEON_2_SSE.h +// https://www.agner.org/optimize/#vectorclass @ vectori128.h + +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; +using System; +using System.Diagnostics; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper32; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + using Func2I = Func<Operand, Operand, Operand>; + + static partial class InstEmit + { + public static void Abs_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOp(context, Intrinsic.Arm64AbsS); + } + else + { + EmitScalarUnaryOpSx(context, (op1) => EmitAbs(context, op1)); + } + } + + public static void Abs_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64AbsV); + } + else + { + EmitVectorUnaryOpSx(context, (op1) => EmitAbs(context, op1)); + } + } + + public static void Add_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOp(context, Intrinsic.Arm64AddS); + } + else + { + EmitScalarBinaryOpZx(context, (op1, op2) => context.Add(op1, op2)); + } + } + + public static void Add_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64AddV); + } + else if (Optimizations.UseSse2) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Intrinsic addInst = X86PaddInstruction[op.Size]; + + Operand res = context.AddIntrinsic(addInst, n, m); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorBinaryOpZx(context, (op1, op2) => context.Add(op1, op2)); + } + } + + public static void Addhn_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64AddhnV); + } + else + { + EmitHighNarrow(context, (op1, op2) => context.Add(op1, op2), round: false); + } + } + + public static void Addp_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOp(context, Intrinsic.Arm64AddpS); + } + else + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand ne0 = EmitVectorExtractZx(context, op.Rn, 0, op.Size); + Operand ne1 = EmitVectorExtractZx(context, op.Rn, 1, op.Size); + + Operand res = context.Add(ne0, ne1); + + context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, op.Size)); + } + } + + public static void Addp_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64AddpV); + } + else if (Optimizations.UseSsse3) + { + EmitSsse3VectorPairwiseOp(context, X86PaddInstruction); + } + else + { + EmitVectorPairwiseOpZx(context, (op1, op2) => context.Add(op1, op2)); + } + } + + public static void Addv_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64AddvV); + } + else + { + EmitVectorAcrossVectorOpZx(context, (op1, op2) => context.Add(op1, op2)); + } + } + + public static void Cls_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64ClsV); + } + else + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + int eSize = 8 << op.Size; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size); + + Operand de = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingSigns)), ne, Const(eSize)); + + res = EmitVectorInsert(context, res, de, index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + } + + public static void Clz_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64ClzV); + } + else + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + int eSize = 8 << op.Size; + + Operand res = eSize switch { + 8 => Clz_V_I8 (context, GetVec(op.Rn)), + 16 => Clz_V_I16(context, GetVec(op.Rn)), + 32 => Clz_V_I32(context, GetVec(op.Rn)), + _ => default + }; + + if (res != default) + { + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + } + else + { + int elems = op.GetBytesCount() >> op.Size; + + res = context.VectorZero(); + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size); + + Operand de = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingZeros)), ne, Const(eSize)); + + res = EmitVectorInsert(context, res, de, index, op.Size); + } + } + + context.Copy(GetVec(op.Rd), res); + } + } + + private static Operand Clz_V_I8(ArmEmitterContext context, Operand arg) + { + if (!Optimizations.UseSsse3) + { + return default; + } + + // CLZ nibble table. + Operand clzTable = X86GetScalar(context, 0x01_01_01_01_02_02_03_04); + + Operand maskLow = X86GetAllElements(context, 0x0f_0f_0f_0f); + Operand c04 = X86GetAllElements(context, 0x04_04_04_04); + + // CLZ of low 4 bits of elements in arg. + Operand loClz = context.AddIntrinsic(Intrinsic.X86Pshufb, clzTable, arg); + + // Get the high 4 bits of elements in arg. + Operand hiArg = context.AddIntrinsic(Intrinsic.X86Psrlw, arg, Const(4)); + hiArg = context.AddIntrinsic(Intrinsic.X86Pand, hiArg, maskLow); + + // CLZ of high 4 bits of elements in arg. + Operand hiClz = context.AddIntrinsic(Intrinsic.X86Pshufb, clzTable, hiArg); + + // If high 4 bits are not all zero, we discard the CLZ of the low 4 bits. + Operand mask = context.AddIntrinsic(Intrinsic.X86Pcmpeqb, hiClz, c04); + loClz = context.AddIntrinsic(Intrinsic.X86Pand, loClz, mask); + + return context.AddIntrinsic(Intrinsic.X86Paddb, loClz, hiClz); + } + + private static Operand Clz_V_I16(ArmEmitterContext context, Operand arg) + { + if (!Optimizations.UseSsse3) + { + return default; + } + + Operand maskSwap = X86GetElements(context, 0x80_0f_80_0d_80_0b_80_09, 0x80_07_80_05_80_03_80_01); + Operand maskLow = X86GetAllElements(context, 0x00ff_00ff); + Operand c0008 = X86GetAllElements(context, 0x0008_0008); + + // CLZ pair of high 8 and low 8 bits of elements in arg. + Operand hiloClz = Clz_V_I8(context, arg); + // Get CLZ of low 8 bits in each pair. + Operand loClz = context.AddIntrinsic(Intrinsic.X86Pand, hiloClz, maskLow); + // Get CLZ of high 8 bits in each pair. + Operand hiClz = context.AddIntrinsic(Intrinsic.X86Pshufb, hiloClz, maskSwap); + + // If high 8 bits are not all zero, we discard the CLZ of the low 8 bits. + Operand mask = context.AddIntrinsic(Intrinsic.X86Pcmpeqw, hiClz, c0008); + loClz = context.AddIntrinsic(Intrinsic.X86Pand, loClz, mask); + + return context.AddIntrinsic(Intrinsic.X86Paddw, loClz, hiClz); + } + + private static Operand Clz_V_I32(ArmEmitterContext context, Operand arg) + { + // TODO: Use vplzcntd when AVX-512 is supported. + if (!Optimizations.UseSse2) + { + return default; + } + + Operand AddVectorI32(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Paddd, op0, op1); + Operand SubVectorI32(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Psubd, op0, op1); + Operand ShiftRightVectorUI32(Operand op0, int imm8) => context.AddIntrinsic(Intrinsic.X86Psrld, op0, Const(imm8)); + Operand OrVector(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Por, op0, op1); + Operand AndVector(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Pand, op0, op1); + Operand NotVector(Operand op0) => context.AddIntrinsic(Intrinsic.X86Pandn, op0, context.VectorOne()); + + Operand c55555555 = X86GetAllElements(context, 0x55555555); + Operand c33333333 = X86GetAllElements(context, 0x33333333); + Operand c0f0f0f0f = X86GetAllElements(context, 0x0f0f0f0f); + Operand c0000003f = X86GetAllElements(context, 0x0000003f); + + Operand tmp0; + Operand tmp1; + Operand res; + + // Set all bits after highest set bit to 1. + res = OrVector(ShiftRightVectorUI32(arg, 1), arg); + res = OrVector(ShiftRightVectorUI32(res, 2), res); + res = OrVector(ShiftRightVectorUI32(res, 4), res); + res = OrVector(ShiftRightVectorUI32(res, 8), res); + res = OrVector(ShiftRightVectorUI32(res, 16), res); + + // Make leading 0s into leading 1s. + res = NotVector(res); + + // Count leading 1s, which is the population count. + tmp0 = ShiftRightVectorUI32(res, 1); + tmp0 = AndVector(tmp0, c55555555); + res = SubVectorI32(res, tmp0); + + tmp0 = ShiftRightVectorUI32(res, 2); + tmp0 = AndVector(tmp0, c33333333); + tmp1 = AndVector(res, c33333333); + res = AddVectorI32(tmp0, tmp1); + + tmp0 = ShiftRightVectorUI32(res, 4); + tmp0 = AddVectorI32(tmp0, res); + res = AndVector(tmp0, c0f0f0f0f); + + tmp0 = ShiftRightVectorUI32(res, 8); + res = AddVectorI32(tmp0, res); + + tmp0 = ShiftRightVectorUI32(res, 16); + res = AddVectorI32(tmp0, res); + + res = AndVector(res, c0000003f); + + return res; + } + + public static void Cnt_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64CntV); + } + else + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = op.RegisterSize == RegisterSize.Simd128 ? 16 : 8; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractZx(context, op.Rn, index, 0); + + Operand de; + + if (Optimizations.UsePopCnt) + { + de = context.AddIntrinsicLong(Intrinsic.X86Popcnt, ne); + } + else + { + de = EmitCountSetBits8(context, ne); + } + + res = EmitVectorInsert(context, res, de, index, 0); + } + + context.Copy(GetVec(op.Rd), res); + } + } + + public static void Fabd_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FabdS); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + Operand res = context.AddIntrinsic(Intrinsic.X86Subss, GetVec(op.Rn), GetVec(op.Rm)); + + res = EmitFloatAbs(context, res, true, false); + + context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res)); + } + else /* if (sizeF == 1) */ + { + Operand res = context.AddIntrinsic(Intrinsic.X86Subsd, GetVec(op.Rn), GetVec(op.Rm)); + + res = EmitFloatAbs(context, res, false, false); + + context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res)); + } + } + else + { + EmitScalarBinaryOpF(context, (op1, op2) => + { + Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub), op1, op2); + + return EmitUnaryMathCall(context, nameof(Math.Abs), res); + }); + } + } + + public static void Fabd_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FabdV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + Operand res = context.AddIntrinsic(Intrinsic.X86Subps, GetVec(op.Rn), GetVec(op.Rm)); + + res = EmitFloatAbs(context, res, true, true); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else /* if (sizeF == 1) */ + { + Operand res = context.AddIntrinsic(Intrinsic.X86Subpd, GetVec(op.Rn), GetVec(op.Rm)); + + res = EmitFloatAbs(context, res, false, true); + + context.Copy(GetVec(op.Rd), res); + } + } + else + { + EmitVectorBinaryOpF(context, (op1, op2) => + { + Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub), op1, op2); + + return EmitUnaryMathCall(context, nameof(Math.Abs), res); + }); + } + } + + public static void Fabs_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FabsS); + } + else if (Optimizations.UseSse2) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + if (op.Size == 0) + { + Operand res = EmitFloatAbs(context, GetVec(op.Rn), true, false); + + context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res)); + } + else /* if (op.Size == 1) */ + { + Operand res = EmitFloatAbs(context, GetVec(op.Rn), false, false); + + context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res)); + } + } + else + { + EmitScalarUnaryOpF(context, (op1) => + { + return EmitUnaryMathCall(context, nameof(Math.Abs), op1); + }); + } + } + + public static void Fabs_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FabsV); + } + else if (Optimizations.UseSse2) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + Operand res = EmitFloatAbs(context, GetVec(op.Rn), true, true); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else /* if (sizeF == 1) */ + { + Operand res = EmitFloatAbs(context, GetVec(op.Rn), false, true); + + context.Copy(GetVec(op.Rd), res); + } + } + else + { + EmitVectorUnaryOpF(context, (op1) => + { + return EmitUnaryMathCall(context, nameof(Math.Abs), op1); + }); + } + } + + public static void Fadd_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FaddS); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarBinaryOpF(context, Intrinsic.X86Addss, Intrinsic.X86Addsd); + } + else if (Optimizations.FastFP) + { + EmitScalarBinaryOpF(context, (op1, op2) => context.Add(op1, op2)); + } + else + { + EmitScalarBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), op1, op2); + }); + } + } + + public static void Fadd_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FaddV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorBinaryOpF(context, Intrinsic.X86Addps, Intrinsic.X86Addpd); + } + else if (Optimizations.FastFP) + { + EmitVectorBinaryOpF(context, (op1, op2) => context.Add(op1, op2)); + } + else + { + EmitVectorBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), op1, op2); + }); + } + } + + public static void Faddp_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FaddpS); + } + else if (Optimizations.FastFP && Optimizations.UseSse3) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + if ((op.Size & 1) == 0) + { + Operand res = context.AddIntrinsic(Intrinsic.X86Haddps, GetVec(op.Rn), GetVec(op.Rn)); + + context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res)); + } + else /* if ((op.Size & 1) == 1) */ + { + Operand res = context.AddIntrinsic(Intrinsic.X86Haddpd, GetVec(op.Rn), GetVec(op.Rn)); + + context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res)); + } + } + else + { + EmitScalarPairwiseOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), op1, op2); + }); + } + } + + public static void Faddp_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FaddpV); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse2VectorPairwiseOpF(context, (op1, op2) => + { + return EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; + + Intrinsic addInst = (op.Size & 1) == 0 ? Intrinsic.X86Addps : Intrinsic.X86Addpd; + + return context.AddIntrinsic(addInst, op1, op2); + }, scalar: false, op1, op2); + }); + } + else + { + EmitVectorPairwiseOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), op1, op2); + }); + } + } + + public static void Fdiv_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FdivS); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarBinaryOpF(context, Intrinsic.X86Divss, Intrinsic.X86Divsd); + } + else if (Optimizations.FastFP) + { + EmitScalarBinaryOpF(context, (op1, op2) => context.Divide(op1, op2)); + } + else + { + EmitScalarBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPDiv), op1, op2); + }); + } + } + + public static void Fdiv_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FdivV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorBinaryOpF(context, Intrinsic.X86Divps, Intrinsic.X86Divpd); + } + else if (Optimizations.FastFP) + { + EmitVectorBinaryOpF(context, (op1, op2) => context.Divide(op1, op2)); + } + else + { + EmitVectorBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPDiv), op1, op2); + }); + } + } + + public static void Fmadd_S(ArmEmitterContext context) // Fused. + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarTernaryOpF(context, Intrinsic.Arm64FmaddS); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand a = GetVec(op.Ra); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res; + + if (op.Size == 0) + { + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ss, a, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addss, a, res); + } + + context.Copy(d, context.VectorZeroUpper96(res)); + } + else /* if (op.Size == 1) */ + { + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231sd, a, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addsd, a, res); + } + + context.Copy(d, context.VectorZeroUpper64(res)); + } + } + else + { + EmitScalarTernaryRaOpF(context, (op1, op2, op3) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), op1, op2, op3); + }); + } + } + + public static void Fmax_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FmaxS); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); + }, scalar: true); + } + else + { + EmitScalarBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMax), op1, op2); + }); + } + } + + public static void Fmax_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FmaxV); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); + }, scalar: false); + } + else + { + EmitVectorBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMax), op1, op2); + }); + } + } + + public static void Fmaxnm_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FmaxnmS); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse41MaxMinNumOpF(context, isMaxNum: true, scalar: true); + } + else + { + EmitScalarBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum), op1, op2); + }); + } + } + + public static void Fmaxnm_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FmaxnmV); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse41MaxMinNumOpF(context, isMaxNum: true, scalar: false); + } + else + { + EmitVectorBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum), op1, op2); + }); + } + } + + public static void Fmaxnmp_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FmaxnmpS); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse2ScalarPairwiseOpF(context, (op1, op2) => + { + return EmitSse41MaxMinNumOpF(context, isMaxNum: true, scalar: true, op1, op2); + }); + } + else + { + EmitScalarPairwiseOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum), op1, op2); + }); + } + } + + public static void Fmaxnmp_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FmaxnmpV); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse2VectorPairwiseOpF(context, (op1, op2) => + { + return EmitSse41MaxMinNumOpF(context, isMaxNum: true, scalar: false, op1, op2); + }); + } + else + { + EmitVectorPairwiseOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum), op1, op2); + }); + } + } + + public static void Fmaxnmv_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FmaxnmvV); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse2VectorAcrossVectorOpF(context, (op1, op2) => + { + return EmitSse41MaxMinNumOpF(context, isMaxNum: true, scalar: false, op1, op2); + }); + } + else + { + EmitVectorAcrossVectorOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum), op1, op2); + }); + } + } + + public static void Fmaxp_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FmaxpV); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse2VectorPairwiseOpF(context, (op1, op2) => + { + return EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); + }, scalar: false, op1, op2); + }); + } + else + { + EmitVectorPairwiseOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMax), op1, op2); + }); + } + } + + public static void Fmaxv_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FmaxvV); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse2VectorAcrossVectorOpF(context, (op1, op2) => + { + return EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); + }, scalar: false, op1, op2); + }); + } + else + { + EmitVectorAcrossVectorOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMax), op1, op2); + }); + } + } + + public static void Fmin_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FminS); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); + }, scalar: true); + } + else + { + EmitScalarBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMin), op1, op2); + }); + } + } + + public static void Fmin_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FminV); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); + }, scalar: false); + } + else + { + EmitVectorBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMin), op1, op2); + }); + } + } + + public static void Fminnm_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FminnmS); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse41MaxMinNumOpF(context, isMaxNum: false, scalar: true); + } + else + { + EmitScalarBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum), op1, op2); + }); + } + } + + public static void Fminnm_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FminnmV); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse41MaxMinNumOpF(context, isMaxNum: false, scalar: false); + } + else + { + EmitVectorBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum), op1, op2); + }); + } + } + + public static void Fminnmp_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FminnmpS); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse2ScalarPairwiseOpF(context, (op1, op2) => + { + return EmitSse41MaxMinNumOpF(context, isMaxNum: false, scalar: true, op1, op2); + }); + } + else + { + EmitScalarPairwiseOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum), op1, op2); + }); + } + } + + public static void Fminnmp_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FminnmpV); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse2VectorPairwiseOpF(context, (op1, op2) => + { + return EmitSse41MaxMinNumOpF(context, isMaxNum: false, scalar: false, op1, op2); + }); + } + else + { + EmitVectorPairwiseOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum), op1, op2); + }); + } + } + + public static void Fminnmv_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FminnmvV); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse2VectorAcrossVectorOpF(context, (op1, op2) => + { + return EmitSse41MaxMinNumOpF(context, isMaxNum: false, scalar: false, op1, op2); + }); + } + else + { + EmitVectorAcrossVectorOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum), op1, op2); + }); + } + } + + public static void Fminp_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FminpV); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse2VectorPairwiseOpF(context, (op1, op2) => + { + return EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); + }, scalar: false, op1, op2); + }); + } + else + { + EmitVectorPairwiseOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMin), op1, op2); + }); + } + } + + public static void Fminv_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FminvV); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse2VectorAcrossVectorOpF(context, (op1, op2) => + { + return EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); + }, scalar: false, op1, op2); + }); + } + else + { + EmitVectorAcrossVectorOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMin), op1, op2); + }); + } + } + + public static void Fmla_Se(ArmEmitterContext context) // Fused. + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarTernaryOpFRdByElem(context, Intrinsic.Arm64FmlaSe); + } + else if (Optimizations.UseFma) + { + OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + int shuffleMask = op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask)); + + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ss, d, n, res); + + context.Copy(d, context.VectorZeroUpper96(res)); + } + else /* if (sizeF == 1) */ + { + int shuffleMask = op.Index | op.Index << 1; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask)); + + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231sd, d, n, res); + + context.Copy(d, context.VectorZeroUpper64(res)); + } + } + else + { + EmitScalarTernaryOpByElemF(context, (op1, op2, op3) => + { + return context.Add(op1, context.Multiply(op2, op3)); + }); + } + } + + public static void Fmla_V(ArmEmitterContext context) // Fused. + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpFRd(context, Intrinsic.Arm64FmlaV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int sizeF = op.Size & 1; + + Operand res; + + if (sizeF == 0) + { + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ps, d, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addps, d, res); + } + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); + } + else /* if (sizeF == 1) */ + { + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231pd, d, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res); + } + + context.Copy(d, res); + } + } + else + { + EmitVectorTernaryOpF(context, (op1, op2, op3) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), op1, op2, op3); + }); + } + } + + public static void Fmla_Ve(ArmEmitterContext context) // Fused. + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpFRdByElem(context, Intrinsic.Arm64FmlaVe); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + int shuffleMask = op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask)); + + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ps, d, n, res); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res); + res = context.AddIntrinsic(Intrinsic.X86Addps, d, res); + } + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); + } + else /* if (sizeF == 1) */ + { + int shuffleMask = op.Index | op.Index << 1; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask)); + + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231pd, d, n, res); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res); + res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res); + } + + context.Copy(d, res); + } + } + else + { + EmitVectorTernaryOpByElemF(context, (op1, op2, op3) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), op1, op2, op3); + }); + } + } + + public static void Fmls_Se(ArmEmitterContext context) // Fused. + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarTernaryOpFRdByElem(context, Intrinsic.Arm64FmlsSe); + } + else if (Optimizations.UseFma) + { + OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + int shuffleMask = op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask)); + + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, d, n, res); + + context.Copy(d, context.VectorZeroUpper96(res)); + } + else /* if (sizeF == 1) */ + { + int shuffleMask = op.Index | op.Index << 1; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask)); + + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, d, n, res); + + context.Copy(d, context.VectorZeroUpper64(res)); + } + } + else + { + EmitScalarTernaryOpByElemF(context, (op1, op2, op3) => + { + return context.Subtract(op1, context.Multiply(op2, op3)); + }); + } + } + + public static void Fmls_V(ArmEmitterContext context) // Fused. + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpFRd(context, Intrinsic.Arm64FmlsV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int sizeF = op.Size & 1; + + Operand res; + + if (sizeF == 0) + { + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, d, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subps, d, res); + } + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); + } + else /* if (sizeF == 1) */ + { + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, d, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res); + } + + context.Copy(d, res); + } + } + else + { + EmitVectorTernaryOpF(context, (op1, op2, op3) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulSub), op1, op2, op3); + }); + } + } + + public static void Fmls_Ve(ArmEmitterContext context) // Fused. + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpFRdByElem(context, Intrinsic.Arm64FmlsVe); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + int shuffleMask = op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask)); + + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, d, n, res); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res); + res = context.AddIntrinsic(Intrinsic.X86Subps, d, res); + } + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); + } + else /* if (sizeF == 1) */ + { + int shuffleMask = op.Index | op.Index << 1; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask)); + + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, d, n, res); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res); + res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res); + } + + context.Copy(d, res); + } + } + else + { + EmitVectorTernaryOpByElemF(context, (op1, op2, op3) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulSub), op1, op2, op3); + }); + } + } + + public static void Fmsub_S(ArmEmitterContext context) // Fused. + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarTernaryOpF(context, Intrinsic.Arm64FmsubS); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand a = GetVec(op.Ra); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res; + + if (op.Size == 0) + { + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, a, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subss, a, res); + } + + context.Copy(d, context.VectorZeroUpper96(res)); + } + else /* if (op.Size == 1) */ + { + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, a, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subsd, a, res); + } + + context.Copy(d, context.VectorZeroUpper64(res)); + } + } + else + { + EmitScalarTernaryRaOpF(context, (op1, op2, op3) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulSub), op1, op2, op3); + }); + } + } + + public static void Fmul_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FmulS); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarBinaryOpF(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd); + } + else if (Optimizations.FastFP) + { + EmitScalarBinaryOpF(context, (op1, op2) => context.Multiply(op1, op2)); + } + else + { + EmitScalarBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op1, op2); + }); + } + } + + public static void Fmul_Se(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOpFByElem(context, Intrinsic.Arm64FmulSe); + } + else + { + EmitScalarBinaryOpByElemF(context, (op1, op2) => context.Multiply(op1, op2)); + } + } + + public static void Fmul_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FmulV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorBinaryOpF(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd); + } + else if (Optimizations.FastFP) + { + EmitVectorBinaryOpF(context, (op1, op2) => context.Multiply(op1, op2)); + } + else + { + EmitVectorBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op1, op2); + }); + } + } + + public static void Fmul_Ve(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpFByElem(context, Intrinsic.Arm64FmulVe); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + int shuffleMask = op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask)); + + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else /* if (sizeF == 1) */ + { + int shuffleMask = op.Index | op.Index << 1; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask)); + + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res); + + context.Copy(GetVec(op.Rd), res); + } + } + else if (Optimizations.FastFP) + { + EmitVectorBinaryOpByElemF(context, (op1, op2) => context.Multiply(op1, op2)); + } + else + { + EmitVectorBinaryOpByElemF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op1, op2); + }); + } + } + + public static void Fmulx_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FmulxS); + } + else + { + EmitScalarBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulX), op1, op2); + }); + } + } + + public static void Fmulx_Se(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOpFByElem(context, Intrinsic.Arm64FmulxSe); + } + else + { + EmitScalarBinaryOpByElemF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulX), op1, op2); + }); + } + } + + public static void Fmulx_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FmulxV); + } + else + { + EmitVectorBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulX), op1, op2); + }); + } + } + + public static void Fmulx_Ve(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpFByElem(context, Intrinsic.Arm64FmulxVe); + } + else + { + EmitVectorBinaryOpByElemF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulX), op1, op2); + }); + } + } + + public static void Fneg_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FnegS); + } + else if (Optimizations.UseSse2) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + if (op.Size == 0) + { + Operand mask = X86GetScalar(context, -0f); + + Operand res = context.AddIntrinsic(Intrinsic.X86Xorps, mask, GetVec(op.Rn)); + + context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res)); + } + else /* if (op.Size == 1) */ + { + Operand mask = X86GetScalar(context, -0d); + + Operand res = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, GetVec(op.Rn)); + + context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res)); + } + } + else + { + EmitScalarUnaryOpF(context, (op1) => context.Negate(op1)); + } + } + + public static void Fneg_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FnegV); + } + else if (Optimizations.UseSse2) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + Operand mask = X86GetAllElements(context, -0f); + + Operand res = context.AddIntrinsic(Intrinsic.X86Xorps, mask, GetVec(op.Rn)); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else /* if (sizeF == 1) */ + { + Operand mask = X86GetAllElements(context, -0d); + + Operand res = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, GetVec(op.Rn)); + + context.Copy(GetVec(op.Rd), res); + } + } + else + { + EmitVectorUnaryOpF(context, (op1) => context.Negate(op1)); + } + } + + public static void Fnmadd_S(ArmEmitterContext context) // Fused. + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarTernaryOpF(context, Intrinsic.Arm64FnmaddS); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand a = GetVec(op.Ra); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res; + + if (op.Size == 0) + { + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmsub231ss, a, n, m); + } + else + { + Operand mask = X86GetScalar(context, -0f); + Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a); + + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subss, aNeg, res); + } + + context.Copy(d, context.VectorZeroUpper96(res)); + } + else /* if (op.Size == 1) */ + { + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmsub231sd, a, n, m); + } + else + { + Operand mask = X86GetScalar(context, -0d); + Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a); + + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subsd, aNeg, res); + } + + context.Copy(d, context.VectorZeroUpper64(res)); + } + } + else + { + EmitScalarTernaryRaOpF(context, (op1, op2, op3) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulAdd), op1, op2, op3); + }); + } + } + + public static void Fnmsub_S(ArmEmitterContext context) // Fused. + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarTernaryOpF(context, Intrinsic.Arm64FnmsubS); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand a = GetVec(op.Ra); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res; + + if (op.Size == 0) + { + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmsub231ss, a, n, m); + } + else + { + Operand mask = X86GetScalar(context, -0f); + Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a); + + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addss, aNeg, res); + } + + context.Copy(d, context.VectorZeroUpper96(res)); + } + else /* if (op.Size == 1) */ + { + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmsub231sd, a, n, m); + } + else + { + Operand mask = X86GetScalar(context, -0d); + Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a); + + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addsd, aNeg, res); + } + + context.Copy(d, context.VectorZeroUpper64(res)); + } + } + else + { + EmitScalarTernaryRaOpF(context, (op1, op2, op3) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulSub), op1, op2, op3); + }); + } + } + + public static void Fnmul_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FnmulS); + } + else + { + EmitScalarBinaryOpF(context, (op1, op2) => context.Negate(context.Multiply(op1, op2))); + } + } + + public static void Frecpe_S(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + int sizeF = op.Size & 1; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrecpeS); + } + else if (Optimizations.FastFP && Optimizations.UseSse41 && sizeF == 0) + { + Operand res = EmitSse41Round32Exp8OpF(context, context.AddIntrinsic(Intrinsic.X86Rcpss, GetVec(op.Rn)), scalar: true); + + context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res)); + } + else + { + EmitScalarUnaryOpF(context, (op1) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRecipEstimate), op1); + }); + } + } + + public static void Frecpe_V(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + int sizeF = op.Size & 1; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrecpeV); + } + else if (Optimizations.FastFP && Optimizations.UseSse41 && sizeF == 0) + { + Operand res = EmitSse41Round32Exp8OpF(context, context.AddIntrinsic(Intrinsic.X86Rcpps, GetVec(op.Rn)), scalar: false); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorUnaryOpF(context, (op1) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRecipEstimate), op1); + }); + } + } + + public static void Frecps_S(ArmEmitterContext context) // Fused. + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FrecpsS); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int sizeF = op.Size & 1; + + Operand res; + + if (sizeF == 0) + { + Operand mask = X86GetScalar(context, 2f); + + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, mask, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subss, mask, res); + } + + res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: true, sizeF); + + context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res)); + } + else /* if (sizeF == 1) */ + { + Operand mask = X86GetScalar(context, 2d); + + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, mask, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subsd, mask, res); + } + + res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: true, sizeF); + + context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res)); + } + } + else + { + EmitScalarBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRecipStepFused), op1, op2); + }); + } + } + + public static void Frecps_V(ArmEmitterContext context) // Fused. + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FrecpsV); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int sizeF = op.Size & 1; + + Operand res; + + if (sizeF == 0) + { + Operand mask = X86GetAllElements(context, 2f); + + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, mask, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subps, mask, res); + } + + res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else /* if (sizeF == 1) */ + { + Operand mask = X86GetAllElements(context, 2d); + + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, mask, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subpd, mask, res); + } + + res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF); + + context.Copy(GetVec(op.Rd), res); + } + } + else + { + EmitVectorBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRecipStepFused), op1, op2); + }); + } + } + + public static void Frecpx_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FrecpxS); + } + else + { + EmitScalarUnaryOpF(context, (op1) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRecpX), op1); + }); + } + } + + public static void Frinta_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrintaS); + } + else if (Optimizations.UseSse41) + { + EmitSse41ScalarRoundOpF(context, FPRoundingMode.ToNearestAway); + } + else + { + EmitScalarUnaryOpF(context, (op1) => + { + return EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1); + }); + } + } + + public static void Frinta_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrintaV); + } + else if (Optimizations.UseSse41) + { + EmitSse41VectorRoundOpF(context, FPRoundingMode.ToNearestAway); + } + else + { + EmitVectorUnaryOpF(context, (op1) => + { + return EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1); + }); + } + } + + public static void Frinti_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrintiS); + } + else + { + EmitScalarUnaryOpF(context, (op1) => + { + return EmitRoundByRMode(context, op1); + }); + } + } + + public static void Frinti_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrintiV); + } + else + { + EmitVectorUnaryOpF(context, (op1) => + { + return EmitRoundByRMode(context, op1); + }); + } + } + + public static void Frintm_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrintmS); + } + else if (Optimizations.UseSse41) + { + EmitSse41ScalarRoundOpF(context, FPRoundingMode.TowardsMinusInfinity); + } + else + { + EmitScalarUnaryOpF(context, (op1) => + { + return EmitUnaryMathCall(context, nameof(Math.Floor), op1); + }); + } + } + + public static void Frintm_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrintmV); + } + else if (Optimizations.UseSse41) + { + EmitSse41VectorRoundOpF(context, FPRoundingMode.TowardsMinusInfinity); + } + else + { + EmitVectorUnaryOpF(context, (op1) => + { + return EmitUnaryMathCall(context, nameof(Math.Floor), op1); + }); + } + } + + public static void Frintn_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrintnS); + } + else if (Optimizations.UseSse41) + { + EmitSse41ScalarRoundOpF(context, FPRoundingMode.ToNearest); + } + else + { + EmitScalarUnaryOpF(context, (op1) => + { + return EmitRoundMathCall(context, MidpointRounding.ToEven, op1); + }); + } + } + + public static void Frintn_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrintnV); + } + else if (Optimizations.UseSse41) + { + EmitSse41VectorRoundOpF(context, FPRoundingMode.ToNearest); + } + else + { + EmitVectorUnaryOpF(context, (op1) => + { + return EmitRoundMathCall(context, MidpointRounding.ToEven, op1); + }); + } + } + + public static void Frintp_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrintpS); + } + else if (Optimizations.UseSse41) + { + EmitSse41ScalarRoundOpF(context, FPRoundingMode.TowardsPlusInfinity); + } + else + { + EmitScalarUnaryOpF(context, (op1) => + { + return EmitUnaryMathCall(context, nameof(Math.Ceiling), op1); + }); + } + } + + public static void Frintp_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrintpV); + } + else if (Optimizations.UseSse41) + { + EmitSse41VectorRoundOpF(context, FPRoundingMode.TowardsPlusInfinity); + } + else + { + EmitVectorUnaryOpF(context, (op1) => + { + return EmitUnaryMathCall(context, nameof(Math.Ceiling), op1); + }); + } + } + + public static void Frintx_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrintxS); + } + else + { + EmitScalarUnaryOpF(context, (op1) => + { + return EmitRoundByRMode(context, op1); + }); + } + } + + public static void Frintx_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrintxV); + } + else + { + EmitVectorUnaryOpF(context, (op1) => + { + return EmitRoundByRMode(context, op1); + }); + } + } + + public static void Frintz_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrintzS); + } + else if (Optimizations.UseSse41) + { + EmitSse41ScalarRoundOpF(context, FPRoundingMode.TowardsZero); + } + else + { + EmitScalarUnaryOpF(context, (op1) => + { + return EmitUnaryMathCall(context, nameof(Math.Truncate), op1); + }); + } + } + + public static void Frintz_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrintzV); + } + else if (Optimizations.UseSse41) + { + EmitSse41VectorRoundOpF(context, FPRoundingMode.TowardsZero); + } + else + { + EmitVectorUnaryOpF(context, (op1) => + { + return EmitUnaryMathCall(context, nameof(Math.Truncate), op1); + }); + } + } + + public static void Frsqrte_S(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + int sizeF = op.Size & 1; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrsqrteS); + } + else if (Optimizations.FastFP && Optimizations.UseSse41 && sizeF == 0) + { + Operand res = EmitSse41Round32Exp8OpF(context, context.AddIntrinsic(Intrinsic.X86Rsqrtss, GetVec(op.Rn)), scalar: true); + + context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res)); + } + else + { + EmitScalarUnaryOpF(context, (op1) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRSqrtEstimate), op1); + }); + } + } + + public static void Frsqrte_V(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + int sizeF = op.Size & 1; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrsqrteV); + } + else if (Optimizations.FastFP && Optimizations.UseSse41 && sizeF == 0) + { + Operand res = EmitSse41Round32Exp8OpF(context, context.AddIntrinsic(Intrinsic.X86Rsqrtps, GetVec(op.Rn)), scalar: false); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorUnaryOpF(context, (op1) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRSqrtEstimate), op1); + }); + } + } + + public static void Frsqrts_S(ArmEmitterContext context) // Fused. + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FrsqrtsS); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int sizeF = op.Size & 1; + + Operand res; + + if (sizeF == 0) + { + Operand maskHalf = X86GetScalar(context, 0.5f); + Operand maskThree = X86GetScalar(context, 3f); + Operand maskOneHalf = X86GetScalar(context, 1.5f); + + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, maskThree, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subss, maskThree, res); + } + + res = context.AddIntrinsic(Intrinsic.X86Mulss, maskHalf, res); + res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: true, sizeF); + + context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res)); + } + else /* if (sizeF == 1) */ + { + Operand maskHalf = X86GetScalar(context, 0.5d); + Operand maskThree = X86GetScalar(context, 3d); + Operand maskOneHalf = X86GetScalar(context, 1.5d); + + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, maskThree, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subsd, maskThree, res); + } + + res = context.AddIntrinsic(Intrinsic.X86Mulsd, maskHalf, res); + res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: true, sizeF); + + context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res)); + } + } + else + { + EmitScalarBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRSqrtStepFused), op1, op2); + }); + } + } + + public static void Frsqrts_V(ArmEmitterContext context) // Fused. + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FrsqrtsV); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int sizeF = op.Size & 1; + + Operand res; + + if (sizeF == 0) + { + Operand maskHalf = X86GetAllElements(context, 0.5f); + Operand maskThree = X86GetAllElements(context, 3f); + Operand maskOneHalf = X86GetAllElements(context, 1.5f); + + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, maskThree, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subps, maskThree, res); + } + + res = context.AddIntrinsic(Intrinsic.X86Mulps, maskHalf, res); + res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: false, sizeF); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else /* if (sizeF == 1) */ + { + Operand maskHalf = X86GetAllElements(context, 0.5d); + Operand maskThree = X86GetAllElements(context, 3d); + Operand maskOneHalf = X86GetAllElements(context, 1.5d); + + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, maskThree, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subpd, maskThree, res); + } + + res = context.AddIntrinsic(Intrinsic.X86Mulpd, maskHalf, res); + res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: false, sizeF); + + context.Copy(GetVec(op.Rd), res); + } + } + else + { + EmitVectorBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRSqrtStepFused), op1, op2); + }); + } + } + + public static void Fsqrt_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FsqrtS); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarUnaryOpF(context, Intrinsic.X86Sqrtss, Intrinsic.X86Sqrtsd); + } + else + { + EmitScalarUnaryOpF(context, (op1) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPSqrt), op1); + }); + } + } + + public static void Fsqrt_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FsqrtV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorUnaryOpF(context, Intrinsic.X86Sqrtps, Intrinsic.X86Sqrtpd); + } + else + { + EmitVectorUnaryOpF(context, (op1) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPSqrt), op1); + }); + } + } + + public static void Fsub_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FsubS); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarBinaryOpF(context, Intrinsic.X86Subss, Intrinsic.X86Subsd); + } + else if (Optimizations.FastFP) + { + EmitScalarBinaryOpF(context, (op1, op2) => context.Subtract(op1, op2)); + } + else + { + EmitScalarBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub), op1, op2); + }); + } + } + + public static void Fsub_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FsubV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorBinaryOpF(context, Intrinsic.X86Subps, Intrinsic.X86Subpd); + } + else if (Optimizations.FastFP) + { + EmitVectorBinaryOpF(context, (op1, op2) => context.Subtract(op1, op2)); + } + else + { + EmitVectorBinaryOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub), op1, op2); + }); + } + } + + public static void Mla_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64MlaV); + } + else if (Optimizations.UseSse41) + { + EmitSse41VectorMul_AddSub(context, AddSub.Add); + } + else + { + EmitVectorTernaryOpZx(context, (op1, op2, op3) => + { + return context.Add(op1, context.Multiply(op2, op3)); + }); + } + } + + public static void Mla_Ve(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRdByElem(context, Intrinsic.Arm64MlaVe); + } + else + { + EmitVectorTernaryOpByElemZx(context, (op1, op2, op3) => + { + return context.Add(op1, context.Multiply(op2, op3)); + }); + } + } + + public static void Mls_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64MlsV); + } + else if (Optimizations.UseSse41) + { + EmitSse41VectorMul_AddSub(context, AddSub.Subtract); + } + else + { + EmitVectorTernaryOpZx(context, (op1, op2, op3) => + { + return context.Subtract(op1, context.Multiply(op2, op3)); + }); + } + } + + public static void Mls_Ve(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRdByElem(context, Intrinsic.Arm64MlsVe); + } + else + { + EmitVectorTernaryOpByElemZx(context, (op1, op2, op3) => + { + return context.Subtract(op1, context.Multiply(op2, op3)); + }); + } + } + + public static void Mul_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64MulV); + } + else if (Optimizations.UseSse41) + { + EmitSse41VectorMul_AddSub(context, AddSub.None); + } + else + { + EmitVectorBinaryOpZx(context, (op1, op2) => context.Multiply(op1, op2)); + } + } + + public static void Mul_Ve(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpByElem(context, Intrinsic.Arm64MulVe); + } + else + { + EmitVectorBinaryOpByElemZx(context, (op1, op2) => context.Multiply(op1, op2)); + } + } + + public static void Neg_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOp(context, Intrinsic.Arm64NegS); + } + else + { + EmitScalarUnaryOpSx(context, (op1) => context.Negate(op1)); + } + } + + public static void Neg_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64NegV); + } + else if (Optimizations.UseSse2) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Intrinsic subInst = X86PsubInstruction[op.Size]; + + Operand res = context.AddIntrinsic(subInst, context.VectorZero(), GetVec(op.Rn)); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorUnaryOpSx(context, (op1) => context.Negate(op1)); + } + } + + public static void Pmull_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UseArm64Pmull) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64PmullV); + } + else if (Optimizations.UsePclmulqdq && op.Size == 3) + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int imm8 = op.RegisterSize == RegisterSize.Simd64 ? 0b0000_0000 : 0b0001_0001; + + Operand res = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, n, m, Const(imm8)); + + context.Copy(GetVec(op.Rd), res); + } + else if (Optimizations.UseSse41) + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if (op.RegisterSize == RegisterSize.Simd64) + { + n = context.VectorZeroUpper64(n); + m = context.VectorZeroUpper64(m); + } + else /* if (op.RegisterSize == RegisterSize.Simd128) */ + { + n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8)); + m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8)); + } + + Operand res = context.VectorZero(); + + if (op.Size == 0) + { + n = context.AddIntrinsic(Intrinsic.X86Pmovzxbw, n); + m = context.AddIntrinsic(Intrinsic.X86Pmovzxbw, m); + + for (int i = 0; i < 8; i++) + { + Operand mask = context.AddIntrinsic(Intrinsic.X86Psllw, n, Const(15 - i)); + mask = context.AddIntrinsic(Intrinsic.X86Psraw, mask, Const(15)); + + Operand tmp = context.AddIntrinsic(Intrinsic.X86Psllw, m, Const(i)); + tmp = context.AddIntrinsic(Intrinsic.X86Pand, tmp, mask); + + res = context.AddIntrinsic(Intrinsic.X86Pxor, res, tmp); + } + } + else /* if (op.Size == 3) */ + { + Operand zero = context.VectorZero(); + + for (int i = 0; i < 64; i++) + { + Operand mask = context.AddIntrinsic(Intrinsic.X86Movlhps, n, n); + mask = context.AddIntrinsic(Intrinsic.X86Psllq, mask, Const(63 - i)); + mask = context.AddIntrinsic(Intrinsic.X86Psrlq, mask, Const(63)); + mask = context.AddIntrinsic(Intrinsic.X86Psubq, zero, mask); + + Operand tmp = EmitSse2Sll_128(context, m, i); + tmp = context.AddIntrinsic(Intrinsic.X86Pand, tmp, mask); + + res = context.AddIntrinsic(Intrinsic.X86Pxor, res, tmp); + } + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res; + + if (op.Size == 0) + { + res = context.VectorZero(); + + int part = op.RegisterSize == RegisterSize.Simd64 ? 0 : 8; + + for (int index = 0; index < 8; index++) + { + Operand ne = context.VectorExtract8(n, part + index); + Operand me = context.VectorExtract8(m, part + index); + + Operand de = EmitPolynomialMultiply(context, ne, me, 8); + + res = EmitVectorInsert(context, res, de, index, 1); + } + } + else /* if (op.Size == 3) */ + { + int part = op.RegisterSize == RegisterSize.Simd64 ? 0 : 1; + + Operand ne = context.VectorExtract(OperandType.I64, n, part); + Operand me = context.VectorExtract(OperandType.I64, m, part); + + res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.PolynomialMult64_128)), ne, me); + } + + context.Copy(GetVec(op.Rd), res); + } + } + + public static void Raddhn_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64RaddhnV); + } + else + { + EmitHighNarrow(context, (op1, op2) => context.Add(op1, op2), round: true); + } + } + + public static void Rsubhn_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64RsubhnV); + } + else + { + EmitHighNarrow(context, (op1, op2) => context.Subtract(op1, op2), round: true); + } + } + + public static void Saba_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64SabaV); + } + else + { + EmitVectorTernaryOpSx(context, (op1, op2, op3) => + { + return context.Add(op1, EmitAbs(context, context.Subtract(op2, op3))); + }); + } + } + + public static void Sabal_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64SabalV); + } + else + { + EmitVectorWidenRnRmTernaryOpSx(context, (op1, op2, op3) => + { + return context.Add(op1, EmitAbs(context, context.Subtract(op2, op3))); + }); + } + } + + public static void Sabd_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SabdV); + } + else if (Optimizations.UseSse41) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + EmitSse41VectorSabdOp(context, op, n, m, isLong: false); + } + else + { + EmitVectorBinaryOpSx(context, (op1, op2) => + { + return EmitAbs(context, context.Subtract(op1, op2)); + }); + } + } + + public static void Sabdl_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SabdlV); + } + else if (Optimizations.UseSse41 && op.Size < 2) + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if (op.RegisterSize == RegisterSize.Simd128) + { + n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8)); + m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8)); + } + + Intrinsic movInst = op.Size == 0 + ? Intrinsic.X86Pmovsxbw + : Intrinsic.X86Pmovsxwd; + + n = context.AddIntrinsic(movInst, n); + m = context.AddIntrinsic(movInst, m); + + EmitSse41VectorSabdOp(context, op, n, m, isLong: true); + } + else + { + EmitVectorWidenRnRmBinaryOpSx(context, (op1, op2) => + { + return EmitAbs(context, context.Subtract(op1, op2)); + }); + } + } + + public static void Sadalp_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpRd(context, Intrinsic.Arm64SadalpV); + } + else + { + EmitAddLongPairwise(context, signed: true, accumulate: true); + } + } + + public static void Saddl_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SaddlV); + } + else if (Optimizations.UseSse41) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if (op.RegisterSize == RegisterSize.Simd128) + { + n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8)); + m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8)); + } + + Intrinsic movInst = X86PmovsxInstruction[op.Size]; + + n = context.AddIntrinsic(movInst, n); + m = context.AddIntrinsic(movInst, m); + + Intrinsic addInst = X86PaddInstruction[op.Size + 1]; + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(addInst, n, m)); + } + else + { + EmitVectorWidenRnRmBinaryOpSx(context, (op1, op2) => context.Add(op1, op2)); + } + } + + public static void Saddlp_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64SaddlpV); + } + else + { + EmitAddLongPairwise(context, signed: true, accumulate: false); + } + } + + public static void Saddlv_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64SaddlvV); + } + else + { + EmitVectorLongAcrossVectorOpSx(context, (op1, op2) => context.Add(op1, op2)); + } + } + + public static void Saddw_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SaddwV); + } + else if (Optimizations.UseSse41) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if (op.RegisterSize == RegisterSize.Simd128) + { + m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8)); + } + + Intrinsic movInst = X86PmovsxInstruction[op.Size]; + + m = context.AddIntrinsic(movInst, m); + + Intrinsic addInst = X86PaddInstruction[op.Size + 1]; + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(addInst, n, m)); + } + else + { + EmitVectorWidenRmBinaryOpSx(context, (op1, op2) => context.Add(op1, op2)); + } + } + + public static void Shadd_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64ShaddV); + } + else if (Optimizations.UseSse2 && op.Size > 0) + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res = context.AddIntrinsic(Intrinsic.X86Pand, n, m); + Operand res2 = context.AddIntrinsic(Intrinsic.X86Pxor, n, m); + + Intrinsic shiftInst = op.Size == 1 ? Intrinsic.X86Psraw : Intrinsic.X86Psrad; + + res2 = context.AddIntrinsic(shiftInst, res2, Const(1)); + + Intrinsic addInst = X86PaddInstruction[op.Size]; + + res = context.AddIntrinsic(addInst, res, res2); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorBinaryOpSx(context, (op1, op2) => + { + return context.ShiftRightSI(context.Add(op1, op2), Const(1)); + }); + } + } + + public static void Shsub_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64ShsubV); + } + else if (Optimizations.UseSse2 && op.Size < 2) + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand mask = X86GetAllElements(context, (int)(op.Size == 0 ? 0x80808080u : 0x80008000u)); + + Intrinsic addInst = X86PaddInstruction[op.Size]; + + Operand nPlusMask = context.AddIntrinsic(addInst, n, mask); + Operand mPlusMask = context.AddIntrinsic(addInst, m, mask); + + Intrinsic avgInst = op.Size == 0 ? Intrinsic.X86Pavgb : Intrinsic.X86Pavgw; + + Operand res = context.AddIntrinsic(avgInst, nPlusMask, mPlusMask); + + Intrinsic subInst = X86PsubInstruction[op.Size]; + + res = context.AddIntrinsic(subInst, nPlusMask, res); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorBinaryOpSx(context, (op1, op2) => + { + return context.ShiftRightSI(context.Subtract(op1, op2), Const(1)); + }); + } + } + + public static void Smax_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SmaxV); + } + else if (Optimizations.UseSse41) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Intrinsic maxInst = X86PmaxsInstruction[op.Size]; + + Operand res = context.AddIntrinsic(maxInst, n, m); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorBinaryOpSx(context, (op1, op2) => EmitMax64Op(context, op1, op2, signed: true)); + } + } + + public static void Smaxp_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SmaxpV); + } + else if (Optimizations.UseSsse3) + { + EmitSsse3VectorPairwiseOp(context, X86PmaxsInstruction); + } + else + { + EmitVectorPairwiseOpSx(context, (op1, op2) => EmitMax64Op(context, op1, op2, signed: true)); + } + } + + public static void Smaxv_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64SmaxvV); + } + else + { + EmitVectorAcrossVectorOpSx(context, (op1, op2) => EmitMax64Op(context, op1, op2, signed: true)); + } + } + + public static void Smin_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SminV); + } + else if (Optimizations.UseSse41) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Intrinsic minInst = X86PminsInstruction[op.Size]; + + Operand res = context.AddIntrinsic(minInst, n, m); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorBinaryOpSx(context, (op1, op2) => EmitMin64Op(context, op1, op2, signed: true)); + } + } + + public static void Sminp_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SminpV); + } + else if (Optimizations.UseSsse3) + { + EmitSsse3VectorPairwiseOp(context, X86PminsInstruction); + } + else + { + EmitVectorPairwiseOpSx(context, (op1, op2) => EmitMin64Op(context, op1, op2, signed: true)); + } + } + + public static void Sminv_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64SminvV); + } + else + { + EmitVectorAcrossVectorOpSx(context, (op1, op2) => EmitMin64Op(context, op1, op2, signed: true)); + } + } + + public static void Smlal_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64SmlalV); + } + else if (Optimizations.UseSse41 && op.Size < 2) + { + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if (op.RegisterSize == RegisterSize.Simd128) + { + n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8)); + m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8)); + } + + Intrinsic movInst = X86PmovsxInstruction[op.Size]; + + n = context.AddIntrinsic(movInst, n); + m = context.AddIntrinsic(movInst, m); + + Intrinsic mullInst = op.Size == 0 ? Intrinsic.X86Pmullw : Intrinsic.X86Pmulld; + + Operand res = context.AddIntrinsic(mullInst, n, m); + + Intrinsic addInst = X86PaddInstruction[op.Size + 1]; + + context.Copy(d, context.AddIntrinsic(addInst, d, res)); + } + else + { + EmitVectorWidenRnRmTernaryOpSx(context, (op1, op2, op3) => + { + return context.Add(op1, context.Multiply(op2, op3)); + }); + } + } + + public static void Smlal_Ve(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRdByElem(context, Intrinsic.Arm64SmlalVe); + } + else + { + EmitVectorWidenTernaryOpByElemSx(context, (op1, op2, op3) => + { + return context.Add(op1, context.Multiply(op2, op3)); + }); + } + } + + public static void Smlsl_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64SmlslV); + } + else if (Optimizations.UseSse41 && op.Size < 2) + { + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if (op.RegisterSize == RegisterSize.Simd128) + { + n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8)); + m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8)); + } + + Intrinsic movInst = op.Size == 0 ? Intrinsic.X86Pmovsxbw : Intrinsic.X86Pmovsxwd; + + n = context.AddIntrinsic(movInst, n); + m = context.AddIntrinsic(movInst, m); + + Intrinsic mullInst = op.Size == 0 ? Intrinsic.X86Pmullw : Intrinsic.X86Pmulld; + + Operand res = context.AddIntrinsic(mullInst, n, m); + + Intrinsic subInst = X86PsubInstruction[op.Size + 1]; + + context.Copy(d, context.AddIntrinsic(subInst, d, res)); + } + else + { + EmitVectorWidenRnRmTernaryOpSx(context, (op1, op2, op3) => + { + return context.Subtract(op1, context.Multiply(op2, op3)); + }); + } + } + + public static void Smlsl_Ve(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRdByElem(context, Intrinsic.Arm64SmlslVe); + } + else + { + EmitVectorWidenTernaryOpByElemSx(context, (op1, op2, op3) => + { + return context.Subtract(op1, context.Multiply(op2, op3)); + }); + } + } + + public static void Smull_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SmullV); + } + else + { + EmitVectorWidenRnRmBinaryOpSx(context, (op1, op2) => context.Multiply(op1, op2)); + } + } + + public static void Smull_Ve(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpByElem(context, Intrinsic.Arm64SmullVe); + } + else + { + EmitVectorWidenBinaryOpByElemSx(context, (op1, op2) => context.Multiply(op1, op2)); + } + } + + public static void Sqabs_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarSaturatingUnaryOp(context, Intrinsic.Arm64SqabsS); + } + else + { + EmitScalarSaturatingUnaryOpSx(context, (op1) => EmitAbs(context, op1)); + } + } + + public static void Sqabs_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorSaturatingUnaryOp(context, Intrinsic.Arm64SqabsV); + } + else + { + EmitVectorSaturatingUnaryOpSx(context, (op1) => EmitAbs(context, op1)); + } + } + + public static void Sqadd_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOp(context, Intrinsic.Arm64SqaddS); + } + else + { + EmitScalarSaturatingBinaryOpSx(context, flags: SaturatingFlags.Add); + } + } + + public static void Sqadd_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOp(context, Intrinsic.Arm64SqaddV); + } + else + { + EmitVectorSaturatingBinaryOpSx(context, flags: SaturatingFlags.Add); + } + } + + public static void Sqdmulh_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOp(context, Intrinsic.Arm64SqdmulhS); + } + else + { + EmitScalarSaturatingBinaryOpSx(context, (op1, op2) => EmitDoublingMultiplyHighHalf(context, op1, op2, round: false)); + } + } + + public static void Sqdmulh_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOp(context, Intrinsic.Arm64SqdmulhV); + } + else + { + EmitVectorSaturatingBinaryOpSx(context, (op1, op2) => EmitDoublingMultiplyHighHalf(context, op1, op2, round: false)); + } + } + + public static void Sqdmulh_Ve(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOpByElem(context, Intrinsic.Arm64SqdmulhVe); + } + else + { + EmitVectorSaturatingBinaryOpByElemSx(context, (op1, op2) => EmitDoublingMultiplyHighHalf(context, op1, op2, round: false)); + } + } + + public static void Sqneg_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarSaturatingUnaryOp(context, Intrinsic.Arm64SqnegS); + } + else + { + EmitScalarSaturatingUnaryOpSx(context, (op1) => context.Negate(op1)); + } + } + + public static void Sqneg_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorSaturatingUnaryOp(context, Intrinsic.Arm64SqnegV); + } + else + { + EmitVectorSaturatingUnaryOpSx(context, (op1) => context.Negate(op1)); + } + } + + public static void Sqrdmulh_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOp(context, Intrinsic.Arm64SqrdmulhS); + } + else + { + EmitScalarSaturatingBinaryOpSx(context, (op1, op2) => EmitDoublingMultiplyHighHalf(context, op1, op2, round: true)); + } + } + + public static void Sqrdmulh_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOp(context, Intrinsic.Arm64SqrdmulhV); + } + else + { + EmitVectorSaturatingBinaryOpSx(context, (op1, op2) => EmitDoublingMultiplyHighHalf(context, op1, op2, round: true)); + } + } + + public static void Sqrdmulh_Ve(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOpByElem(context, Intrinsic.Arm64SqrdmulhVe); + } + else + { + EmitVectorSaturatingBinaryOpByElemSx(context, (op1, op2) => EmitDoublingMultiplyHighHalf(context, op1, op2, round: true)); + } + } + + public static void Sqsub_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOp(context, Intrinsic.Arm64SqsubS); + } + else + { + EmitScalarSaturatingBinaryOpSx(context, flags: SaturatingFlags.Sub); + } + } + + public static void Sqsub_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOp(context, Intrinsic.Arm64SqsubV); + } + else + { + EmitVectorSaturatingBinaryOpSx(context, flags: SaturatingFlags.Sub); + } + } + + public static void Sqxtn_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOpRd(context, Intrinsic.Arm64SqxtnS); + } + else + { + EmitSaturatingNarrowOp(context, SaturatingNarrowFlags.ScalarSxSx); + } + } + + public static void Sqxtn_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOpRd(context, Intrinsic.Arm64SqxtnV); + } + else + { + EmitSaturatingNarrowOp(context, SaturatingNarrowFlags.VectorSxSx); + } + } + + public static void Sqxtun_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOpRd(context, Intrinsic.Arm64SqxtunS); + } + else + { + EmitSaturatingNarrowOp(context, SaturatingNarrowFlags.ScalarSxZx); + } + } + + public static void Sqxtun_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOpRd(context, Intrinsic.Arm64SqxtunV); + } + else + { + EmitSaturatingNarrowOp(context, SaturatingNarrowFlags.VectorSxZx); + } + } + + public static void Srhadd_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SrhaddV); + } + else if (Optimizations.UseSse2 && op.Size < 2) + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand mask = X86GetAllElements(context, (int)(op.Size == 0 ? 0x80808080u : 0x80008000u)); + + Intrinsic subInst = X86PsubInstruction[op.Size]; + + Operand nMinusMask = context.AddIntrinsic(subInst, n, mask); + Operand mMinusMask = context.AddIntrinsic(subInst, m, mask); + + Intrinsic avgInst = op.Size == 0 ? Intrinsic.X86Pavgb : Intrinsic.X86Pavgw; + + Operand res = context.AddIntrinsic(avgInst, nMinusMask, mMinusMask); + + Intrinsic addInst = X86PaddInstruction[op.Size]; + + res = context.AddIntrinsic(addInst, mask, res); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorBinaryOpSx(context, (op1, op2) => + { + Operand res = context.Add(op1, op2); + + res = context.Add(res, Const(1L)); + + return context.ShiftRightSI(res, Const(1)); + }); + } + } + + public static void Ssubl_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SsublV); + } + else if (Optimizations.UseSse41) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if (op.RegisterSize == RegisterSize.Simd128) + { + n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8)); + m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8)); + } + + Intrinsic movInst = X86PmovsxInstruction[op.Size]; + + n = context.AddIntrinsic(movInst, n); + m = context.AddIntrinsic(movInst, m); + + Intrinsic subInst = X86PsubInstruction[op.Size + 1]; + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(subInst, n, m)); + } + else + { + EmitVectorWidenRnRmBinaryOpSx(context, (op1, op2) => context.Subtract(op1, op2)); + } + } + + public static void Ssubw_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SsubwV); + } + else if (Optimizations.UseSse41) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if (op.RegisterSize == RegisterSize.Simd128) + { + m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8)); + } + + Intrinsic movInst = X86PmovsxInstruction[op.Size]; + + m = context.AddIntrinsic(movInst, m); + + Intrinsic subInst = X86PsubInstruction[op.Size + 1]; + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(subInst, n, m)); + } + else + { + EmitVectorWidenRmBinaryOpSx(context, (op1, op2) => context.Subtract(op1, op2)); + } + } + + public static void Sub_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOp(context, Intrinsic.Arm64SubS); + } + else + { + EmitScalarBinaryOpZx(context, (op1, op2) => context.Subtract(op1, op2)); + } + } + + public static void Sub_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SubV); + } + else if (Optimizations.UseSse2) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Intrinsic subInst = X86PsubInstruction[op.Size]; + + Operand res = context.AddIntrinsic(subInst, n, m); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorBinaryOpZx(context, (op1, op2) => context.Subtract(op1, op2)); + } + } + + public static void Subhn_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64SubhnV); + } + else + { + EmitHighNarrow(context, (op1, op2) => context.Subtract(op1, op2), round: false); + } + } + + public static void Suqadd_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOpRd(context, Intrinsic.Arm64SuqaddS); + } + else + { + EmitScalarSaturatingBinaryOpSx(context, flags: SaturatingFlags.Accumulate); + } + } + + public static void Suqadd_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOpRd(context, Intrinsic.Arm64SuqaddV); + } + else + { + EmitVectorSaturatingBinaryOpSx(context, flags: SaturatingFlags.Accumulate); + } + } + + public static void Uaba_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64UabaV); + } + else + { + EmitVectorTernaryOpZx(context, (op1, op2, op3) => + { + return context.Add(op1, EmitAbs(context, context.Subtract(op2, op3))); + }); + } + } + + public static void Uabal_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64UabalV); + } + else + { + EmitVectorWidenRnRmTernaryOpZx(context, (op1, op2, op3) => + { + return context.Add(op1, EmitAbs(context, context.Subtract(op2, op3))); + }); + } + } + + public static void Uabd_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UabdV); + } + else if (Optimizations.UseSse41) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + EmitSse41VectorUabdOp(context, op, n, m, isLong: false); + } + else + { + EmitVectorBinaryOpZx(context, (op1, op2) => + { + return EmitAbs(context, context.Subtract(op1, op2)); + }); + } + } + + public static void Uabdl_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UabdlV); + } + else if (Optimizations.UseSse41 && op.Size < 2) + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if (op.RegisterSize == RegisterSize.Simd128) + { + n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8)); + m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8)); + } + + Intrinsic movInst = op.Size == 0 + ? Intrinsic.X86Pmovzxbw + : Intrinsic.X86Pmovzxwd; + + n = context.AddIntrinsic(movInst, n); + m = context.AddIntrinsic(movInst, m); + + EmitSse41VectorUabdOp(context, op, n, m, isLong: true); + } + else + { + EmitVectorWidenRnRmBinaryOpZx(context, (op1, op2) => + { + return EmitAbs(context, context.Subtract(op1, op2)); + }); + } + } + + public static void Uadalp_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpRd(context, Intrinsic.Arm64UadalpV); + } + else + { + EmitAddLongPairwise(context, signed: false, accumulate: true); + } + } + + public static void Uaddl_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UaddlV); + } + else if (Optimizations.UseSse41) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if (op.RegisterSize == RegisterSize.Simd128) + { + n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8)); + m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8)); + } + + Intrinsic movInst = X86PmovzxInstruction[op.Size]; + + n = context.AddIntrinsic(movInst, n); + m = context.AddIntrinsic(movInst, m); + + Intrinsic addInst = X86PaddInstruction[op.Size + 1]; + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(addInst, n, m)); + } + else + { + EmitVectorWidenRnRmBinaryOpZx(context, (op1, op2) => context.Add(op1, op2)); + } + } + + public static void Uaddlp_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64UaddlpV); + } + else + { + EmitAddLongPairwise(context, signed: false, accumulate: false); + } + } + + public static void Uaddlv_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64UaddlvV); + } + else + { + EmitVectorLongAcrossVectorOpZx(context, (op1, op2) => context.Add(op1, op2)); + } + } + + public static void Uaddw_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UaddwV); + } + else if (Optimizations.UseSse41) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if (op.RegisterSize == RegisterSize.Simd128) + { + m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8)); + } + + Intrinsic movInst = X86PmovzxInstruction[op.Size]; + + m = context.AddIntrinsic(movInst, m); + + Intrinsic addInst = X86PaddInstruction[op.Size + 1]; + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(addInst, n, m)); + } + else + { + EmitVectorWidenRmBinaryOpZx(context, (op1, op2) => context.Add(op1, op2)); + } + } + + public static void Uhadd_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UhaddV); + } + else if (Optimizations.UseSse2 && op.Size > 0) + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res = context.AddIntrinsic(Intrinsic.X86Pand, n, m); + Operand res2 = context.AddIntrinsic(Intrinsic.X86Pxor, n, m); + + Intrinsic shiftInst = op.Size == 1 ? Intrinsic.X86Psrlw : Intrinsic.X86Psrld; + + res2 = context.AddIntrinsic(shiftInst, res2, Const(1)); + + Intrinsic addInst = X86PaddInstruction[op.Size]; + + res = context.AddIntrinsic(addInst, res, res2); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorBinaryOpZx(context, (op1, op2) => + { + return context.ShiftRightUI(context.Add(op1, op2), Const(1)); + }); + } + } + + public static void Uhsub_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UhsubV); + } + else if (Optimizations.UseSse2 && op.Size < 2) + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Intrinsic avgInst = op.Size == 0 ? Intrinsic.X86Pavgb : Intrinsic.X86Pavgw; + + Operand res = context.AddIntrinsic(avgInst, n, m); + + Intrinsic subInst = X86PsubInstruction[op.Size]; + + res = context.AddIntrinsic(subInst, n, res); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorBinaryOpZx(context, (op1, op2) => + { + return context.ShiftRightUI(context.Subtract(op1, op2), Const(1)); + }); + } + } + + public static void Umax_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UmaxV); + } + else if (Optimizations.UseSse41) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Intrinsic maxInst = X86PmaxuInstruction[op.Size]; + + Operand res = context.AddIntrinsic(maxInst, n, m); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorBinaryOpZx(context, (op1, op2) => EmitMax64Op(context, op1, op2, signed: false)); + } + } + + public static void Umaxp_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UmaxpV); + } + else if (Optimizations.UseSsse3) + { + EmitSsse3VectorPairwiseOp(context, X86PmaxuInstruction); + } + else + { + EmitVectorPairwiseOpZx(context, (op1, op2) => EmitMax64Op(context, op1, op2, signed: false)); + } + } + + public static void Umaxv_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64UmaxvV); + } + else + { + EmitVectorAcrossVectorOpZx(context, (op1, op2) => EmitMax64Op(context, op1, op2, signed: false)); + } + } + + public static void Umin_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UminV); + } + else if (Optimizations.UseSse41) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Intrinsic minInst = X86PminuInstruction[op.Size]; + + Operand res = context.AddIntrinsic(minInst, n, m); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorBinaryOpZx(context, (op1, op2) => EmitMin64Op(context, op1, op2, signed: false)); + } + } + + public static void Uminp_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UminpV); + } + else if (Optimizations.UseSsse3) + { + EmitSsse3VectorPairwiseOp(context, X86PminuInstruction); + } + else + { + EmitVectorPairwiseOpZx(context, (op1, op2) => EmitMin64Op(context, op1, op2, signed: false)); + } + } + + public static void Uminv_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64UminvV); + } + else + { + EmitVectorAcrossVectorOpZx(context, (op1, op2) => EmitMin64Op(context, op1, op2, signed: false)); + } + } + + public static void Umlal_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64UmlalV); + } + else if (Optimizations.UseSse41 && op.Size < 2) + { + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if (op.RegisterSize == RegisterSize.Simd128) + { + n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8)); + m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8)); + } + + Intrinsic movInst = X86PmovzxInstruction[op.Size]; + + n = context.AddIntrinsic(movInst, n); + m = context.AddIntrinsic(movInst, m); + + Intrinsic mullInst = op.Size == 0 ? Intrinsic.X86Pmullw : Intrinsic.X86Pmulld; + + Operand res = context.AddIntrinsic(mullInst, n, m); + + Intrinsic addInst = X86PaddInstruction[op.Size + 1]; + + context.Copy(d, context.AddIntrinsic(addInst, d, res)); + } + else + { + EmitVectorWidenRnRmTernaryOpZx(context, (op1, op2, op3) => + { + return context.Add(op1, context.Multiply(op2, op3)); + }); + } + } + + public static void Umlal_Ve(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRdByElem(context, Intrinsic.Arm64UmlalVe); + } + else + { + EmitVectorWidenTernaryOpByElemZx(context, (op1, op2, op3) => + { + return context.Add(op1, context.Multiply(op2, op3)); + }); + } + } + + public static void Umlsl_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64UmlslV); + } + else if (Optimizations.UseSse41 && op.Size < 2) + { + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if (op.RegisterSize == RegisterSize.Simd128) + { + n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8)); + m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8)); + } + + Intrinsic movInst = op.Size == 0 ? Intrinsic.X86Pmovzxbw : Intrinsic.X86Pmovzxwd; + + n = context.AddIntrinsic(movInst, n); + m = context.AddIntrinsic(movInst, m); + + Intrinsic mullInst = op.Size == 0 ? Intrinsic.X86Pmullw : Intrinsic.X86Pmulld; + + Operand res = context.AddIntrinsic(mullInst, n, m); + + Intrinsic subInst = X86PsubInstruction[op.Size + 1]; + + context.Copy(d, context.AddIntrinsic(subInst, d, res)); + } + else + { + EmitVectorWidenRnRmTernaryOpZx(context, (op1, op2, op3) => + { + return context.Subtract(op1, context.Multiply(op2, op3)); + }); + } + } + + public static void Umlsl_Ve(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRdByElem(context, Intrinsic.Arm64UmlslVe); + } + else + { + EmitVectorWidenTernaryOpByElemZx(context, (op1, op2, op3) => + { + return context.Subtract(op1, context.Multiply(op2, op3)); + }); + } + } + + public static void Umull_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UmullV); + } + else + { + EmitVectorWidenRnRmBinaryOpZx(context, (op1, op2) => context.Multiply(op1, op2)); + } + } + + public static void Umull_Ve(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpByElem(context, Intrinsic.Arm64UmullVe); + } + else + { + EmitVectorWidenBinaryOpByElemZx(context, (op1, op2) => context.Multiply(op1, op2)); + } + } + + public static void Uqadd_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOp(context, Intrinsic.Arm64UqaddS); + } + else + { + EmitScalarSaturatingBinaryOpZx(context, SaturatingFlags.Add); + } + } + + public static void Uqadd_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOp(context, Intrinsic.Arm64UqaddV); + } + else + { + EmitVectorSaturatingBinaryOpZx(context, SaturatingFlags.Add); + } + } + + public static void Uqsub_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOp(context, Intrinsic.Arm64UqsubS); + } + else + { + EmitScalarSaturatingBinaryOpZx(context, SaturatingFlags.Sub); + } + } + + public static void Uqsub_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOp(context, Intrinsic.Arm64UqsubV); + } + else + { + EmitVectorSaturatingBinaryOpZx(context, SaturatingFlags.Sub); + } + } + + public static void Uqxtn_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOpRd(context, Intrinsic.Arm64UqxtnS); + } + else + { + EmitSaturatingNarrowOp(context, SaturatingNarrowFlags.ScalarZxZx); + } + } + + public static void Uqxtn_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOpRd(context, Intrinsic.Arm64UqxtnV); + } + else + { + EmitSaturatingNarrowOp(context, SaturatingNarrowFlags.VectorZxZx); + } + } + + public static void Urhadd_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UrhaddV); + } + else if (Optimizations.UseSse2 && op.Size < 2) + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Intrinsic avgInst = op.Size == 0 ? Intrinsic.X86Pavgb : Intrinsic.X86Pavgw; + + Operand res = context.AddIntrinsic(avgInst, n, m); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorBinaryOpZx(context, (op1, op2) => + { + Operand res = context.Add(op1, op2); + + res = context.Add(res, Const(1L)); + + return context.ShiftRightUI(res, Const(1)); + }); + } + } + + public static void Usqadd_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOpRd(context, Intrinsic.Arm64UsqaddS); + } + else + { + EmitScalarSaturatingBinaryOpZx(context, SaturatingFlags.Accumulate); + } + } + + public static void Usqadd_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOpRd(context, Intrinsic.Arm64UsqaddV); + } + else + { + EmitVectorSaturatingBinaryOpZx(context, SaturatingFlags.Accumulate); + } + } + + public static void Usubl_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UsublV); + } + else if (Optimizations.UseSse41) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if (op.RegisterSize == RegisterSize.Simd128) + { + n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8)); + m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8)); + } + + Intrinsic movInst = X86PmovzxInstruction[op.Size]; + + n = context.AddIntrinsic(movInst, n); + m = context.AddIntrinsic(movInst, m); + + Intrinsic subInst = X86PsubInstruction[op.Size + 1]; + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(subInst, n, m)); + } + else + { + EmitVectorWidenRnRmBinaryOpZx(context, (op1, op2) => context.Subtract(op1, op2)); + } + } + + public static void Usubw_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UsubwV); + } + else if (Optimizations.UseSse41) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if (op.RegisterSize == RegisterSize.Simd128) + { + m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8)); + } + + Intrinsic movInst = X86PmovzxInstruction[op.Size]; + + m = context.AddIntrinsic(movInst, m); + + Intrinsic subInst = X86PsubInstruction[op.Size + 1]; + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(subInst, n, m)); + } + else + { + EmitVectorWidenRmBinaryOpZx(context, (op1, op2) => context.Subtract(op1, op2)); + } + } + + private static Operand EmitAbs(ArmEmitterContext context, Operand value) + { + Operand isPositive = context.ICompareGreaterOrEqual(value, Const(value.Type, 0)); + + return context.ConditionalSelect(isPositive, value, context.Negate(value)); + } + + private static void EmitAddLongPairwise(ArmEmitterContext context, bool signed, bool accumulate) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand res = context.VectorZero(); + + int pairs = op.GetPairsCount() >> op.Size; + + for (int index = 0; index < pairs; index++) + { + int pairIndex = index << 1; + + Operand ne0 = EmitVectorExtract(context, op.Rn, pairIndex, op.Size, signed); + Operand ne1 = EmitVectorExtract(context, op.Rn, pairIndex + 1, op.Size, signed); + + Operand e = context.Add(ne0, ne1); + + if (accumulate) + { + Operand de = EmitVectorExtract(context, op.Rd, index, op.Size + 1, signed); + + e = context.Add(e, de); + } + + res = EmitVectorInsert(context, res, e, index, op.Size + 1); + } + + context.Copy(GetVec(op.Rd), res); + } + + private static Operand EmitDoublingMultiplyHighHalf( + ArmEmitterContext context, + Operand n, + Operand m, + bool round) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + int eSize = 8 << op.Size; + + Operand res = context.Multiply(n, m); + + if (!round) + { + res = context.ShiftRightSI(res, Const(eSize - 1)); + } + else + { + long roundConst = 1L << (eSize - 1); + + res = context.ShiftLeft(res, Const(1)); + + res = context.Add(res, Const(roundConst)); + + res = context.ShiftRightSI(res, Const(eSize)); + + Operand isIntMin = context.ICompareEqual(res, Const((long)int.MinValue)); + + res = context.ConditionalSelect(isIntMin, context.Negate(res), res); + } + + return res; + } + + private static void EmitHighNarrow(ArmEmitterContext context, Func2I emit, bool round) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + int elems = 8 >> op.Size; + int eSize = 8 << op.Size; + + int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0; + + Operand d = GetVec(op.Rd); + + Operand res = part == 0 ? context.VectorZero() : context.Copy(d); + + long roundConst = 1L << (eSize - 1); + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size + 1); + Operand me = EmitVectorExtractZx(context, op.Rm, index, op.Size + 1); + + Operand de = emit(ne, me); + + if (round) + { + de = context.Add(de, Const(roundConst)); + } + + de = context.ShiftRightUI(de, Const(eSize)); + + res = EmitVectorInsert(context, res, de, part + index, op.Size); + } + + context.Copy(d, res); + } + + private static Operand EmitMax64Op(ArmEmitterContext context, Operand op1, Operand op2, bool signed) + { + Debug.Assert(op1.Type == OperandType.I64 && op2.Type == OperandType.I64); + + Operand cmp = signed + ? context.ICompareGreaterOrEqual (op1, op2) + : context.ICompareGreaterOrEqualUI(op1, op2); + + return context.ConditionalSelect(cmp, op1, op2); + } + + private static Operand EmitMin64Op(ArmEmitterContext context, Operand op1, Operand op2, bool signed) + { + Debug.Assert(op1.Type == OperandType.I64 && op2.Type == OperandType.I64); + + Operand cmp = signed + ? context.ICompareLessOrEqual (op1, op2) + : context.ICompareLessOrEqualUI(op1, op2); + + return context.ConditionalSelect(cmp, op1, op2); + } + + private static void EmitSse41ScalarRoundOpF(ArmEmitterContext context, FPRoundingMode roundMode) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + Operand res; + + if (roundMode != FPRoundingMode.ToNearestAway) + { + Intrinsic inst = (op.Size & 1) != 0 ? Intrinsic.X86Roundsd : Intrinsic.X86Roundss; + + res = context.AddIntrinsic(inst, n, Const(X86GetRoundControl(roundMode))); + } + else + { + res = EmitSse41RoundToNearestWithTiesToAwayOpF(context, n, scalar: true); + } + + if ((op.Size & 1) != 0) + { + res = context.VectorZeroUpper64(res); + } + else + { + res = context.VectorZeroUpper96(res); + } + + context.Copy(GetVec(op.Rd), res); + } + + private static void EmitSse41VectorRoundOpF(ArmEmitterContext context, FPRoundingMode roundMode) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + Operand res; + + if (roundMode != FPRoundingMode.ToNearestAway) + { + Intrinsic inst = (op.Size & 1) != 0 ? Intrinsic.X86Roundpd : Intrinsic.X86Roundps; + + res = context.AddIntrinsic(inst, n, Const(X86GetRoundControl(roundMode))); + } + else + { + res = EmitSse41RoundToNearestWithTiesToAwayOpF(context, n, scalar: false); + } + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + + private static Operand EmitSse41Round32Exp8OpF(ArmEmitterContext context, Operand value, bool scalar) + { + Operand roundMask; + Operand truncMask; + Operand expMask; + + if (scalar) + { + roundMask = X86GetScalar(context, 0x4000); + truncMask = X86GetScalar(context, unchecked((int)0xFFFF8000)); + expMask = X86GetScalar(context, 0x7F800000); + } + else + { + roundMask = X86GetAllElements(context, 0x4000); + truncMask = X86GetAllElements(context, unchecked((int)0xFFFF8000)); + expMask = X86GetAllElements(context, 0x7F800000); + } + + Operand oValue = value; + Operand masked = context.AddIntrinsic(Intrinsic.X86Pand, value, expMask); + Operand isNaNInf = context.AddIntrinsic(Intrinsic.X86Pcmpeqd, masked, expMask); + + value = context.AddIntrinsic(Intrinsic.X86Paddd, value, roundMask); + value = context.AddIntrinsic(Intrinsic.X86Pand, value, truncMask); + + return context.AddIntrinsic(Intrinsic.X86Blendvps, value, oValue, isNaNInf); + } + + private static Operand EmitSse41RecipStepSelectOpF( + ArmEmitterContext context, + Operand n, + Operand m, + Operand res, + Operand mask, + bool scalar, + int sizeF) + { + Intrinsic cmpOp; + Intrinsic shlOp; + Intrinsic blendOp; + Operand zero = context.VectorZero(); + Operand expMask; + + if (sizeF == 0) + { + cmpOp = Intrinsic.X86Pcmpeqd; + shlOp = Intrinsic.X86Pslld; + blendOp = Intrinsic.X86Blendvps; + expMask = scalar ? X86GetScalar(context, 0x7F800000 << 1) : X86GetAllElements(context, 0x7F800000 << 1); + } + else /* if (sizeF == 1) */ + { + cmpOp = Intrinsic.X86Pcmpeqq; + shlOp = Intrinsic.X86Psllq; + blendOp = Intrinsic.X86Blendvpd; + expMask = scalar ? X86GetScalar(context, 0x7FF0000000000000L << 1) : X86GetAllElements(context, 0x7FF0000000000000L << 1); + } + + n = context.AddIntrinsic(shlOp, n, Const(1)); + m = context.AddIntrinsic(shlOp, m, Const(1)); + + Operand nZero = context.AddIntrinsic(cmpOp, n, zero); + Operand mZero = context.AddIntrinsic(cmpOp, m, zero); + Operand nInf = context.AddIntrinsic(cmpOp, n, expMask); + Operand mInf = context.AddIntrinsic(cmpOp, m, expMask); + + Operand nmZero = context.AddIntrinsic(Intrinsic.X86Por, nZero, mZero); + Operand nmInf = context.AddIntrinsic(Intrinsic.X86Por, nInf, mInf); + Operand nmZeroInf = context.AddIntrinsic(Intrinsic.X86Pand, nmZero, nmInf); + + return context.AddIntrinsic(blendOp, res, mask, nmZeroInf); + } + + public static void EmitSse2VectorIsNaNOpF( + ArmEmitterContext context, + Operand opF, + out Operand qNaNMask, + out Operand sNaNMask, + bool? isQNaN = null) + { + IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; + + if ((op.Size & 1) == 0) + { + const int QBit = 22; + + Operand qMask = X86GetAllElements(context, 1 << QBit); + + Operand mask1 = context.AddIntrinsic(Intrinsic.X86Cmpps, opF, opF, Const((int)CmpCondition.UnorderedQ)); + + Operand mask2 = context.AddIntrinsic(Intrinsic.X86Pand, opF, qMask); + mask2 = context.AddIntrinsic(Intrinsic.X86Cmpps, mask2, qMask, Const((int)CmpCondition.Equal)); + + qNaNMask = isQNaN == null || (bool)isQNaN ? context.AddIntrinsic(Intrinsic.X86Andps, mask2, mask1) : default; + sNaNMask = isQNaN == null || !(bool)isQNaN ? context.AddIntrinsic(Intrinsic.X86Andnps, mask2, mask1) : default; + } + else /* if ((op.Size & 1) == 1) */ + { + const int QBit = 51; + + Operand qMask = X86GetAllElements(context, 1L << QBit); + + Operand mask1 = context.AddIntrinsic(Intrinsic.X86Cmppd, opF, opF, Const((int)CmpCondition.UnorderedQ)); + + Operand mask2 = context.AddIntrinsic(Intrinsic.X86Pand, opF, qMask); + mask2 = context.AddIntrinsic(Intrinsic.X86Cmppd, mask2, qMask, Const((int)CmpCondition.Equal)); + + qNaNMask = isQNaN == null || (bool)isQNaN ? context.AddIntrinsic(Intrinsic.X86Andpd, mask2, mask1) : default; + sNaNMask = isQNaN == null || !(bool)isQNaN ? context.AddIntrinsic(Intrinsic.X86Andnpd, mask2, mask1) : default; + } + } + + public static Operand EmitSse41ProcessNaNsOpF( + ArmEmitterContext context, + Func2I emit, + bool scalar, + Operand n = default, + Operand m = default) + { + Operand nCopy = n == default ? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rn)) : n; + Operand mCopy = m == default ? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rm)) : m; + + EmitSse2VectorIsNaNOpF(context, nCopy, out Operand nQNaNMask, out Operand nSNaNMask); + EmitSse2VectorIsNaNOpF(context, mCopy, out _, out Operand mSNaNMask, isQNaN: false); + + int sizeF = ((IOpCodeSimd)context.CurrOp).Size & 1; + + if (sizeF == 0) + { + const int QBit = 22; + + Operand qMask = scalar ? X86GetScalar(context, 1 << QBit) : X86GetAllElements(context, 1 << QBit); + + Operand resNaNMask = context.AddIntrinsic(Intrinsic.X86Pandn, mSNaNMask, nQNaNMask); + resNaNMask = context.AddIntrinsic(Intrinsic.X86Por, resNaNMask, nSNaNMask); + + Operand resNaN = context.AddIntrinsic(Intrinsic.X86Blendvps, mCopy, nCopy, resNaNMask); + resNaN = context.AddIntrinsic(Intrinsic.X86Por, resNaN, qMask); + + Operand resMask = context.AddIntrinsic(Intrinsic.X86Cmpps, nCopy, mCopy, Const((int)CmpCondition.OrderedQ)); + + Operand res = context.AddIntrinsic(Intrinsic.X86Blendvps, resNaN, emit(nCopy, mCopy), resMask); + + if (n != default || m != default) + { + return res; + } + + if (scalar) + { + res = context.VectorZeroUpper96(res); + } + else if (((OpCodeSimdReg)context.CurrOp).RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res); + + return default; + } + else /* if (sizeF == 1) */ + { + const int QBit = 51; + + Operand qMask = scalar ? X86GetScalar(context, 1L << QBit) : X86GetAllElements(context, 1L << QBit); + + Operand resNaNMask = context.AddIntrinsic(Intrinsic.X86Pandn, mSNaNMask, nQNaNMask); + resNaNMask = context.AddIntrinsic(Intrinsic.X86Por, resNaNMask, nSNaNMask); + + Operand resNaN = context.AddIntrinsic(Intrinsic.X86Blendvpd, mCopy, nCopy, resNaNMask); + resNaN = context.AddIntrinsic(Intrinsic.X86Por, resNaN, qMask); + + Operand resMask = context.AddIntrinsic(Intrinsic.X86Cmppd, nCopy, mCopy, Const((int)CmpCondition.OrderedQ)); + + Operand res = context.AddIntrinsic(Intrinsic.X86Blendvpd, resNaN, emit(nCopy, mCopy), resMask); + + if (n != default || m != default) + { + return res; + } + + if (scalar) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res); + + return default; + } + } + + private static Operand EmitSse2VectorMaxMinOpF(ArmEmitterContext context, Operand n, Operand m, bool isMax) + { + IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; + + if ((op.Size & 1) == 0) + { + Operand mask = X86GetAllElements(context, -0f); + + Operand res = context.AddIntrinsic(isMax ? Intrinsic.X86Maxps : Intrinsic.X86Minps, n, m); + res = context.AddIntrinsic(Intrinsic.X86Andnps, mask, res); + + Operand resSign = context.AddIntrinsic(isMax ? Intrinsic.X86Pand : Intrinsic.X86Por, n, m); + resSign = context.AddIntrinsic(Intrinsic.X86Andps, mask, resSign); + + return context.AddIntrinsic(Intrinsic.X86Por, res, resSign); + } + else /* if ((op.Size & 1) == 1) */ + { + Operand mask = X86GetAllElements(context, -0d); + + Operand res = context.AddIntrinsic(isMax ? Intrinsic.X86Maxpd : Intrinsic.X86Minpd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Andnpd, mask, res); + + Operand resSign = context.AddIntrinsic(isMax ? Intrinsic.X86Pand : Intrinsic.X86Por, n, m); + resSign = context.AddIntrinsic(Intrinsic.X86Andpd, mask, resSign); + + return context.AddIntrinsic(Intrinsic.X86Por, res, resSign); + } + } + + private static Operand EmitSse41MaxMinNumOpF( + ArmEmitterContext context, + bool isMaxNum, + bool scalar, + Operand n = default, + Operand m = default) + { + Operand nCopy = n == default ? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rn)) : n; + Operand mCopy = m == default ? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rm)) : m; + + EmitSse2VectorIsNaNOpF(context, nCopy, out Operand nQNaNMask, out _, isQNaN: true); + EmitSse2VectorIsNaNOpF(context, mCopy, out Operand mQNaNMask, out _, isQNaN: true); + + int sizeF = ((IOpCodeSimd)context.CurrOp).Size & 1; + + if (sizeF == 0) + { + Operand negInfMask = scalar + ? X86GetScalar (context, isMaxNum ? float.NegativeInfinity : float.PositiveInfinity) + : X86GetAllElements(context, isMaxNum ? float.NegativeInfinity : float.PositiveInfinity); + + Operand nMask = context.AddIntrinsic(Intrinsic.X86Andnps, mQNaNMask, nQNaNMask); + Operand mMask = context.AddIntrinsic(Intrinsic.X86Andnps, nQNaNMask, mQNaNMask); + + nCopy = context.AddIntrinsic(Intrinsic.X86Blendvps, nCopy, negInfMask, nMask); + mCopy = context.AddIntrinsic(Intrinsic.X86Blendvps, mCopy, negInfMask, mMask); + + Operand res = EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum); + }, scalar: scalar, nCopy, mCopy); + + if (n != default || m != default) + { + return res; + } + + if (scalar) + { + res = context.VectorZeroUpper96(res); + } + else if (((OpCodeSimdReg)context.CurrOp).RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res); + + return default; + } + else /* if (sizeF == 1) */ + { + Operand negInfMask = scalar + ? X86GetScalar (context, isMaxNum ? double.NegativeInfinity : double.PositiveInfinity) + : X86GetAllElements(context, isMaxNum ? double.NegativeInfinity : double.PositiveInfinity); + + Operand nMask = context.AddIntrinsic(Intrinsic.X86Andnpd, mQNaNMask, nQNaNMask); + Operand mMask = context.AddIntrinsic(Intrinsic.X86Andnpd, nQNaNMask, mQNaNMask); + + nCopy = context.AddIntrinsic(Intrinsic.X86Blendvpd, nCopy, negInfMask, nMask); + mCopy = context.AddIntrinsic(Intrinsic.X86Blendvpd, mCopy, negInfMask, mMask); + + Operand res = EmitSse41ProcessNaNsOpF(context, (op1, op2) => + { + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum); + }, scalar: scalar, nCopy, mCopy); + + if (n != default || m != default) + { + return res; + } + + if (scalar) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res); + + return default; + } + } + + private enum AddSub + { + None, + Add, + Subtract + } + + private static void EmitSse41VectorMul_AddSub(ArmEmitterContext context, AddSub addSub) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res; + + if (op.Size == 0) + { + Operand ns8 = context.AddIntrinsic(Intrinsic.X86Psrlw, n, Const(8)); + Operand ms8 = context.AddIntrinsic(Intrinsic.X86Psrlw, m, Const(8)); + + res = context.AddIntrinsic(Intrinsic.X86Pmullw, ns8, ms8); + + res = context.AddIntrinsic(Intrinsic.X86Psllw, res, Const(8)); + + Operand res2 = context.AddIntrinsic(Intrinsic.X86Pmullw, n, m); + + Operand mask = X86GetAllElements(context, 0x00FF00FF); + + res = context.AddIntrinsic(Intrinsic.X86Pblendvb, res, res2, mask); + } + else if (op.Size == 1) + { + res = context.AddIntrinsic(Intrinsic.X86Pmullw, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Pmulld, n, m); + } + + Operand d = GetVec(op.Rd); + + if (addSub == AddSub.Add) + { + Intrinsic addInst = X86PaddInstruction[op.Size]; + + res = context.AddIntrinsic(addInst, d, res); + } + else if (addSub == AddSub.Subtract) + { + Intrinsic subInst = X86PsubInstruction[op.Size]; + + res = context.AddIntrinsic(subInst, d, res); + } + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); + } + + private static void EmitSse41VectorSabdOp( + ArmEmitterContext context, + OpCodeSimdReg op, + Operand n, + Operand m, + bool isLong) + { + int size = isLong ? op.Size + 1 : op.Size; + + Intrinsic cmpgtInst = X86PcmpgtInstruction[size]; + + Operand cmpMask = context.AddIntrinsic(cmpgtInst, n, m); + + Intrinsic subInst = X86PsubInstruction[size]; + + Operand res = context.AddIntrinsic(subInst, n, m); + + res = context.AddIntrinsic(Intrinsic.X86Pand, cmpMask, res); + + Operand res2 = context.AddIntrinsic(subInst, m, n); + + res2 = context.AddIntrinsic(Intrinsic.X86Pandn, cmpMask, res2); + + res = context.AddIntrinsic(Intrinsic.X86Por, res, res2); + + if (!isLong && op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + + private static void EmitSse41VectorUabdOp( + ArmEmitterContext context, + OpCodeSimdReg op, + Operand n, + Operand m, + bool isLong) + { + int size = isLong ? op.Size + 1 : op.Size; + + Intrinsic maxInst = X86PmaxuInstruction[size]; + + Operand max = context.AddIntrinsic(maxInst, m, n); + + Intrinsic cmpeqInst = X86PcmpeqInstruction[size]; + + Operand cmpMask = context.AddIntrinsic(cmpeqInst, max, m); + + Operand onesMask = X86GetAllElements(context, -1L); + + cmpMask = context.AddIntrinsic(Intrinsic.X86Pandn, cmpMask, onesMask); + + Intrinsic subInst = X86PsubInstruction[size]; + + Operand res = context.AddIntrinsic(subInst, n, m); + Operand res2 = context.AddIntrinsic(subInst, m, n); + + res = context.AddIntrinsic(Intrinsic.X86Pand, cmpMask, res); + res2 = context.AddIntrinsic(Intrinsic.X86Pandn, cmpMask, res2); + + res = context.AddIntrinsic(Intrinsic.X86Por, res, res2); + + if (!isLong && op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + + private static Operand EmitSse2Sll_128(ArmEmitterContext context, Operand op, int shift) + { + // The upper part of op is assumed to be zero. + Debug.Assert(shift >= 0 && shift < 64); + + if (shift == 0) + { + return op; + } + + Operand high = context.AddIntrinsic(Intrinsic.X86Pslldq, op, Const(8)); + high = context.AddIntrinsic(Intrinsic.X86Psrlq, high, Const(64 - shift)); + + Operand low = context.AddIntrinsic(Intrinsic.X86Psllq, op, Const(shift)); + + return context.AddIntrinsic(Intrinsic.X86Por, high, low); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs b/src/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs new file mode 100644 index 00000000..a9994e41 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs @@ -0,0 +1,1703 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; +using System; + +using static ARMeilleure.Instructions.InstEmitFlowHelper; +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper32; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit32 + { + public static void Vabd_I(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + EmitVectorBinaryOpI32(context, (op1, op2) => EmitAbs(context, context.Subtract(op1, op2)), !op.U); + } + + public static void Vabdl_I(ArmEmitterContext context) + { + OpCode32SimdRegLong op = (OpCode32SimdRegLong)context.CurrOp; + + EmitVectorBinaryLongOpI32(context, (op1, op2) => EmitAbs(context, context.Subtract(op1, op2)), !op.U); + } + + public static void Vabs_S(ArmEmitterContext context) + { + OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; + + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarUnaryOpF32(context, Intrinsic.Arm64FabsS); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarUnaryOpSimd32(context, (m) => + { + return EmitFloatAbs(context, m, (op.Size & 1) == 0, false); + }); + } + else + { + EmitScalarUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, nameof(Math.Abs), op1)); + } + } + + public static void Vabs_V(ArmEmitterContext context) + { + OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp; + + if (op.F) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorUnaryOpF32(context, Intrinsic.Arm64FabsV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorUnaryOpSimd32(context, (m) => + { + return EmitFloatAbs(context, m, (op.Size & 1) == 0, true); + }); + } + else + { + EmitVectorUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, nameof(Math.Abs), op1)); + } + } + else + { + EmitVectorUnaryOpSx32(context, (op1) => EmitAbs(context, op1)); + } + } + + private static Operand EmitAbs(ArmEmitterContext context, Operand value) + { + Operand isPositive = context.ICompareGreaterOrEqual(value, Const(value.Type, 0)); + + return context.ConditionalSelect(isPositive, value, context.Negate(value)); + } + + public static void Vadd_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarBinaryOpF32(context, Intrinsic.Arm64FaddS); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarBinaryOpF32(context, Intrinsic.X86Addss, Intrinsic.X86Addsd); + } + else if (Optimizations.FastFP) + { + EmitScalarBinaryOpF32(context, (op1, op2) => context.Add(op1, op2)); + } + else + { + EmitScalarBinaryOpF32(context, (op1, op2) => EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), op1, op2)); + } + } + + public static void Vadd_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorBinaryOpF32(context, Intrinsic.Arm64FaddV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorBinaryOpF32(context, Intrinsic.X86Addps, Intrinsic.X86Addpd); + } + else if (Optimizations.FastFP) + { + EmitVectorBinaryOpF32(context, (op1, op2) => context.Add(op1, op2)); + } + else + { + EmitVectorBinaryOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPAddFpscr), op1, op2)); + } + } + + public static void Vadd_I(ArmEmitterContext context) + { + if (Optimizations.UseSse2) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PaddInstruction[op.Size], op1, op2)); + } + else + { + EmitVectorBinaryOpZx32(context, (op1, op2) => context.Add(op1, op2)); + } + } + + public static void Vaddl_I(ArmEmitterContext context) + { + OpCode32SimdRegLong op = (OpCode32SimdRegLong)context.CurrOp; + + EmitVectorBinaryLongOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U); + } + + public static void Vaddw_I(ArmEmitterContext context) + { + OpCode32SimdRegWide op = (OpCode32SimdRegWide)context.CurrOp; + + EmitVectorBinaryWideOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U); + } + + public static void Vcnt(ArmEmitterContext context) + { + OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp; + + Operand res = GetVecA32(op.Qd); + + int elems = op.GetBytesCount(); + + for (int index = 0; index < elems; index++) + { + Operand de; + Operand me = EmitVectorExtractZx32(context, op.Qm, op.Im + index, op.Size); + + if (Optimizations.UsePopCnt) + { + de = context.AddIntrinsicInt(Intrinsic.X86Popcnt, me); + } + else + { + de = EmitCountSetBits8(context, me); + } + + res = EmitVectorInsert(context, res, de, op.Id + index, op.Size); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void Vdup(ArmEmitterContext context) + { + OpCode32SimdDupGP op = (OpCode32SimdDupGP)context.CurrOp; + + Operand insert = GetIntA32(context, op.Rt); + + // Zero extend into an I64, then replicate. Saves the most time over elementwise inserts. + insert = op.Size switch + { + 2 => context.Multiply(context.ZeroExtend32(OperandType.I64, insert), Const(0x0000000100000001u)), + 1 => context.Multiply(context.ZeroExtend16(OperandType.I64, insert), Const(0x0001000100010001u)), + 0 => context.Multiply(context.ZeroExtend8(OperandType.I64, insert), Const(0x0101010101010101u)), + _ => throw new InvalidOperationException($"Invalid Vdup size \"{op.Size}\".") + }; + + InsertScalar(context, op.Vd, insert); + if (op.Q) + { + InsertScalar(context, op.Vd + 1, insert); + } + } + + public static void Vdup_1(ArmEmitterContext context) + { + OpCode32SimdDupElem op = (OpCode32SimdDupElem)context.CurrOp; + + Operand insert = EmitVectorExtractZx32(context, op.Vm >> 1, ((op.Vm & 1) << (3 - op.Size)) + op.Index, op.Size); + + // Zero extend into an I64, then replicate. Saves the most time over elementwise inserts. + insert = op.Size switch + { + 2 => context.Multiply(context.ZeroExtend32(OperandType.I64, insert), Const(0x0000000100000001u)), + 1 => context.Multiply(context.ZeroExtend16(OperandType.I64, insert), Const(0x0001000100010001u)), + 0 => context.Multiply(context.ZeroExtend8(OperandType.I64, insert), Const(0x0101010101010101u)), + _ => throw new InvalidOperationException($"Invalid Vdup size \"{op.Size}\".") + }; + + InsertScalar(context, op.Vd, insert); + if (op.Q) + { + InsertScalar(context, op.Vd | 1, insert); + } + } + + private static (long, long) MaskHelperByteSequence(int start, int length, int startByte) + { + int end = start + length; + int b = startByte; + long result = 0; + long result2 = 0; + for (int i = 0; i < 8; i++) + { + result |= (long)((i >= end || i < start) ? 0x80 : b++) << (i * 8); + } + for (int i = 8; i < 16; i++) + { + result2 |= (long)((i >= end || i < start) ? 0x80 : b++) << ((i - 8) * 8); + } + return (result2, result); + } + + public static void Vext(ArmEmitterContext context) + { + OpCode32SimdExt op = (OpCode32SimdExt)context.CurrOp; + int elems = op.GetBytesCount(); + int byteOff = op.Immediate; + + if (Optimizations.UseSsse3) + { + EmitVectorBinaryOpSimd32(context, (n, m) => + { + // Writing low to high of d: start <imm> into n, overlap into m. + // Then rotate n down by <imm>, m up by (elems)-imm. + // Then OR them together for the result. + + (long nMaskHigh, long nMaskLow) = MaskHelperByteSequence(0, elems - byteOff, byteOff); + (long mMaskHigh, long mMaskLow) = MaskHelperByteSequence(elems - byteOff, byteOff, 0); + Operand nMask, mMask; + if (!op.Q) + { + // Do the same operation to the bytes in the top doubleword too, as our target could be in either. + nMaskHigh = nMaskLow + 0x0808080808080808L; + mMaskHigh = mMaskLow + 0x0808080808080808L; + } + nMask = X86GetElements(context, nMaskHigh, nMaskLow); + mMask = X86GetElements(context, mMaskHigh, mMaskLow); + Operand nPart = context.AddIntrinsic(Intrinsic.X86Pshufb, n, nMask); + Operand mPart = context.AddIntrinsic(Intrinsic.X86Pshufb, m, mMask); + + return context.AddIntrinsic(Intrinsic.X86Por, nPart, mPart); + }); + } + else + { + Operand res = GetVecA32(op.Qd); + + for (int index = 0; index < elems; index++) + { + Operand extract; + + if (byteOff >= elems) + { + extract = EmitVectorExtractZx32(context, op.Qm, op.Im + (byteOff - elems), op.Size); + } + else + { + extract = EmitVectorExtractZx32(context, op.Qn, op.In + byteOff, op.Size); + } + byteOff++; + + res = EmitVectorInsert(context, res, extract, op.Id + index, op.Size); + } + + context.Copy(GetVecA32(op.Qd), res); + } + } + + public static void Vfma_S(ArmEmitterContext context) // Fused. + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarTernaryOpF32(context, Intrinsic.Arm64FmaddS); + } + else if (Optimizations.FastFP && Optimizations.UseFma) + { + EmitScalarTernaryOpF32(context, Intrinsic.X86Vfmadd231ss, Intrinsic.X86Vfmadd231sd); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd); + } + else + { + EmitScalarTernaryOpF32(context, (op1, op2, op3) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), op1, op2, op3); + }); + } + } + + public static void Vfma_V(ArmEmitterContext context) // Fused. + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorTernaryOpF32(context, Intrinsic.Arm64FmlaV); + } + else if (Optimizations.FastFP && Optimizations.UseFma) + { + EmitVectorTernaryOpF32(context, Intrinsic.X86Vfmadd231ps); + } + else + { + EmitVectorTernaryOpF32(context, (op1, op2, op3) => + { + return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulAddFpscr), op1, op2, op3); + }); + } + } + + public static void Vfms_S(ArmEmitterContext context) // Fused. + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarTernaryOpF32(context, Intrinsic.Arm64FmsubS); + } + else if (Optimizations.FastFP && Optimizations.UseFma) + { + EmitScalarTernaryOpF32(context, Intrinsic.X86Vfnmadd231ss, Intrinsic.X86Vfnmadd231sd); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd); + } + else + { + EmitScalarTernaryOpF32(context, (op1, op2, op3) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulSub), op1, op2, op3); + }); + } + } + + public static void Vfms_V(ArmEmitterContext context) // Fused. + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorTernaryOpF32(context, Intrinsic.Arm64FmlsV); + } + else if (Optimizations.FastFP && Optimizations.UseFma) + { + EmitVectorTernaryOpF32(context, Intrinsic.X86Vfnmadd231ps); + } + else + { + EmitVectorTernaryOpF32(context, (op1, op2, op3) => + { + return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulSubFpscr), op1, op2, op3); + }); + } + } + + public static void Vfnma_S(ArmEmitterContext context) // Fused. + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarTernaryOpF32(context, Intrinsic.Arm64FnmaddS); + } + else if (Optimizations.FastFP && Optimizations.UseFma) + { + EmitScalarTernaryOpF32(context, Intrinsic.X86Vfnmsub231ss, Intrinsic.X86Vfnmsub231sd); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd, isNegD: true); + } + else + { + EmitScalarTernaryOpF32(context, (op1, op2, op3) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulAdd), op1, op2, op3); + }); + } + } + + public static void Vfnms_S(ArmEmitterContext context) // Fused. + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarTernaryOpF32(context, Intrinsic.Arm64FnmsubS); + } + else if (Optimizations.FastFP && Optimizations.UseFma) + { + EmitScalarTernaryOpF32(context, Intrinsic.X86Vfmsub231ss, Intrinsic.X86Vfmsub231sd); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd, isNegD: true); + } + else + { + EmitScalarTernaryOpF32(context, (op1, op2, op3) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulSub), op1, op2, op3); + }); + } + } + + public static void Vhadd(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + if (op.U) + { + EmitVectorBinaryOpZx32(context, (op1, op2) => context.ShiftRightUI(context.Add(op1, op2), Const(1))); + } + else + { + EmitVectorBinaryOpSx32(context, (op1, op2) => context.ShiftRightSI(context.Add(op1, op2), Const(1))); + } + } + + public static void Vmov_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarUnaryOpF32(context, 0, 0); + } + else + { + EmitScalarUnaryOpF32(context, (op1) => op1); + } + } + + public static void Vmovn(ArmEmitterContext context) + { + EmitVectorUnaryNarrowOp32(context, (op1) => op1); + } + + public static void Vneg_S(ArmEmitterContext context) + { + OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; + + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarUnaryOpF32(context, Intrinsic.Arm64FnegS); + } + else if (Optimizations.UseSse2) + { + EmitScalarUnaryOpSimd32(context, (m) => + { + if ((op.Size & 1) == 0) + { + Operand mask = X86GetScalar(context, -0f); + return context.AddIntrinsic(Intrinsic.X86Xorps, mask, m); + } + else + { + Operand mask = X86GetScalar(context, -0d); + return context.AddIntrinsic(Intrinsic.X86Xorpd, mask, m); + } + }); + } + else + { + EmitScalarUnaryOpF32(context, (op1) => context.Negate(op1)); + } + } + + public static void Vnmul_S(ArmEmitterContext context) + { + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarBinaryOpF32(context, Intrinsic.Arm64FnmulS); + } + else if (Optimizations.UseSse2) + { + EmitScalarBinaryOpSimd32(context, (n, m) => + { + if ((op.Size & 1) == 0) + { + Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + Operand mask = X86GetScalar(context, -0f); + return context.AddIntrinsic(Intrinsic.X86Xorps, mask, res); + } + else + { + Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + Operand mask = X86GetScalar(context, -0d); + return context.AddIntrinsic(Intrinsic.X86Xorpd, mask, res); + } + }); + } + else + { + EmitScalarBinaryOpF32(context, (op1, op2) => context.Negate(context.Multiply(op1, op2))); + } + } + + public static void Vnmla_S(ArmEmitterContext context) + { + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarTernaryOpF32(context, Intrinsic.Arm64FnmaddS); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd, isNegD: true); + } + else if (Optimizations.FastFP) + { + EmitScalarTernaryOpF32(context, (op1, op2, op3) => + { + return context.Subtract(context.Negate(op1), context.Multiply(op2, op3)); + }); + } + else + { + EmitScalarTernaryOpF32(context, (op1, op2, op3) => + { + Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op2, op3); + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub), context.Negate(op1), res); + }); + } + } + + public static void Vnmls_S(ArmEmitterContext context) + { + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarTernaryOpF32(context, Intrinsic.Arm64FnmsubS); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd, isNegD: true); + } + else if (Optimizations.FastFP) + { + EmitScalarTernaryOpF32(context, (op1, op2, op3) => + { + return context.Add(context.Negate(op1), context.Multiply(op2, op3)); + }); + } + else + { + EmitScalarTernaryOpF32(context, (op1, op2, op3) => + { + Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op2, op3); + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), context.Negate(op1), res); + }); + } + } + + public static void Vneg_V(ArmEmitterContext context) + { + OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp; + + if (op.F) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorUnaryOpF32(context, Intrinsic.Arm64FnegV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorUnaryOpSimd32(context, (m) => + { + if ((op.Size & 1) == 0) + { + Operand mask = X86GetAllElements(context, -0f); + return context.AddIntrinsic(Intrinsic.X86Xorps, mask, m); + } + else + { + Operand mask = X86GetAllElements(context, -0d); + return context.AddIntrinsic(Intrinsic.X86Xorpd, mask, m); + } + }); + } + else + { + EmitVectorUnaryOpF32(context, (op1) => context.Negate(op1)); + } + } + else + { + EmitVectorUnaryOpSx32(context, (op1) => context.Negate(op1)); + } + } + + public static void Vdiv_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarBinaryOpF32(context, Intrinsic.Arm64FdivS); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarBinaryOpF32(context, Intrinsic.X86Divss, Intrinsic.X86Divsd); + } + else if (Optimizations.FastFP) + { + EmitScalarBinaryOpF32(context, (op1, op2) => context.Divide(op1, op2)); + } + else + { + EmitScalarBinaryOpF32(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPDiv), op1, op2); + }); + } + } + + public static void Vmaxnm_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarBinaryOpF32(context, Intrinsic.Arm64FmaxnmS); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse41MaxMinNumOpF32(context, true, true); + } + else + { + EmitScalarBinaryOpF32(context, (op1, op2) => EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum), op1, op2)); + } + } + + public static void Vmaxnm_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorBinaryOpF32(context, Intrinsic.Arm64FmaxnmV); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse41MaxMinNumOpF32(context, true, false); + } + else + { + EmitVectorBinaryOpSx32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMaxNumFpscr), op1, op2)); + } + } + + public static void Vminnm_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarBinaryOpF32(context, Intrinsic.Arm64FminnmS); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse41MaxMinNumOpF32(context, false, true); + } + else + { + EmitScalarBinaryOpF32(context, (op1, op2) => EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum), op1, op2)); + } + } + + public static void Vminnm_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorBinaryOpF32(context, Intrinsic.Arm64FminnmV); + } + else if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse41MaxMinNumOpF32(context, false, false); + } + else + { + EmitVectorBinaryOpSx32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMinNumFpscr), op1, op2)); + } + } + + public static void Vmax_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorBinaryOpF32(context, Intrinsic.Arm64FmaxV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorBinaryOpF32(context, Intrinsic.X86Maxps, Intrinsic.X86Maxpd); + } + else + { + EmitVectorBinaryOpF32(context, (op1, op2) => + { + return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMaxFpscr), op1, op2); + }); + } + } + + public static void Vmax_I(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + if (op.U) + { + if (Optimizations.UseSse2) + { + EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PmaxuInstruction[op.Size], op1, op2)); + } + else + { + EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreaterUI(op1, op2), op1, op2)); + } + } + else + { + if (Optimizations.UseSse2) + { + EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PmaxsInstruction[op.Size], op1, op2)); + } + else + { + EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreater(op1, op2), op1, op2)); + } + } + } + + public static void Vmin_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorBinaryOpF32(context, Intrinsic.Arm64FminV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorBinaryOpF32(context, Intrinsic.X86Minps, Intrinsic.X86Minpd); + } + else + { + EmitVectorBinaryOpF32(context, (op1, op2) => + { + return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMinFpscr), op1, op2); + }); + } + } + + public static void Vmin_I(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + if (op.U) + { + if (Optimizations.UseSse2) + { + EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PminuInstruction[op.Size], op1, op2)); + } + else + { + EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLessUI(op1, op2), op1, op2)); + } + } + else + { + if (Optimizations.UseSse2) + { + EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PminsInstruction[op.Size], op1, op2)); + } + else + { + EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLess(op1, op2), op1, op2)); + } + } + } + + public static void Vmla_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarTernaryOpF32(context, Intrinsic.Arm64FmaddS); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd); + } + else if (Optimizations.FastFP) + { + EmitScalarTernaryOpF32(context, (op1, op2, op3) => + { + return context.Add(op1, context.Multiply(op2, op3)); + }); + } + else + { + EmitScalarTernaryOpF32(context, (op1, op2, op3) => + { + Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op2, op3); + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), op1, res); + }); + } + } + + public static void Vmla_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorTernaryOpF32(context, Intrinsic.Arm64FmlaV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorTernaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd); + } + else if (Optimizations.FastFP) + { + EmitVectorTernaryOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3))); + } + else + { + EmitVectorTernaryOpF32(context, (op1, op2, op3) => + { + return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulAddFpscr), op1, op2, op3); + }); + } + } + + public static void Vmla_I(ArmEmitterContext context) + { + EmitVectorTernaryOpZx32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3))); + } + + public static void Vmla_1(ArmEmitterContext context) + { + OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; + + if (op.F) + { + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd); + } + else if (Optimizations.FastFP) + { + EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3))); + } + else + { + EmitVectorsByScalarOpF32(context, (op1, op2, op3) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulAddFpscr), op1, op2, op3)); + } + } + else + { + EmitVectorsByScalarOpI32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)), false); + } + } + + public static void Vmlal_I(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + EmitVectorTernaryLongOpI32(context, (d, n, m) => context.Add(d, context.Multiply(n, m)), !op.U); + } + + public static void Vmls_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarTernaryOpF32(context, Intrinsic.Arm64FmlsV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd); + } + else if (Optimizations.FastFP) + { + EmitScalarTernaryOpF32(context, (op1, op2, op3) => + { + return context.Subtract(op1, context.Multiply(op2, op3)); + }); + } + else + { + EmitScalarTernaryOpF32(context, (op1, op2, op3) => + { + Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op2, op3); + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub), op1, res); + }); + } + } + + public static void Vmls_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorTernaryOpF32(context, Intrinsic.Arm64FmlsV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorTernaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd); + } + else if (Optimizations.FastFP) + { + EmitVectorTernaryOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3))); + } + else + { + EmitVectorTernaryOpF32(context, (op1, op2, op3) => + { + return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulSubFpscr), op1, op2, op3); + }); + } + } + + public static void Vmls_I(ArmEmitterContext context) + { + EmitVectorTernaryOpZx32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3))); + } + + public static void Vmls_1(ArmEmitterContext context) + { + OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; + + if (op.F) + { + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd); + } + else if (Optimizations.FastFP) + { + EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3))); + } + else + { + EmitVectorsByScalarOpF32(context, (op1, op2, op3) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulSubFpscr), op1, op2, op3)); + } + } + else + { + EmitVectorsByScalarOpI32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)), false); + } + } + + public static void Vmlsl_I(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + EmitVectorTernaryLongOpI32(context, (opD, op1, op2) => context.Subtract(opD, context.Multiply(op1, op2)), !op.U); + } + + public static void Vmul_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarBinaryOpF32(context, Intrinsic.Arm64FmulS); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarBinaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd); + } + else if (Optimizations.FastFP) + { + EmitScalarBinaryOpF32(context, (op1, op2) => context.Multiply(op1, op2)); + } + else + { + EmitScalarBinaryOpF32(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op1, op2); + }); + } + } + + public static void Vmul_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorBinaryOpF32(context, Intrinsic.Arm64FmulV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorBinaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd); + } + else if (Optimizations.FastFP) + { + EmitVectorBinaryOpF32(context, (op1, op2) => context.Multiply(op1, op2)); + } + else + { + EmitVectorBinaryOpF32(context, (op1, op2) => + { + return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulFpscr), op1, op2); + }); + } + } + + public static void Vmul_I(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + if (op.U) // This instruction is always signed, U indicates polynomial mode. + { + EmitVectorBinaryOpZx32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size)); + } + else + { + EmitVectorBinaryOpSx32(context, (op1, op2) => context.Multiply(op1, op2)); + } + } + + public static void Vmul_1(ArmEmitterContext context) + { + OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; + + if (op.F) + { + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd); + } + else if (Optimizations.FastFP) + { + EmitVectorByScalarOpF32(context, (op1, op2) => context.Multiply(op1, op2)); + } + else + { + EmitVectorByScalarOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulFpscr), op1, op2)); + } + } + else + { + EmitVectorByScalarOpI32(context, (op1, op2) => context.Multiply(op1, op2), false); + } + } + + public static void Vmull_1(ArmEmitterContext context) + { + OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; + + EmitVectorByScalarLongOpI32(context, (op1, op2) => context.Multiply(op1, op2), !op.U); + } + + public static void Vmull_I(ArmEmitterContext context) + { + OpCode32SimdRegLong op = (OpCode32SimdRegLong)context.CurrOp; + + if (op.Polynomial) + { + if (op.Size == 0) // P8 + { + EmitVectorBinaryLongOpI32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size), false); + } + else /* if (op.Size == 2) // P64 */ + { + Operand ne = context.VectorExtract(OperandType.I64, GetVec(op.Qn), op.Vn & 1); + Operand me = context.VectorExtract(OperandType.I64, GetVec(op.Qm), op.Vm & 1); + + Operand res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.PolynomialMult64_128)), ne, me); + + context.Copy(GetVecA32(op.Qd), res); + } + } + else + { + EmitVectorBinaryLongOpI32(context, (op1, op2) => context.Multiply(op1, op2), !op.U); + } + } + + public static void Vpadd_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorPairwiseOpF32(context, Intrinsic.Arm64FaddpV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2VectorPairwiseOpF32(context, Intrinsic.X86Addps); + } + else + { + EmitVectorPairwiseOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPAddFpscr), op1, op2)); + } + } + + public static void Vpadd_I(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + if (Optimizations.UseSsse3) + { + EmitSsse3VectorPairwiseOp32(context, X86PaddInstruction); + } + else + { + EmitVectorPairwiseOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U); + } + } + + public static void Vpaddl(ArmEmitterContext context) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + EmitVectorPairwiseLongOpI32(context, (op1, op2) => context.Add(op1, op2), (op.Opc & 1) == 0); + } + + public static void Vpmax_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorPairwiseOpF32(context, Intrinsic.Arm64FmaxpV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2VectorPairwiseOpF32(context, Intrinsic.X86Maxps); + } + else + { + EmitVectorPairwiseOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat64.FPMaxFpscr), op1, op2)); + } + } + + public static void Vpmax_I(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + if (Optimizations.UseSsse3) + { + EmitSsse3VectorPairwiseOp32(context, op.U ? X86PmaxuInstruction : X86PmaxsInstruction); + } + else + { + EmitVectorPairwiseOpI32(context, (op1, op2) => + { + Operand greater = op.U ? context.ICompareGreaterUI(op1, op2) : context.ICompareGreater(op1, op2); + return context.ConditionalSelect(greater, op1, op2); + }, !op.U); + } + } + + public static void Vpmin_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorPairwiseOpF32(context, Intrinsic.Arm64FminpV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2VectorPairwiseOpF32(context, Intrinsic.X86Minps); + } + else + { + EmitVectorPairwiseOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMinFpscr), op1, op2)); + } + } + + public static void Vpmin_I(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + if (Optimizations.UseSsse3) + { + EmitSsse3VectorPairwiseOp32(context, op.U ? X86PminuInstruction : X86PminsInstruction); + } + else + { + EmitVectorPairwiseOpI32(context, (op1, op2) => + { + Operand greater = op.U ? context.ICompareLessUI(op1, op2) : context.ICompareLess(op1, op2); + return context.ConditionalSelect(greater, op1, op2); + }, !op.U); + } + } + + public static void Vqadd(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + EmitSaturatingAddSubBinaryOp(context, add: true, !op.U); + } + + public static void Vqdmulh(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + int eSize = 8 << op.Size; + + EmitVectorBinaryOpI32(context, (op1, op2) => + { + if (op.Size == 2) + { + op1 = context.SignExtend32(OperandType.I64, op1); + op2 = context.SignExtend32(OperandType.I64, op2); + } + + Operand res = context.Multiply(op1, op2); + res = context.ShiftRightSI(res, Const(eSize - 1)); + res = EmitSatQ(context, res, eSize, signedSrc: true, signedDst: true); + + if (op.Size == 2) + { + res = context.ConvertI64ToI32(res); + } + + return res; + }, signed: true); + } + + public static void Vqmovn(ArmEmitterContext context) + { + OpCode32SimdMovn op = (OpCode32SimdMovn)context.CurrOp; + + bool signed = !op.Q; + + EmitVectorUnaryNarrowOp32(context, (op1) => EmitSatQ(context, op1, 8 << op.Size, signed, signed), signed); + } + + public static void Vqmovun(ArmEmitterContext context) + { + OpCode32SimdMovn op = (OpCode32SimdMovn)context.CurrOp; + + EmitVectorUnaryNarrowOp32(context, (op1) => EmitSatQ(context, op1, 8 << op.Size, signedSrc: true, signedDst: false), signed: true); + } + + public static void Vqsub(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + EmitSaturatingAddSubBinaryOp(context, add: false, !op.U); + } + + public static void Vrev(ArmEmitterContext context) + { + OpCode32SimdRev op = (OpCode32SimdRev)context.CurrOp; + + if (Optimizations.UseSsse3) + { + EmitVectorUnaryOpSimd32(context, (op1) => + { + Operand mask; + switch (op.Size) + { + case 3: + // Rev64 + switch (op.Opc) + { + case 0: + mask = X86GetElements(context, 0x08090a0b0c0d0e0fL, 0x0001020304050607L); + return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask); + case 1: + mask = X86GetElements(context, 0x09080b0a0d0c0f0eL, 0x0100030205040706L); + return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask); + case 2: + return context.AddIntrinsic(Intrinsic.X86Shufps, op1, op1, Const(1 | (0 << 2) | (3 << 4) | (2 << 6))); + } + break; + case 2: + // Rev32 + switch (op.Opc) + { + case 0: + mask = X86GetElements(context, 0x0c0d0e0f_08090a0bL, 0x04050607_00010203L); + return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask); + case 1: + mask = X86GetElements(context, 0x0d0c0f0e_09080b0aL, 0x05040706_01000302L); + return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask); + } + break; + case 1: + // Rev16 + mask = X86GetElements(context, 0x0e0f_0c0d_0a0b_0809L, 0x_0607_0405_0203_0001L); + return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask); + } + + throw new InvalidOperationException("Invalid VREV Opcode + Size combo."); // Should be unreachable. + }); + } + else + { + EmitVectorUnaryOpZx32(context, (op1) => + { + switch (op.Opc) + { + case 0: + switch (op.Size) // Swap bytes. + { + case 1: + return InstEmitAluHelper.EmitReverseBytes16_32Op(context, op1); + case 2: + case 3: + return context.ByteSwap(op1); + } + break; + case 1: + switch (op.Size) + { + case 2: + return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff0000)), Const(16)), + context.ShiftLeft(context.BitwiseAnd(op1, Const(0x0000ffff)), Const(16))); + case 3: + return context.BitwiseOr( + context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff000000000000ul)), Const(48)), + context.ShiftLeft(context.BitwiseAnd(op1, Const(0x000000000000fffful)), Const(48))), + context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0x0000ffff00000000ul)), Const(16)), + context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000ffff0000ul)), Const(16)))); + } + break; + case 2: + // Swap upper and lower halves. + return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffffffff00000000ul)), Const(32)), + context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000fffffffful)), Const(32))); + } + + throw new InvalidOperationException("Invalid VREV Opcode + Size combo."); // Should be unreachable. + }); + } + } + + public static void Vrecpe(ArmEmitterContext context) + { + OpCode32SimdSqrte op = (OpCode32SimdSqrte)context.CurrOp; + + if (op.F) + { + int sizeF = op.Size & 1; + + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorUnaryOpF32(context, Intrinsic.Arm64FrecpeV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2 && sizeF == 0) + { + EmitVectorUnaryOpF32(context, Intrinsic.X86Rcpps, 0); + } + else + { + EmitVectorUnaryOpF32(context, (op1) => + { + return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPRecipEstimateFpscr), op1); + }); + } + } + else + { + throw new NotImplementedException("Integer Vrecpe not currently implemented."); + } + } + + public static void Vrecps(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorBinaryOpF32(context, Intrinsic.Arm64FrecpsV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + bool single = (op.Size & 1) == 0; + + // (2 - (n*m)) + EmitVectorBinaryOpSimd32(context, (n, m) => + { + if (single) + { + Operand maskTwo = X86GetAllElements(context, 2f); + + Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + + return context.AddIntrinsic(Intrinsic.X86Subps, maskTwo, res); + } + else + { + Operand maskTwo = X86GetAllElements(context, 2d); + + Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + + return context.AddIntrinsic(Intrinsic.X86Subpd, maskTwo, res); + } + }); + } + else + { + EmitVectorBinaryOpF32(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRecipStep), op1, op2); + }); + } + } + + public static void Vrhadd(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + EmitVectorBinaryOpI32(context, (op1, op2) => + { + if (op.Size == 2) + { + op1 = context.ZeroExtend32(OperandType.I64, op1); + op2 = context.ZeroExtend32(OperandType.I64, op2); + } + + Operand res = context.Add(context.Add(op1, op2), Const(op1.Type, 1L)); + res = context.ShiftRightUI(res, Const(1)); + + if (op.Size == 2) + { + res = context.ConvertI64ToI32(res); + } + + return res; + }, !op.U); + } + + public static void Vrsqrte(ArmEmitterContext context) + { + OpCode32SimdSqrte op = (OpCode32SimdSqrte)context.CurrOp; + + if (op.F) + { + int sizeF = op.Size & 1; + + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorUnaryOpF32(context, Intrinsic.Arm64FrsqrteV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2 && sizeF == 0) + { + EmitVectorUnaryOpF32(context, Intrinsic.X86Rsqrtps, 0); + } + else + { + EmitVectorUnaryOpF32(context, (op1) => + { + return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPRSqrtEstimateFpscr), op1); + }); + } + } + else + { + throw new NotImplementedException("Integer Vrsqrte not currently implemented."); + } + } + + public static void Vrsqrts(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorBinaryOpF32(context, Intrinsic.Arm64FrsqrtsV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + bool single = (op.Size & 1) == 0; + + // (3 - (n*m)) / 2 + EmitVectorBinaryOpSimd32(context, (n, m) => + { + if (single) + { + Operand maskHalf = X86GetAllElements(context, 0.5f); + Operand maskThree = X86GetAllElements(context, 3f); + + Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + + res = context.AddIntrinsic(Intrinsic.X86Subps, maskThree, res); + return context.AddIntrinsic(Intrinsic.X86Mulps, maskHalf, res); + } + else + { + Operand maskHalf = X86GetAllElements(context, 0.5d); + Operand maskThree = X86GetAllElements(context, 3d); + + Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + + res = context.AddIntrinsic(Intrinsic.X86Subpd, maskThree, res); + return context.AddIntrinsic(Intrinsic.X86Mulpd, maskHalf, res); + } + }); + } + else + { + EmitVectorBinaryOpF32(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRSqrtStep), op1, op2); + }); + } + } + + public static void Vsel(ArmEmitterContext context) + { + OpCode32SimdSel op = (OpCode32SimdSel)context.CurrOp; + + Operand condition = default; + + switch (op.Cc) + { + case OpCode32SimdSelMode.Eq: + condition = GetCondTrue(context, Condition.Eq); + break; + case OpCode32SimdSelMode.Ge: + condition = GetCondTrue(context, Condition.Ge); + break; + case OpCode32SimdSelMode.Gt: + condition = GetCondTrue(context, Condition.Gt); + break; + case OpCode32SimdSelMode.Vs: + condition = GetCondTrue(context, Condition.Vs); + break; + } + + EmitScalarBinaryOpI32(context, (op1, op2) => + { + return context.ConditionalSelect(condition, op1, op2); + }); + } + + public static void Vsqrt_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarUnaryOpF32(context, Intrinsic.Arm64FsqrtS); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarUnaryOpF32(context, Intrinsic.X86Sqrtss, Intrinsic.X86Sqrtsd); + } + else + { + EmitScalarUnaryOpF32(context, (op1) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPSqrt), op1); + }); + } + } + + public static void Vsub_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarBinaryOpF32(context, Intrinsic.Arm64FsubS); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarBinaryOpF32(context, Intrinsic.X86Subss, Intrinsic.X86Subsd); + } + else + { + EmitScalarBinaryOpF32(context, (op1, op2) => context.Subtract(op1, op2)); + } + } + + public static void Vsub_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorBinaryOpF32(context, Intrinsic.Arm64FsubV); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorBinaryOpF32(context, Intrinsic.X86Subps, Intrinsic.X86Subpd); + } + else + { + EmitVectorBinaryOpF32(context, (op1, op2) => context.Subtract(op1, op2)); + } + } + + public static void Vsub_I(ArmEmitterContext context) + { + if (Optimizations.UseSse2) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PsubInstruction[op.Size], op1, op2)); + } + else + { + EmitVectorBinaryOpZx32(context, (op1, op2) => context.Subtract(op1, op2)); + } + } + + public static void Vsubl_I(ArmEmitterContext context) + { + OpCode32SimdRegLong op = (OpCode32SimdRegLong)context.CurrOp; + + EmitVectorBinaryLongOpI32(context, (op1, op2) => context.Subtract(op1, op2), !op.U); + } + + public static void Vsubw_I(ArmEmitterContext context) + { + OpCode32SimdRegWide op = (OpCode32SimdRegWide)context.CurrOp; + + EmitVectorBinaryWideOpI32(context, (op1, op2) => context.Subtract(op1, op2), !op.U); + } + + private static void EmitSaturatingAddSubBinaryOp(ArmEmitterContext context, bool add, bool signed) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + EmitVectorBinaryOpI32(context, (ne, me) => + { + if (op.Size <= 2) + { + if (op.Size == 2) + { + ne = signed ? context.SignExtend32(OperandType.I64, ne) : context.ZeroExtend32(OperandType.I64, ne); + me = signed ? context.SignExtend32(OperandType.I64, me) : context.ZeroExtend32(OperandType.I64, me); + } + + Operand res = add ? context.Add(ne, me) : context.Subtract(ne, me); + + res = EmitSatQ(context, res, 8 << op.Size, signedSrc: true, signed); + + if (op.Size == 2) + { + res = context.ConvertI64ToI32(res); + } + + return res; + } + else if (add) /* if (op.Size == 3) */ + { + return signed + ? EmitBinarySignedSatQAdd(context, ne, me) + : EmitBinaryUnsignedSatQAdd(context, ne, me); + } + else /* if (sub) */ + { + return signed + ? EmitBinarySignedSatQSub(context, ne, me) + : EmitBinaryUnsignedSatQSub(context, ne, me); + } + }, signed); + } + + private static void EmitSse41MaxMinNumOpF32(ArmEmitterContext context, bool isMaxNum, bool scalar) + { + IOpCode32Simd op = (IOpCode32Simd)context.CurrOp; + + Func<Operand, Operand, Operand> genericEmit = (n, m) => + { + Operand nNum = context.Copy(n); + Operand mNum = context.Copy(m); + + InstEmit.EmitSse2VectorIsNaNOpF(context, nNum, out Operand nQNaNMask, out _, isQNaN: true); + InstEmit.EmitSse2VectorIsNaNOpF(context, mNum, out Operand mQNaNMask, out _, isQNaN: true); + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + Operand negInfMask = X86GetAllElements(context, isMaxNum ? float.NegativeInfinity : float.PositiveInfinity); + + Operand nMask = context.AddIntrinsic(Intrinsic.X86Andnps, mQNaNMask, nQNaNMask); + Operand mMask = context.AddIntrinsic(Intrinsic.X86Andnps, nQNaNMask, mQNaNMask); + + nNum = context.AddIntrinsic(Intrinsic.X86Blendvps, nNum, negInfMask, nMask); + mNum = context.AddIntrinsic(Intrinsic.X86Blendvps, mNum, negInfMask, mMask); + + return context.AddIntrinsic(isMaxNum ? Intrinsic.X86Maxps : Intrinsic.X86Minps, nNum, mNum); + } + else /* if (sizeF == 1) */ + { + Operand negInfMask = X86GetAllElements(context, isMaxNum ? double.NegativeInfinity : double.PositiveInfinity); + + Operand nMask = context.AddIntrinsic(Intrinsic.X86Andnpd, mQNaNMask, nQNaNMask); + Operand mMask = context.AddIntrinsic(Intrinsic.X86Andnpd, nQNaNMask, mQNaNMask); + + nNum = context.AddIntrinsic(Intrinsic.X86Blendvpd, nNum, negInfMask, nMask); + mNum = context.AddIntrinsic(Intrinsic.X86Blendvpd, mNum, negInfMask, mMask); + + return context.AddIntrinsic(isMaxNum ? Intrinsic.X86Maxpd : Intrinsic.X86Minpd, nNum, mNum); + } + }; + + if (scalar) + { + EmitScalarBinaryOpSimd32(context, genericEmit); + } + else + { + EmitVectorBinaryOpSimd32(context, genericEmit); + } + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSimdCmp.cs b/src/ARMeilleure/Instructions/InstEmitSimdCmp.cs new file mode 100644 index 00000000..c32b64ba --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdCmp.cs @@ -0,0 +1,799 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; +using System; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + using Func2I = Func<Operand, Operand, Operand>; + + static partial class InstEmit + { + public static void Cmeq_S(ArmEmitterContext context) + { + EmitCmpOp(context, (op1, op2) => context.ICompareEqual(op1, op2), scalar: true); + } + + public static void Cmeq_V(ArmEmitterContext context) + { + if (Optimizations.UseSse41) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m; + + if (op is OpCodeSimdReg binOp) + { + m = GetVec(binOp.Rm); + } + else + { + m = context.VectorZero(); + } + + Intrinsic cmpInst = X86PcmpeqInstruction[op.Size]; + + Operand res = context.AddIntrinsic(cmpInst, n, m); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitCmpOp(context, (op1, op2) => context.ICompareEqual(op1, op2), scalar: false); + } + } + + public static void Cmge_S(ArmEmitterContext context) + { + EmitCmpOp(context, (op1, op2) => context.ICompareGreaterOrEqual(op1, op2), scalar: true); + } + + public static void Cmge_V(ArmEmitterContext context) + { + if (Optimizations.UseSse42) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m; + + if (op is OpCodeSimdReg binOp) + { + m = GetVec(binOp.Rm); + } + else + { + m = context.VectorZero(); + } + + Intrinsic cmpInst = X86PcmpgtInstruction[op.Size]; + + Operand res = context.AddIntrinsic(cmpInst, m, n); + + Operand mask = X86GetAllElements(context, -1L); + + res = context.AddIntrinsic(Intrinsic.X86Pandn, res, mask); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitCmpOp(context, (op1, op2) => context.ICompareGreaterOrEqual(op1, op2), scalar: false); + } + } + + public static void Cmgt_S(ArmEmitterContext context) + { + EmitCmpOp(context, (op1, op2) => context.ICompareGreater(op1, op2), scalar: true); + } + + public static void Cmgt_V(ArmEmitterContext context) + { + if (Optimizations.UseSse42) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m; + + if (op is OpCodeSimdReg binOp) + { + m = GetVec(binOp.Rm); + } + else + { + m = context.VectorZero(); + } + + Intrinsic cmpInst = X86PcmpgtInstruction[op.Size]; + + Operand res = context.AddIntrinsic(cmpInst, n, m); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitCmpOp(context, (op1, op2) => context.ICompareGreater(op1, op2), scalar: false); + } + } + + public static void Cmhi_S(ArmEmitterContext context) + { + EmitCmpOp(context, (op1, op2) => context.ICompareGreaterUI(op1, op2), scalar: true); + } + + public static void Cmhi_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UseSse41 && op.Size < 3) + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Intrinsic maxInst = X86PmaxuInstruction[op.Size]; + + Operand res = context.AddIntrinsic(maxInst, m, n); + + Intrinsic cmpInst = X86PcmpeqInstruction[op.Size]; + + res = context.AddIntrinsic(cmpInst, res, m); + + Operand mask = X86GetAllElements(context, -1L); + + res = context.AddIntrinsic(Intrinsic.X86Pandn, res, mask); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitCmpOp(context, (op1, op2) => context.ICompareGreaterUI(op1, op2), scalar: false); + } + } + + public static void Cmhs_S(ArmEmitterContext context) + { + EmitCmpOp(context, (op1, op2) => context.ICompareGreaterOrEqualUI(op1, op2), scalar: true); + } + + public static void Cmhs_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UseSse41 && op.Size < 3) + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Intrinsic maxInst = X86PmaxuInstruction[op.Size]; + + Operand res = context.AddIntrinsic(maxInst, n, m); + + Intrinsic cmpInst = X86PcmpeqInstruction[op.Size]; + + res = context.AddIntrinsic(cmpInst, res, n); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitCmpOp(context, (op1, op2) => context.ICompareGreaterOrEqualUI(op1, op2), scalar: false); + } + } + + public static void Cmle_S(ArmEmitterContext context) + { + EmitCmpOp(context, (op1, op2) => context.ICompareLessOrEqual(op1, op2), scalar: true); + } + + public static void Cmle_V(ArmEmitterContext context) + { + if (Optimizations.UseSse42) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + Intrinsic cmpInst = X86PcmpgtInstruction[op.Size]; + + Operand res = context.AddIntrinsic(cmpInst, n, context.VectorZero()); + + Operand mask = X86GetAllElements(context, -1L); + + res = context.AddIntrinsic(Intrinsic.X86Pandn, res, mask); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitCmpOp(context, (op1, op2) => context.ICompareLessOrEqual(op1, op2), scalar: false); + } + } + + public static void Cmlt_S(ArmEmitterContext context) + { + EmitCmpOp(context, (op1, op2) => context.ICompareLess(op1, op2), scalar: true); + } + + public static void Cmlt_V(ArmEmitterContext context) + { + if (Optimizations.UseSse42) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + Intrinsic cmpInst = X86PcmpgtInstruction[op.Size]; + + Operand res = context.AddIntrinsic(cmpInst, context.VectorZero(), n); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitCmpOp(context, (op1, op2) => context.ICompareLess(op1, op2), scalar: false); + } + } + + public static void Cmtst_S(ArmEmitterContext context) + { + EmitCmtstOp(context, scalar: true); + } + + public static void Cmtst_V(ArmEmitterContext context) + { + EmitCmtstOp(context, scalar: false); + } + + public static void Facge_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAvx) + { + EmitSse2OrAvxCmpOpF(context, CmpCondition.GreaterThanOrEqual, scalar: true, absolute: true); + } + else + { + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareGE), scalar: true, absolute: true); + } + } + + public static void Facge_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAvx) + { + EmitSse2OrAvxCmpOpF(context, CmpCondition.GreaterThanOrEqual, scalar: false, absolute: true); + } + else + { + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareGE), scalar: false, absolute: true); + } + } + + public static void Facgt_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAvx) + { + EmitSse2OrAvxCmpOpF(context, CmpCondition.GreaterThan, scalar: true, absolute: true); + } + else + { + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareGT), scalar: true, absolute: true); + } + } + + public static void Facgt_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAvx) + { + EmitSse2OrAvxCmpOpF(context, CmpCondition.GreaterThan, scalar: false, absolute: true); + } + else + { + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareGT), scalar: false, absolute: true); + } + } + + public static void Fccmp_S(ArmEmitterContext context) + { + EmitFccmpOrFccmpe(context, signalNaNs: false); + } + + public static void Fccmpe_S(ArmEmitterContext context) + { + EmitFccmpOrFccmpe(context, signalNaNs: true); + } + + public static void Fcmeq_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2OrAvxCmpOpF(context, CmpCondition.Equal, scalar: true); + } + else + { + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareEQ), scalar: true); + } + } + + public static void Fcmeq_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2OrAvxCmpOpF(context, CmpCondition.Equal, scalar: false); + } + else + { + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareEQ), scalar: false); + } + } + + public static void Fcmge_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAvx) + { + EmitSse2OrAvxCmpOpF(context, CmpCondition.GreaterThanOrEqual, scalar: true); + } + else + { + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareGE), scalar: true); + } + } + + public static void Fcmge_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAvx) + { + EmitSse2OrAvxCmpOpF(context, CmpCondition.GreaterThanOrEqual, scalar: false); + } + else + { + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareGE), scalar: false); + } + } + + public static void Fcmgt_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAvx) + { + EmitSse2OrAvxCmpOpF(context, CmpCondition.GreaterThan, scalar: true); + } + else + { + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareGT), scalar: true); + } + } + + public static void Fcmgt_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAvx) + { + EmitSse2OrAvxCmpOpF(context, CmpCondition.GreaterThan, scalar: false); + } + else + { + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareGT), scalar: false); + } + } + + public static void Fcmle_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2OrAvxCmpOpF(context, CmpCondition.LessThanOrEqual, scalar: true); + } + else + { + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareLE), scalar: true); + } + } + + public static void Fcmle_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2OrAvxCmpOpF(context, CmpCondition.LessThanOrEqual, scalar: false); + } + else + { + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareLE), scalar: false); + } + } + + public static void Fcmlt_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2OrAvxCmpOpF(context, CmpCondition.LessThan, scalar: true); + } + else + { + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareLT), scalar: true); + } + } + + public static void Fcmlt_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2OrAvxCmpOpF(context, CmpCondition.LessThan, scalar: false); + } + else + { + EmitCmpOpF(context, nameof(SoftFloat32.FPCompareLT), scalar: false); + } + } + + public static void Fcmp_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitFcmpOrFcmpe(context, signalNaNs: false); + } + else + { + EmitFcmpOrFcmpe(context, signalNaNs: false); + } + } + + public static void Fcmpe_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitFcmpOrFcmpe(context, signalNaNs: true); + } + else + { + EmitFcmpOrFcmpe(context, signalNaNs: true); + } + } + + private static void EmitFccmpOrFccmpe(ArmEmitterContext context, bool signalNaNs) + { + OpCodeSimdFcond op = (OpCodeSimdFcond)context.CurrOp; + + Operand lblTrue = Label(); + Operand lblEnd = Label(); + + context.BranchIfTrue(lblTrue, InstEmitFlowHelper.GetCondTrue(context, op.Cond)); + + EmitSetNzcv(context, op.Nzcv); + + context.Branch(lblEnd); + + context.MarkLabel(lblTrue); + + EmitFcmpOrFcmpe(context, signalNaNs); + + context.MarkLabel(lblEnd); + } + + private static void EmitSetNzcv(ArmEmitterContext context, int nzcv) + { + Operand Extract(int value, int bit) + { + if (bit != 0) + { + value >>= bit; + } + + value &= 1; + + return Const(value); + } + + SetFlag(context, PState.VFlag, Extract(nzcv, 0)); + SetFlag(context, PState.CFlag, Extract(nzcv, 1)); + SetFlag(context, PState.ZFlag, Extract(nzcv, 2)); + SetFlag(context, PState.NFlag, Extract(nzcv, 3)); + } + + private static void EmitFcmpOrFcmpe(ArmEmitterContext context, bool signalNaNs) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + bool cmpWithZero = !(op is OpCodeSimdFcond) ? op.Bit3 : false; + + if (Optimizations.FastFP && (signalNaNs ? Optimizations.UseAvx : Optimizations.UseSse2)) + { + Operand n = GetVec(op.Rn); + Operand m = cmpWithZero ? context.VectorZero() : GetVec(op.Rm); + + CmpCondition cmpOrdered = signalNaNs ? CmpCondition.OrderedS : CmpCondition.OrderedQ; + + Operand lblNaN = Label(); + Operand lblEnd = Label(); + + if (op.Size == 0) + { + Operand ordMask = context.AddIntrinsic(Intrinsic.X86Cmpss, n, m, Const((int)cmpOrdered)); + + Operand isOrdered = context.AddIntrinsicInt(Intrinsic.X86Cvtsi2si, ordMask); + + context.BranchIfFalse(lblNaN, isOrdered); + + Operand nCopy = context.Copy(n); + Operand mCopy = cmpWithZero ? context.VectorZero() : context.Copy(m); + + Operand cf = context.AddIntrinsicInt(Intrinsic.X86Comissge, nCopy, mCopy); + Operand zf = context.AddIntrinsicInt(Intrinsic.X86Comisseq, nCopy, mCopy); + Operand nf = context.AddIntrinsicInt(Intrinsic.X86Comisslt, nCopy, mCopy); + + SetFlag(context, PState.VFlag, Const(0)); + SetFlag(context, PState.CFlag, cf); + SetFlag(context, PState.ZFlag, zf); + SetFlag(context, PState.NFlag, nf); + } + else /* if (op.Size == 1) */ + { + Operand ordMask = context.AddIntrinsic(Intrinsic.X86Cmpsd, n, m, Const((int)cmpOrdered)); + + Operand isOrdered = context.AddIntrinsicLong(Intrinsic.X86Cvtsi2si, ordMask); + + context.BranchIfFalse(lblNaN, isOrdered); + + Operand nCopy = context.Copy(n); + Operand mCopy = cmpWithZero ? context.VectorZero() : context.Copy(m); + + Operand cf = context.AddIntrinsicInt(Intrinsic.X86Comisdge, nCopy, mCopy); + Operand zf = context.AddIntrinsicInt(Intrinsic.X86Comisdeq, nCopy, mCopy); + Operand nf = context.AddIntrinsicInt(Intrinsic.X86Comisdlt, nCopy, mCopy); + + SetFlag(context, PState.VFlag, Const(0)); + SetFlag(context, PState.CFlag, cf); + SetFlag(context, PState.ZFlag, zf); + SetFlag(context, PState.NFlag, nf); + } + + context.Branch(lblEnd); + + context.MarkLabel(lblNaN); + + SetFlag(context, PState.VFlag, Const(1)); + SetFlag(context, PState.CFlag, Const(1)); + SetFlag(context, PState.ZFlag, Const(0)); + SetFlag(context, PState.NFlag, Const(0)); + + context.MarkLabel(lblEnd); + } + else + { + OperandType type = op.Size != 0 ? OperandType.FP64 : OperandType.FP32; + + Operand ne = context.VectorExtract(type, GetVec(op.Rn), 0); + Operand me; + + if (cmpWithZero) + { + me = op.Size == 0 ? ConstF(0f) : ConstF(0d); + } + else + { + me = context.VectorExtract(type, GetVec(op.Rm), 0); + } + + Operand nzcv = EmitSoftFloatCall(context, nameof(SoftFloat32.FPCompare), ne, me, Const(signalNaNs)); + + EmitSetNzcv(context, nzcv); + } + } + + private static void EmitSetNzcv(ArmEmitterContext context, Operand nzcv) + { + Operand Extract(Operand value, int bit) + { + if (bit != 0) + { + value = context.ShiftRightUI(value, Const(bit)); + } + + value = context.BitwiseAnd(value, Const(1)); + + return value; + } + + SetFlag(context, PState.VFlag, Extract(nzcv, 0)); + SetFlag(context, PState.CFlag, Extract(nzcv, 1)); + SetFlag(context, PState.ZFlag, Extract(nzcv, 2)); + SetFlag(context, PState.NFlag, Extract(nzcv, 3)); + } + + private static void EmitCmpOp(ArmEmitterContext context, Func2I emitCmp, bool scalar) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = !scalar ? op.GetBytesCount() >> op.Size : 1; + + ulong szMask = ulong.MaxValue >> (64 - (8 << op.Size)); + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractSx(context, op.Rn, index, op.Size); + Operand me; + + if (op is OpCodeSimdReg binOp) + { + me = EmitVectorExtractSx(context, binOp.Rm, index, op.Size); + } + else + { + me = Const(0L); + } + + Operand isTrue = emitCmp(ne, me); + + Operand mask = context.ConditionalSelect(isTrue, Const(szMask), Const(0L)); + + res = EmitVectorInsert(context, res, mask, index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + + private static void EmitCmtstOp(ArmEmitterContext context, bool scalar) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = !scalar ? op.GetBytesCount() >> op.Size : 1; + + ulong szMask = ulong.MaxValue >> (64 - (8 << op.Size)); + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size); + Operand me = EmitVectorExtractZx(context, op.Rm, index, op.Size); + + Operand test = context.BitwiseAnd(ne, me); + + Operand isTrue = context.ICompareNotEqual(test, Const(0L)); + + Operand mask = context.ConditionalSelect(isTrue, Const(szMask), Const(0L)); + + res = EmitVectorInsert(context, res, mask, index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + + private static void EmitCmpOpF(ArmEmitterContext context, string name, bool scalar, bool absolute = false) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand res = context.VectorZero(); + + int sizeF = op.Size & 1; + + OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32; + + int elems = !scalar ? op.GetBytesCount() >> sizeF + 2 : 1; + + for (int index = 0; index < elems; index++) + { + Operand ne = context.VectorExtract(type, GetVec(op.Rn), index); + Operand me; + + if (op is OpCodeSimdReg binOp) + { + me = context.VectorExtract(type, GetVec(binOp.Rm), index); + } + else + { + me = sizeF == 0 ? ConstF(0f) : ConstF(0d); + } + + if (absolute) + { + ne = EmitUnaryMathCall(context, nameof(Math.Abs), ne); + me = EmitUnaryMathCall(context, nameof(Math.Abs), me); + } + + Operand e = EmitSoftFloatCall(context, name, ne, me); + + res = context.VectorInsert(res, e, index); + } + + context.Copy(GetVec(op.Rd), res); + } + + private static void EmitSse2OrAvxCmpOpF(ArmEmitterContext context, CmpCondition cond, bool scalar, bool absolute = false) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = op is OpCodeSimdReg binOp ? GetVec(binOp.Rm) : context.VectorZero(); + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + if (absolute) + { + Operand mask = scalar ? X86GetScalar(context, int.MaxValue) : X86GetAllElements(context, int.MaxValue); + + n = context.AddIntrinsic(Intrinsic.X86Andps, n, mask); + m = context.AddIntrinsic(Intrinsic.X86Andps, m, mask); + } + + Intrinsic inst = scalar ? Intrinsic.X86Cmpss : Intrinsic.X86Cmpps; + + Operand res = context.AddIntrinsic(inst, n, m, Const((int)cond)); + + if (scalar) + { + res = context.VectorZeroUpper96(res); + } + else if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else /* if (sizeF == 1) */ + { + if (absolute) + { + Operand mask = scalar ? X86GetScalar(context, long.MaxValue) : X86GetAllElements(context, long.MaxValue); + + n = context.AddIntrinsic(Intrinsic.X86Andpd, n, mask); + m = context.AddIntrinsic(Intrinsic.X86Andpd, m, mask); + } + + Intrinsic inst = scalar ? Intrinsic.X86Cmpsd : Intrinsic.X86Cmppd; + + Operand res = context.AddIntrinsic(inst, n, m, Const((int)cond)); + + if (scalar) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSimdCmp32.cs b/src/ARMeilleure/Instructions/InstEmitSimdCmp32.cs new file mode 100644 index 00000000..a990e057 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdCmp32.cs @@ -0,0 +1,437 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; +using System; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper32; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + using Func2I = Func<Operand, Operand, Operand>; + + static partial class InstEmit32 + { + public static void Vceq_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitCmpOpF32(context, CmpCondition.Equal, false); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2OrAvxCmpOpF32(context, CmpCondition.Equal, false); + } + else + { + EmitCmpOpF32(context, nameof(SoftFloat32.FPCompareEQFpscr), false); + } + } + + public static void Vceq_I(ArmEmitterContext context) + { + EmitCmpOpI32(context, context.ICompareEqual, context.ICompareEqual, false, false); + } + + public static void Vceq_Z(ArmEmitterContext context) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + if (op.F) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitCmpOpF32(context, CmpCondition.Equal, true); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2OrAvxCmpOpF32(context, CmpCondition.Equal, true); + } + else + { + EmitCmpOpF32(context, nameof(SoftFloat32.FPCompareEQFpscr), true); + } + } + else + { + EmitCmpOpI32(context, context.ICompareEqual, context.ICompareEqual, true, false); + } + } + + public static void Vcge_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitCmpOpF32(context, CmpCondition.GreaterThanOrEqual, false); + } + else if (Optimizations.FastFP && Optimizations.UseAvx) + { + EmitSse2OrAvxCmpOpF32(context, CmpCondition.GreaterThanOrEqual, false); + } + else + { + EmitCmpOpF32(context, nameof(SoftFloat32.FPCompareGEFpscr), false); + } + } + + public static void Vcge_I(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + EmitCmpOpI32(context, context.ICompareGreaterOrEqual, context.ICompareGreaterOrEqualUI, false, !op.U); + } + + public static void Vcge_Z(ArmEmitterContext context) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + if (op.F) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitCmpOpF32(context, CmpCondition.GreaterThanOrEqual, true); + } + else if (Optimizations.FastFP && Optimizations.UseAvx) + { + EmitSse2OrAvxCmpOpF32(context, CmpCondition.GreaterThanOrEqual, true); + } + else + { + EmitCmpOpF32(context, nameof(SoftFloat32.FPCompareGEFpscr), true); + } + } + else + { + EmitCmpOpI32(context, context.ICompareGreaterOrEqual, context.ICompareGreaterOrEqualUI, true, true); + } + } + + public static void Vcgt_V(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitCmpOpF32(context, CmpCondition.GreaterThan, false); + } + else if (Optimizations.FastFP && Optimizations.UseAvx) + { + EmitSse2OrAvxCmpOpF32(context, CmpCondition.GreaterThan, false); + } + else + { + EmitCmpOpF32(context, nameof(SoftFloat32.FPCompareGTFpscr), false); + } + } + + public static void Vcgt_I(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + EmitCmpOpI32(context, context.ICompareGreater, context.ICompareGreaterUI, false, !op.U); + } + + public static void Vcgt_Z(ArmEmitterContext context) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + if (op.F) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitCmpOpF32(context, CmpCondition.GreaterThan, true); + } + else if (Optimizations.FastFP && Optimizations.UseAvx) + { + EmitSse2OrAvxCmpOpF32(context, CmpCondition.GreaterThan, true); + } + else + { + EmitCmpOpF32(context, nameof(SoftFloat32.FPCompareGTFpscr), true); + } + } + else + { + EmitCmpOpI32(context, context.ICompareGreater, context.ICompareGreaterUI, true, true); + } + } + + public static void Vcle_Z(ArmEmitterContext context) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + if (op.F) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitCmpOpF32(context, CmpCondition.LessThanOrEqual, true); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2OrAvxCmpOpF32(context, CmpCondition.LessThanOrEqual, true); + } + else + { + EmitCmpOpF32(context, nameof(SoftFloat32.FPCompareLEFpscr), true); + } + } + else + { + EmitCmpOpI32(context, context.ICompareLessOrEqual, context.ICompareLessOrEqualUI, true, true); + } + } + + public static void Vclt_Z(ArmEmitterContext context) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + if (op.F) + { + if (Optimizations.FastFP && Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitCmpOpF32(context, CmpCondition.LessThan, true); + } + else if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2OrAvxCmpOpF32(context, CmpCondition.LessThan, true); + } + else + { + EmitCmpOpF32(context, nameof(SoftFloat32.FPCompareLTFpscr), true); + } + } + else + { + EmitCmpOpI32(context, context.ICompareLess, context.ICompareLessUI, true, true); + } + } + + private static void EmitCmpOpF32(ArmEmitterContext context, string name, bool zero) + { + if (zero) + { + EmitVectorUnaryOpF32(context, (m) => + { + Operand zeroOp = m.Type == OperandType.FP64 ? ConstF(0.0d) : ConstF(0.0f); + + return EmitSoftFloatCallDefaultFpscr(context, name, m, zeroOp); + }); + } + else + { + EmitVectorBinaryOpF32(context, (n, m) => + { + return EmitSoftFloatCallDefaultFpscr(context, name, n, m); + }); + } + } + + private static Operand ZerosOrOnes(ArmEmitterContext context, Operand fromBool, OperandType baseType) + { + var ones = (baseType == OperandType.I64) ? Const(-1L) : Const(-1); + + return context.ConditionalSelect(fromBool, ones, Const(baseType, 0L)); + } + + private static void EmitCmpOpI32( + ArmEmitterContext context, + Func2I signedOp, + Func2I unsignedOp, + bool zero, + bool signed) + { + if (zero) + { + if (signed) + { + EmitVectorUnaryOpSx32(context, (m) => + { + OperandType type = m.Type; + Operand zeroV = (type == OperandType.I64) ? Const(0L) : Const(0); + + return ZerosOrOnes(context, signedOp(m, zeroV), type); + }); + } + else + { + EmitVectorUnaryOpZx32(context, (m) => + { + OperandType type = m.Type; + Operand zeroV = (type == OperandType.I64) ? Const(0L) : Const(0); + + return ZerosOrOnes(context, unsignedOp(m, zeroV), type); + }); + } + } + else + { + if (signed) + { + EmitVectorBinaryOpSx32(context, (n, m) => ZerosOrOnes(context, signedOp(n, m), n.Type)); + } + else + { + EmitVectorBinaryOpZx32(context, (n, m) => ZerosOrOnes(context, unsignedOp(n, m), n.Type)); + } + } + } + + public static void Vcmp(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVcmpOrVcmpe(context, false); + } + else + { + EmitVcmpOrVcmpe(context, false); + } + } + + public static void Vcmpe(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVcmpOrVcmpe(context, true); + } + else + { + EmitVcmpOrVcmpe(context, true); + } + } + + private static void EmitVcmpOrVcmpe(ArmEmitterContext context, bool signalNaNs) + { + OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; + + bool cmpWithZero = (op.Opc & 2) != 0; + int sizeF = op.Size & 1; + + if (Optimizations.FastFP && (signalNaNs ? Optimizations.UseAvx : Optimizations.UseSse2)) + { + CmpCondition cmpOrdered = signalNaNs ? CmpCondition.OrderedS : CmpCondition.OrderedQ; + + bool doubleSize = sizeF != 0; + int shift = doubleSize ? 1 : 2; + Operand m = GetVecA32(op.Vm >> shift); + Operand n = GetVecA32(op.Vd >> shift); + + n = EmitSwapScalar(context, n, op.Vd, doubleSize); + m = cmpWithZero ? context.VectorZero() : EmitSwapScalar(context, m, op.Vm, doubleSize); + + Operand lblNaN = Label(); + Operand lblEnd = Label(); + + if (!doubleSize) + { + Operand ordMask = context.AddIntrinsic(Intrinsic.X86Cmpss, n, m, Const((int)cmpOrdered)); + + Operand isOrdered = context.AddIntrinsicInt(Intrinsic.X86Cvtsi2si, ordMask); + + context.BranchIfFalse(lblNaN, isOrdered); + + Operand cf = context.AddIntrinsicInt(Intrinsic.X86Comissge, n, m); + Operand zf = context.AddIntrinsicInt(Intrinsic.X86Comisseq, n, m); + Operand nf = context.AddIntrinsicInt(Intrinsic.X86Comisslt, n, m); + + SetFpFlag(context, FPState.VFlag, Const(0)); + SetFpFlag(context, FPState.CFlag, cf); + SetFpFlag(context, FPState.ZFlag, zf); + SetFpFlag(context, FPState.NFlag, nf); + } + else + { + Operand ordMask = context.AddIntrinsic(Intrinsic.X86Cmpsd, n, m, Const((int)cmpOrdered)); + + Operand isOrdered = context.AddIntrinsicLong(Intrinsic.X86Cvtsi2si, ordMask); + + context.BranchIfFalse(lblNaN, isOrdered); + + Operand cf = context.AddIntrinsicInt(Intrinsic.X86Comisdge, n, m); + Operand zf = context.AddIntrinsicInt(Intrinsic.X86Comisdeq, n, m); + Operand nf = context.AddIntrinsicInt(Intrinsic.X86Comisdlt, n, m); + + SetFpFlag(context, FPState.VFlag, Const(0)); + SetFpFlag(context, FPState.CFlag, cf); + SetFpFlag(context, FPState.ZFlag, zf); + SetFpFlag(context, FPState.NFlag, nf); + } + + context.Branch(lblEnd); + + context.MarkLabel(lblNaN); + + SetFpFlag(context, FPState.VFlag, Const(1)); + SetFpFlag(context, FPState.CFlag, Const(1)); + SetFpFlag(context, FPState.ZFlag, Const(0)); + SetFpFlag(context, FPState.NFlag, Const(0)); + + context.MarkLabel(lblEnd); + } + else + { + OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32; + + Operand ne = ExtractScalar(context, type, op.Vd); + Operand me; + + if (cmpWithZero) + { + me = sizeF == 0 ? ConstF(0f) : ConstF(0d); + } + else + { + me = ExtractScalar(context, type, op.Vm); + } + + Operand nzcv = EmitSoftFloatCall(context, nameof(SoftFloat32.FPCompare), ne, me, Const(signalNaNs)); + + EmitSetFpscrNzcv(context, nzcv); + } + } + + private static void EmitSetFpscrNzcv(ArmEmitterContext context, Operand nzcv) + { + Operand Extract(Operand value, int bit) + { + if (bit != 0) + { + value = context.ShiftRightUI(value, Const(bit)); + } + + value = context.BitwiseAnd(value, Const(1)); + + return value; + } + + SetFpFlag(context, FPState.VFlag, Extract(nzcv, 0)); + SetFpFlag(context, FPState.CFlag, Extract(nzcv, 1)); + SetFpFlag(context, FPState.ZFlag, Extract(nzcv, 2)); + SetFpFlag(context, FPState.NFlag, Extract(nzcv, 3)); + } + + private static void EmitSse2OrAvxCmpOpF32(ArmEmitterContext context, CmpCondition cond, bool zero) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + int sizeF = op.Size & 1; + Intrinsic inst = (sizeF == 0) ? Intrinsic.X86Cmpps : Intrinsic.X86Cmppd; + + if (zero) + { + EmitVectorUnaryOpSimd32(context, (m) => + { + return context.AddIntrinsic(inst, m, context.VectorZero(), Const((int)cond)); + }); + } + else + { + EmitVectorBinaryOpSimd32(context, (n, m) => + { + return context.AddIntrinsic(inst, n, m, Const((int)cond)); + }); + } + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSimdCrypto.cs b/src/ARMeilleure/Instructions/InstEmitSimdCrypto.cs new file mode 100644 index 00000000..db24e029 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdCrypto.cs @@ -0,0 +1,99 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitHelper; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit + { + public static void Aesd_V(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + Operand res; + + if (Optimizations.UseAesni) + { + res = context.AddIntrinsic(Intrinsic.X86Aesdeclast, context.AddIntrinsic(Intrinsic.X86Xorpd, d, n), context.VectorZero()); + } + else + { + res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Decrypt)), d, n); + } + + context.Copy(d, res); + } + + public static void Aese_V(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + Operand res; + + if (Optimizations.UseAesni) + { + res = context.AddIntrinsic(Intrinsic.X86Aesenclast, context.AddIntrinsic(Intrinsic.X86Xorpd, d, n), context.VectorZero()); + } + else + { + res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Encrypt)), d, n); + } + + context.Copy(d, res); + } + + public static void Aesimc_V(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + Operand res; + + if (Optimizations.UseAesni) + { + res = context.AddIntrinsic(Intrinsic.X86Aesimc, n); + } + else + { + res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.InverseMixColumns)), n); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void Aesmc_V(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + Operand res; + + if (Optimizations.UseAesni) + { + Operand roundKey = context.VectorZero(); + + // Inverse Shift Rows, Inverse Sub Bytes, xor 0 so nothing happens + res = context.AddIntrinsic(Intrinsic.X86Aesdeclast, n, roundKey); + + // Shift Rows, Sub Bytes, Mix Columns (!), xor 0 so nothing happens + res = context.AddIntrinsic(Intrinsic.X86Aesenc, res, roundKey); + } + else + { + res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.MixColumns)), n); + } + + context.Copy(GetVec(op.Rd), res); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSimdCrypto32.cs b/src/ARMeilleure/Instructions/InstEmitSimdCrypto32.cs new file mode 100644 index 00000000..f713a388 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdCrypto32.cs @@ -0,0 +1,99 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitHelper; + +namespace ARMeilleure.Instructions +{ + partial class InstEmit32 + { + public static void Aesd_V(ArmEmitterContext context) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + Operand d = GetVecA32(op.Qd); + Operand n = GetVecA32(op.Qm); + + Operand res; + + if (Optimizations.UseAesni) + { + res = context.AddIntrinsic(Intrinsic.X86Aesdeclast, context.AddIntrinsic(Intrinsic.X86Xorpd, d, n), context.VectorZero()); + } + else + { + res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Decrypt)), d, n); + } + + context.Copy(d, res); + } + + public static void Aese_V(ArmEmitterContext context) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + Operand d = GetVecA32(op.Qd); + Operand n = GetVecA32(op.Qm); + + Operand res; + + if (Optimizations.UseAesni) + { + res = context.AddIntrinsic(Intrinsic.X86Aesenclast, context.AddIntrinsic(Intrinsic.X86Xorpd, d, n), context.VectorZero()); + } + else + { + res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Encrypt)), d, n); + } + + context.Copy(d, res); + } + + public static void Aesimc_V(ArmEmitterContext context) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + Operand n = GetVecA32(op.Qm); + + Operand res; + + if (Optimizations.UseAesni) + { + res = context.AddIntrinsic(Intrinsic.X86Aesimc, n); + } + else + { + res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.InverseMixColumns)), n); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void Aesmc_V(ArmEmitterContext context) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + Operand n = GetVecA32(op.Qm); + + Operand res; + + if (Optimizations.UseAesni) + { + Operand roundKey = context.VectorZero(); + + // Inverse Shift Rows, Inverse Sub Bytes, xor 0 so nothing happens. + res = context.AddIntrinsic(Intrinsic.X86Aesdeclast, n, roundKey); + + // Shift Rows, Sub Bytes, Mix Columns (!), xor 0 so nothing happens. + res = context.AddIntrinsic(Intrinsic.X86Aesenc, res, roundKey); + } + else + { + res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.MixColumns)), n); + } + + context.Copy(GetVecA32(op.Qd), res); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSimdCvt.cs b/src/ARMeilleure/Instructions/InstEmitSimdCvt.cs new file mode 100644 index 00000000..652ad397 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdCvt.cs @@ -0,0 +1,1891 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; +using System; +using System.Diagnostics; +using System.Reflection; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + using Func1I = Func<Operand, Operand>; + + static partial class InstEmit + { + public static void Fcvt_S(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + if (op.Size == 0 && op.Opc == 1) // Single -> Double. + { + if (Optimizations.UseSse2) + { + Operand n = GetVec(op.Rn); + + Operand res = context.AddIntrinsic(Intrinsic.X86Cvtss2sd, context.VectorZero(), n); + + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0); + + Operand res = context.ConvertToFP(OperandType.FP64, ne); + + context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0)); + } + } + else if (op.Size == 1 && op.Opc == 0) // Double -> Single. + { + if (Optimizations.UseSse2) + { + Operand n = GetVec(op.Rn); + + Operand res = context.AddIntrinsic(Intrinsic.X86Cvtsd2ss, context.VectorZero(), n); + + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand ne = context.VectorExtract(OperandType.FP64, GetVec(op.Rn), 0); + + Operand res = context.ConvertToFP(OperandType.FP32, ne); + + context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0)); + } + } + else if (op.Size == 0 && op.Opc == 3) // Single -> Half. + { + if (Optimizations.UseF16c) + { + Debug.Assert(!Optimizations.ForceLegacySse); + + Operand n = GetVec(op.Rn); + + Operand res = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, n, Const(X86GetRoundControl(FPRoundingMode.ToNearest))); + res = context.AddIntrinsic(Intrinsic.X86Pslldq, res, Const(14)); // VectorZeroUpper112() + res = context.AddIntrinsic(Intrinsic.X86Psrldq, res, Const(14)); + + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0); + + context.StoreToContext(); + Operand res = context.Call(typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)), ne); + context.LoadFromContext(); + + res = context.ZeroExtend16(OperandType.I64, res); + + context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, 1)); + } + } + else if (op.Size == 3 && op.Opc == 0) // Half -> Single. + { + if (Optimizations.UseF16c) + { + Debug.Assert(!Optimizations.ForceLegacySse); + + Operand res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, GetVec(op.Rn)); + res = context.VectorZeroUpper96(res); + + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand ne = EmitVectorExtractZx(context, op.Rn, 0, 1); + + context.StoreToContext(); + Operand res = context.Call(typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)), ne); + context.LoadFromContext(); + + context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0)); + } + } + else if (op.Size == 1 && op.Opc == 3) // Double -> Half. + { + if (Optimizations.UseF16c) + { + Debug.Assert(!Optimizations.ForceLegacySse); + + Operand n = GetVec(op.Rn); + + Operand res = context.AddIntrinsic(Intrinsic.X86Cvtsd2ss, context.VectorZero(), n); + res = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, res, Const(X86GetRoundControl(FPRoundingMode.ToNearest))); + + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand ne = context.VectorExtract(OperandType.FP64, GetVec(op.Rn), 0); + + context.StoreToContext(); + Operand res = context.Call(typeof(SoftFloat64_16).GetMethod(nameof(SoftFloat64_16.FPConvert)), ne); + context.LoadFromContext(); + + res = context.ZeroExtend16(OperandType.I64, res); + + context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, 1)); + } + } + else if (op.Size == 3 && op.Opc == 1) // Half -> Double. + { + if (Optimizations.UseF16c) + { + Operand n = GetVec(op.Rn); + + Operand res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, GetVec(op.Rn)); + res = context.AddIntrinsic(Intrinsic.X86Cvtss2sd, context.VectorZero(), res); + res = context.VectorZeroUpper64(res); + + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand ne = EmitVectorExtractZx(context, op.Rn, 0, 1); + + context.StoreToContext(); + Operand res = context.Call(typeof(SoftFloat16_64).GetMethod(nameof(SoftFloat16_64.FPConvert)), ne); + context.LoadFromContext(); + + context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0)); + } + } + else // Invalid encoding. + { + Debug.Assert(false, $"type == {op.Size} && opc == {op.Opc}"); + } + } + + public static void Fcvtas_Gp(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpFToGp(context, Intrinsic.Arm64FcvtasGp); + } + else if (Optimizations.UseSse41) + { + EmitSse41Fcvts_Gp(context, FPRoundingMode.ToNearestAway, isFixed: false); + } + else + { + EmitFcvt_s_Gp(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1)); + } + } + + public static void Fcvtas_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FcvtasS); + } + else if (Optimizations.UseSse41) + { + EmitSse41FcvtsOpF(context, FPRoundingMode.ToNearestAway, scalar: true); + } + else + { + EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: true, scalar: true); + } + } + + public static void Fcvtas_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FcvtasS); + } + else if (Optimizations.UseSse41) + { + EmitSse41FcvtsOpF(context, FPRoundingMode.ToNearestAway, scalar: false); + } + else + { + EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: true, scalar: false); + } + } + + public static void Fcvtau_Gp(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpFToGp(context, Intrinsic.Arm64FcvtauGp); + } + else if (Optimizations.UseSse41) + { + EmitSse41Fcvtu_Gp(context, FPRoundingMode.ToNearestAway, isFixed: false); + } + else + { + EmitFcvt_u_Gp(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1)); + } + } + + public static void Fcvtau_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FcvtauS); + } + else if (Optimizations.UseSse41) + { + EmitSse41FcvtuOpF(context, FPRoundingMode.ToNearestAway, scalar: true); + } + else + { + EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: false, scalar: true); + } + } + + public static void Fcvtau_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FcvtauV); + } + else if (Optimizations.UseSse41) + { + EmitSse41FcvtuOpF(context, FPRoundingMode.ToNearestAway, scalar: false); + } + else + { + EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1), signed: false, scalar: false); + } + } + + public static void Fcvtl_V(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + int sizeF = op.Size & 1; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FcvtlV); + } + else if (Optimizations.UseSse2 && sizeF == 1) + { + Operand n = GetVec(op.Rn); + + Operand res = op.RegisterSize == RegisterSize.Simd128 ? context.AddIntrinsic(Intrinsic.X86Movhlps, n, n) : n; + res = context.AddIntrinsic(Intrinsic.X86Cvtps2pd, res); + + context.Copy(GetVec(op.Rd), res); + } + else if (Optimizations.UseF16c && sizeF == 0) + { + Debug.Assert(!Optimizations.ForceLegacySse); + + Operand n = GetVec(op.Rn); + + Operand res = op.RegisterSize == RegisterSize.Simd128 ? context.AddIntrinsic(Intrinsic.X86Movhlps, n, n) : n; + res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, res); + + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand res = context.VectorZero(); + + int elems = 4 >> sizeF; + + int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0; + + for (int index = 0; index < elems; index++) + { + if (sizeF == 0) + { + Operand ne = EmitVectorExtractZx(context, op.Rn, part + index, 1); + + context.StoreToContext(); + Operand e = context.Call(typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)), ne); + context.LoadFromContext(); + + res = context.VectorInsert(res, e, index); + } + else /* if (sizeF == 1) */ + { + Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), part + index); + + Operand e = context.ConvertToFP(OperandType.FP64, ne); + + res = context.VectorInsert(res, e, index); + } + } + + context.Copy(GetVec(op.Rd), res); + } + } + + public static void Fcvtms_Gp(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpFToGp(context, Intrinsic.Arm64FcvtmsGp); + } + else if (Optimizations.UseSse41) + { + EmitSse41Fcvts_Gp(context, FPRoundingMode.TowardsMinusInfinity, isFixed: false); + } + else + { + EmitFcvt_s_Gp(context, (op1) => EmitUnaryMathCall(context, nameof(Math.Floor), op1)); + } + } + + public static void Fcvtms_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FcvtmsV); + } + else if (Optimizations.UseSse41) + { + EmitSse41FcvtsOpF(context, FPRoundingMode.TowardsMinusInfinity, scalar: false); + } + else + { + EmitFcvt(context, (op1) => EmitUnaryMathCall(context, nameof(Math.Floor), op1), signed: true, scalar: false); + } + } + + public static void Fcvtmu_Gp(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpFToGp(context, Intrinsic.Arm64FcvtmuGp); + } + else if (Optimizations.UseSse41) + { + EmitSse41Fcvtu_Gp(context, FPRoundingMode.TowardsMinusInfinity, isFixed: false); + } + else + { + EmitFcvt_u_Gp(context, (op1) => EmitUnaryMathCall(context, nameof(Math.Floor), op1)); + } + } + + public static void Fcvtn_V(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + int sizeF = op.Size & 1; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOpFRd(context, Intrinsic.Arm64FcvtnV); + } + else if (Optimizations.UseSse2 && sizeF == 1) + { + Operand d = GetVec(op.Rd); + + Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps; + + Operand nInt = context.AddIntrinsic(Intrinsic.X86Cvtpd2ps, GetVec(op.Rn)); + nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt); + + Operand res = context.VectorZeroUpper64(d); + res = context.AddIntrinsic(movInst, res, nInt); + + context.Copy(d, res); + } + else if (Optimizations.UseF16c && sizeF == 0) + { + Debug.Assert(!Optimizations.ForceLegacySse); + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps; + + Operand nInt = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, n, Const(X86GetRoundControl(FPRoundingMode.ToNearest))); + nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt); + + Operand res = context.VectorZeroUpper64(d); + res = context.AddIntrinsic(movInst, res, nInt); + + context.Copy(d, res); + } + else + { + OperandType type = sizeF == 0 ? OperandType.FP32 : OperandType.FP64; + + int elems = 4 >> sizeF; + + int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0; + + Operand d = GetVec(op.Rd); + + Operand res = part == 0 ? context.VectorZero() : context.Copy(d); + + for (int index = 0; index < elems; index++) + { + Operand ne = context.VectorExtract(type, GetVec(op.Rn), index); + + if (sizeF == 0) + { + context.StoreToContext(); + Operand e = context.Call(typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)), ne); + context.LoadFromContext(); + + res = EmitVectorInsert(context, res, e, part + index, 1); + } + else /* if (sizeF == 1) */ + { + Operand e = context.ConvertToFP(OperandType.FP32, ne); + + res = context.VectorInsert(res, e, part + index); + } + } + + context.Copy(d, res); + } + } + + public static void Fcvtns_Gp(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpFToGp(context, Intrinsic.Arm64FcvtnsGp); + } + else if (Optimizations.UseSse41) + { + EmitSse41Fcvts_Gp(context, FPRoundingMode.ToNearest, isFixed: false); + } + else + { + EmitFcvt_s_Gp(context, (op1) => EmitRoundMathCall(context, MidpointRounding.ToEven, op1)); + } + } + + public static void Fcvtns_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FcvtnsS); + } + else if (Optimizations.UseSse41) + { + EmitSse41FcvtsOpF(context, FPRoundingMode.ToNearest, scalar: true); + } + else + { + EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.ToEven, op1), signed: true, scalar: true); + } + } + + public static void Fcvtns_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FcvtnsV); + } + else if (Optimizations.UseSse41) + { + EmitSse41FcvtsOpF(context, FPRoundingMode.ToNearest, scalar: false); + } + else + { + EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.ToEven, op1), signed: true, scalar: false); + } + } + + public static void Fcvtnu_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FcvtnuS); + } + else if (Optimizations.UseSse41) + { + EmitSse41FcvtuOpF(context, FPRoundingMode.ToNearest, scalar: true); + } + else + { + EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.ToEven, op1), signed: false, scalar: true); + } + } + + public static void Fcvtnu_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FcvtnuV); + } + else if (Optimizations.UseSse41) + { + EmitSse41FcvtuOpF(context, FPRoundingMode.ToNearest, scalar: false); + } + else + { + EmitFcvt(context, (op1) => EmitRoundMathCall(context, MidpointRounding.ToEven, op1), signed: false, scalar: false); + } + } + + public static void Fcvtps_Gp(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpFToGp(context, Intrinsic.Arm64FcvtpsGp); + } + else if (Optimizations.UseSse41) + { + EmitSse41Fcvts_Gp(context, FPRoundingMode.TowardsPlusInfinity, isFixed: false); + } + else + { + EmitFcvt_s_Gp(context, (op1) => EmitUnaryMathCall(context, nameof(Math.Ceiling), op1)); + } + } + + public static void Fcvtpu_Gp(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpFToGp(context, Intrinsic.Arm64FcvtpuGp); + } + else if (Optimizations.UseSse41) + { + EmitSse41Fcvtu_Gp(context, FPRoundingMode.TowardsPlusInfinity, isFixed: false); + } + else + { + EmitFcvt_u_Gp(context, (op1) => EmitUnaryMathCall(context, nameof(Math.Ceiling), op1)); + } + } + + public static void Fcvtzs_Gp(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpFToGp(context, Intrinsic.Arm64FcvtzsGp); + } + else if (Optimizations.UseSse41) + { + EmitSse41Fcvts_Gp(context, FPRoundingMode.TowardsZero, isFixed: false); + } + else + { + EmitFcvt_s_Gp(context, (op1) => op1); + } + } + + public static void Fcvtzs_Gp_Fixed(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdCvt op = (OpCodeSimdCvt)context.CurrOp; + + InstEmitSimdHelperArm64.EmitScalarConvertBinaryOpFToGp(context, Intrinsic.Arm64FcvtzsGpFixed, op.FBits); + } + else if (Optimizations.UseSse41) + { + EmitSse41Fcvts_Gp(context, FPRoundingMode.TowardsZero, isFixed: true); + } + else + { + EmitFcvtzs_Gp_Fixed(context); + } + } + + public static void Fcvtzs_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FcvtzsS); + } + else if (Optimizations.UseSse41) + { + EmitSse41FcvtsOpF(context, FPRoundingMode.TowardsZero, scalar: true); + } + else + { + EmitFcvtz(context, signed: true, scalar: true); + } + } + + public static void Fcvtzs_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FcvtzsV); + } + else if (Optimizations.UseSse41) + { + EmitSse41FcvtsOpF(context, FPRoundingMode.TowardsZero, scalar: false); + } + else + { + EmitFcvtz(context, signed: true, scalar: false); + } + } + + public static void Fcvtzs_V_Fixed(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorConvertBinaryOpF(context, Intrinsic.Arm64FcvtzsVFixed, GetFBits(context)); + } + else if (Optimizations.UseSse41) + { + EmitSse41FcvtsOpF(context, FPRoundingMode.TowardsZero, scalar: false); + } + else + { + EmitFcvtz(context, signed: true, scalar: false); + } + } + + public static void Fcvtzu_Gp(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpFToGp(context, Intrinsic.Arm64FcvtzuGp); + } + else if (Optimizations.UseSse41) + { + EmitSse41Fcvtu_Gp(context, FPRoundingMode.TowardsZero, isFixed: false); + } + else + { + EmitFcvt_u_Gp(context, (op1) => op1); + } + } + + public static void Fcvtzu_Gp_Fixed(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdCvt op = (OpCodeSimdCvt)context.CurrOp; + + InstEmitSimdHelperArm64.EmitScalarConvertBinaryOpFToGp(context, Intrinsic.Arm64FcvtzuGpFixed, op.FBits); + } + else if (Optimizations.UseSse41) + { + EmitSse41Fcvtu_Gp(context, FPRoundingMode.TowardsZero, isFixed: true); + } + else + { + EmitFcvtzu_Gp_Fixed(context); + } + } + + public static void Fcvtzu_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FcvtzuS); + } + else if (Optimizations.UseSse41) + { + EmitSse41FcvtuOpF(context, FPRoundingMode.TowardsZero, scalar: true); + } + else + { + EmitFcvtz(context, signed: false, scalar: true); + } + } + + public static void Fcvtzu_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FcvtzuV); + } + else if (Optimizations.UseSse41) + { + EmitSse41FcvtuOpF(context, FPRoundingMode.TowardsZero, scalar: false); + } + else + { + EmitFcvtz(context, signed: false, scalar: false); + } + } + + public static void Fcvtzu_V_Fixed(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorConvertBinaryOpF(context, Intrinsic.Arm64FcvtzuVFixed, GetFBits(context)); + } + else if (Optimizations.UseSse41) + { + EmitSse41FcvtuOpF(context, FPRoundingMode.TowardsZero, scalar: false); + } + else + { + EmitFcvtz(context, signed: false, scalar: false); + } + } + + public static void Scvtf_Gp(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpFFromGp(context, Intrinsic.Arm64ScvtfGp); + } + else + { + OpCodeSimdCvt op = (OpCodeSimdCvt)context.CurrOp; + + Operand res = GetIntOrZR(context, op.Rn); + + if (op.RegisterSize == RegisterSize.Int32) + { + res = context.SignExtend32(OperandType.I64, res); + } + + res = EmitFPConvert(context, res, op.Size, signed: true); + + context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0)); + } + } + + public static void Scvtf_Gp_Fixed(ArmEmitterContext context) + { + OpCodeSimdCvt op = (OpCodeSimdCvt)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarConvertBinaryOpFFromGp(context, Intrinsic.Arm64ScvtfGpFixed, op.FBits); + } + else + { + Operand res = GetIntOrZR(context, op.Rn); + + if (op.RegisterSize == RegisterSize.Int32) + { + res = context.SignExtend32(OperandType.I64, res); + } + + res = EmitFPConvert(context, res, op.Size, signed: true); + + res = EmitI2fFBitsMul(context, res, op.FBits); + + context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0)); + } + } + + public static void Scvtf_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64ScvtfS); + } + else if (Optimizations.UseSse2) + { + EmitSse2ScvtfOp(context, scalar: true); + } + else + { + EmitCvtf(context, signed: true, scalar: true); + } + } + + public static void Scvtf_S_Fixed(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarConvertBinaryOpF(context, Intrinsic.Arm64ScvtfSFixed, GetFBits(context)); + } + else if (Optimizations.UseSse2) + { + EmitSse2ScvtfOp(context, scalar: true); + } + else + { + EmitCvtf(context, signed: true, scalar: true); + } + } + + public static void Scvtf_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64ScvtfV); + } + else if (Optimizations.UseSse2) + { + EmitSse2ScvtfOp(context, scalar: false); + } + else + { + EmitCvtf(context, signed: true, scalar: false); + } + } + + public static void Scvtf_V_Fixed(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorConvertBinaryOpF(context, Intrinsic.Arm64ScvtfVFixed, GetFBits(context)); + } + else if (Optimizations.UseSse2) + { + EmitSse2ScvtfOp(context, scalar: false); + } + else + { + EmitCvtf(context, signed: true, scalar: false); + } + } + + public static void Ucvtf_Gp(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpFFromGp(context, Intrinsic.Arm64UcvtfGp); + } + else + { + OpCodeSimdCvt op = (OpCodeSimdCvt)context.CurrOp; + + Operand res = GetIntOrZR(context, op.Rn); + + res = EmitFPConvert(context, res, op.Size, signed: false); + + context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0)); + } + } + + public static void Ucvtf_Gp_Fixed(ArmEmitterContext context) + { + OpCodeSimdCvt op = (OpCodeSimdCvt)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarConvertBinaryOpFFromGp(context, Intrinsic.Arm64UcvtfGpFixed, op.FBits); + } + else + { + Operand res = GetIntOrZR(context, op.Rn); + + res = EmitFPConvert(context, res, op.Size, signed: false); + + res = EmitI2fFBitsMul(context, res, op.FBits); + + context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0)); + } + } + + public static void Ucvtf_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64UcvtfS); + } + else if (Optimizations.UseSse2) + { + EmitSse2UcvtfOp(context, scalar: true); + } + else + { + EmitCvtf(context, signed: false, scalar: true); + } + } + + public static void Ucvtf_S_Fixed(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarConvertBinaryOpF(context, Intrinsic.Arm64UcvtfSFixed, GetFBits(context)); + } + else if (Optimizations.UseSse2) + { + EmitSse2UcvtfOp(context, scalar: true); + } + else + { + EmitCvtf(context, signed: false, scalar: true); + } + } + + public static void Ucvtf_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64UcvtfV); + } + else if (Optimizations.UseSse2) + { + EmitSse2UcvtfOp(context, scalar: false); + } + else + { + EmitCvtf(context, signed: false, scalar: false); + } + } + + public static void Ucvtf_V_Fixed(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorConvertBinaryOpF(context, Intrinsic.Arm64UcvtfVFixed, GetFBits(context)); + } + else if (Optimizations.UseSse2) + { + EmitSse2UcvtfOp(context, scalar: false); + } + else + { + EmitCvtf(context, signed: false, scalar: false); + } + } + + private static void EmitFcvt(ArmEmitterContext context, Func1I emit, bool signed, bool scalar) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand res = context.VectorZero(); + + Operand n = GetVec(op.Rn); + + int sizeF = op.Size & 1; + int sizeI = sizeF + 2; + + OperandType type = sizeF == 0 ? OperandType.FP32 : OperandType.FP64; + + int elems = !scalar ? op.GetBytesCount() >> sizeI : 1; + + for (int index = 0; index < elems; index++) + { + Operand ne = context.VectorExtract(type, n, index); + + Operand e = emit(ne); + + if (sizeF == 0) + { + MethodInfo info = signed + ? typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF32ToS32)) + : typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF32ToU32)); + + e = context.Call(info, e); + + e = context.ZeroExtend32(OperandType.I64, e); + } + else /* if (sizeF == 1) */ + { + MethodInfo info = signed + ? typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF64ToS64)) + : typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF64ToU64)); + + e = context.Call(info, e); + } + + res = EmitVectorInsert(context, res, e, index, sizeI); + } + + context.Copy(GetVec(op.Rd), res); + } + + private static void EmitFcvtz(ArmEmitterContext context, bool signed, bool scalar) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand res = context.VectorZero(); + + Operand n = GetVec(op.Rn); + + int sizeF = op.Size & 1; + int sizeI = sizeF + 2; + + OperandType type = sizeF == 0 ? OperandType.FP32 : OperandType.FP64; + + int fBits = GetFBits(context); + + int elems = !scalar ? op.GetBytesCount() >> sizeI : 1; + + for (int index = 0; index < elems; index++) + { + Operand ne = context.VectorExtract(type, n, index); + + Operand e = EmitF2iFBitsMul(context, ne, fBits); + + if (sizeF == 0) + { + MethodInfo info = signed + ? typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF32ToS32)) + : typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF32ToU32)); + + e = context.Call(info, e); + + e = context.ZeroExtend32(OperandType.I64, e); + } + else /* if (sizeF == 1) */ + { + MethodInfo info = signed + ? typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF64ToS64)) + : typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF64ToU64)); + + e = context.Call(info, e); + } + + res = EmitVectorInsert(context, res, e, index, sizeI); + } + + context.Copy(GetVec(op.Rd), res); + } + + private static void EmitFcvt_s_Gp(ArmEmitterContext context, Func1I emit) + { + EmitFcvt___Gp(context, emit, signed: true); + } + + private static void EmitFcvt_u_Gp(ArmEmitterContext context, Func1I emit) + { + EmitFcvt___Gp(context, emit, signed: false); + } + + private static void EmitFcvt___Gp(ArmEmitterContext context, Func1I emit, bool signed) + { + OpCodeSimdCvt op = (OpCodeSimdCvt)context.CurrOp; + + OperandType type = op.Size == 0 ? OperandType.FP32 : OperandType.FP64; + + Operand ne = context.VectorExtract(type, GetVec(op.Rn), 0); + + Operand res = signed + ? EmitScalarFcvts(context, emit(ne), 0) + : EmitScalarFcvtu(context, emit(ne), 0); + + SetIntOrZR(context, op.Rd, res); + } + + private static void EmitFcvtzs_Gp_Fixed(ArmEmitterContext context) + { + EmitFcvtz__Gp_Fixed(context, signed: true); + } + + private static void EmitFcvtzu_Gp_Fixed(ArmEmitterContext context) + { + EmitFcvtz__Gp_Fixed(context, signed: false); + } + + private static void EmitFcvtz__Gp_Fixed(ArmEmitterContext context, bool signed) + { + OpCodeSimdCvt op = (OpCodeSimdCvt)context.CurrOp; + + OperandType type = op.Size == 0 ? OperandType.FP32 : OperandType.FP64; + + Operand ne = context.VectorExtract(type, GetVec(op.Rn), 0); + + Operand res = signed + ? EmitScalarFcvts(context, ne, op.FBits) + : EmitScalarFcvtu(context, ne, op.FBits); + + SetIntOrZR(context, op.Rd, res); + } + + private static void EmitCvtf(ArmEmitterContext context, bool signed, bool scalar) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand res = context.VectorZero(); + + int sizeF = op.Size & 1; + int sizeI = sizeF + 2; + + int fBits = GetFBits(context); + + int elems = !scalar ? op.GetBytesCount() >> sizeI : 1; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorLongExtract(context, op.Rn, index, sizeI); + + Operand e = EmitFPConvert(context, ne, sizeF, signed); + + e = EmitI2fFBitsMul(context, e, fBits); + + res = context.VectorInsert(res, e, index); + } + + context.Copy(GetVec(op.Rd), res); + } + + private static int GetFBits(ArmEmitterContext context) + { + if (context.CurrOp is OpCodeSimdShImm op) + { + return GetImmShr(op); + } + + return 0; + } + + private static Operand EmitFPConvert(ArmEmitterContext context, Operand value, int size, bool signed) + { + Debug.Assert(value.Type == OperandType.I32 || value.Type == OperandType.I64); + Debug.Assert((uint)size < 2); + + OperandType type = size == 0 ? OperandType.FP32 : OperandType.FP64; + + if (signed) + { + return context.ConvertToFP(type, value); + } + else + { + return context.ConvertToFPUI(type, value); + } + } + + private static Operand EmitScalarFcvts(ArmEmitterContext context, Operand value, int fBits) + { + Debug.Assert(value.Type == OperandType.FP32 || value.Type == OperandType.FP64); + + value = EmitF2iFBitsMul(context, value, fBits); + + MethodInfo info; + + if (context.CurrOp.RegisterSize == RegisterSize.Int32) + { + info = value.Type == OperandType.FP32 + ? typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF32ToS32)) + : typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF64ToS32)); + } + else + { + info = value.Type == OperandType.FP32 + ? typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF32ToS64)) + : typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF64ToS64)); + } + + return context.Call(info, value); + } + + private static Operand EmitScalarFcvtu(ArmEmitterContext context, Operand value, int fBits) + { + Debug.Assert(value.Type == OperandType.FP32 || value.Type == OperandType.FP64); + + value = EmitF2iFBitsMul(context, value, fBits); + + MethodInfo info; + + if (context.CurrOp.RegisterSize == RegisterSize.Int32) + { + info = value.Type == OperandType.FP32 + ? typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF32ToU32)) + : typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF64ToU32)); + } + else + { + info = value.Type == OperandType.FP32 + ? typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF32ToU64)) + : typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF64ToU64)); + } + + return context.Call(info, value); + } + + private static Operand EmitF2iFBitsMul(ArmEmitterContext context, Operand value, int fBits) + { + Debug.Assert(value.Type == OperandType.FP32 || value.Type == OperandType.FP64); + + if (fBits == 0) + { + return value; + } + + if (value.Type == OperandType.FP32) + { + return context.Multiply(value, ConstF(MathF.Pow(2f, fBits))); + } + else /* if (value.Type == OperandType.FP64) */ + { + return context.Multiply(value, ConstF(Math.Pow(2d, fBits))); + } + } + + private static Operand EmitI2fFBitsMul(ArmEmitterContext context, Operand value, int fBits) + { + Debug.Assert(value.Type == OperandType.FP32 || value.Type == OperandType.FP64); + + if (fBits == 0) + { + return value; + } + + if (value.Type == OperandType.FP32) + { + return context.Multiply(value, ConstF(1f / MathF.Pow(2f, fBits))); + } + else /* if (value.Type == OperandType.FP64) */ + { + return context.Multiply(value, ConstF(1d / Math.Pow(2d, fBits))); + } + } + + public static Operand EmitSse2CvtDoubleToInt64OpF(ArmEmitterContext context, Operand opF, bool scalar) + { + Debug.Assert(opF.Type == OperandType.V128); + + Operand longL = context.AddIntrinsicLong (Intrinsic.X86Cvtsd2si, opF); // opFL + Operand res = context.VectorCreateScalar(longL); + + if (!scalar) + { + Operand opFH = context.AddIntrinsic (Intrinsic.X86Movhlps, res, opF); // res doesn't matter. + Operand longH = context.AddIntrinsicLong (Intrinsic.X86Cvtsd2si, opFH); + Operand resH = context.VectorCreateScalar(longH); + res = context.AddIntrinsic (Intrinsic.X86Movlhps, res, resH); + } + + return res; + } + + private static Operand EmitSse2CvtInt64ToDoubleOp(ArmEmitterContext context, Operand op, bool scalar) + { + Debug.Assert(op.Type == OperandType.V128); + + Operand longL = context.AddIntrinsicLong(Intrinsic.X86Cvtsi2si, op); // opL + Operand res = context.AddIntrinsic (Intrinsic.X86Cvtsi2sd, context.VectorZero(), longL); + + if (!scalar) + { + Operand opH = context.AddIntrinsic (Intrinsic.X86Movhlps, res, op); // res doesn't matter. + Operand longH = context.AddIntrinsicLong(Intrinsic.X86Cvtsi2si, opH); + Operand resH = context.AddIntrinsic (Intrinsic.X86Cvtsi2sd, res, longH); // res doesn't matter. + res = context.AddIntrinsic (Intrinsic.X86Movlhps, res, resH); + } + + return res; + } + + private static void EmitSse2ScvtfOp(ArmEmitterContext context, bool scalar) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + // sizeF == ((OpCodeSimdShImm)op).Size - 2 + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + Operand res = context.AddIntrinsic(Intrinsic.X86Cvtdq2ps, n); + + if (op is OpCodeSimdShImm fixedOp) + { + int fBits = GetImmShr(fixedOp); + + // BitConverter.Int32BitsToSingle(fpScaled) == 1f / MathF.Pow(2f, fBits) + int fpScaled = 0x3F800000 - fBits * 0x800000; + + Operand fpScaledMask = scalar + ? X86GetScalar (context, fpScaled) + : X86GetAllElements(context, fpScaled); + + res = context.AddIntrinsic(Intrinsic.X86Mulps, res, fpScaledMask); + } + + if (scalar) + { + res = context.VectorZeroUpper96(res); + } + else if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else /* if (sizeF == 1) */ + { + Operand res = EmitSse2CvtInt64ToDoubleOp(context, n, scalar); + + if (op is OpCodeSimdShImm fixedOp) + { + int fBits = GetImmShr(fixedOp); + + // BitConverter.Int64BitsToDouble(fpScaled) == 1d / Math.Pow(2d, fBits) + long fpScaled = 0x3FF0000000000000L - fBits * 0x10000000000000L; + + Operand fpScaledMask = scalar + ? X86GetScalar (context, fpScaled) + : X86GetAllElements(context, fpScaled); + + res = context.AddIntrinsic(Intrinsic.X86Mulpd, res, fpScaledMask); + } + + if (scalar) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + } + + private static void EmitSse2UcvtfOp(ArmEmitterContext context, bool scalar) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + // sizeF == ((OpCodeSimdShImm)op).Size - 2 + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + Operand mask = scalar // 65536.000f (1 << 16) + ? X86GetScalar (context, 0x47800000) + : X86GetAllElements(context, 0x47800000); + + Operand res = context.AddIntrinsic(Intrinsic.X86Psrld, n, Const(16)); + res = context.AddIntrinsic(Intrinsic.X86Cvtdq2ps, res); + res = context.AddIntrinsic(Intrinsic.X86Mulps, res, mask); + + Operand res2 = context.AddIntrinsic(Intrinsic.X86Pslld, n, Const(16)); + res2 = context.AddIntrinsic(Intrinsic.X86Psrld, res2, Const(16)); + res2 = context.AddIntrinsic(Intrinsic.X86Cvtdq2ps, res2); + + res = context.AddIntrinsic(Intrinsic.X86Addps, res, res2); + + if (op is OpCodeSimdShImm fixedOp) + { + int fBits = GetImmShr(fixedOp); + + // BitConverter.Int32BitsToSingle(fpScaled) == 1f / MathF.Pow(2f, fBits) + int fpScaled = 0x3F800000 - fBits * 0x800000; + + Operand fpScaledMask = scalar + ? X86GetScalar (context, fpScaled) + : X86GetAllElements(context, fpScaled); + + res = context.AddIntrinsic(Intrinsic.X86Mulps, res, fpScaledMask); + } + + if (scalar) + { + res = context.VectorZeroUpper96(res); + } + else if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else /* if (sizeF == 1) */ + { + Operand mask = scalar // 4294967296.0000000d (1L << 32) + ? X86GetScalar (context, 0x41F0000000000000L) + : X86GetAllElements(context, 0x41F0000000000000L); + + Operand res = context.AddIntrinsic (Intrinsic.X86Psrlq, n, Const(32)); + res = EmitSse2CvtInt64ToDoubleOp(context, res, scalar); + res = context.AddIntrinsic (Intrinsic.X86Mulpd, res, mask); + + Operand res2 = context.AddIntrinsic (Intrinsic.X86Psllq, n, Const(32)); + res2 = context.AddIntrinsic (Intrinsic.X86Psrlq, res2, Const(32)); + res2 = EmitSse2CvtInt64ToDoubleOp(context, res2, scalar); + + res = context.AddIntrinsic(Intrinsic.X86Addpd, res, res2); + + if (op is OpCodeSimdShImm fixedOp) + { + int fBits = GetImmShr(fixedOp); + + // BitConverter.Int64BitsToDouble(fpScaled) == 1d / Math.Pow(2d, fBits) + long fpScaled = 0x3FF0000000000000L - fBits * 0x10000000000000L; + + Operand fpScaledMask = scalar + ? X86GetScalar (context, fpScaled) + : X86GetAllElements(context, fpScaled); + + res = context.AddIntrinsic(Intrinsic.X86Mulpd, res, fpScaledMask); + } + + if (scalar) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + } + + private static void EmitSse41FcvtsOpF(ArmEmitterContext context, FPRoundingMode roundMode, bool scalar) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + // sizeF == ((OpCodeSimdShImm)op).Size - 2 + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmpps, n, n, Const((int)CmpCondition.OrderedQ)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n); + + if (op is OpCodeSimdShImm fixedOp) + { + int fBits = GetImmShr(fixedOp); + + // BitConverter.Int32BitsToSingle(fpScaled) == MathF.Pow(2f, fBits) + int fpScaled = 0x3F800000 + fBits * 0x800000; + + Operand fpScaledMask = scalar + ? X86GetScalar (context, fpScaled) + : X86GetAllElements(context, fpScaled); + + nRes = context.AddIntrinsic(Intrinsic.X86Mulps, nRes, fpScaledMask); + } + + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundps, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar); + } + + Operand nInt = context.AddIntrinsic(Intrinsic.X86Cvtps2dq, nRes); + + Operand fpMaxValMask = scalar // 2.14748365E9f (2147483648) + ? X86GetScalar (context, 0x4F000000) + : X86GetAllElements(context, 0x4F000000); + + nRes = context.AddIntrinsic(Intrinsic.X86Cmpps, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan)); + + Operand dRes = context.AddIntrinsic(Intrinsic.X86Pxor, nInt, nRes); + + if (scalar) + { + dRes = context.VectorZeroUpper96(dRes); + } + else if (op.RegisterSize == RegisterSize.Simd64) + { + dRes = context.VectorZeroUpper64(dRes); + } + + context.Copy(GetVec(op.Rd), dRes); + } + else /* if (sizeF == 1) */ + { + Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmppd, n, n, Const((int)CmpCondition.OrderedQ)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n); + + if (op is OpCodeSimdShImm fixedOp) + { + int fBits = GetImmShr(fixedOp); + + // BitConverter.Int64BitsToDouble(fpScaled) == Math.Pow(2d, fBits) + long fpScaled = 0x3FF0000000000000L + fBits * 0x10000000000000L; + + Operand fpScaledMask = scalar + ? X86GetScalar (context, fpScaled) + : X86GetAllElements(context, fpScaled); + + nRes = context.AddIntrinsic(Intrinsic.X86Mulpd, nRes, fpScaledMask); + } + + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundpd, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar); + } + + Operand nLong = EmitSse2CvtDoubleToInt64OpF(context, nRes, scalar); + + Operand fpMaxValMask = scalar // 9.2233720368547760E18d (9223372036854775808) + ? X86GetScalar (context, 0x43E0000000000000L) + : X86GetAllElements(context, 0x43E0000000000000L); + + nRes = context.AddIntrinsic(Intrinsic.X86Cmppd, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan)); + + Operand dRes = context.AddIntrinsic(Intrinsic.X86Pxor, nLong, nRes); + + if (scalar) + { + dRes = context.VectorZeroUpper64(dRes); + } + + context.Copy(GetVec(op.Rd), dRes); + } + } + + private static void EmitSse41FcvtuOpF(ArmEmitterContext context, FPRoundingMode roundMode, bool scalar) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + // sizeF == ((OpCodeSimdShImm)op).Size - 2 + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmpps, n, n, Const((int)CmpCondition.OrderedQ)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n); + + if (op is OpCodeSimdShImm fixedOp) + { + int fBits = GetImmShr(fixedOp); + + // BitConverter.Int32BitsToSingle(fpScaled) == MathF.Pow(2f, fBits) + int fpScaled = 0x3F800000 + fBits * 0x800000; + + Operand fpScaledMask = scalar + ? X86GetScalar (context, fpScaled) + : X86GetAllElements(context, fpScaled); + + nRes = context.AddIntrinsic(Intrinsic.X86Mulps, nRes, fpScaledMask); + } + + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundps, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar); + } + + Operand zero = context.VectorZero(); + + Operand nCmp = context.AddIntrinsic(Intrinsic.X86Cmpps, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp); + + Operand fpMaxValMask = scalar // 2.14748365E9f (2147483648) + ? X86GetScalar (context, 0x4F000000) + : X86GetAllElements(context, 0x4F000000); + + Operand nInt = context.AddIntrinsic(Intrinsic.X86Cvtps2dq, nRes); + + nRes = context.AddIntrinsic(Intrinsic.X86Subps, nRes, fpMaxValMask); + + nCmp = context.AddIntrinsic(Intrinsic.X86Cmpps, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp); + + Operand nInt2 = context.AddIntrinsic(Intrinsic.X86Cvtps2dq, nRes); + + nRes = context.AddIntrinsic(Intrinsic.X86Cmpps, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan)); + + Operand dRes = context.AddIntrinsic(Intrinsic.X86Pxor, nInt2, nRes); + dRes = context.AddIntrinsic(Intrinsic.X86Paddd, dRes, nInt); + + if (scalar) + { + dRes = context.VectorZeroUpper96(dRes); + } + else if (op.RegisterSize == RegisterSize.Simd64) + { + dRes = context.VectorZeroUpper64(dRes); + } + + context.Copy(GetVec(op.Rd), dRes); + } + else /* if (sizeF == 1) */ + { + Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmppd, n, n, Const((int)CmpCondition.OrderedQ)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n); + + if (op is OpCodeSimdShImm fixedOp) + { + int fBits = GetImmShr(fixedOp); + + // BitConverter.Int64BitsToDouble(fpScaled) == Math.Pow(2d, fBits) + long fpScaled = 0x3FF0000000000000L + fBits * 0x10000000000000L; + + Operand fpScaledMask = scalar + ? X86GetScalar (context, fpScaled) + : X86GetAllElements(context, fpScaled); + + nRes = context.AddIntrinsic(Intrinsic.X86Mulpd, nRes, fpScaledMask); + } + + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundpd, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar); + } + + Operand zero = context.VectorZero(); + + Operand nCmp = context.AddIntrinsic(Intrinsic.X86Cmppd, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp); + + Operand fpMaxValMask = scalar // 9.2233720368547760E18d (9223372036854775808) + ? X86GetScalar (context, 0x43E0000000000000L) + : X86GetAllElements(context, 0x43E0000000000000L); + + Operand nLong = EmitSse2CvtDoubleToInt64OpF(context, nRes, scalar); + + nRes = context.AddIntrinsic(Intrinsic.X86Subpd, nRes, fpMaxValMask); + + nCmp = context.AddIntrinsic(Intrinsic.X86Cmppd, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp); + + Operand nLong2 = EmitSse2CvtDoubleToInt64OpF(context, nRes, scalar); + + nRes = context.AddIntrinsic(Intrinsic.X86Cmppd, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan)); + + Operand dRes = context.AddIntrinsic(Intrinsic.X86Pxor, nLong2, nRes); + dRes = context.AddIntrinsic(Intrinsic.X86Paddq, dRes, nLong); + + if (scalar) + { + dRes = context.VectorZeroUpper64(dRes); + } + + context.Copy(GetVec(op.Rd), dRes); + } + } + + private static void EmitSse41Fcvts_Gp(ArmEmitterContext context, FPRoundingMode roundMode, bool isFixed) + { + OpCodeSimdCvt op = (OpCodeSimdCvt)context.CurrOp; + + Operand n = GetVec(op.Rn); + + if (op.Size == 0) + { + Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmpss, n, n, Const((int)CmpCondition.OrderedQ)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n); + + if (isFixed) + { + // BitConverter.Int32BitsToSingle(fpScaled) == MathF.Pow(2f, op.FBits) + int fpScaled = 0x3F800000 + op.FBits * 0x800000; + + Operand fpScaledMask = X86GetScalar(context, fpScaled); + + nRes = context.AddIntrinsic(Intrinsic.X86Mulss, nRes, fpScaledMask); + } + + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundss, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar: true); + } + + Operand nIntOrLong = op.RegisterSize == RegisterSize.Int32 + ? context.AddIntrinsicInt (Intrinsic.X86Cvtss2si, nRes) + : context.AddIntrinsicLong(Intrinsic.X86Cvtss2si, nRes); + + int fpMaxVal = op.RegisterSize == RegisterSize.Int32 + ? 0x4F000000 // 2.14748365E9f (2147483648) + : 0x5F000000; // 9.223372E18f (9223372036854775808) + + Operand fpMaxValMask = X86GetScalar(context, fpMaxVal); + + nRes = context.AddIntrinsic(Intrinsic.X86Cmpss, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan)); + + Operand nInt = context.AddIntrinsicInt(Intrinsic.X86Cvtsi2si, nRes); + + if (op.RegisterSize == RegisterSize.Int64) + { + nInt = context.SignExtend32(OperandType.I64, nInt); + } + + Operand dRes = context.BitwiseExclusiveOr(nIntOrLong, nInt); + + SetIntOrZR(context, op.Rd, dRes); + } + else /* if (op.Size == 1) */ + { + Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmpsd, n, n, Const((int)CmpCondition.OrderedQ)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n); + + if (isFixed) + { + // BitConverter.Int64BitsToDouble(fpScaled) == Math.Pow(2d, op.FBits) + long fpScaled = 0x3FF0000000000000L + op.FBits * 0x10000000000000L; + + Operand fpScaledMask = X86GetScalar(context, fpScaled); + + nRes = context.AddIntrinsic(Intrinsic.X86Mulsd, nRes, fpScaledMask); + } + + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundsd, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar: true); + } + + Operand nIntOrLong = op.RegisterSize == RegisterSize.Int32 + ? context.AddIntrinsicInt (Intrinsic.X86Cvtsd2si, nRes) + : context.AddIntrinsicLong(Intrinsic.X86Cvtsd2si, nRes); + + long fpMaxVal = op.RegisterSize == RegisterSize.Int32 + ? 0x41E0000000000000L // 2147483648.0000000d (2147483648) + : 0x43E0000000000000L; // 9.2233720368547760E18d (9223372036854775808) + + Operand fpMaxValMask = X86GetScalar(context, fpMaxVal); + + nRes = context.AddIntrinsic(Intrinsic.X86Cmpsd, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan)); + + Operand nLong = context.AddIntrinsicLong(Intrinsic.X86Cvtsi2si, nRes); + + if (op.RegisterSize == RegisterSize.Int32) + { + nLong = context.ConvertI64ToI32(nLong); + } + + Operand dRes = context.BitwiseExclusiveOr(nIntOrLong, nLong); + + SetIntOrZR(context, op.Rd, dRes); + } + } + + private static void EmitSse41Fcvtu_Gp(ArmEmitterContext context, FPRoundingMode roundMode, bool isFixed) + { + OpCodeSimdCvt op = (OpCodeSimdCvt)context.CurrOp; + + Operand n = GetVec(op.Rn); + + if (op.Size == 0) + { + Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmpss, n, n, Const((int)CmpCondition.OrderedQ)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n); + + if (isFixed) + { + // BitConverter.Int32BitsToSingle(fpScaled) == MathF.Pow(2f, op.FBits) + int fpScaled = 0x3F800000 + op.FBits * 0x800000; + + Operand fpScaledMask = X86GetScalar(context, fpScaled); + + nRes = context.AddIntrinsic(Intrinsic.X86Mulss, nRes, fpScaledMask); + } + + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundss, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar: true); + } + + Operand zero = context.VectorZero(); + + Operand nCmp = context.AddIntrinsic(Intrinsic.X86Cmpss, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp); + + int fpMaxVal = op.RegisterSize == RegisterSize.Int32 + ? 0x4F000000 // 2.14748365E9f (2147483648) + : 0x5F000000; // 9.223372E18f (9223372036854775808) + + Operand fpMaxValMask = X86GetScalar(context, fpMaxVal); + + Operand nIntOrLong = op.RegisterSize == RegisterSize.Int32 + ? context.AddIntrinsicInt (Intrinsic.X86Cvtss2si, nRes) + : context.AddIntrinsicLong(Intrinsic.X86Cvtss2si, nRes); + + nRes = context.AddIntrinsic(Intrinsic.X86Subss, nRes, fpMaxValMask); + + nCmp = context.AddIntrinsic(Intrinsic.X86Cmpss, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp); + + Operand nIntOrLong2 = op.RegisterSize == RegisterSize.Int32 + ? context.AddIntrinsicInt (Intrinsic.X86Cvtss2si, nRes) + : context.AddIntrinsicLong(Intrinsic.X86Cvtss2si, nRes); + + nRes = context.AddIntrinsic(Intrinsic.X86Cmpss, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan)); + + Operand nInt = context.AddIntrinsicInt(Intrinsic.X86Cvtsi2si, nRes); + + if (op.RegisterSize == RegisterSize.Int64) + { + nInt = context.SignExtend32(OperandType.I64, nInt); + } + + Operand dRes = context.BitwiseExclusiveOr(nIntOrLong2, nInt); + dRes = context.Add(dRes, nIntOrLong); + + SetIntOrZR(context, op.Rd, dRes); + } + else /* if (op.Size == 1) */ + { + Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmpsd, n, n, Const((int)CmpCondition.OrderedQ)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n); + + if (isFixed) + { + // BitConverter.Int64BitsToDouble(fpScaled) == Math.Pow(2d, op.FBits) + long fpScaled = 0x3FF0000000000000L + op.FBits * 0x10000000000000L; + + Operand fpScaledMask = X86GetScalar(context, fpScaled); + + nRes = context.AddIntrinsic(Intrinsic.X86Mulsd, nRes, fpScaledMask); + } + + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundsd, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar: true); + } + + Operand zero = context.VectorZero(); + + Operand nCmp = context.AddIntrinsic(Intrinsic.X86Cmpsd, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp); + + long fpMaxVal = op.RegisterSize == RegisterSize.Int32 + ? 0x41E0000000000000L // 2147483648.0000000d (2147483648) + : 0x43E0000000000000L; // 9.2233720368547760E18d (9223372036854775808) + + Operand fpMaxValMask = X86GetScalar(context, fpMaxVal); + + Operand nIntOrLong = op.RegisterSize == RegisterSize.Int32 + ? context.AddIntrinsicInt (Intrinsic.X86Cvtsd2si, nRes) + : context.AddIntrinsicLong(Intrinsic.X86Cvtsd2si, nRes); + + nRes = context.AddIntrinsic(Intrinsic.X86Subsd, nRes, fpMaxValMask); + + nCmp = context.AddIntrinsic(Intrinsic.X86Cmpsd, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp); + + Operand nIntOrLong2 = op.RegisterSize == RegisterSize.Int32 + ? context.AddIntrinsicInt (Intrinsic.X86Cvtsd2si, nRes) + : context.AddIntrinsicLong(Intrinsic.X86Cvtsd2si, nRes); + + nRes = context.AddIntrinsic(Intrinsic.X86Cmpsd, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan)); + + Operand nLong = context.AddIntrinsicLong(Intrinsic.X86Cvtsi2si, nRes); + + if (op.RegisterSize == RegisterSize.Int32) + { + nLong = context.ConvertI64ToI32(nLong); + } + + Operand dRes = context.BitwiseExclusiveOr(nIntOrLong2, nLong); + dRes = context.Add(dRes, nIntOrLong); + + SetIntOrZR(context, op.Rd, dRes); + } + } + + private static Operand EmitVectorLongExtract(ArmEmitterContext context, int reg, int index, int size) + { + OperandType type = size == 3 ? OperandType.I64 : OperandType.I32; + + return context.VectorExtract(type, GetVec(reg), index); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSimdCvt32.cs b/src/ARMeilleure/Instructions/InstEmitSimdCvt32.cs new file mode 100644 index 00000000..33ae83df --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdCvt32.cs @@ -0,0 +1,800 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; +using System; +using System.Diagnostics; +using System.Reflection; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper32; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit32 + { + private static int FlipVdBits(int vd, bool lowBit) + { + if (lowBit) + { + // Move the low bit to the top. + return ((vd & 0x1) << 4) | (vd >> 1); + } + else + { + // Move the high bit to the bottom. + return ((vd & 0xf) << 1) | (vd >> 4); + } + } + + private static Operand EmitSaturateFloatToInt(ArmEmitterContext context, Operand op1, bool unsigned) + { + MethodInfo info; + + if (op1.Type == OperandType.FP64) + { + info = unsigned + ? typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF64ToU32)) + : typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF64ToS32)); + } + else + { + info = unsigned + ? typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF32ToU32)) + : typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF32ToS32)); + } + + return context.Call(info, op1); + } + + public static void Vcvt_V(ArmEmitterContext context) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + bool unsigned = (op.Opc & 1) != 0; + bool toInteger = (op.Opc & 2) != 0; + OperandType floatSize = (op.Size == 2) ? OperandType.FP32 : OperandType.FP64; + + if (toInteger) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorUnaryOpF32(context, unsigned ? Intrinsic.Arm64FcvtzuV : Intrinsic.Arm64FcvtzsV); + } + else if (Optimizations.UseSse41) + { + EmitSse41ConvertVector32(context, FPRoundingMode.TowardsZero, !unsigned); + } + else + { + EmitVectorUnaryOpF32(context, (op1) => + { + return EmitSaturateFloatToInt(context, op1, unsigned); + }); + } + } + else + { + if (Optimizations.UseSse2) + { + EmitVectorUnaryOpSimd32(context, (n) => + { + if (unsigned) + { + Operand mask = X86GetAllElements(context, 0x47800000); + + Operand res = context.AddIntrinsic(Intrinsic.X86Psrld, n, Const(16)); + res = context.AddIntrinsic(Intrinsic.X86Cvtdq2ps, res); + res = context.AddIntrinsic(Intrinsic.X86Mulps, res, mask); + + Operand res2 = context.AddIntrinsic(Intrinsic.X86Pslld, n, Const(16)); + res2 = context.AddIntrinsic(Intrinsic.X86Psrld, res2, Const(16)); + res2 = context.AddIntrinsic(Intrinsic.X86Cvtdq2ps, res2); + + return context.AddIntrinsic(Intrinsic.X86Addps, res, res2); + } + else + { + return context.AddIntrinsic(Intrinsic.X86Cvtdq2ps, n); + } + }); + } + else + { + if (unsigned) + { + EmitVectorUnaryOpZx32(context, (op1) => EmitFPConvert(context, op1, floatSize, false)); + } + else + { + EmitVectorUnaryOpSx32(context, (op1) => EmitFPConvert(context, op1, floatSize, true)); + } + } + } + } + + public static void Vcvt_FD(ArmEmitterContext context) + { + OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; + + int vm = op.Vm; + int vd; + if (op.Size == 3) + { + vd = FlipVdBits(op.Vd, false); + // Double to single. + Operand fp = ExtractScalar(context, OperandType.FP64, vm); + + Operand res = context.ConvertToFP(OperandType.FP32, fp); + + InsertScalar(context, vd, res); + } + else + { + vd = FlipVdBits(op.Vd, true); + // Single to double. + Operand fp = ExtractScalar(context, OperandType.FP32, vm); + + Operand res = context.ConvertToFP(OperandType.FP64, fp); + + InsertScalar(context, vd, res); + } + } + + // VCVT (floating-point to integer, floating-point) | VCVT (integer to floating-point, floating-point). + public static void Vcvt_FI(ArmEmitterContext context) + { + OpCode32SimdCvtFI op = (OpCode32SimdCvtFI)context.CurrOp; + + bool toInteger = (op.Opc2 & 0b100) != 0; + + OperandType floatSize = op.RegisterSize == RegisterSize.Int64 ? OperandType.FP64 : OperandType.FP32; + + if (toInteger) + { + bool unsigned = (op.Opc2 & 1) == 0; + bool roundWithFpscr = op.Opc != 1; + + if (!roundWithFpscr && Optimizations.UseAdvSimd) + { + bool doubleSize = floatSize == OperandType.FP64; + + if (doubleSize) + { + Operand m = GetVecA32(op.Vm >> 1); + + Operand toConvert = InstEmitSimdHelper32Arm64.EmitExtractScalar(context, m, op.Vm, doubleSize); + + Intrinsic inst = (unsigned ? Intrinsic.Arm64FcvtzuGp : Intrinsic.Arm64FcvtzsGp) | Intrinsic.Arm64VDouble; + + Operand asInteger = context.AddIntrinsicInt(inst, toConvert); + + InsertScalar(context, op.Vd, asInteger); + } + else + { + InstEmitSimdHelper32Arm64.EmitScalarUnaryOpF32(context, unsigned ? Intrinsic.Arm64FcvtzuS : Intrinsic.Arm64FcvtzsS); + } + } + else if (!roundWithFpscr && Optimizations.UseSse41) + { + EmitSse41ConvertInt32(context, FPRoundingMode.TowardsZero, !unsigned); + } + else + { + Operand toConvert = ExtractScalar(context, floatSize, op.Vm); + + // TODO: Fast Path. + if (roundWithFpscr) + { + toConvert = EmitRoundByRMode(context, toConvert); + } + + // Round towards zero. + Operand asInteger = EmitSaturateFloatToInt(context, toConvert, unsigned); + + InsertScalar(context, op.Vd, asInteger); + } + } + else + { + bool unsigned = op.Opc == 0; + + Operand toConvert = ExtractScalar(context, OperandType.I32, op.Vm); + + Operand asFloat = EmitFPConvert(context, toConvert, floatSize, !unsigned); + + InsertScalar(context, op.Vd, asFloat); + } + } + + private static Operand EmitRoundMathCall(ArmEmitterContext context, MidpointRounding roundMode, Operand n) + { + IOpCode32Simd op = (IOpCode32Simd)context.CurrOp; + + string name = nameof(Math.Round); + + MethodInfo info = (op.Size & 1) == 0 + ? typeof(MathF).GetMethod(name, new Type[] { typeof(float), typeof(MidpointRounding) }) + : typeof(Math). GetMethod(name, new Type[] { typeof(double), typeof(MidpointRounding) }); + + return context.Call(info, n, Const((int)roundMode)); + } + + private static FPRoundingMode RMToRoundMode(int rm) + { + FPRoundingMode roundMode; + switch (rm) + { + case 0b00: + roundMode = FPRoundingMode.ToNearestAway; + break; + case 0b01: + roundMode = FPRoundingMode.ToNearest; + break; + case 0b10: + roundMode = FPRoundingMode.TowardsPlusInfinity; + break; + case 0b11: + roundMode = FPRoundingMode.TowardsMinusInfinity; + break; + default: + throw new ArgumentOutOfRangeException(nameof(rm)); + } + return roundMode; + } + + // VCVTA/M/N/P (floating-point). + public static void Vcvt_RM(ArmEmitterContext context) + { + OpCode32SimdCvtFI op = (OpCode32SimdCvtFI)context.CurrOp; // toInteger == true (opCode<18> == 1 => Opc2<2> == 1). + + OperandType floatSize = op.RegisterSize == RegisterSize.Int64 ? OperandType.FP64 : OperandType.FP32; + + bool unsigned = op.Opc == 0; + int rm = op.Opc2 & 3; + + Intrinsic inst; + + if (Optimizations.UseAdvSimd) + { + if (unsigned) + { + inst = rm switch { + 0b00 => Intrinsic.Arm64FcvtauS, + 0b01 => Intrinsic.Arm64FcvtnuS, + 0b10 => Intrinsic.Arm64FcvtpuS, + 0b11 => Intrinsic.Arm64FcvtmuS, + _ => throw new ArgumentOutOfRangeException(nameof(rm)) + }; + } + else + { + inst = rm switch { + 0b00 => Intrinsic.Arm64FcvtasS, + 0b01 => Intrinsic.Arm64FcvtnsS, + 0b10 => Intrinsic.Arm64FcvtpsS, + 0b11 => Intrinsic.Arm64FcvtmsS, + _ => throw new ArgumentOutOfRangeException(nameof(rm)) + }; + } + + InstEmitSimdHelper32Arm64.EmitScalarUnaryOpF32(context, inst); + } + else if (Optimizations.UseSse41) + { + EmitSse41ConvertInt32(context, RMToRoundMode(rm), !unsigned); + } + else + { + Operand toConvert = ExtractScalar(context, floatSize, op.Vm); + + switch (rm) + { + case 0b00: // Away + toConvert = EmitRoundMathCall(context, MidpointRounding.AwayFromZero, toConvert); + break; + case 0b01: // Nearest + toConvert = EmitRoundMathCall(context, MidpointRounding.ToEven, toConvert); + break; + case 0b10: // Towards positive infinity + toConvert = EmitUnaryMathCall(context, nameof(Math.Ceiling), toConvert); + break; + case 0b11: // Towards negative infinity + toConvert = EmitUnaryMathCall(context, nameof(Math.Floor), toConvert); + break; + } + + Operand asInteger = EmitSaturateFloatToInt(context, toConvert, unsigned); + + InsertScalar(context, op.Vd, asInteger); + } + } + + public static void Vcvt_TB(ArmEmitterContext context) + { + OpCode32SimdCvtTB op = (OpCode32SimdCvtTB)context.CurrOp; + + if (Optimizations.UseF16c) + { + Debug.Assert(!Optimizations.ForceLegacySse); + + if (op.Op) + { + Operand res = ExtractScalar(context, op.Size == 1 ? OperandType.FP64 : OperandType.FP32, op.Vm); + if (op.Size == 1) + { + res = context.AddIntrinsic(Intrinsic.X86Cvtsd2ss, context.VectorZero(), res); + } + res = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, res, Const(X86GetRoundControl(FPRoundingMode.ToNearest))); + res = context.VectorExtract16(res, 0); + InsertScalar16(context, op.Vd, op.T, res); + } + else + { + Operand res = context.VectorCreateScalar(ExtractScalar16(context, op.Vm, op.T)); + res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, res); + if (op.Size == 1) + { + res = context.AddIntrinsic(Intrinsic.X86Cvtss2sd, context.VectorZero(), res); + } + res = context.VectorExtract(op.Size == 1 ? OperandType.I64 : OperandType.I32, res, 0); + InsertScalar(context, op.Vd, res); + } + } + else + { + if (op.Op) + { + // Convert to half. + + Operand src = ExtractScalar(context, op.Size == 1 ? OperandType.FP64 : OperandType.FP32, op.Vm); + + MethodInfo method = op.Size == 1 + ? typeof(SoftFloat64_16).GetMethod(nameof(SoftFloat64_16.FPConvert)) + : typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)); + + context.ExitArmFpMode(); + context.StoreToContext(); + Operand res = context.Call(method, src); + context.LoadFromContext(); + context.EnterArmFpMode(); + + InsertScalar16(context, op.Vd, op.T, res); + } + else + { + // Convert from half. + + Operand src = ExtractScalar16(context, op.Vm, op.T); + + MethodInfo method = op.Size == 1 + ? typeof(SoftFloat16_64).GetMethod(nameof(SoftFloat16_64.FPConvert)) + : typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)); + + context.ExitArmFpMode(); + context.StoreToContext(); + Operand res = context.Call(method, src); + context.LoadFromContext(); + context.EnterArmFpMode(); + + InsertScalar(context, op.Vd, res); + } + } + } + + // VRINTA/M/N/P (floating-point). + public static void Vrint_RM(ArmEmitterContext context) + { + OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; + + OperandType floatSize = op.RegisterSize == RegisterSize.Int64 ? OperandType.FP64 : OperandType.FP32; + + int rm = op.Opc2 & 3; + + if (Optimizations.UseAdvSimd) + { + Intrinsic inst = rm switch { + 0b00 => Intrinsic.Arm64FrintaS, + 0b01 => Intrinsic.Arm64FrintnS, + 0b10 => Intrinsic.Arm64FrintpS, + 0b11 => Intrinsic.Arm64FrintmS, + _ => throw new ArgumentOutOfRangeException(nameof(rm)) + }; + + InstEmitSimdHelper32Arm64.EmitScalarUnaryOpF32(context, inst); + } + else if (Optimizations.UseSse41) + { + EmitScalarUnaryOpSimd32(context, (m) => + { + FPRoundingMode roundMode = RMToRoundMode(rm); + + if (roundMode != FPRoundingMode.ToNearestAway) + { + Intrinsic inst = (op.Size & 1) == 0 ? Intrinsic.X86Roundss : Intrinsic.X86Roundsd; + return context.AddIntrinsic(inst, m, Const(X86GetRoundControl(roundMode))); + } + else + { + return EmitSse41RoundToNearestWithTiesToAwayOpF(context, m, scalar: true); + } + }); + } + else + { + Operand toConvert = ExtractScalar(context, floatSize, op.Vm); + + switch (rm) + { + case 0b00: // Away + toConvert = EmitRoundMathCall(context, MidpointRounding.AwayFromZero, toConvert); + break; + case 0b01: // Nearest + toConvert = EmitRoundMathCall(context, MidpointRounding.ToEven, toConvert); + break; + case 0b10: // Towards positive infinity + toConvert = EmitUnaryMathCall(context, nameof(Math.Ceiling), toConvert); + break; + case 0b11: // Towards negative infinity + toConvert = EmitUnaryMathCall(context, nameof(Math.Floor), toConvert); + break; + } + + InsertScalar(context, op.Vd, toConvert); + } + } + + // VRINTA (vector). + public static void Vrinta_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorUnaryOpF32(context, Intrinsic.Arm64FrintaS); + } + else + { + EmitVectorUnaryOpF32(context, (m) => EmitRoundMathCall(context, MidpointRounding.AwayFromZero, m)); + } + } + + // VRINTM (vector). + public static void Vrintm_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorUnaryOpF32(context, Intrinsic.Arm64FrintmS); + } + else if (Optimizations.UseSse2) + { + EmitVectorUnaryOpSimd32(context, (m) => + { + return context.AddIntrinsic(Intrinsic.X86Roundps, m, Const(X86GetRoundControl(FPRoundingMode.TowardsMinusInfinity))); + }); + } + else + { + EmitVectorUnaryOpF32(context, (m) => EmitUnaryMathCall(context, nameof(Math.Floor), m)); + } + } + + // VRINTN (vector). + public static void Vrintn_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorUnaryOpF32(context, Intrinsic.Arm64FrintnS); + } + else if (Optimizations.UseSse2) + { + EmitVectorUnaryOpSimd32(context, (m) => + { + return context.AddIntrinsic(Intrinsic.X86Roundps, m, Const(X86GetRoundControl(FPRoundingMode.ToNearest))); + }); + } + else + { + EmitVectorUnaryOpF32(context, (m) => EmitRoundMathCall(context, MidpointRounding.ToEven, m)); + } + } + + // VRINTP (vector). + public static void Vrintp_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorUnaryOpF32(context, Intrinsic.Arm64FrintpS); + } + else if (Optimizations.UseSse2) + { + EmitVectorUnaryOpSimd32(context, (m) => + { + return context.AddIntrinsic(Intrinsic.X86Roundps, m, Const(X86GetRoundControl(FPRoundingMode.TowardsPlusInfinity))); + }); + } + else + { + EmitVectorUnaryOpF32(context, (m) => EmitUnaryMathCall(context, nameof(Math.Ceiling), m)); + } + } + + // VRINTZ (floating-point). + public static void Vrint_Z(ArmEmitterContext context) + { + OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarUnaryOpF32(context, Intrinsic.Arm64FrintzS); + } + else if (Optimizations.UseSse2) + { + EmitScalarUnaryOpSimd32(context, (m) => + { + Intrinsic inst = (op.Size & 1) == 0 ? Intrinsic.X86Roundss : Intrinsic.X86Roundsd; + return context.AddIntrinsic(inst, m, Const(X86GetRoundControl(FPRoundingMode.TowardsZero))); + }); + } + else + { + EmitScalarUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, nameof(Math.Truncate), op1)); + } + } + + // VRINTX (floating-point). + public static void Vrintx_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitScalarUnaryOpF32(context, Intrinsic.Arm64FrintxS); + } + else + { + EmitScalarUnaryOpF32(context, (op1) => + { + return EmitRoundByRMode(context, op1); + }); + } + } + + private static Operand EmitFPConvert(ArmEmitterContext context, Operand value, OperandType type, bool signed) + { + Debug.Assert(value.Type == OperandType.I32 || value.Type == OperandType.I64); + + if (signed) + { + return context.ConvertToFP(type, value); + } + else + { + return context.ConvertToFPUI(type, value); + } + } + + private static void EmitSse41ConvertInt32(ArmEmitterContext context, FPRoundingMode roundMode, bool signed) + { + // A port of the similar round function in InstEmitSimdCvt. + OpCode32SimdCvtFI op = (OpCode32SimdCvtFI)context.CurrOp; + + bool doubleSize = (op.Size & 1) != 0; + int shift = doubleSize ? 1 : 2; + Operand n = GetVecA32(op.Vm >> shift); + n = EmitSwapScalar(context, n, op.Vm, doubleSize); + + if (!doubleSize) + { + Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmpss, n, n, Const((int)CmpCondition.OrderedQ)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n); + + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundss, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar: true); + } + + Operand zero = context.VectorZero(); + + Operand nCmp; + Operand nIntOrLong2 = default; + + if (!signed) + { + nCmp = context.AddIntrinsic(Intrinsic.X86Cmpss, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp); + } + + int fpMaxVal = 0x4F000000; // 2.14748365E9f (2147483648) + + Operand fpMaxValMask = X86GetScalar(context, fpMaxVal); + + Operand nIntOrLong = context.AddIntrinsicInt(Intrinsic.X86Cvtss2si, nRes); + + if (!signed) + { + nRes = context.AddIntrinsic(Intrinsic.X86Subss, nRes, fpMaxValMask); + + nCmp = context.AddIntrinsic(Intrinsic.X86Cmpss, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp); + + nIntOrLong2 = context.AddIntrinsicInt(Intrinsic.X86Cvtss2si, nRes); + } + + nRes = context.AddIntrinsic(Intrinsic.X86Cmpss, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan)); + + Operand nInt = context.AddIntrinsicInt(Intrinsic.X86Cvtsi2si, nRes); + + Operand dRes; + if (signed) + { + dRes = context.BitwiseExclusiveOr(nIntOrLong, nInt); + } + else + { + dRes = context.BitwiseExclusiveOr(nIntOrLong2, nInt); + dRes = context.Add(dRes, nIntOrLong); + } + + InsertScalar(context, op.Vd, dRes); + } + else + { + Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmpsd, n, n, Const((int)CmpCondition.OrderedQ)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n); + + if (roundMode != FPRoundingMode.ToNearestAway) + { + nRes = context.AddIntrinsic(Intrinsic.X86Roundsd, nRes, Const(X86GetRoundControl(roundMode))); + } + else + { + nRes = EmitSse41RoundToNearestWithTiesToAwayOpF(context, nRes, scalar: true); + } + + Operand zero = context.VectorZero(); + + Operand nCmp; + Operand nIntOrLong2 = default; + + if (!signed) + { + nCmp = context.AddIntrinsic(Intrinsic.X86Cmpsd, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp); + } + + long fpMaxVal = 0x41E0000000000000L; // 2147483648.0000000d (2147483648) + + Operand fpMaxValMask = X86GetScalar(context, fpMaxVal); + + Operand nIntOrLong = context.AddIntrinsicInt(Intrinsic.X86Cvtsd2si, nRes); + + if (!signed) + { + nRes = context.AddIntrinsic(Intrinsic.X86Subsd, nRes, fpMaxValMask); + + nCmp = context.AddIntrinsic(Intrinsic.X86Cmpsd, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp); + + nIntOrLong2 = context.AddIntrinsicInt(Intrinsic.X86Cvtsd2si, nRes); + } + + nRes = context.AddIntrinsic(Intrinsic.X86Cmpsd, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan)); + + Operand nLong = context.AddIntrinsicLong(Intrinsic.X86Cvtsi2si, nRes); + nLong = context.ConvertI64ToI32(nLong); + + Operand dRes; + if (signed) + { + dRes = context.BitwiseExclusiveOr(nIntOrLong, nLong); + } + else + { + dRes = context.BitwiseExclusiveOr(nIntOrLong2, nLong); + dRes = context.Add(dRes, nIntOrLong); + } + + InsertScalar(context, op.Vd, dRes); + } + } + + private static void EmitSse41ConvertVector32(ArmEmitterContext context, FPRoundingMode roundMode, bool signed) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + EmitVectorUnaryOpSimd32(context, (n) => + { + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmpps, n, n, Const((int)CmpCondition.OrderedQ)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n); + + nRes = context.AddIntrinsic(Intrinsic.X86Roundps, nRes, Const(X86GetRoundControl(roundMode))); + + Operand zero = context.VectorZero(); + Operand nCmp; + if (!signed) + { + nCmp = context.AddIntrinsic(Intrinsic.X86Cmpps, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp); + } + + Operand fpMaxValMask = X86GetAllElements(context, 0x4F000000); // 2.14748365E9f (2147483648) + + Operand nInt = context.AddIntrinsic(Intrinsic.X86Cvtps2dq, nRes); + Operand nInt2 = default; + + if (!signed) + { + nRes = context.AddIntrinsic(Intrinsic.X86Subps, nRes, fpMaxValMask); + + nCmp = context.AddIntrinsic(Intrinsic.X86Cmpps, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp); + + nInt2 = context.AddIntrinsic(Intrinsic.X86Cvtps2dq, nRes); + } + + nRes = context.AddIntrinsic(Intrinsic.X86Cmpps, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan)); + + if (signed) + { + return context.AddIntrinsic(Intrinsic.X86Pxor, nInt, nRes); + } + else + { + Operand dRes = context.AddIntrinsic(Intrinsic.X86Pxor, nInt2, nRes); + return context.AddIntrinsic(Intrinsic.X86Paddd, dRes, nInt); + } + } + else /* if (sizeF == 1) */ + { + Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmppd, n, n, Const((int)CmpCondition.OrderedQ)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n); + + nRes = context.AddIntrinsic(Intrinsic.X86Roundpd, nRes, Const(X86GetRoundControl(roundMode))); + + Operand zero = context.VectorZero(); + Operand nCmp; + if (!signed) + { + nCmp = context.AddIntrinsic(Intrinsic.X86Cmppd, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp); + } + + Operand fpMaxValMask = X86GetAllElements(context, 0x43E0000000000000L); // 9.2233720368547760E18d (9223372036854775808) + + Operand nLong = InstEmit.EmitSse2CvtDoubleToInt64OpF(context, nRes, false); + Operand nLong2 = default; + + if (!signed) + { + nRes = context.AddIntrinsic(Intrinsic.X86Subpd, nRes, fpMaxValMask); + + nCmp = context.AddIntrinsic(Intrinsic.X86Cmppd, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual)); + nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp); + + nLong2 = InstEmit.EmitSse2CvtDoubleToInt64OpF(context, nRes, false); + } + + nRes = context.AddIntrinsic(Intrinsic.X86Cmppd, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan)); + + if (signed) + { + return context.AddIntrinsic(Intrinsic.X86Pxor, nLong, nRes); + } + else + { + Operand dRes = context.AddIntrinsic(Intrinsic.X86Pxor, nLong2, nRes); + return context.AddIntrinsic(Intrinsic.X86Paddq, dRes, nLong); + } + } + }); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSimdHash.cs b/src/ARMeilleure/Instructions/InstEmitSimdHash.cs new file mode 100644 index 00000000..4fb048ee --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdHash.cs @@ -0,0 +1,147 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitHelper; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit + { +#region "Sha1" + public static void Sha1c_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand d = GetVec(op.Rd); + + Operand ne = context.VectorExtract(OperandType.I32, GetVec(op.Rn), 0); + + Operand m = GetVec(op.Rm); + + Operand res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.HashChoose)), d, ne, m); + + context.Copy(GetVec(op.Rd), res); + } + + public static void Sha1h_V(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand ne = context.VectorExtract(OperandType.I32, GetVec(op.Rn), 0); + + Operand res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.FixedRotate)), ne); + + context.Copy(GetVec(op.Rd), context.VectorCreateScalar(res)); + } + + public static void Sha1m_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand d = GetVec(op.Rd); + + Operand ne = context.VectorExtract(OperandType.I32, GetVec(op.Rn), 0); + + Operand m = GetVec(op.Rm); + + Operand res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.HashMajority)), d, ne, m); + + context.Copy(GetVec(op.Rd), res); + } + + public static void Sha1p_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand d = GetVec(op.Rd); + + Operand ne = context.VectorExtract(OperandType.I32, GetVec(op.Rn), 0); + + Operand m = GetVec(op.Rm); + + Operand res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.HashParity)), d, ne, m); + + context.Copy(GetVec(op.Rd), res); + } + + public static void Sha1su0_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Sha1SchedulePart1)), d, n, m); + + context.Copy(GetVec(op.Rd), res); + } + + public static void Sha1su1_V(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + Operand res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Sha1SchedulePart2)), d, n); + + context.Copy(GetVec(op.Rd), res); + } +#endregion + +#region "Sha256" + public static void Sha256h_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res = InstEmitSimdHashHelper.EmitSha256h(context, d, n, m, part2: false); + + context.Copy(GetVec(op.Rd), res); + } + + public static void Sha256h2_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res = InstEmitSimdHashHelper.EmitSha256h(context, n, d, m, part2: true); + + context.Copy(GetVec(op.Rd), res); + } + + public static void Sha256su0_V(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + Operand res = InstEmitSimdHashHelper.EmitSha256su0(context, d, n); + + context.Copy(GetVec(op.Rd), res); + } + + public static void Sha256su1_V(ArmEmitterContext context) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res = InstEmitSimdHashHelper.EmitSha256su1(context, d, n, m); + + context.Copy(GetVec(op.Rd), res); + } +#endregion + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSimdHash32.cs b/src/ARMeilleure/Instructions/InstEmitSimdHash32.cs new file mode 100644 index 00000000..51334608 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdHash32.cs @@ -0,0 +1,64 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitHelper; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit32 + { +#region "Sha256" + public static void Sha256h_V(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + Operand d = GetVecA32(op.Qd); + Operand n = GetVecA32(op.Qn); + Operand m = GetVecA32(op.Qm); + + Operand res = InstEmitSimdHashHelper.EmitSha256h(context, d, n, m, part2: false); + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void Sha256h2_V(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + Operand d = GetVecA32(op.Qd); + Operand n = GetVecA32(op.Qn); + Operand m = GetVecA32(op.Qm); + + Operand res = InstEmitSimdHashHelper.EmitSha256h(context, n, d, m, part2: true); + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void Sha256su0_V(ArmEmitterContext context) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + Operand d = GetVecA32(op.Qd); + Operand m = GetVecA32(op.Qm); + + Operand res = InstEmitSimdHashHelper.EmitSha256su0(context, d, m); + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void Sha256su1_V(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + Operand d = GetVecA32(op.Qd); + Operand n = GetVecA32(op.Qn); + Operand m = GetVecA32(op.Qm); + + Operand res = InstEmitSimdHashHelper.EmitSha256su1(context, d, n, m); + + context.Copy(GetVecA32(op.Qd), res); + } +#endregion + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSimdHashHelper.cs b/src/ARMeilleure/Instructions/InstEmitSimdHashHelper.cs new file mode 100644 index 00000000..23e4948d --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdHashHelper.cs @@ -0,0 +1,56 @@ +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; +using System; + +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static class InstEmitSimdHashHelper + { + public static Operand EmitSha256h(ArmEmitterContext context, Operand x, Operand y, Operand w, bool part2) + { + if (Optimizations.UseSha) + { + Operand src1 = context.AddIntrinsic(Intrinsic.X86Shufps, y, x, Const(0xbb)); + Operand src2 = context.AddIntrinsic(Intrinsic.X86Shufps, y, x, Const(0x11)); + Operand w2 = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, w, w); + + Operand round2 = context.AddIntrinsic(Intrinsic.X86Sha256Rnds2, src1, src2, w); + Operand round4 = context.AddIntrinsic(Intrinsic.X86Sha256Rnds2, src2, round2, w2); + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, round4, round2, Const(part2 ? 0x11 : 0xbb)); + + return res; + } + + String method = part2 ? nameof(SoftFallback.HashUpper) : nameof(SoftFallback.HashLower); + return context.Call(typeof(SoftFallback).GetMethod(method), x, y, w); + } + + public static Operand EmitSha256su0(ArmEmitterContext context, Operand x, Operand y) + { + if (Optimizations.UseSha) + { + return context.AddIntrinsic(Intrinsic.X86Sha256Msg1, x, y); + } + + return context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Sha256SchedulePart1)), x, y); + } + + public static Operand EmitSha256su1(ArmEmitterContext context, Operand x, Operand y, Operand z) + { + if (Optimizations.UseSha && Optimizations.UseSsse3) + { + Operand extr = context.AddIntrinsic(Intrinsic.X86Palignr, z, y, Const(4)); + Operand tmp = context.AddIntrinsic(Intrinsic.X86Paddd, extr, x); + + Operand res = context.AddIntrinsic(Intrinsic.X86Sha256Msg2, tmp, z); + + return res; + } + + return context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Sha256SchedulePart2)), x, y, z); + } + } +}
\ No newline at end of file diff --git a/src/ARMeilleure/Instructions/InstEmitSimdHelper.cs b/src/ARMeilleure/Instructions/InstEmitSimdHelper.cs new file mode 100644 index 00000000..c44c9b4d --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdHelper.cs @@ -0,0 +1,2088 @@ +using ARMeilleure.CodeGen.X86; +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; +using System; +using System.Diagnostics; +using System.Reflection; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + using Func1I = Func<Operand, Operand>; + using Func2I = Func<Operand, Operand, Operand>; + using Func3I = Func<Operand, Operand, Operand, Operand>; + + static class InstEmitSimdHelper + { +#region "Masks" + public static readonly long[] EvenMasks = new long[] + { + 14L << 56 | 12L << 48 | 10L << 40 | 08L << 32 | 06L << 24 | 04L << 16 | 02L << 8 | 00L << 0, // B + 13L << 56 | 12L << 48 | 09L << 40 | 08L << 32 | 05L << 24 | 04L << 16 | 01L << 8 | 00L << 0, // H + 11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 03L << 24 | 02L << 16 | 01L << 8 | 00L << 0 // S + }; + + public static readonly long[] OddMasks = new long[] + { + 15L << 56 | 13L << 48 | 11L << 40 | 09L << 32 | 07L << 24 | 05L << 16 | 03L << 8 | 01L << 0, // B + 15L << 56 | 14L << 48 | 11L << 40 | 10L << 32 | 07L << 24 | 06L << 16 | 03L << 8 | 02L << 0, // H + 15L << 56 | 14L << 48 | 13L << 40 | 12L << 32 | 07L << 24 | 06L << 16 | 05L << 8 | 04L << 0 // S + }; + + public static readonly long ZeroMask = 128L << 56 | 128L << 48 | 128L << 40 | 128L << 32 | 128L << 24 | 128L << 16 | 128L << 8 | 128L << 0; + + public static ulong X86GetGf2p8LogicalShiftLeft(int shift) + { + ulong identity = (0b00000001UL << 56) | (0b00000010UL << 48) | (0b00000100UL << 40) | (0b00001000UL << 32) | + (0b00010000UL << 24) | (0b00100000UL << 16) | (0b01000000UL << 8) | (0b10000000UL << 0); + + return shift >= 0 ? identity >> (shift * 8) : identity << (-shift * 8); + } +#endregion + +#region "X86 SSE Intrinsics" + public static readonly Intrinsic[] X86PaddInstruction = new Intrinsic[] + { + Intrinsic.X86Paddb, + Intrinsic.X86Paddw, + Intrinsic.X86Paddd, + Intrinsic.X86Paddq + }; + + public static readonly Intrinsic[] X86PcmpeqInstruction = new Intrinsic[] + { + Intrinsic.X86Pcmpeqb, + Intrinsic.X86Pcmpeqw, + Intrinsic.X86Pcmpeqd, + Intrinsic.X86Pcmpeqq + }; + + public static readonly Intrinsic[] X86PcmpgtInstruction = new Intrinsic[] + { + Intrinsic.X86Pcmpgtb, + Intrinsic.X86Pcmpgtw, + Intrinsic.X86Pcmpgtd, + Intrinsic.X86Pcmpgtq + }; + + public static readonly Intrinsic[] X86PmaxsInstruction = new Intrinsic[] + { + Intrinsic.X86Pmaxsb, + Intrinsic.X86Pmaxsw, + Intrinsic.X86Pmaxsd + }; + + public static readonly Intrinsic[] X86PmaxuInstruction = new Intrinsic[] + { + Intrinsic.X86Pmaxub, + Intrinsic.X86Pmaxuw, + Intrinsic.X86Pmaxud + }; + + public static readonly Intrinsic[] X86PminsInstruction = new Intrinsic[] + { + Intrinsic.X86Pminsb, + Intrinsic.X86Pminsw, + Intrinsic.X86Pminsd + }; + + public static readonly Intrinsic[] X86PminuInstruction = new Intrinsic[] + { + Intrinsic.X86Pminub, + Intrinsic.X86Pminuw, + Intrinsic.X86Pminud + }; + + public static readonly Intrinsic[] X86PmovsxInstruction = new Intrinsic[] + { + Intrinsic.X86Pmovsxbw, + Intrinsic.X86Pmovsxwd, + Intrinsic.X86Pmovsxdq + }; + + public static readonly Intrinsic[] X86PmovzxInstruction = new Intrinsic[] + { + Intrinsic.X86Pmovzxbw, + Intrinsic.X86Pmovzxwd, + Intrinsic.X86Pmovzxdq + }; + + public static readonly Intrinsic[] X86PsllInstruction = new Intrinsic[] + { + 0, + Intrinsic.X86Psllw, + Intrinsic.X86Pslld, + Intrinsic.X86Psllq + }; + + public static readonly Intrinsic[] X86PsraInstruction = new Intrinsic[] + { + 0, + Intrinsic.X86Psraw, + Intrinsic.X86Psrad + }; + + public static readonly Intrinsic[] X86PsrlInstruction = new Intrinsic[] + { + 0, + Intrinsic.X86Psrlw, + Intrinsic.X86Psrld, + Intrinsic.X86Psrlq + }; + + public static readonly Intrinsic[] X86PsubInstruction = new Intrinsic[] + { + Intrinsic.X86Psubb, + Intrinsic.X86Psubw, + Intrinsic.X86Psubd, + Intrinsic.X86Psubq + }; + + public static readonly Intrinsic[] X86PunpckhInstruction = new Intrinsic[] + { + Intrinsic.X86Punpckhbw, + Intrinsic.X86Punpckhwd, + Intrinsic.X86Punpckhdq, + Intrinsic.X86Punpckhqdq + }; + + public static readonly Intrinsic[] X86PunpcklInstruction = new Intrinsic[] + { + Intrinsic.X86Punpcklbw, + Intrinsic.X86Punpcklwd, + Intrinsic.X86Punpckldq, + Intrinsic.X86Punpcklqdq + }; +#endregion + + public static void EnterArmFpMode(EmitterContext context, Func<FPState, Operand> getFpFlag) + { + if (Optimizations.UseSse2) + { + Operand mxcsr = context.AddIntrinsicInt(Intrinsic.X86Stmxcsr); + + Operand fzTrue = getFpFlag(FPState.FzFlag); + Operand r0True = getFpFlag(FPState.RMode0Flag); + Operand r1True = getFpFlag(FPState.RMode1Flag); + + mxcsr = context.BitwiseAnd(mxcsr, Const(~(int)(Mxcsr.Ftz | Mxcsr.Daz | Mxcsr.Rhi | Mxcsr.Rlo))); + + mxcsr = context.BitwiseOr(mxcsr, context.ConditionalSelect(fzTrue, Const((int)(Mxcsr.Ftz | Mxcsr.Daz | Mxcsr.Um | Mxcsr.Dm)), Const(0))); + + // X86 round modes in order: nearest, negative, positive, zero + // ARM round modes in order: nearest, positive, negative, zero + // Read the bits backwards to correct this. + + mxcsr = context.BitwiseOr(mxcsr, context.ConditionalSelect(r0True, Const((int)Mxcsr.Rhi), Const(0))); + mxcsr = context.BitwiseOr(mxcsr, context.ConditionalSelect(r1True, Const((int)Mxcsr.Rlo), Const(0))); + + context.AddIntrinsicNoRet(Intrinsic.X86Ldmxcsr, mxcsr); + } + else if (Optimizations.UseAdvSimd) + { + Operand fpcr = context.AddIntrinsicInt(Intrinsic.Arm64MrsFpcr); + + Operand fzTrue = getFpFlag(FPState.FzFlag); + Operand r0True = getFpFlag(FPState.RMode0Flag); + Operand r1True = getFpFlag(FPState.RMode1Flag); + + fpcr = context.BitwiseAnd(fpcr, Const(~(int)(FPCR.Fz | FPCR.RMode0 | FPCR.RMode1))); + + fpcr = context.BitwiseOr(fpcr, context.ConditionalSelect(fzTrue, Const((int)FPCR.Fz), Const(0))); + fpcr = context.BitwiseOr(fpcr, context.ConditionalSelect(r0True, Const((int)FPCR.RMode0), Const(0))); + fpcr = context.BitwiseOr(fpcr, context.ConditionalSelect(r1True, Const((int)FPCR.RMode1), Const(0))); + + context.AddIntrinsicNoRet(Intrinsic.Arm64MsrFpcr, fpcr); + + // TODO: Restore FPSR + } + } + + public static void ExitArmFpMode(EmitterContext context, Action<FPState, Operand> setFpFlag) + { + if (Optimizations.UseSse2) + { + Operand mxcsr = context.AddIntrinsicInt(Intrinsic.X86Stmxcsr); + + // Unset round mode (to nearest) and ftz. + mxcsr = context.BitwiseAnd(mxcsr, Const(~(int)(Mxcsr.Ftz | Mxcsr.Daz | Mxcsr.Rhi | Mxcsr.Rlo))); + + context.AddIntrinsicNoRet(Intrinsic.X86Ldmxcsr, mxcsr); + + // Status flags would be stored here if they were used. + } + else if (Optimizations.UseAdvSimd) + { + Operand fpcr = context.AddIntrinsicInt(Intrinsic.Arm64MrsFpcr); + + // Unset round mode (to nearest) and fz. + fpcr = context.BitwiseAnd(fpcr, Const(~(int)(FPCR.Fz | FPCR.RMode0 | FPCR.RMode1))); + + context.AddIntrinsicNoRet(Intrinsic.Arm64MsrFpcr, fpcr); + + // TODO: Store FPSR + } + } + + public static int GetImmShl(OpCodeSimdShImm op) + { + return op.Imm - (8 << op.Size); + } + + public static int GetImmShr(OpCodeSimdShImm op) + { + return (8 << (op.Size + 1)) - op.Imm; + } + + public static Operand X86GetScalar(ArmEmitterContext context, float value) + { + return X86GetScalar(context, BitConverter.SingleToInt32Bits(value)); + } + + public static Operand X86GetScalar(ArmEmitterContext context, double value) + { + return X86GetScalar(context, BitConverter.DoubleToInt64Bits(value)); + } + + public static Operand X86GetScalar(ArmEmitterContext context, int value) + { + return context.VectorCreateScalar(Const(value)); + } + + public static Operand X86GetScalar(ArmEmitterContext context, long value) + { + return context.VectorCreateScalar(Const(value)); + } + + public static Operand X86GetAllElements(ArmEmitterContext context, float value) + { + return X86GetAllElements(context, BitConverter.SingleToInt32Bits(value)); + } + + public static Operand X86GetAllElements(ArmEmitterContext context, double value) + { + return X86GetAllElements(context, BitConverter.DoubleToInt64Bits(value)); + } + + public static Operand X86GetAllElements(ArmEmitterContext context, short value) + { + ulong value1 = (ushort)value; + ulong value2 = value1 << 16 | value1; + ulong value4 = value2 << 32 | value2; + + return X86GetAllElements(context, (long)value4); + } + + public static Operand X86GetAllElements(ArmEmitterContext context, int value) + { + Operand vector = context.VectorCreateScalar(Const(value)); + + vector = context.AddIntrinsic(Intrinsic.X86Shufps, vector, vector, Const(0)); + + return vector; + } + + public static Operand X86GetAllElements(ArmEmitterContext context, long value) + { + Operand vector = context.VectorCreateScalar(Const(value)); + + vector = context.AddIntrinsic(Intrinsic.X86Movlhps, vector, vector); + + return vector; + } + + public static Operand X86GetElements(ArmEmitterContext context, long e1, long e0) + { + return X86GetElements(context, (ulong)e1, (ulong)e0); + } + + public static Operand X86GetElements(ArmEmitterContext context, ulong e1, ulong e0) + { + Operand vector0 = context.VectorCreateScalar(Const(e0)); + Operand vector1 = context.VectorCreateScalar(Const(e1)); + + return context.AddIntrinsic(Intrinsic.X86Punpcklqdq, vector0, vector1); + } + + public static int X86GetRoundControl(FPRoundingMode roundMode) + { + switch (roundMode) + { + case FPRoundingMode.ToNearest: return 8 | 0; // even + case FPRoundingMode.TowardsPlusInfinity: return 8 | 2; + case FPRoundingMode.TowardsMinusInfinity: return 8 | 1; + case FPRoundingMode.TowardsZero: return 8 | 3; + } + + throw new ArgumentException($"Invalid rounding mode \"{roundMode}\"."); + } + + public static Operand EmitSse41RoundToNearestWithTiesToAwayOpF(ArmEmitterContext context, Operand n, bool scalar) + { + Debug.Assert(n.Type == OperandType.V128); + + Operand nCopy = context.Copy(n); + + Operand rC = Const(X86GetRoundControl(FPRoundingMode.TowardsZero)); + + IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; + + if ((op.Size & 1) == 0) + { + Operand signMask = scalar ? X86GetScalar(context, int.MinValue) : X86GetAllElements(context, int.MinValue); + signMask = context.AddIntrinsic(Intrinsic.X86Pand, signMask, nCopy); + + // 0x3EFFFFFF == BitConverter.SingleToInt32Bits(0.5f) - 1 + Operand valueMask = scalar ? X86GetScalar(context, 0x3EFFFFFF) : X86GetAllElements(context, 0x3EFFFFFF); + valueMask = context.AddIntrinsic(Intrinsic.X86Por, valueMask, signMask); + + nCopy = context.AddIntrinsic(scalar ? Intrinsic.X86Addss : Intrinsic.X86Addps, nCopy, valueMask); + + nCopy = context.AddIntrinsic(scalar ? Intrinsic.X86Roundss : Intrinsic.X86Roundps, nCopy, rC); + } + else + { + Operand signMask = scalar ? X86GetScalar(context, long.MinValue) : X86GetAllElements(context, long.MinValue); + signMask = context.AddIntrinsic(Intrinsic.X86Pand, signMask, nCopy); + + // 0x3FDFFFFFFFFFFFFFL == BitConverter.DoubleToInt64Bits(0.5d) - 1L + Operand valueMask = scalar ? X86GetScalar(context, 0x3FDFFFFFFFFFFFFFL) : X86GetAllElements(context, 0x3FDFFFFFFFFFFFFFL); + valueMask = context.AddIntrinsic(Intrinsic.X86Por, valueMask, signMask); + + nCopy = context.AddIntrinsic(scalar ? Intrinsic.X86Addsd : Intrinsic.X86Addpd, nCopy, valueMask); + + nCopy = context.AddIntrinsic(scalar ? Intrinsic.X86Roundsd : Intrinsic.X86Roundpd, nCopy, rC); + } + + return nCopy; + } + + public static Operand EmitCountSetBits8(ArmEmitterContext context, Operand op) // "size" is 8 (SIMD&FP Inst.). + { + Debug.Assert(op.Type == OperandType.I32 || op.Type == OperandType.I64); + + Operand op0 = context.Subtract(op, context.BitwiseAnd(context.ShiftRightUI(op, Const(1)), Const(op.Type, 0x55L))); + + Operand c1 = Const(op.Type, 0x33L); + Operand op1 = context.Add(context.BitwiseAnd(context.ShiftRightUI(op0, Const(2)), c1), context.BitwiseAnd(op0, c1)); + + return context.BitwiseAnd(context.Add(op1, context.ShiftRightUI(op1, Const(4))), Const(op.Type, 0x0fL)); + } + + public static void EmitScalarUnaryOpF(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32; + + Operand res = context.AddIntrinsic(inst, n); + + if ((op.Size & 1) != 0) + { + res = context.VectorZeroUpper64(res); + } + else + { + res = context.VectorZeroUpper96(res); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitScalarBinaryOpF(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32; + + Operand res = context.AddIntrinsic(inst, n, m); + + if ((op.Size & 1) != 0) + { + res = context.VectorZeroUpper64(res); + } + else + { + res = context.VectorZeroUpper96(res); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorUnaryOpF(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32; + + Operand res = context.AddIntrinsic(inst, n); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorBinaryOpF(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32; + + Operand res = context.AddIntrinsic(inst, n, m); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static Operand EmitUnaryMathCall(ArmEmitterContext context, string name, Operand n) + { + IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; + + MethodInfo info = (op.Size & 1) == 0 + ? typeof(MathF).GetMethod(name, new Type[] { typeof(float) }) + : typeof(Math). GetMethod(name, new Type[] { typeof(double) }); + + return context.Call(info, n); + } + + public static Operand EmitRoundMathCall(ArmEmitterContext context, MidpointRounding roundMode, Operand n) + { + IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; + + string name = nameof(Math.Round); + + MethodInfo info = (op.Size & 1) == 0 + ? typeof(MathF).GetMethod(name, new Type[] { typeof(float), typeof(MidpointRounding) }) + : typeof(Math). GetMethod(name, new Type[] { typeof(double), typeof(MidpointRounding) }); + + return context.Call(info, n, Const((int)roundMode)); + } + + public static Operand EmitGetRoundingMode(ArmEmitterContext context) + { + Operand rMode = context.ShiftLeft(GetFpFlag(FPState.RMode1Flag), Const(1)); + rMode = context.BitwiseOr(rMode, GetFpFlag(FPState.RMode0Flag)); + + return rMode; + } + + public static Operand EmitRoundByRMode(ArmEmitterContext context, Operand op) + { + Debug.Assert(op.Type == OperandType.FP32 || op.Type == OperandType.FP64); + + Operand lbl1 = Label(); + Operand lbl2 = Label(); + Operand lbl3 = Label(); + Operand lblEnd = Label(); + + Operand rN = Const((int)FPRoundingMode.ToNearest); + Operand rP = Const((int)FPRoundingMode.TowardsPlusInfinity); + Operand rM = Const((int)FPRoundingMode.TowardsMinusInfinity); + + Operand res = context.AllocateLocal(op.Type); + + Operand rMode = EmitGetRoundingMode(context); + + context.BranchIf(lbl1, rMode, rN, Comparison.NotEqual); + context.Copy(res, EmitRoundMathCall(context, MidpointRounding.ToEven, op)); + context.Branch(lblEnd); + + context.MarkLabel(lbl1); + context.BranchIf(lbl2, rMode, rP, Comparison.NotEqual); + context.Copy(res, EmitUnaryMathCall(context, nameof(Math.Ceiling), op)); + context.Branch(lblEnd); + + context.MarkLabel(lbl2); + context.BranchIf(lbl3, rMode, rM, Comparison.NotEqual); + context.Copy(res, EmitUnaryMathCall(context, nameof(Math.Floor), op)); + context.Branch(lblEnd); + + context.MarkLabel(lbl3); + context.Copy(res, EmitUnaryMathCall(context, nameof(Math.Truncate), op)); + context.Branch(lblEnd); + + context.MarkLabel(lblEnd); + + return res; + } + + public static Operand EmitSoftFloatCall(ArmEmitterContext context, string name, params Operand[] callArgs) + { + IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; + + MethodInfo info = (op.Size & 1) == 0 + ? typeof(SoftFloat32).GetMethod(name) + : typeof(SoftFloat64).GetMethod(name); + + context.ExitArmFpMode(); + context.StoreToContext(); + Operand res = context.Call(info, callArgs); + context.LoadFromContext(); + context.EnterArmFpMode(); + + return res; + } + + public static void EmitScalarBinaryOpByElemF(ArmEmitterContext context, Func2I emit) + { + OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp; + + OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32; + + Operand n = context.VectorExtract(type, GetVec(op.Rn), 0); + Operand m = context.VectorExtract(type, GetVec(op.Rm), op.Index); + + context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), emit(n, m), 0)); + } + + public static void EmitScalarTernaryOpByElemF(ArmEmitterContext context, Func3I emit) + { + OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp; + + OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32; + + Operand d = context.VectorExtract(type, GetVec(op.Rd), 0); + Operand n = context.VectorExtract(type, GetVec(op.Rn), 0); + Operand m = context.VectorExtract(type, GetVec(op.Rm), op.Index); + + context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), emit(d, n, m), 0)); + } + + public static void EmitScalarUnaryOpSx(ArmEmitterContext context, Func1I emit) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = EmitVectorExtractSx(context, op.Rn, 0, op.Size); + + Operand d = EmitVectorInsert(context, context.VectorZero(), emit(n), 0, op.Size); + + context.Copy(GetVec(op.Rd), d); + } + + public static void EmitScalarBinaryOpSx(ArmEmitterContext context, Func2I emit) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = EmitVectorExtractSx(context, op.Rn, 0, op.Size); + Operand m = EmitVectorExtractSx(context, op.Rm, 0, op.Size); + + Operand d = EmitVectorInsert(context, context.VectorZero(), emit(n, m), 0, op.Size); + + context.Copy(GetVec(op.Rd), d); + } + + public static void EmitScalarUnaryOpZx(ArmEmitterContext context, Func1I emit) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = EmitVectorExtractZx(context, op.Rn, 0, op.Size); + + Operand d = EmitVectorInsert(context, context.VectorZero(), emit(n), 0, op.Size); + + context.Copy(GetVec(op.Rd), d); + } + + public static void EmitScalarBinaryOpZx(ArmEmitterContext context, Func2I emit) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = EmitVectorExtractZx(context, op.Rn, 0, op.Size); + Operand m = EmitVectorExtractZx(context, op.Rm, 0, op.Size); + + Operand d = EmitVectorInsert(context, context.VectorZero(), emit(n, m), 0, op.Size); + + context.Copy(GetVec(op.Rd), d); + } + + public static void EmitScalarTernaryOpZx(ArmEmitterContext context, Func3I emit) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand d = EmitVectorExtractZx(context, op.Rd, 0, op.Size); + Operand n = EmitVectorExtractZx(context, op.Rn, 0, op.Size); + Operand m = EmitVectorExtractZx(context, op.Rm, 0, op.Size); + + d = EmitVectorInsert(context, context.VectorZero(), emit(d, n, m), 0, op.Size); + + context.Copy(GetVec(op.Rd), d); + } + + public static void EmitScalarUnaryOpF(ArmEmitterContext context, Func1I emit) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32; + + Operand n = context.VectorExtract(type, GetVec(op.Rn), 0); + + context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), emit(n), 0)); + } + + public static void EmitScalarBinaryOpF(ArmEmitterContext context, Func2I emit) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32; + + Operand n = context.VectorExtract(type, GetVec(op.Rn), 0); + Operand m = context.VectorExtract(type, GetVec(op.Rm), 0); + + context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), emit(n, m), 0)); + } + + public static void EmitScalarTernaryRaOpF(ArmEmitterContext context, Func3I emit) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32; + + Operand a = context.VectorExtract(type, GetVec(op.Ra), 0); + Operand n = context.VectorExtract(type, GetVec(op.Rn), 0); + Operand m = context.VectorExtract(type, GetVec(op.Rm), 0); + + context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), emit(a, n, m), 0)); + } + + public static void EmitVectorUnaryOpF(ArmEmitterContext context, Func1I emit) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand res = context.VectorZero(); + + int sizeF = op.Size & 1; + + OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32; + + int elems = op.GetBytesCount() >> sizeF + 2; + + for (int index = 0; index < elems; index++) + { + Operand ne = context.VectorExtract(type, GetVec(op.Rn), index); + + res = context.VectorInsert(res, emit(ne), index); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorBinaryOpF(ArmEmitterContext context, Func2I emit) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand res = context.VectorZero(); + + int sizeF = op.Size & 1; + + OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32; + + int elems = op.GetBytesCount() >> sizeF + 2; + + for (int index = 0; index < elems; index++) + { + Operand ne = context.VectorExtract(type, GetVec(op.Rn), index); + Operand me = context.VectorExtract(type, GetVec(op.Rm), index); + + res = context.VectorInsert(res, emit(ne, me), index); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorTernaryOpF(ArmEmitterContext context, Func3I emit) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand res = context.VectorZero(); + + int sizeF = op.Size & 1; + + OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32; + + int elems = op.GetBytesCount() >> sizeF + 2; + + for (int index = 0; index < elems; index++) + { + Operand de = context.VectorExtract(type, GetVec(op.Rd), index); + Operand ne = context.VectorExtract(type, GetVec(op.Rn), index); + Operand me = context.VectorExtract(type, GetVec(op.Rm), index); + + res = context.VectorInsert(res, emit(de, ne, me), index); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorBinaryOpByElemF(ArmEmitterContext context, Func2I emit) + { + OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp; + + Operand res = context.VectorZero(); + + int sizeF = op.Size & 1; + + OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32; + + int elems = op.GetBytesCount() >> sizeF + 2; + + for (int index = 0; index < elems; index++) + { + Operand ne = context.VectorExtract(type, GetVec(op.Rn), index); + Operand me = context.VectorExtract(type, GetVec(op.Rm), op.Index); + + res = context.VectorInsert(res, emit(ne, me), index); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorTernaryOpByElemF(ArmEmitterContext context, Func3I emit) + { + OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp; + + Operand res = context.VectorZero(); + + int sizeF = op.Size & 1; + + OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32; + + int elems = op.GetBytesCount() >> sizeF + 2; + + for (int index = 0; index < elems; index++) + { + Operand de = context.VectorExtract(type, GetVec(op.Rd), index); + Operand ne = context.VectorExtract(type, GetVec(op.Rn), index); + Operand me = context.VectorExtract(type, GetVec(op.Rm), op.Index); + + res = context.VectorInsert(res, emit(de, ne, me), index); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorUnaryOpSx(ArmEmitterContext context, Func1I emit) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractSx(context, op.Rn, index, op.Size); + + res = EmitVectorInsert(context, res, emit(ne), index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorBinaryOpSx(ArmEmitterContext context, Func2I emit) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractSx(context, op.Rn, index, op.Size); + Operand me = EmitVectorExtractSx(context, op.Rm, index, op.Size); + + res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorTernaryOpSx(ArmEmitterContext context, Func3I emit) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand de = EmitVectorExtractSx(context, op.Rd, index, op.Size); + Operand ne = EmitVectorExtractSx(context, op.Rn, index, op.Size); + Operand me = EmitVectorExtractSx(context, op.Rm, index, op.Size); + + res = EmitVectorInsert(context, res, emit(de, ne, me), index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorUnaryOpZx(ArmEmitterContext context, Func1I emit) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size); + + res = EmitVectorInsert(context, res, emit(ne), index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorBinaryOpZx(ArmEmitterContext context, Func2I emit) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size); + Operand me = EmitVectorExtractZx(context, op.Rm, index, op.Size); + + res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorTernaryOpZx(ArmEmitterContext context, Func3I emit) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand de = EmitVectorExtractZx(context, op.Rd, index, op.Size); + Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size); + Operand me = EmitVectorExtractZx(context, op.Rm, index, op.Size); + + res = EmitVectorInsert(context, res, emit(de, ne, me), index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorBinaryOpByElemSx(ArmEmitterContext context, Func2I emit) + { + OpCodeSimdRegElem op = (OpCodeSimdRegElem)context.CurrOp; + + Operand res = context.VectorZero(); + + Operand me = EmitVectorExtractSx(context, op.Rm, op.Index, op.Size); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractSx(context, op.Rn, index, op.Size); + + res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorBinaryOpByElemZx(ArmEmitterContext context, Func2I emit) + { + OpCodeSimdRegElem op = (OpCodeSimdRegElem)context.CurrOp; + + Operand res = context.VectorZero(); + + Operand me = EmitVectorExtractZx(context, op.Rm, op.Index, op.Size); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size); + + res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorTernaryOpByElemZx(ArmEmitterContext context, Func3I emit) + { + OpCodeSimdRegElem op = (OpCodeSimdRegElem)context.CurrOp; + + Operand res = context.VectorZero(); + + Operand me = EmitVectorExtractZx(context, op.Rm, op.Index, op.Size); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand de = EmitVectorExtractZx(context, op.Rd, index, op.Size); + Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size); + + res = EmitVectorInsert(context, res, emit(de, ne, me), index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorImmUnaryOp(ArmEmitterContext context, Func1I emit) + { + OpCodeSimdImm op = (OpCodeSimdImm)context.CurrOp; + + Operand imm = Const(op.Immediate); + + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + res = EmitVectorInsert(context, res, emit(imm), index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorImmBinaryOp(ArmEmitterContext context, Func2I emit) + { + OpCodeSimdImm op = (OpCodeSimdImm)context.CurrOp; + + Operand imm = Const(op.Immediate); + + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand de = EmitVectorExtractZx(context, op.Rd, index, op.Size); + + res = EmitVectorInsert(context, res, emit(de, imm), index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorWidenRmBinaryOpSx(ArmEmitterContext context, Func2I emit) + { + EmitVectorWidenRmBinaryOp(context, emit, signed: true); + } + + public static void EmitVectorWidenRmBinaryOpZx(ArmEmitterContext context, Func2I emit) + { + EmitVectorWidenRmBinaryOp(context, emit, signed: false); + } + + private static void EmitVectorWidenRmBinaryOp(ArmEmitterContext context, Func2I emit, bool signed) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = 8 >> op.Size; + + int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtract(context, op.Rn, index, op.Size + 1, signed); + Operand me = EmitVectorExtract(context, op.Rm, part + index, op.Size, signed); + + res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size + 1); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorWidenRnRmBinaryOpSx(ArmEmitterContext context, Func2I emit) + { + EmitVectorWidenRnRmBinaryOp(context, emit, signed: true); + } + + public static void EmitVectorWidenRnRmBinaryOpZx(ArmEmitterContext context, Func2I emit) + { + EmitVectorWidenRnRmBinaryOp(context, emit, signed: false); + } + + private static void EmitVectorWidenRnRmBinaryOp(ArmEmitterContext context, Func2I emit, bool signed) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = 8 >> op.Size; + + int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtract(context, op.Rn, part + index, op.Size, signed); + Operand me = EmitVectorExtract(context, op.Rm, part + index, op.Size, signed); + + res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size + 1); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorWidenRnRmTernaryOpSx(ArmEmitterContext context, Func3I emit) + { + EmitVectorWidenRnRmTernaryOp(context, emit, signed: true); + } + + public static void EmitVectorWidenRnRmTernaryOpZx(ArmEmitterContext context, Func3I emit) + { + EmitVectorWidenRnRmTernaryOp(context, emit, signed: false); + } + + private static void EmitVectorWidenRnRmTernaryOp(ArmEmitterContext context, Func3I emit, bool signed) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = 8 >> op.Size; + + int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0; + + for (int index = 0; index < elems; index++) + { + Operand de = EmitVectorExtract(context, op.Rd, index, op.Size + 1, signed); + Operand ne = EmitVectorExtract(context, op.Rn, part + index, op.Size, signed); + Operand me = EmitVectorExtract(context, op.Rm, part + index, op.Size, signed); + + res = EmitVectorInsert(context, res, emit(de, ne, me), index, op.Size + 1); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorWidenBinaryOpByElemSx(ArmEmitterContext context, Func2I emit) + { + EmitVectorWidenBinaryOpByElem(context, emit, signed: true); + } + + public static void EmitVectorWidenBinaryOpByElemZx(ArmEmitterContext context, Func2I emit) + { + EmitVectorWidenBinaryOpByElem(context, emit, signed: false); + } + + private static void EmitVectorWidenBinaryOpByElem(ArmEmitterContext context, Func2I emit, bool signed) + { + OpCodeSimdRegElem op = (OpCodeSimdRegElem)context.CurrOp; + + Operand res = context.VectorZero(); + + Operand me = EmitVectorExtract(context, op.Rm, op.Index, op.Size, signed); + + int elems = 8 >> op.Size; + + int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtract(context, op.Rn, part + index, op.Size, signed); + + res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size + 1); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorWidenTernaryOpByElemSx(ArmEmitterContext context, Func3I emit) + { + EmitVectorWidenTernaryOpByElem(context, emit, signed: true); + } + + public static void EmitVectorWidenTernaryOpByElemZx(ArmEmitterContext context, Func3I emit) + { + EmitVectorWidenTernaryOpByElem(context, emit, signed: false); + } + + private static void EmitVectorWidenTernaryOpByElem(ArmEmitterContext context, Func3I emit, bool signed) + { + OpCodeSimdRegElem op = (OpCodeSimdRegElem)context.CurrOp; + + Operand res = context.VectorZero(); + + Operand me = EmitVectorExtract(context, op.Rm, op.Index, op.Size, signed); + + int elems = 8 >> op.Size; + + int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0; + + for (int index = 0; index < elems; index++) + { + Operand de = EmitVectorExtract(context, op.Rd, index, op.Size + 1, signed); + Operand ne = EmitVectorExtract(context, op.Rn, part + index, op.Size, signed); + + res = EmitVectorInsert(context, res, emit(de, ne, me), index, op.Size + 1); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitVectorPairwiseOpSx(ArmEmitterContext context, Func2I emit) + { + EmitVectorPairwiseOp(context, emit, signed: true); + } + + public static void EmitVectorPairwiseOpZx(ArmEmitterContext context, Func2I emit) + { + EmitVectorPairwiseOp(context, emit, signed: false); + } + + private static void EmitVectorPairwiseOp(ArmEmitterContext context, Func2I emit, bool signed) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand res = context.VectorZero(); + + int pairs = op.GetPairsCount() >> op.Size; + + for (int index = 0; index < pairs; index++) + { + int pairIndex = index << 1; + + Operand n0 = EmitVectorExtract(context, op.Rn, pairIndex, op.Size, signed); + Operand n1 = EmitVectorExtract(context, op.Rn, pairIndex + 1, op.Size, signed); + + Operand m0 = EmitVectorExtract(context, op.Rm, pairIndex, op.Size, signed); + Operand m1 = EmitVectorExtract(context, op.Rm, pairIndex + 1, op.Size, signed); + + res = EmitVectorInsert(context, res, emit(n0, n1), index, op.Size); + res = EmitVectorInsert(context, res, emit(m0, m1), pairs + index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitSsse3VectorPairwiseOp(ArmEmitterContext context, Intrinsic[] inst) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if (op.RegisterSize == RegisterSize.Simd64) + { + Operand zeroEvenMask = X86GetElements(context, ZeroMask, EvenMasks[op.Size]); + Operand zeroOddMask = X86GetElements(context, ZeroMask, OddMasks [op.Size]); + + Operand mN = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m); // m:n + + Operand left = context.AddIntrinsic(Intrinsic.X86Pshufb, mN, zeroEvenMask); // 0:even from m:n + Operand right = context.AddIntrinsic(Intrinsic.X86Pshufb, mN, zeroOddMask); // 0:odd from m:n + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst[op.Size], left, right)); + } + else if (op.Size < 3) + { + Operand oddEvenMask = X86GetElements(context, OddMasks[op.Size], EvenMasks[op.Size]); + + Operand oddEvenN = context.AddIntrinsic(Intrinsic.X86Pshufb, n, oddEvenMask); // odd:even from n + Operand oddEvenM = context.AddIntrinsic(Intrinsic.X86Pshufb, m, oddEvenMask); // odd:even from m + + Operand left = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, oddEvenN, oddEvenM); + Operand right = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, oddEvenN, oddEvenM); + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst[op.Size], left, right)); + } + else + { + Operand left = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m); + Operand right = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, n, m); + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst[3], left, right)); + } + } + + public static void EmitVectorAcrossVectorOpSx(ArmEmitterContext context, Func2I emit) + { + EmitVectorAcrossVectorOp(context, emit, signed: true, isLong: false); + } + + public static void EmitVectorAcrossVectorOpZx(ArmEmitterContext context, Func2I emit) + { + EmitVectorAcrossVectorOp(context, emit, signed: false, isLong: false); + } + + public static void EmitVectorLongAcrossVectorOpSx(ArmEmitterContext context, Func2I emit) + { + EmitVectorAcrossVectorOp(context, emit, signed: true, isLong: true); + } + + public static void EmitVectorLongAcrossVectorOpZx(ArmEmitterContext context, Func2I emit) + { + EmitVectorAcrossVectorOp(context, emit, signed: false, isLong: true); + } + + private static void EmitVectorAcrossVectorOp( + ArmEmitterContext context, + Func2I emit, + bool signed, + bool isLong) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + int elems = op.GetBytesCount() >> op.Size; + + Operand res = EmitVectorExtract(context, op.Rn, 0, op.Size, signed); + + for (int index = 1; index < elems; index++) + { + Operand n = EmitVectorExtract(context, op.Rn, index, op.Size, signed); + + res = emit(res, n); + } + + int size = isLong ? op.Size + 1 : op.Size; + + Operand d = EmitVectorInsert(context, context.VectorZero(), res, 0, size); + + context.Copy(GetVec(op.Rd), d); + } + + public static void EmitVectorAcrossVectorOpF(ArmEmitterContext context, Func2I emit) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Debug.Assert((op.Size & 1) == 0 && op.RegisterSize == RegisterSize.Simd128); + + Operand res = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0); + + for (int index = 1; index < 4; index++) + { + Operand n = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), index); + + res = emit(res, n); + } + + Operand d = context.VectorInsert(context.VectorZero(), res, 0); + + context.Copy(GetVec(op.Rd), d); + } + + public static void EmitSse2VectorAcrossVectorOpF(ArmEmitterContext context, Func2I emit) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Debug.Assert((op.Size & 1) == 0 && op.RegisterSize == RegisterSize.Simd128); + + const int sm0 = 0 << 6 | 0 << 4 | 0 << 2 | 0 << 0; + const int sm1 = 1 << 6 | 1 << 4 | 1 << 2 | 1 << 0; + const int sm2 = 2 << 6 | 2 << 4 | 2 << 2 | 2 << 0; + const int sm3 = 3 << 6 | 3 << 4 | 3 << 2 | 3 << 0; + + Operand nCopy = context.Copy(GetVec(op.Rn)); + + Operand part0 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, nCopy, Const(sm0)); + Operand part1 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, nCopy, Const(sm1)); + Operand part2 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, nCopy, Const(sm2)); + Operand part3 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, nCopy, Const(sm3)); + + Operand res = emit(emit(part0, part1), emit(part2, part3)); + + context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res)); + } + + public static void EmitScalarPairwiseOpF(ArmEmitterContext context, Func2I emit) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32; + + Operand ne0 = context.VectorExtract(type, GetVec(op.Rn), 0); + Operand ne1 = context.VectorExtract(type, GetVec(op.Rn), 1); + + Operand res = context.VectorInsert(context.VectorZero(), emit(ne0, ne1), 0); + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitSse2ScalarPairwiseOpF(ArmEmitterContext context, Func2I emit) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + Operand op0, op1; + + if ((op.Size & 1) == 0) + { + const int sm0 = 2 << 6 | 2 << 4 | 2 << 2 | 0 << 0; + const int sm1 = 2 << 6 | 2 << 4 | 2 << 2 | 1 << 0; + + Operand zeroN = context.VectorZeroUpper64(n); + + op0 = context.AddIntrinsic(Intrinsic.X86Pshufd, zeroN, Const(sm0)); + op1 = context.AddIntrinsic(Intrinsic.X86Pshufd, zeroN, Const(sm1)); + } + else /* if ((op.Size & 1) == 1) */ + { + Operand zero = context.VectorZero(); + + op0 = context.AddIntrinsic(Intrinsic.X86Movlhps, n, zero); + op1 = context.AddIntrinsic(Intrinsic.X86Movhlps, zero, n); + } + + context.Copy(GetVec(op.Rd), emit(op0, op1)); + } + + public static void EmitVectorPairwiseOpF(ArmEmitterContext context, Func2I emit) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand res = context.VectorZero(); + + int sizeF = op.Size & 1; + + OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32; + + int pairs = op.GetPairsCount() >> sizeF + 2; + + for (int index = 0; index < pairs; index++) + { + int pairIndex = index << 1; + + Operand n0 = context.VectorExtract(type, GetVec(op.Rn), pairIndex); + Operand n1 = context.VectorExtract(type, GetVec(op.Rn), pairIndex + 1); + + Operand m0 = context.VectorExtract(type, GetVec(op.Rm), pairIndex); + Operand m1 = context.VectorExtract(type, GetVec(op.Rm), pairIndex + 1); + + res = context.VectorInsert(res, emit(n0, n1), index); + res = context.VectorInsert(res, emit(m0, m1), pairs + index); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitSse2VectorPairwiseOpF(ArmEmitterContext context, Func2I emit) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand nCopy = context.Copy(GetVec(op.Rn)); + Operand mCopy = context.Copy(GetVec(op.Rm)); + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + if (op.RegisterSize == RegisterSize.Simd64) + { + Operand unpck = context.AddIntrinsic(Intrinsic.X86Unpcklps, nCopy, mCopy); + + Operand zero = context.VectorZero(); + + Operand part0 = context.AddIntrinsic(Intrinsic.X86Movlhps, unpck, zero); + Operand part1 = context.AddIntrinsic(Intrinsic.X86Movhlps, zero, unpck); + + context.Copy(GetVec(op.Rd), emit(part0, part1)); + } + else /* if (op.RegisterSize == RegisterSize.Simd128) */ + { + const int sm0 = 2 << 6 | 0 << 4 | 2 << 2 | 0 << 0; + const int sm1 = 3 << 6 | 1 << 4 | 3 << 2 | 1 << 0; + + Operand part0 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, mCopy, Const(sm0)); + Operand part1 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, mCopy, Const(sm1)); + + context.Copy(GetVec(op.Rd), emit(part0, part1)); + } + } + else /* if (sizeF == 1) */ + { + Operand part0 = context.AddIntrinsic(Intrinsic.X86Unpcklpd, nCopy, mCopy); + Operand part1 = context.AddIntrinsic(Intrinsic.X86Unpckhpd, nCopy, mCopy); + + context.Copy(GetVec(op.Rd), emit(part0, part1)); + } + } + + public enum CmpCondition + { + // Legacy Sse. + Equal = 0, // Ordered, non-signaling. + LessThan = 1, // Ordered, signaling. + LessThanOrEqual = 2, // Ordered, signaling. + UnorderedQ = 3, // Non-signaling. + NotLessThan = 5, // Unordered, signaling. + NotLessThanOrEqual = 6, // Unordered, signaling. + OrderedQ = 7, // Non-signaling. + + // Vex. + GreaterThanOrEqual = 13, // Ordered, signaling. + GreaterThan = 14, // Ordered, signaling. + OrderedS = 23 // Signaling. + } + + [Flags] + public enum SaturatingFlags + { + None = 0, + + ByElem = 1 << 0, + Scalar = 1 << 1, + Signed = 1 << 2, + + Add = 1 << 3, + Sub = 1 << 4, + + Accumulate = 1 << 5 + } + + public static void EmitScalarSaturatingUnaryOpSx(ArmEmitterContext context, Func1I emit) + { + EmitSaturatingUnaryOpSx(context, emit, SaturatingFlags.Scalar | SaturatingFlags.Signed); + } + + public static void EmitVectorSaturatingUnaryOpSx(ArmEmitterContext context, Func1I emit) + { + EmitSaturatingUnaryOpSx(context, emit, SaturatingFlags.Signed); + } + + public static void EmitSaturatingUnaryOpSx(ArmEmitterContext context, Func1I emit, SaturatingFlags flags) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand res = context.VectorZero(); + + bool scalar = (flags & SaturatingFlags.Scalar) != 0; + + int elems = !scalar ? op.GetBytesCount() >> op.Size : 1; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractSx(context, op.Rn, index, op.Size); + Operand de; + + if (op.Size <= 2) + { + de = EmitSignedSrcSatQ(context, emit(ne), op.Size, signedDst: true); + } + else /* if (op.Size == 3) */ + { + de = EmitUnarySignedSatQAbsOrNeg(context, emit(ne)); + } + + res = EmitVectorInsert(context, res, de, index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitScalarSaturatingBinaryOpSx(ArmEmitterContext context, Func2I emit = null, SaturatingFlags flags = SaturatingFlags.None) + { + EmitSaturatingBinaryOp(context, emit, SaturatingFlags.Scalar | SaturatingFlags.Signed | flags); + } + + public static void EmitScalarSaturatingBinaryOpZx(ArmEmitterContext context, SaturatingFlags flags) + { + EmitSaturatingBinaryOp(context, null, SaturatingFlags.Scalar | flags); + } + + public static void EmitVectorSaturatingBinaryOpSx(ArmEmitterContext context, Func2I emit = null, SaturatingFlags flags = SaturatingFlags.None) + { + EmitSaturatingBinaryOp(context, emit, SaturatingFlags.Signed | flags); + } + + public static void EmitVectorSaturatingBinaryOpZx(ArmEmitterContext context, SaturatingFlags flags) + { + EmitSaturatingBinaryOp(context, null, flags); + } + + public static void EmitVectorSaturatingBinaryOpByElemSx(ArmEmitterContext context, Func2I emit) + { + EmitSaturatingBinaryOp(context, emit, SaturatingFlags.ByElem | SaturatingFlags.Signed); + } + + public static void EmitSaturatingBinaryOp(ArmEmitterContext context, Func2I emit, SaturatingFlags flags) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand res = context.VectorZero(); + + bool byElem = (flags & SaturatingFlags.ByElem) != 0; + bool scalar = (flags & SaturatingFlags.Scalar) != 0; + bool signed = (flags & SaturatingFlags.Signed) != 0; + + bool add = (flags & SaturatingFlags.Add) != 0; + bool sub = (flags & SaturatingFlags.Sub) != 0; + + bool accumulate = (flags & SaturatingFlags.Accumulate) != 0; + + int elems = !scalar ? op.GetBytesCount() >> op.Size : 1; + + if (add || sub) + { + for (int index = 0; index < elems; index++) + { + Operand de; + Operand ne = EmitVectorExtract(context, op.Rn, index, op.Size, signed); + Operand me = EmitVectorExtract(context, ((OpCodeSimdReg)op).Rm, index, op.Size, signed); + + if (op.Size <= 2) + { + Operand temp = add ? context.Add(ne, me) : context.Subtract(ne, me); + + de = EmitSignedSrcSatQ(context, temp, op.Size, signedDst: signed); + } + else /* if (op.Size == 3) */ + { + if (add) + { + de = signed ? EmitBinarySignedSatQAdd(context, ne, me) : EmitBinaryUnsignedSatQAdd(context, ne, me); + } + else /* if (sub) */ + { + de = signed ? EmitBinarySignedSatQSub(context, ne, me) : EmitBinaryUnsignedSatQSub(context, ne, me); + } + } + + res = EmitVectorInsert(context, res, de, index, op.Size); + } + } + else if (accumulate) + { + for (int index = 0; index < elems; index++) + { + Operand de; + Operand ne = EmitVectorExtract(context, op.Rn, index, op.Size, !signed); + Operand me = EmitVectorExtract(context, op.Rd, index, op.Size, signed); + + if (op.Size <= 2) + { + Operand temp = context.Add(ne, me); + + de = EmitSignedSrcSatQ(context, temp, op.Size, signedDst: signed); + } + else /* if (op.Size == 3) */ + { + de = signed ? EmitBinarySignedSatQAcc(context, ne, me) : EmitBinaryUnsignedSatQAcc(context, ne, me); + } + + res = EmitVectorInsert(context, res, de, index, op.Size); + } + } + else + { + Operand me = default; + + if (byElem) + { + OpCodeSimdRegElem opRegElem = (OpCodeSimdRegElem)op; + + me = EmitVectorExtract(context, opRegElem.Rm, opRegElem.Index, op.Size, signed); + } + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtract(context, op.Rn, index, op.Size, signed); + + if (!byElem) + { + me = EmitVectorExtract(context, ((OpCodeSimdReg)op).Rm, index, op.Size, signed); + } + + Operand de = EmitSignedSrcSatQ(context, emit(ne, me), op.Size, signedDst: signed); + + res = EmitVectorInsert(context, res, de, index, op.Size); + } + } + + context.Copy(GetVec(op.Rd), res); + } + + [Flags] + public enum SaturatingNarrowFlags + { + Scalar = 1 << 0, + SignedSrc = 1 << 1, + SignedDst = 1 << 2, + + ScalarSxSx = Scalar | SignedSrc | SignedDst, + ScalarSxZx = Scalar | SignedSrc, + ScalarZxZx = Scalar, + + VectorSxSx = SignedSrc | SignedDst, + VectorSxZx = SignedSrc, + VectorZxZx = 0 + } + + public static void EmitSaturatingNarrowOp(ArmEmitterContext context, SaturatingNarrowFlags flags) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + bool scalar = (flags & SaturatingNarrowFlags.Scalar) != 0; + bool signedSrc = (flags & SaturatingNarrowFlags.SignedSrc) != 0; + bool signedDst = (flags & SaturatingNarrowFlags.SignedDst) != 0; + + int elems = !scalar ? 8 >> op.Size : 1; + + int part = !scalar && (op.RegisterSize == RegisterSize.Simd128) ? elems : 0; + + Operand d = GetVec(op.Rd); + + Operand res = part == 0 ? context.VectorZero() : context.Copy(d); + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtract(context, op.Rn, index, op.Size + 1, signedSrc); + + Operand temp = signedSrc + ? EmitSignedSrcSatQ(context, ne, op.Size, signedDst) + : EmitUnsignedSrcSatQ(context, ne, op.Size, signedDst); + + res = EmitVectorInsert(context, res, temp, part + index, op.Size); + } + + context.Copy(d, res); + } + + // long SignedSignSatQ(long op, int size); + public static Operand EmitSignedSignSatQ(ArmEmitterContext context, Operand op, int size) + { + int eSize = 8 << size; + + Debug.Assert(op.Type == OperandType.I64); + Debug.Assert(eSize == 8 || eSize == 16 || eSize == 32 || eSize == 64); + + Operand lbl1 = Label(); + Operand lblEnd = Label(); + + Operand zeroL = Const(0L); + Operand maxT = Const((1L << (eSize - 1)) - 1L); + Operand minT = Const(-(1L << (eSize - 1))); + + Operand res = context.Copy(context.AllocateLocal(OperandType.I64), zeroL); + + context.BranchIf(lbl1, op, zeroL, Comparison.LessOrEqual); + context.Copy(res, maxT); + SetFpFlag(context, FPState.QcFlag, Const(1)); + context.Branch(lblEnd); + + context.MarkLabel(lbl1); + context.BranchIf(lblEnd, op, zeroL, Comparison.GreaterOrEqual); + context.Copy(res, minT); + SetFpFlag(context, FPState.QcFlag, Const(1)); + context.Branch(lblEnd); + + context.MarkLabel(lblEnd); + + return res; + } + + // private static ulong UnsignedSignSatQ(ulong op, int size); + public static Operand EmitUnsignedSignSatQ(ArmEmitterContext context, Operand op, int size) + { + int eSize = 8 << size; + + Debug.Assert(op.Type == OperandType.I64); + Debug.Assert(eSize == 8 || eSize == 16 || eSize == 32 || eSize == 64); + + Operand lblEnd = Label(); + + Operand zeroUL = Const(0UL); + Operand maxT = Const(ulong.MaxValue >> (64 - eSize)); + + Operand res = context.Copy(context.AllocateLocal(OperandType.I64), zeroUL); + + context.BranchIf(lblEnd, op, zeroUL, Comparison.LessOrEqualUI); + context.Copy(res, maxT); + SetFpFlag(context, FPState.QcFlag, Const(1)); + context.Branch(lblEnd); + + context.MarkLabel(lblEnd); + + return res; + } + + // TSrc (16bit, 32bit, 64bit; signed) > TDst (8bit, 16bit, 32bit; signed, unsigned). + // long SignedSrcSignedDstSatQ(long op, int size); ulong SignedSrcUnsignedDstSatQ(long op, int size); + public static Operand EmitSignedSrcSatQ(ArmEmitterContext context, Operand op, int sizeDst, bool signedDst) + { + int eSizeDst = 8 << sizeDst; + + Debug.Assert(op.Type == OperandType.I64); + Debug.Assert(eSizeDst == 8 || eSizeDst == 16 || eSizeDst == 32); + + Operand lbl1 = Label(); + Operand lblEnd = Label(); + + Operand maxT = signedDst ? Const((1L << (eSizeDst - 1)) - 1L) : Const((1UL << eSizeDst) - 1UL); + Operand minT = signedDst ? Const(-(1L << (eSizeDst - 1))) : Const(0UL); + + Operand res = context.Copy(context.AllocateLocal(OperandType.I64), op); + + context.BranchIf(lbl1, op, maxT, Comparison.LessOrEqual); + context.Copy(res, maxT); + SetFpFlag(context, FPState.QcFlag, Const(1)); + context.Branch(lblEnd); + + context.MarkLabel(lbl1); + context.BranchIf(lblEnd, op, minT, Comparison.GreaterOrEqual); + context.Copy(res, minT); + SetFpFlag(context, FPState.QcFlag, Const(1)); + context.Branch(lblEnd); + + context.MarkLabel(lblEnd); + + return res; + } + + // TSrc (16bit, 32bit, 64bit; unsigned) > TDst (8bit, 16bit, 32bit; signed, unsigned). + // long UnsignedSrcSignedDstSatQ(ulong op, int size); ulong UnsignedSrcUnsignedDstSatQ(ulong op, int size); + public static Operand EmitUnsignedSrcSatQ(ArmEmitterContext context, Operand op, int sizeDst, bool signedDst) + { + int eSizeDst = 8 << sizeDst; + + Debug.Assert(op.Type == OperandType.I64); + Debug.Assert(eSizeDst == 8 || eSizeDst == 16 || eSizeDst == 32); + + Operand lblEnd = Label(); + + Operand maxT = signedDst ? Const((1L << (eSizeDst - 1)) - 1L) : Const((1UL << eSizeDst) - 1UL); + + Operand res = context.Copy(context.AllocateLocal(OperandType.I64), op); + + context.BranchIf(lblEnd, op, maxT, Comparison.LessOrEqualUI); + context.Copy(res, maxT); + SetFpFlag(context, FPState.QcFlag, Const(1)); + context.Branch(lblEnd); + + context.MarkLabel(lblEnd); + + return res; + } + + // long UnarySignedSatQAbsOrNeg(long op); + private static Operand EmitUnarySignedSatQAbsOrNeg(ArmEmitterContext context, Operand op) + { + Debug.Assert(op.Type == OperandType.I64); + + Operand lblEnd = Label(); + + Operand minL = Const(long.MinValue); + Operand maxL = Const(long.MaxValue); + + Operand res = context.Copy(context.AllocateLocal(OperandType.I64), op); + + context.BranchIf(lblEnd, op, minL, Comparison.NotEqual); + context.Copy(res, maxL); + SetFpFlag(context, FPState.QcFlag, Const(1)); + context.Branch(lblEnd); + + context.MarkLabel(lblEnd); + + return res; + } + + // long BinarySignedSatQAdd(long op1, long op2); + public static Operand EmitBinarySignedSatQAdd(ArmEmitterContext context, Operand op1, Operand op2) + { + Debug.Assert(op1.Type == OperandType.I64 && op2.Type == OperandType.I64); + + Operand lblEnd = Label(); + + Operand minL = Const(long.MinValue); + Operand maxL = Const(long.MaxValue); + Operand zeroL = Const(0L); + + Operand add = context.Add(op1, op2); + Operand res = context.Copy(context.AllocateLocal(OperandType.I64), add); + + Operand left = context.BitwiseNot(context.BitwiseExclusiveOr(op1, op2)); + Operand right = context.BitwiseExclusiveOr(op1, add); + context.BranchIf(lblEnd, context.BitwiseAnd(left, right), zeroL, Comparison.GreaterOrEqual); + + Operand isPositive = context.ICompareGreaterOrEqual(op1, zeroL); + context.Copy(res, context.ConditionalSelect(isPositive, maxL, minL)); + SetFpFlag(context, FPState.QcFlag, Const(1)); + context.Branch(lblEnd); + + context.MarkLabel(lblEnd); + + return res; + } + + // ulong BinaryUnsignedSatQAdd(ulong op1, ulong op2); + public static Operand EmitBinaryUnsignedSatQAdd(ArmEmitterContext context, Operand op1, Operand op2) + { + Debug.Assert(op1.Type == OperandType.I64 && op2.Type == OperandType.I64); + + Operand lblEnd = Label(); + + Operand maxUL = Const(ulong.MaxValue); + + Operand add = context.Add(op1, op2); + Operand res = context.Copy(context.AllocateLocal(OperandType.I64), add); + + context.BranchIf(lblEnd, add, op1, Comparison.GreaterOrEqualUI); + context.Copy(res, maxUL); + SetFpFlag(context, FPState.QcFlag, Const(1)); + context.Branch(lblEnd); + + context.MarkLabel(lblEnd); + + return res; + } + + // long BinarySignedSatQSub(long op1, long op2); + public static Operand EmitBinarySignedSatQSub(ArmEmitterContext context, Operand op1, Operand op2) + { + Debug.Assert(op1.Type == OperandType.I64 && op2.Type == OperandType.I64); + + Operand lblEnd = Label(); + + Operand minL = Const(long.MinValue); + Operand maxL = Const(long.MaxValue); + Operand zeroL = Const(0L); + + Operand sub = context.Subtract(op1, op2); + Operand res = context.Copy(context.AllocateLocal(OperandType.I64), sub); + + Operand left = context.BitwiseExclusiveOr(op1, op2); + Operand right = context.BitwiseExclusiveOr(op1, sub); + context.BranchIf(lblEnd, context.BitwiseAnd(left, right), zeroL, Comparison.GreaterOrEqual); + + Operand isPositive = context.ICompareGreaterOrEqual(op1, zeroL); + context.Copy(res, context.ConditionalSelect(isPositive, maxL, minL)); + SetFpFlag(context, FPState.QcFlag, Const(1)); + context.Branch(lblEnd); + + context.MarkLabel(lblEnd); + + return res; + } + + // ulong BinaryUnsignedSatQSub(ulong op1, ulong op2); + public static Operand EmitBinaryUnsignedSatQSub(ArmEmitterContext context, Operand op1, Operand op2) + { + Debug.Assert(op1.Type == OperandType.I64 && op2.Type == OperandType.I64); + + Operand lblEnd = Label(); + + Operand zeroL = Const(0L); + + Operand sub = context.Subtract(op1, op2); + Operand res = context.Copy(context.AllocateLocal(OperandType.I64), sub); + + context.BranchIf(lblEnd, op1, op2, Comparison.GreaterOrEqualUI); + context.Copy(res, zeroL); + SetFpFlag(context, FPState.QcFlag, Const(1)); + context.Branch(lblEnd); + + context.MarkLabel(lblEnd); + + return res; + } + + // long BinarySignedSatQAcc(ulong op1, long op2); + private static Operand EmitBinarySignedSatQAcc(ArmEmitterContext context, Operand op1, Operand op2) + { + Debug.Assert(op1.Type == OperandType.I64 && op2.Type == OperandType.I64); + + Operand lbl1 = Label(); + Operand lbl2 = Label(); + Operand lblEnd = Label(); + + Operand maxL = Const(long.MaxValue); + Operand zeroL = Const(0L); + + Operand add = context.Add(op1, op2); + Operand res = context.Copy(context.AllocateLocal(OperandType.I64), add); + + context.BranchIf(lbl1, op1, maxL, Comparison.GreaterUI); + Operand notOp2AndRes = context.BitwiseAnd(context.BitwiseNot(op2), add); + context.BranchIf(lblEnd, notOp2AndRes, zeroL, Comparison.GreaterOrEqual); + context.Copy(res, maxL); + SetFpFlag(context, FPState.QcFlag, Const(1)); + context.Branch(lblEnd); + + context.MarkLabel(lbl1); + context.BranchIf(lbl2, op2, zeroL, Comparison.Less); + context.Copy(res, maxL); + SetFpFlag(context, FPState.QcFlag, Const(1)); + context.Branch(lblEnd); + + context.MarkLabel(lbl2); + context.BranchIf(lblEnd, add, maxL, Comparison.LessOrEqualUI); + context.Copy(res, maxL); + SetFpFlag(context, FPState.QcFlag, Const(1)); + context.Branch(lblEnd); + + context.MarkLabel(lblEnd); + + return res; + } + + // ulong BinaryUnsignedSatQAcc(long op1, ulong op2); + private static Operand EmitBinaryUnsignedSatQAcc(ArmEmitterContext context, Operand op1, Operand op2) + { + Debug.Assert(op1.Type == OperandType.I64 && op2.Type == OperandType.I64); + + Operand lbl1 = Label(); + Operand lblEnd = Label(); + + Operand maxUL = Const(ulong.MaxValue); + Operand maxL = Const(long.MaxValue); + Operand zeroL = Const(0L); + + Operand add = context.Add(op1, op2); + Operand res = context.Copy(context.AllocateLocal(OperandType.I64), add); + + context.BranchIf(lbl1, op1, zeroL, Comparison.Less); + context.BranchIf(lblEnd, add, op1, Comparison.GreaterOrEqualUI); + context.Copy(res, maxUL); + SetFpFlag(context, FPState.QcFlag, Const(1)); + context.Branch(lblEnd); + + context.MarkLabel(lbl1); + context.BranchIf(lblEnd, op2, maxL, Comparison.GreaterUI); + context.BranchIf(lblEnd, add, zeroL, Comparison.GreaterOrEqual); + context.Copy(res, zeroL); + SetFpFlag(context, FPState.QcFlag, Const(1)); + context.Branch(lblEnd); + + context.MarkLabel(lblEnd); + + return res; + } + + public static Operand EmitFloatAbs(ArmEmitterContext context, Operand value, bool single, bool vector) + { + Operand mask; + if (single) + { + mask = vector ? X86GetAllElements(context, -0f) : X86GetScalar(context, -0f); + } + else + { + mask = vector ? X86GetAllElements(context, -0d) : X86GetScalar(context, -0d); + } + + return context.AddIntrinsic(single ? Intrinsic.X86Andnps : Intrinsic.X86Andnpd, mask, value); + } + + public static Operand EmitVectorExtractSx(ArmEmitterContext context, int reg, int index, int size) + { + return EmitVectorExtract(context, reg, index, size, true); + } + + public static Operand EmitVectorExtractZx(ArmEmitterContext context, int reg, int index, int size) + { + return EmitVectorExtract(context, reg, index, size, false); + } + + public static Operand EmitVectorExtract(ArmEmitterContext context, int reg, int index, int size, bool signed) + { + ThrowIfInvalid(index, size); + + Operand res = default; + + switch (size) + { + case 0: + res = context.VectorExtract8(GetVec(reg), index); + break; + + case 1: + res = context.VectorExtract16(GetVec(reg), index); + break; + + case 2: + res = context.VectorExtract(OperandType.I32, GetVec(reg), index); + break; + + case 3: + res = context.VectorExtract(OperandType.I64, GetVec(reg), index); + break; + } + + if (signed) + { + switch (size) + { + case 0: res = context.SignExtend8 (OperandType.I64, res); break; + case 1: res = context.SignExtend16(OperandType.I64, res); break; + case 2: res = context.SignExtend32(OperandType.I64, res); break; + } + } + else + { + switch (size) + { + case 0: res = context.ZeroExtend8 (OperandType.I64, res); break; + case 1: res = context.ZeroExtend16(OperandType.I64, res); break; + case 2: res = context.ZeroExtend32(OperandType.I64, res); break; + } + } + + return res; + } + + public static Operand EmitVectorInsert(ArmEmitterContext context, Operand vector, Operand value, int index, int size) + { + ThrowIfInvalid(index, size); + + if (size < 3 && value.Type == OperandType.I64) + { + value = context.ConvertI64ToI32(value); + } + + switch (size) + { + case 0: vector = context.VectorInsert8 (vector, value, index); break; + case 1: vector = context.VectorInsert16(vector, value, index); break; + case 2: vector = context.VectorInsert (vector, value, index); break; + case 3: vector = context.VectorInsert (vector, value, index); break; + } + + return vector; + } + + public static void ThrowIfInvalid(int index, int size) + { + if ((uint)size > 3u) + { + throw new ArgumentOutOfRangeException(nameof(size)); + } + + if ((uint)index >= 16u >> size) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSimdHelper32.cs b/src/ARMeilleure/Instructions/InstEmitSimdHelper32.cs new file mode 100644 index 00000000..36d27d42 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdHelper32.cs @@ -0,0 +1,1286 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; +using System; +using System.Diagnostics; +using System.Reflection; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + using Func1I = Func<Operand, Operand>; + using Func2I = Func<Operand, Operand, Operand>; + using Func3I = Func<Operand, Operand, Operand, Operand>; + + static class InstEmitSimdHelper32 + { + public static (int, int) GetQuadwordAndSubindex(int index, RegisterSize size) + { + switch (size) + { + case RegisterSize.Simd128: + return (index >> 1, 0); + case RegisterSize.Simd64: + case RegisterSize.Int64: + return (index >> 1, index & 1); + case RegisterSize.Int32: + return (index >> 2, index & 3); + } + + throw new ArgumentException("Unrecognized Vector Register Size."); + } + + public static Operand ExtractScalar(ArmEmitterContext context, OperandType type, int reg) + { + Debug.Assert(type != OperandType.V128); + + if (type == OperandType.FP64 || type == OperandType.I64) + { + // From dreg. + return context.VectorExtract(type, GetVecA32(reg >> 1), reg & 1); + } + else + { + // From sreg. + return context.VectorExtract(type, GetVecA32(reg >> 2), reg & 3); + } + } + + public static void InsertScalar(ArmEmitterContext context, int reg, Operand value) + { + Debug.Assert(value.Type != OperandType.V128); + + Operand vec, insert; + if (value.Type == OperandType.FP64 || value.Type == OperandType.I64) + { + // From dreg. + vec = GetVecA32(reg >> 1); + insert = context.VectorInsert(vec, value, reg & 1); + } + else + { + // From sreg. + vec = GetVecA32(reg >> 2); + insert = context.VectorInsert(vec, value, reg & 3); + } + + context.Copy(vec, insert); + } + + public static Operand ExtractScalar16(ArmEmitterContext context, int reg, bool top) + { + return context.VectorExtract16(GetVecA32(reg >> 2), ((reg & 3) << 1) | (top ? 1 : 0)); + } + + public static void InsertScalar16(ArmEmitterContext context, int reg, bool top, Operand value) + { + Debug.Assert(value.Type == OperandType.FP32 || value.Type == OperandType.I32); + + Operand vec, insert; + vec = GetVecA32(reg >> 2); + insert = context.VectorInsert16(vec, value, ((reg & 3) << 1) | (top ? 1 : 0)); + + context.Copy(vec, insert); + } + + public static Operand ExtractElement(ArmEmitterContext context, int reg, int size, bool signed) + { + return EmitVectorExtract32(context, reg >> (4 - size), reg & ((16 >> size) - 1), size, signed); + } + + public static void EmitVectorImmUnaryOp32(ArmEmitterContext context, Func1I emit) + { + IOpCode32SimdImm op = (IOpCode32SimdImm)context.CurrOp; + + Operand imm = Const(op.Immediate); + + int elems = op.Elems; + (int index, int subIndex) = GetQuadwordAndSubindex(op.Vd, op.RegisterSize); + + Operand vec = GetVecA32(index); + Operand res = vec; + + for (int item = 0; item < elems; item++) + { + res = EmitVectorInsert(context, res, emit(imm), item + subIndex * elems, op.Size); + } + + context.Copy(vec, res); + } + + public static void EmitScalarUnaryOpF32(ArmEmitterContext context, Func1I emit) + { + OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; + + OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32; + + Operand m = ExtractScalar(context, type, op.Vm); + + InsertScalar(context, op.Vd, emit(m)); + } + + public static void EmitScalarBinaryOpF32(ArmEmitterContext context, Func2I emit) + { + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + + OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32; + + Operand n = ExtractScalar(context, type, op.Vn); + Operand m = ExtractScalar(context, type, op.Vm); + + InsertScalar(context, op.Vd, emit(n, m)); + } + + public static void EmitScalarBinaryOpI32(ArmEmitterContext context, Func2I emit) + { + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + + OperandType type = (op.Size & 1) != 0 ? OperandType.I64 : OperandType.I32; + + if (op.Size < 2) + { + throw new NotSupportedException("Cannot perform a scalar SIMD operation on integers smaller than 32 bits."); + } + + Operand n = ExtractScalar(context, type, op.Vn); + Operand m = ExtractScalar(context, type, op.Vm); + + InsertScalar(context, op.Vd, emit(n, m)); + } + + public static void EmitScalarTernaryOpF32(ArmEmitterContext context, Func3I emit) + { + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + + OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32; + + Operand a = ExtractScalar(context, type, op.Vd); + Operand n = ExtractScalar(context, type, op.Vn); + Operand m = ExtractScalar(context, type, op.Vm); + + InsertScalar(context, op.Vd, emit(a, n, m)); + } + + public static void EmitVectorUnaryOpF32(ArmEmitterContext context, Func1I emit) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + int sizeF = op.Size & 1; + + OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32; + + int elems = op.GetBytesCount() >> sizeF + 2; + + Operand res = GetVecA32(op.Qd); + + for (int index = 0; index < elems; index++) + { + Operand me = context.VectorExtract(type, GetVecA32(op.Qm), op.Fm + index); + + res = context.VectorInsert(res, emit(me), op.Fd + index); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void EmitVectorBinaryOpF32(ArmEmitterContext context, Func2I emit) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + int sizeF = op.Size & 1; + + OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32; + + int elems = op.GetBytesCount() >> (sizeF + 2); + + Operand res = GetVecA32(op.Qd); + + for (int index = 0; index < elems; index++) + { + Operand ne = context.VectorExtract(type, GetVecA32(op.Qn), op.Fn + index); + Operand me = context.VectorExtract(type, GetVecA32(op.Qm), op.Fm + index); + + res = context.VectorInsert(res, emit(ne, me), op.Fd + index); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void EmitVectorTernaryOpF32(ArmEmitterContext context, Func3I emit) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + int sizeF = op.Size & 1; + + OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32; + + int elems = op.GetBytesCount() >> sizeF + 2; + + Operand res = GetVecA32(op.Qd); + + for (int index = 0; index < elems; index++) + { + Operand de = context.VectorExtract(type, GetVecA32(op.Qd), op.Fd + index); + Operand ne = context.VectorExtract(type, GetVecA32(op.Qn), op.Fn + index); + Operand me = context.VectorExtract(type, GetVecA32(op.Qm), op.Fm + index); + + res = context.VectorInsert(res, emit(de, ne, me), op.Fd + index); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + // Integer + + public static void EmitVectorUnaryAccumulateOpI32(ArmEmitterContext context, Func1I emit, bool signed) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + Operand res = GetVecA32(op.Qd); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand de = EmitVectorExtract32(context, op.Qd, op.Id + index, op.Size, signed); + Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, signed); + + res = EmitVectorInsert(context, res, context.Add(de, emit(me)), op.Id + index, op.Size); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void EmitVectorUnaryOpI32(ArmEmitterContext context, Func1I emit, bool signed) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + Operand res = GetVecA32(op.Qd); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, signed); + + res = EmitVectorInsert(context, res, emit(me), op.Id + index, op.Size); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void EmitVectorBinaryOpI32(ArmEmitterContext context, Func2I emit, bool signed) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + Operand res = GetVecA32(op.Qd); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed); + Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, signed); + + res = EmitVectorInsert(context, res, emit(ne, me), op.Id + index, op.Size); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void EmitVectorBinaryLongOpI32(ArmEmitterContext context, Func2I emit, bool signed) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed); + Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, signed); + + if (op.Size == 2) + { + ne = signed ? context.SignExtend32(OperandType.I64, ne) : context.ZeroExtend32(OperandType.I64, ne); + me = signed ? context.SignExtend32(OperandType.I64, me) : context.ZeroExtend32(OperandType.I64, me); + } + + res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size + 1); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void EmitVectorBinaryWideOpI32(ArmEmitterContext context, Func2I emit, bool signed) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size + 1, signed); + Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, signed); + + if (op.Size == 2) + { + me = signed ? context.SignExtend32(OperandType.I64, me) : context.ZeroExtend32(OperandType.I64, me); + } + + res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size + 1); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void EmitVectorImmBinaryQdQmOpZx32(ArmEmitterContext context, Func2I emit) + { + EmitVectorImmBinaryQdQmOpI32(context, emit, false); + } + + public static void EmitVectorImmBinaryQdQmOpSx32(ArmEmitterContext context, Func2I emit) + { + EmitVectorImmBinaryQdQmOpI32(context, emit, true); + } + + public static void EmitVectorImmBinaryQdQmOpI32(ArmEmitterContext context, Func2I emit, bool signed) + { + OpCode32SimdShImm op = (OpCode32SimdShImm)context.CurrOp; + + Operand res = GetVecA32(op.Qd); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand de = EmitVectorExtract32(context, op.Qd, op.Id + index, op.Size, signed); + Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, signed); + + res = EmitVectorInsert(context, res, emit(de, me), op.Id + index, op.Size); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void EmitVectorTernaryLongOpI32(ArmEmitterContext context, Func3I emit, bool signed) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand de = EmitVectorExtract32(context, op.Qd, op.Id + index, op.Size + 1, signed); + Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed); + Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, signed); + + if (op.Size == 2) + { + ne = signed ? context.SignExtend32(OperandType.I64, ne) : context.ZeroExtend32(OperandType.I64, ne); + me = signed ? context.SignExtend32(OperandType.I64, me) : context.ZeroExtend32(OperandType.I64, me); + } + + res = EmitVectorInsert(context, res, emit(de, ne, me), index, op.Size + 1); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void EmitVectorTernaryOpI32(ArmEmitterContext context, Func3I emit, bool signed) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + Operand res = GetVecA32(op.Qd); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand de = EmitVectorExtract32(context, op.Qd, op.Id + index, op.Size, signed); + Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed); + Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, signed); + + res = EmitVectorInsert(context, res, emit(de, ne, me), op.Id + index, op.Size); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void EmitVectorUnaryOpSx32(ArmEmitterContext context, Func1I emit) + { + EmitVectorUnaryOpI32(context, emit, true); + } + + public static void EmitVectorUnaryOpSx32(ArmEmitterContext context, Func1I emit, bool accumulate) + { + if (accumulate) + { + EmitVectorUnaryAccumulateOpI32(context, emit, true); + } + else + { + EmitVectorUnaryOpI32(context, emit, true); + } + } + + public static void EmitVectorBinaryOpSx32(ArmEmitterContext context, Func2I emit) + { + EmitVectorBinaryOpI32(context, emit, true); + } + + public static void EmitVectorTernaryOpSx32(ArmEmitterContext context, Func3I emit) + { + EmitVectorTernaryOpI32(context, emit, true); + } + + public static void EmitVectorUnaryOpZx32(ArmEmitterContext context, Func1I emit) + { + EmitVectorUnaryOpI32(context, emit, false); + } + + public static void EmitVectorUnaryOpZx32(ArmEmitterContext context, Func1I emit, bool accumulate) + { + if (accumulate) + { + EmitVectorUnaryAccumulateOpI32(context, emit, false); + } + else + { + EmitVectorUnaryOpI32(context, emit, false); + } + } + + public static void EmitVectorBinaryOpZx32(ArmEmitterContext context, Func2I emit) + { + EmitVectorBinaryOpI32(context, emit, false); + } + + public static void EmitVectorTernaryOpZx32(ArmEmitterContext context, Func3I emit) + { + EmitVectorTernaryOpI32(context, emit, false); + } + + // Vector by scalar + + public static void EmitVectorByScalarOpF32(ArmEmitterContext context, Func2I emit) + { + OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; + + int sizeF = op.Size & 1; + + OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32; + + int elems = op.GetBytesCount() >> sizeF + 2; + + Operand m = ExtractScalar(context, type, op.Vm); + + Operand res = GetVecA32(op.Qd); + + for (int index = 0; index < elems; index++) + { + Operand ne = context.VectorExtract(type, GetVecA32(op.Qn), op.Fn + index); + + res = context.VectorInsert(res, emit(ne, m), op.Fd + index); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void EmitVectorByScalarOpI32(ArmEmitterContext context, Func2I emit, bool signed) + { + OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; + + Operand m = ExtractElement(context, op.Vm, op.Size, signed); + + Operand res = GetVecA32(op.Qd); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed); + + res = EmitVectorInsert(context, res, emit(ne, m), op.Id + index, op.Size); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void EmitVectorByScalarLongOpI32(ArmEmitterContext context, Func2I emit, bool signed) + { + OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; + + Operand m = ExtractElement(context, op.Vm, op.Size, signed); + + if (op.Size == 2) + { + m = signed ? context.SignExtend32(OperandType.I64, m) : context.ZeroExtend32(OperandType.I64, m); + } + + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed); + + if (op.Size == 2) + { + ne = signed ? context.SignExtend32(OperandType.I64, ne) : context.ZeroExtend32(OperandType.I64, ne); + } + + res = EmitVectorInsert(context, res, emit(ne, m), index, op.Size + 1); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void EmitVectorsByScalarOpF32(ArmEmitterContext context, Func3I emit) + { + OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; + + int sizeF = op.Size & 1; + + OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32; + + int elems = op.GetBytesCount() >> sizeF + 2; + + Operand m = ExtractScalar(context, type, op.Vm); + + Operand res = GetVecA32(op.Qd); + + for (int index = 0; index < elems; index++) + { + Operand de = context.VectorExtract(type, GetVecA32(op.Qd), op.Fd + index); + Operand ne = context.VectorExtract(type, GetVecA32(op.Qn), op.Fn + index); + + res = context.VectorInsert(res, emit(de, ne, m), op.Fd + index); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void EmitVectorsByScalarOpI32(ArmEmitterContext context, Func3I emit, bool signed) + { + OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; + + Operand m = EmitVectorExtract32(context, op.Vm >> (4 - op.Size), op.Vm & ((1 << (4 - op.Size)) - 1), op.Size, signed); + + Operand res = GetVecA32(op.Qd); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand de = EmitVectorExtract32(context, op.Qd, op.Id + index, op.Size, signed); + Operand ne = EmitVectorExtract32(context, op.Qn, op.In + index, op.Size, signed); + + res = EmitVectorInsert(context, res, emit(de, ne, m), op.Id + index, op.Size); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + // Pairwise + + public static void EmitVectorPairwiseOpF32(ArmEmitterContext context, Func2I emit) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + int sizeF = op.Size & 1; + + OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32; + + int elems = op.GetBytesCount() >> (sizeF + 2); + int pairs = elems >> 1; + + Operand res = GetVecA32(op.Qd); + Operand mvec = GetVecA32(op.Qm); + Operand nvec = GetVecA32(op.Qn); + + for (int index = 0; index < pairs; index++) + { + int pairIndex = index << 1; + + Operand n1 = context.VectorExtract(type, nvec, op.Fn + pairIndex); + Operand n2 = context.VectorExtract(type, nvec, op.Fn + pairIndex + 1); + + res = context.VectorInsert(res, emit(n1, n2), op.Fd + index); + + Operand m1 = context.VectorExtract(type, mvec, op.Fm + pairIndex); + Operand m2 = context.VectorExtract(type, mvec, op.Fm + pairIndex + 1); + + res = context.VectorInsert(res, emit(m1, m2), op.Fd + index + pairs); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void EmitVectorPairwiseOpI32(ArmEmitterContext context, Func2I emit, bool signed) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + int elems = op.GetBytesCount() >> op.Size; + int pairs = elems >> 1; + + Operand res = GetVecA32(op.Qd); + + for (int index = 0; index < pairs; index++) + { + int pairIndex = index << 1; + Operand n1 = EmitVectorExtract32(context, op.Qn, op.In + pairIndex, op.Size, signed); + Operand n2 = EmitVectorExtract32(context, op.Qn, op.In + pairIndex + 1, op.Size, signed); + + Operand m1 = EmitVectorExtract32(context, op.Qm, op.Im + pairIndex, op.Size, signed); + Operand m2 = EmitVectorExtract32(context, op.Qm, op.Im + pairIndex + 1, op.Size, signed); + + res = EmitVectorInsert(context, res, emit(n1, n2), op.Id + index, op.Size); + res = EmitVectorInsert(context, res, emit(m1, m2), op.Id + index + pairs, op.Size); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void EmitVectorPairwiseLongOpI32(ArmEmitterContext context, Func2I emit, bool signed) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + int elems = (op.Q ? 16 : 8) >> op.Size; + int pairs = elems >> 1; + int id = (op.Vd & 1) * pairs; + + Operand res = GetVecA32(op.Qd); + + for (int index = 0; index < pairs; index++) + { + int pairIndex = index << 1; + Operand m1 = EmitVectorExtract32(context, op.Qm, op.Im + pairIndex, op.Size, signed); + Operand m2 = EmitVectorExtract32(context, op.Qm, op.Im + pairIndex + 1, op.Size, signed); + + if (op.Size == 2) + { + m1 = signed ? context.SignExtend32(OperandType.I64, m1) : context.ZeroExtend32(OperandType.I64, m1); + m2 = signed ? context.SignExtend32(OperandType.I64, m2) : context.ZeroExtend32(OperandType.I64, m2); + } + + res = EmitVectorInsert(context, res, emit(m1, m2), id + index, op.Size + 1); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + // Narrow + + public static void EmitVectorUnaryNarrowOp32(ArmEmitterContext context, Func1I emit, bool signed = false) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + int elems = 8 >> op.Size; // Size contains the target element size. (for when it becomes a doubleword) + + Operand res = GetVecA32(op.Qd); + int id = (op.Vd & 1) << (3 - op.Size); // Target doubleword base. + + for (int index = 0; index < elems; index++) + { + Operand m = EmitVectorExtract32(context, op.Qm, index, op.Size + 1, signed); + + res = EmitVectorInsert(context, res, emit(m), id + index, op.Size); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + // Intrinsic Helpers + + public static Operand EmitMoveDoubleWordToSide(ArmEmitterContext context, Operand input, int originalV, int targetV) + { + Debug.Assert(input.Type == OperandType.V128); + + int originalSide = originalV & 1; + int targetSide = targetV & 1; + + if (originalSide == targetSide) + { + return input; + } + + if (targetSide == 1) + { + return context.AddIntrinsic(Intrinsic.X86Movlhps, input, input); // Low to high. + } + else + { + return context.AddIntrinsic(Intrinsic.X86Movhlps, input, input); // High to low. + } + } + + public static Operand EmitDoubleWordInsert(ArmEmitterContext context, Operand target, Operand value, int targetV) + { + Debug.Assert(target.Type == OperandType.V128 && value.Type == OperandType.V128); + + int targetSide = targetV & 1; + int shuffleMask = 2; + + if (targetSide == 1) + { + return context.AddIntrinsic(Intrinsic.X86Shufpd, target, value, Const(shuffleMask)); + } + else + { + return context.AddIntrinsic(Intrinsic.X86Shufpd, value, target, Const(shuffleMask)); + } + } + + public static Operand EmitScalarInsert(ArmEmitterContext context, Operand target, Operand value, int reg, bool doubleWidth) + { + Debug.Assert(target.Type == OperandType.V128 && value.Type == OperandType.V128); + + // Insert from index 0 in value to index in target. + int index = reg & (doubleWidth ? 1 : 3); + + if (doubleWidth) + { + if (index == 1) + { + return context.AddIntrinsic(Intrinsic.X86Movlhps, target, value); // Low to high. + } + else + { + return context.AddIntrinsic(Intrinsic.X86Shufpd, value, target, Const(2)); // Low to low, keep high from original. + } + } + else + { + if (Optimizations.UseSse41) + { + return context.AddIntrinsic(Intrinsic.X86Insertps, target, value, Const(index << 4)); + } + else + { + target = EmitSwapScalar(context, target, index, doubleWidth); // Swap value to replace into element 0. + target = context.AddIntrinsic(Intrinsic.X86Movss, target, value); // Move the value into element 0 of the vector. + return EmitSwapScalar(context, target, index, doubleWidth); // Swap new value back to the correct index. + } + } + } + + public static Operand EmitSwapScalar(ArmEmitterContext context, Operand target, int reg, bool doubleWidth) + { + // Index into 0, 0 into index. This swap happens at the start of an A32 scalar op if required. + int index = reg & (doubleWidth ? 1 : 3); + if (index == 0) return target; + + if (doubleWidth) + { + int shuffleMask = 1; // Swap top and bottom. (b0 = 1, b1 = 0) + return context.AddIntrinsic(Intrinsic.X86Shufpd, target, target, Const(shuffleMask)); + } + else + { + int shuffleMask = (3 << 6) | (2 << 4) | (1 << 2) | index; // Swap index and 0. (others remain) + shuffleMask &= ~(3 << (index * 2)); + + return context.AddIntrinsic(Intrinsic.X86Shufps, target, target, Const(shuffleMask)); + } + } + + // Vector Operand Templates + + public static void EmitVectorUnaryOpSimd32(ArmEmitterContext context, Func1I vectorFunc) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + Operand m = GetVecA32(op.Qm); + Operand d = GetVecA32(op.Qd); + + if (!op.Q) // Register swap: move relevant doubleword to destination side. + { + m = EmitMoveDoubleWordToSide(context, m, op.Vm, op.Vd); + } + + Operand res = vectorFunc(m); + + if (!op.Q) // Register insert. + { + res = EmitDoubleWordInsert(context, d, res, op.Vd); + } + + context.Copy(d, res); + } + + public static void EmitVectorUnaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32; + + EmitVectorUnaryOpSimd32(context, (m) => context.AddIntrinsic(inst, m)); + } + + public static void EmitVectorBinaryOpSimd32(ArmEmitterContext context, Func2I vectorFunc, int side = -1) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + Operand n = GetVecA32(op.Qn); + Operand m = GetVecA32(op.Qm); + Operand d = GetVecA32(op.Qd); + + if (side == -1) + { + side = op.Vd; + } + + if (!op.Q) // Register swap: move relevant doubleword to destination side. + { + n = EmitMoveDoubleWordToSide(context, n, op.Vn, side); + m = EmitMoveDoubleWordToSide(context, m, op.Vm, side); + } + + Operand res = vectorFunc(n, m); + + if (!op.Q) // Register insert. + { + if (side != op.Vd) + { + res = EmitMoveDoubleWordToSide(context, res, side, op.Vd); + } + res = EmitDoubleWordInsert(context, d, res, op.Vd); + } + + context.Copy(d, res); + } + + public static void EmitVectorBinaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32; + EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(inst, n, m)); + } + + public static void EmitVectorTernaryOpSimd32(ArmEmitterContext context, Func3I vectorFunc) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + Operand n = GetVecA32(op.Qn); + Operand m = GetVecA32(op.Qm); + Operand d = GetVecA32(op.Qd); + Operand initialD = d; + + if (!op.Q) // Register swap: move relevant doubleword to destination side. + { + n = EmitMoveDoubleWordToSide(context, n, op.Vn, op.Vd); + m = EmitMoveDoubleWordToSide(context, m, op.Vm, op.Vd); + } + + Operand res = vectorFunc(d, n, m); + + if (!op.Q) // Register insert. + { + res = EmitDoubleWordInsert(context, initialD, res, op.Vd); + } + + context.Copy(initialD, res); + } + + public static void EmitVectorTernaryOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + Intrinsic inst1 = (op.Size & 1) != 0 ? inst64pt1 : inst32pt1; + Intrinsic inst2 = (op.Size & 1) != 0 ? inst64pt2 : inst32pt2; + + EmitVectorTernaryOpSimd32(context, (d, n, m) => + { + Operand res = context.AddIntrinsic(inst1, n, m); + return res = context.AddIntrinsic(inst2, d, res); + }); + } + + public static void EmitVectorTernaryOpF32(ArmEmitterContext context, Intrinsic inst32) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + Debug.Assert((op.Size & 1) == 0); + + EmitVectorTernaryOpSimd32(context, (d, n, m) => + { + return context.AddIntrinsic(inst32, d, n, m); + }); + } + + public static void EmitScalarUnaryOpSimd32(ArmEmitterContext context, Func1I scalarFunc) + { + OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; + + bool doubleSize = (op.Size & 1) != 0; + int shift = doubleSize ? 1 : 2; + Operand m = GetVecA32(op.Vm >> shift); + Operand d = GetVecA32(op.Vd >> shift); + + m = EmitSwapScalar(context, m, op.Vm, doubleSize); + + Operand res = scalarFunc(m); + + // Insert scalar into vector. + res = EmitScalarInsert(context, d, res, op.Vd, doubleSize); + + context.Copy(d, res); + } + + public static void EmitScalarUnaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + { + OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; + + Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32; + + EmitScalarUnaryOpSimd32(context, (m) => (inst == 0) ? m : context.AddIntrinsic(inst, m)); + } + + public static void EmitScalarBinaryOpSimd32(ArmEmitterContext context, Func2I scalarFunc) + { + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + + bool doubleSize = (op.Size & 1) != 0; + int shift = doubleSize ? 1 : 2; + Operand n = GetVecA32(op.Vn >> shift); + Operand m = GetVecA32(op.Vm >> shift); + Operand d = GetVecA32(op.Vd >> shift); + + n = EmitSwapScalar(context, n, op.Vn, doubleSize); + m = EmitSwapScalar(context, m, op.Vm, doubleSize); + + Operand res = scalarFunc(n, m); + + // Insert scalar into vector. + res = EmitScalarInsert(context, d, res, op.Vd, doubleSize); + + context.Copy(d, res); + } + + public static void EmitScalarBinaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + { + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + + Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32; + + EmitScalarBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(inst, n, m)); + } + + public static void EmitScalarTernaryOpSimd32(ArmEmitterContext context, Func3I scalarFunc) + { + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + + bool doubleSize = (op.Size & 1) != 0; + int shift = doubleSize ? 1 : 2; + Operand n = GetVecA32(op.Vn >> shift); + Operand m = GetVecA32(op.Vm >> shift); + Operand d = GetVecA32(op.Vd >> shift); + Operand initialD = d; + + n = EmitSwapScalar(context, n, op.Vn, doubleSize); + m = EmitSwapScalar(context, m, op.Vm, doubleSize); + d = EmitSwapScalar(context, d, op.Vd, doubleSize); + + Operand res = scalarFunc(d, n, m); + + // Insert scalar into vector. + res = EmitScalarInsert(context, initialD, res, op.Vd, doubleSize); + + context.Copy(initialD, res); + } + + public static void EmitScalarTernaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + { + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + + bool doubleSize = (op.Size & 1) != 0; + + Intrinsic inst = doubleSize ? inst64 : inst32; + + EmitScalarTernaryOpSimd32(context, (d, n, m) => + { + return context.AddIntrinsic(inst, d, n, m); + }); + } + + public static void EmitScalarTernaryOpF32( + ArmEmitterContext context, + Intrinsic inst32pt1, + Intrinsic inst64pt1, + Intrinsic inst32pt2, + Intrinsic inst64pt2, + bool isNegD = false) + { + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + + bool doubleSize = (op.Size & 1) != 0; + + Intrinsic inst1 = doubleSize ? inst64pt1 : inst32pt1; + Intrinsic inst2 = doubleSize ? inst64pt2 : inst32pt2; + + EmitScalarTernaryOpSimd32(context, (d, n, m) => + { + Operand res = context.AddIntrinsic(inst1, n, m); + + if (isNegD) + { + Operand mask = doubleSize + ? X86GetScalar(context, -0d) + : X86GetScalar(context, -0f); + + d = doubleSize + ? context.AddIntrinsic(Intrinsic.X86Xorpd, mask, d) + : context.AddIntrinsic(Intrinsic.X86Xorps, mask, d); + } + + return context.AddIntrinsic(inst2, d, res); + }); + } + + // By Scalar + + public static void EmitVectorByScalarOpSimd32(ArmEmitterContext context, Func2I vectorFunc) + { + OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; + + Operand n = GetVecA32(op.Qn); + Operand d = GetVecA32(op.Qd); + + int index = op.Vm & 3; + int dupeMask = (index << 6) | (index << 4) | (index << 2) | index; + Operand m = GetVecA32(op.Vm >> 2); + m = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(dupeMask)); + + if (!op.Q) // Register swap: move relevant doubleword to destination side. + { + n = EmitMoveDoubleWordToSide(context, n, op.Vn, op.Vd); + } + + Operand res = vectorFunc(n, m); + + if (!op.Q) // Register insert. + { + res = EmitDoubleWordInsert(context, d, res, op.Vd); + } + + context.Copy(d, res); + } + + public static void EmitVectorByScalarOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + { + OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; + + Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32; + EmitVectorByScalarOpSimd32(context, (n, m) => context.AddIntrinsic(inst, n, m)); + } + + public static void EmitVectorsByScalarOpSimd32(ArmEmitterContext context, Func3I vectorFunc) + { + OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; + + Operand n = GetVecA32(op.Qn); + Operand d = GetVecA32(op.Qd); + Operand initialD = d; + + int index = op.Vm & 3; + int dupeMask = (index << 6) | (index << 4) | (index << 2) | index; + Operand m = GetVecA32(op.Vm >> 2); + m = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(dupeMask)); + + if (!op.Q) // Register swap: move relevant doubleword to destination side. + { + n = EmitMoveDoubleWordToSide(context, n, op.Vn, op.Vd); + } + + Operand res = vectorFunc(d, n, m); + + if (!op.Q) // Register insert. + { + res = EmitDoubleWordInsert(context, initialD, res, op.Vd); + } + + context.Copy(initialD, res); + } + + public static void EmitVectorsByScalarOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2) + { + OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; + + Intrinsic inst1 = (op.Size & 1) != 0 ? inst64pt1 : inst32pt1; + Intrinsic inst2 = (op.Size & 1) != 0 ? inst64pt2 : inst32pt2; + + EmitVectorsByScalarOpSimd32(context, (d, n, m) => + { + Operand res = context.AddIntrinsic(inst1, n, m); + return res = context.AddIntrinsic(inst2, d, res); + }); + } + + // Pairwise + + public static void EmitSse2VectorPairwiseOpF32(ArmEmitterContext context, Intrinsic inst32) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + EmitVectorBinaryOpSimd32(context, (n, m) => + { + Operand unpck = context.AddIntrinsic(Intrinsic.X86Unpcklps, n, m); + + Operand part0 = unpck; + Operand part1 = context.AddIntrinsic(Intrinsic.X86Movhlps, unpck, unpck); + + return context.AddIntrinsic(inst32, part0, part1); + }, 0); + } + + public static void EmitSsse3VectorPairwiseOp32(ArmEmitterContext context, Intrinsic[] inst) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + EmitVectorBinaryOpSimd32(context, (n, m) => + { + if (op.RegisterSize == RegisterSize.Simd64) + { + Operand zeroEvenMask = X86GetElements(context, ZeroMask, EvenMasks[op.Size]); + Operand zeroOddMask = X86GetElements(context, ZeroMask, OddMasks[op.Size]); + + Operand mN = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m); // m:n + + Operand left = context.AddIntrinsic(Intrinsic.X86Pshufb, mN, zeroEvenMask); // 0:even from m:n + Operand right = context.AddIntrinsic(Intrinsic.X86Pshufb, mN, zeroOddMask); // 0:odd from m:n + + return context.AddIntrinsic(inst[op.Size], left, right); + } + else if (op.Size < 3) + { + Operand oddEvenMask = X86GetElements(context, OddMasks[op.Size], EvenMasks[op.Size]); + + Operand oddEvenN = context.AddIntrinsic(Intrinsic.X86Pshufb, n, oddEvenMask); // odd:even from n + Operand oddEvenM = context.AddIntrinsic(Intrinsic.X86Pshufb, m, oddEvenMask); // odd:even from m + + Operand left = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, oddEvenN, oddEvenM); + Operand right = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, oddEvenN, oddEvenM); + + return context.AddIntrinsic(inst[op.Size], left, right); + } + else + { + Operand left = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m); + Operand right = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, n, m); + + return context.AddIntrinsic(inst[3], left, right); + } + }, 0); + } + + // Generic Functions + + public static Operand EmitSoftFloatCallDefaultFpscr(ArmEmitterContext context, string name, params Operand[] callArgs) + { + IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; + + MethodInfo info = (op.Size & 1) == 0 + ? typeof(SoftFloat32).GetMethod(name) + : typeof(SoftFloat64).GetMethod(name); + + Array.Resize(ref callArgs, callArgs.Length + 1); + callArgs[callArgs.Length - 1] = Const(1); + + context.ExitArmFpMode(); + context.StoreToContext(); + Operand res = context.Call(info, callArgs); + context.LoadFromContext(); + context.EnterArmFpMode(); + + return res; + } + + public static Operand EmitVectorExtractSx32(ArmEmitterContext context, int reg, int index, int size) + { + return EmitVectorExtract32(context, reg, index, size, true); + } + + public static Operand EmitVectorExtractZx32(ArmEmitterContext context, int reg, int index, int size) + { + return EmitVectorExtract32(context, reg, index, size, false); + } + + public static Operand EmitVectorExtract32(ArmEmitterContext context, int reg, int index, int size, bool signed) + { + ThrowIfInvalid(index, size); + + Operand res = default; + + switch (size) + { + case 0: + res = context.VectorExtract8(GetVec(reg), index); + break; + + case 1: + res = context.VectorExtract16(GetVec(reg), index); + break; + + case 2: + res = context.VectorExtract(OperandType.I32, GetVec(reg), index); + break; + + case 3: + res = context.VectorExtract(OperandType.I64, GetVec(reg), index); + break; + } + + if (signed) + { + switch (size) + { + case 0: res = context.SignExtend8(OperandType.I32, res); break; + case 1: res = context.SignExtend16(OperandType.I32, res); break; + } + } + else + { + switch (size) + { + case 0: res = context.ZeroExtend8(OperandType.I32, res); break; + case 1: res = context.ZeroExtend16(OperandType.I32, res); break; + } + } + + return res; + } + + public static Operand EmitPolynomialMultiply(ArmEmitterContext context, Operand op1, Operand op2, int eSize) + { + Debug.Assert(eSize <= 32); + + Operand result = eSize == 32 ? Const(0L) : Const(0); + + if (eSize == 32) + { + op1 = context.ZeroExtend32(OperandType.I64, op1); + op2 = context.ZeroExtend32(OperandType.I64, op2); + } + + for (int i = 0; i < eSize; i++) + { + Operand mask = context.BitwiseAnd(op1, Const(op1.Type, 1L << i)); + + result = context.BitwiseExclusiveOr(result, context.Multiply(op2, mask)); + } + + return result; + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSimdHelper32Arm64.cs b/src/ARMeilleure/Instructions/InstEmitSimdHelper32Arm64.cs new file mode 100644 index 00000000..98236be6 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdHelper32Arm64.cs @@ -0,0 +1,366 @@ + +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; +using System; +using System.Diagnostics; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + using Func1I = Func<Operand, Operand>; + using Func2I = Func<Operand, Operand, Operand>; + using Func3I = Func<Operand, Operand, Operand, Operand>; + + static class InstEmitSimdHelper32Arm64 + { + // Intrinsic Helpers + + public static Operand EmitMoveDoubleWordToSide(ArmEmitterContext context, Operand input, int originalV, int targetV) + { + Debug.Assert(input.Type == OperandType.V128); + + int originalSide = originalV & 1; + int targetSide = targetV & 1; + + if (originalSide == targetSide) + { + return input; + } + + Intrinsic vType = Intrinsic.Arm64VDWord | Intrinsic.Arm64V128; + + if (targetSide == 1) + { + return context.AddIntrinsic(Intrinsic.Arm64DupVe | vType, input, Const(OperandType.I32, 0)); // Low to high. + } + else + { + return context.AddIntrinsic(Intrinsic.Arm64DupVe | vType, input, Const(OperandType.I32, 1)); // High to low. + } + } + + public static Operand EmitDoubleWordInsert(ArmEmitterContext context, Operand target, Operand value, int targetV) + { + Debug.Assert(target.Type == OperandType.V128 && value.Type == OperandType.V128); + + int targetSide = targetV & 1; + Operand idx = Const(targetSide); + + return context.AddIntrinsic(Intrinsic.Arm64InsVe | Intrinsic.Arm64VDWord, target, idx, value, idx); + } + + public static Operand EmitScalarInsert(ArmEmitterContext context, Operand target, Operand value, int reg, bool doubleWidth) + { + Debug.Assert(target.Type == OperandType.V128 && value.Type == OperandType.V128); + + // Insert from index 0 in value to index in target. + int index = reg & (doubleWidth ? 1 : 3); + + if (doubleWidth) + { + return context.AddIntrinsic(Intrinsic.Arm64InsVe | Intrinsic.Arm64VDWord, target, Const(index), value, Const(0)); + } + else + { + return context.AddIntrinsic(Intrinsic.Arm64InsVe | Intrinsic.Arm64VWord, target, Const(index), value, Const(0)); + } + } + + public static Operand EmitExtractScalar(ArmEmitterContext context, Operand target, int reg, bool doubleWidth) + { + int index = reg & (doubleWidth ? 1 : 3); + if (index == 0) return target; // Element is already at index 0, so just return the vector directly. + + if (doubleWidth) + { + return context.AddIntrinsic(Intrinsic.Arm64DupSe | Intrinsic.Arm64VDWord, target, Const(1)); // Extract high (index 1). + } + else + { + return context.AddIntrinsic(Intrinsic.Arm64DupSe | Intrinsic.Arm64VWord, target, Const(index)); // Extract element at index. + } + } + + // Vector Operand Templates + + public static void EmitVectorUnaryOpSimd32(ArmEmitterContext context, Func1I vectorFunc) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + Operand m = GetVecA32(op.Qm); + Operand d = GetVecA32(op.Qd); + + if (!op.Q) // Register swap: move relevant doubleword to destination side. + { + m = EmitMoveDoubleWordToSide(context, m, op.Vm, op.Vd); + } + + Operand res = vectorFunc(m); + + if (!op.Q) // Register insert. + { + res = EmitDoubleWordInsert(context, d, res, op.Vd); + } + + context.Copy(d, res); + } + + public static void EmitVectorUnaryOpF32(ArmEmitterContext context, Intrinsic inst) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + inst |= ((op.Size & 1) != 0 ? Intrinsic.Arm64VDouble : Intrinsic.Arm64VFloat) | Intrinsic.Arm64V128; + EmitVectorUnaryOpSimd32(context, (m) => context.AddIntrinsic(inst, m)); + } + + public static void EmitVectorBinaryOpSimd32(ArmEmitterContext context, Func2I vectorFunc, int side = -1) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + Operand n = GetVecA32(op.Qn); + Operand m = GetVecA32(op.Qm); + Operand d = GetVecA32(op.Qd); + + if (side == -1) + { + side = op.Vd; + } + + if (!op.Q) // Register swap: move relevant doubleword to destination side. + { + n = EmitMoveDoubleWordToSide(context, n, op.Vn, side); + m = EmitMoveDoubleWordToSide(context, m, op.Vm, side); + } + + Operand res = vectorFunc(n, m); + + if (!op.Q) // Register insert. + { + if (side != op.Vd) + { + res = EmitMoveDoubleWordToSide(context, res, side, op.Vd); + } + res = EmitDoubleWordInsert(context, d, res, op.Vd); + } + + context.Copy(d, res); + } + + public static void EmitVectorBinaryOpF32(ArmEmitterContext context, Intrinsic inst) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + inst |= ((op.Size & 1) != 0 ? Intrinsic.Arm64VDouble : Intrinsic.Arm64VFloat) | Intrinsic.Arm64V128; + EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(inst, n, m)); + } + + public static void EmitVectorTernaryOpSimd32(ArmEmitterContext context, Func3I vectorFunc) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + Operand n = GetVecA32(op.Qn); + Operand m = GetVecA32(op.Qm); + Operand d = GetVecA32(op.Qd); + Operand initialD = d; + + if (!op.Q) // Register swap: move relevant doubleword to destination side. + { + n = EmitMoveDoubleWordToSide(context, n, op.Vn, op.Vd); + m = EmitMoveDoubleWordToSide(context, m, op.Vm, op.Vd); + } + + Operand res = vectorFunc(d, n, m); + + if (!op.Q) // Register insert. + { + res = EmitDoubleWordInsert(context, initialD, res, op.Vd); + } + + context.Copy(initialD, res); + } + + public static void EmitVectorTernaryOpF32(ArmEmitterContext context, Intrinsic inst) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + inst |= ((op.Size & 1) != 0 ? Intrinsic.Arm64VDouble : Intrinsic.Arm64VFloat) | Intrinsic.Arm64V128; + EmitVectorTernaryOpSimd32(context, (d, n, m) => context.AddIntrinsic(inst, d, n, m)); + } + + public static void EmitScalarUnaryOpSimd32(ArmEmitterContext context, Func1I scalarFunc) + { + OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; + + bool doubleSize = (op.Size & 1) != 0; + int shift = doubleSize ? 1 : 2; + Operand m = GetVecA32(op.Vm >> shift); + Operand d = GetVecA32(op.Vd >> shift); + + m = EmitExtractScalar(context, m, op.Vm, doubleSize); + + Operand res = scalarFunc(m); + + // Insert scalar into vector. + res = EmitScalarInsert(context, d, res, op.Vd, doubleSize); + + context.Copy(d, res); + } + + public static void EmitScalarUnaryOpF32(ArmEmitterContext context, Intrinsic inst) + { + OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; + + inst |= ((op.Size & 1) != 0 ? Intrinsic.Arm64VDouble : Intrinsic.Arm64VFloat) | Intrinsic.Arm64V128; + EmitScalarUnaryOpSimd32(context, (m) => (inst == 0) ? m : context.AddIntrinsic(inst, m)); + } + + public static void EmitScalarBinaryOpSimd32(ArmEmitterContext context, Func2I scalarFunc) + { + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + + bool doubleSize = (op.Size & 1) != 0; + int shift = doubleSize ? 1 : 2; + Operand n = GetVecA32(op.Vn >> shift); + Operand m = GetVecA32(op.Vm >> shift); + Operand d = GetVecA32(op.Vd >> shift); + + n = EmitExtractScalar(context, n, op.Vn, doubleSize); + m = EmitExtractScalar(context, m, op.Vm, doubleSize); + + Operand res = scalarFunc(n, m); + + // Insert scalar into vector. + res = EmitScalarInsert(context, d, res, op.Vd, doubleSize); + + context.Copy(d, res); + } + + public static void EmitScalarBinaryOpF32(ArmEmitterContext context, Intrinsic inst) + { + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + + inst |= ((op.Size & 1) != 0 ? Intrinsic.Arm64VDouble : Intrinsic.Arm64VFloat) | Intrinsic.Arm64V128; + EmitScalarBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(inst, n, m)); + } + + public static void EmitScalarTernaryOpSimd32(ArmEmitterContext context, Func3I scalarFunc) + { + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + + bool doubleSize = (op.Size & 1) != 0; + int shift = doubleSize ? 1 : 2; + Operand n = GetVecA32(op.Vn >> shift); + Operand m = GetVecA32(op.Vm >> shift); + Operand d = GetVecA32(op.Vd >> shift); + Operand initialD = d; + + n = EmitExtractScalar(context, n, op.Vn, doubleSize); + m = EmitExtractScalar(context, m, op.Vm, doubleSize); + d = EmitExtractScalar(context, d, op.Vd, doubleSize); + + Operand res = scalarFunc(d, n, m); + + // Insert scalar into vector. + res = EmitScalarInsert(context, initialD, res, op.Vd, doubleSize); + + context.Copy(initialD, res); + } + + public static void EmitScalarTernaryOpF32(ArmEmitterContext context, Intrinsic inst) + { + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + + inst |= ((op.Size & 1) != 0 ? Intrinsic.Arm64VDouble : Intrinsic.Arm64VFloat) | Intrinsic.Arm64V128; + EmitScalarTernaryOpSimd32(context, (d, n, m) => context.AddIntrinsic(inst, d, n, m)); + } + + // Pairwise + + public static void EmitVectorPairwiseOpF32(ArmEmitterContext context, Intrinsic inst32) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + inst32 |= Intrinsic.Arm64V64 | Intrinsic.Arm64VFloat; + EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(inst32, n, m), 0); + } + + public static void EmitVcmpOrVcmpe(ArmEmitterContext context, bool signalNaNs) + { + OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; + + bool cmpWithZero = (op.Opc & 2) != 0; + + Intrinsic inst = signalNaNs ? Intrinsic.Arm64FcmpeS : Intrinsic.Arm64FcmpS; + inst |= ((op.Size & 1) != 0 ? Intrinsic.Arm64VDouble : Intrinsic.Arm64VFloat) | Intrinsic.Arm64V128; + + bool doubleSize = (op.Size & 1) != 0; + int shift = doubleSize ? 1 : 2; + Operand n = GetVecA32(op.Vd >> shift); + Operand m = GetVecA32(op.Vm >> shift); + + n = EmitExtractScalar(context, n, op.Vd, doubleSize); + m = cmpWithZero ? Const(0) : EmitExtractScalar(context, m, op.Vm, doubleSize); + + Operand nzcv = context.AddIntrinsicInt(inst, n, m); + + Operand one = Const(1); + + SetFpFlag(context, FPState.VFlag, context.BitwiseAnd(context.ShiftRightUI(nzcv, Const(28)), one)); + SetFpFlag(context, FPState.CFlag, context.BitwiseAnd(context.ShiftRightUI(nzcv, Const(29)), one)); + SetFpFlag(context, FPState.ZFlag, context.BitwiseAnd(context.ShiftRightUI(nzcv, Const(30)), one)); + SetFpFlag(context, FPState.NFlag, context.BitwiseAnd(context.ShiftRightUI(nzcv, Const(31)), one)); + } + + public static void EmitCmpOpF32(ArmEmitterContext context, CmpCondition cond, bool zero) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + int sizeF = op.Size & 1; + + Intrinsic inst; + if (zero) + { + inst = cond switch + { + CmpCondition.Equal => Intrinsic.Arm64FcmeqVz, + CmpCondition.GreaterThan => Intrinsic.Arm64FcmgtVz, + CmpCondition.GreaterThanOrEqual => Intrinsic.Arm64FcmgeVz, + CmpCondition.LessThan => Intrinsic.Arm64FcmltVz, + CmpCondition.LessThanOrEqual => Intrinsic.Arm64FcmleVz, + _ => throw new InvalidOperationException() + }; + } + else { + inst = cond switch + { + CmpCondition.Equal => Intrinsic.Arm64FcmeqV, + CmpCondition.GreaterThan => Intrinsic.Arm64FcmgtV, + CmpCondition.GreaterThanOrEqual => Intrinsic.Arm64FcmgeV, + _ => throw new InvalidOperationException() + }; + } + + inst |= (sizeF != 0 ? Intrinsic.Arm64VDouble : Intrinsic.Arm64VFloat) | Intrinsic.Arm64V128; + + if (zero) + { + EmitVectorUnaryOpSimd32(context, (m) => + { + return context.AddIntrinsic(inst, m); + }); + } + else + { + EmitVectorBinaryOpSimd32(context, (n, m) => + { + return context.AddIntrinsic(inst, n, m); + }); + } + } + } +}
\ No newline at end of file diff --git a/src/ARMeilleure/Instructions/InstEmitSimdHelperArm64.cs b/src/ARMeilleure/Instructions/InstEmitSimdHelperArm64.cs new file mode 100644 index 00000000..f0d242ae --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdHelperArm64.cs @@ -0,0 +1,720 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static class InstEmitSimdHelperArm64 + { + public static void EmitScalarUnaryOpF(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + if ((op.Size & 1) != 0) + { + inst |= Intrinsic.Arm64VDouble; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, n)); + } + + public static void EmitScalarUnaryOpFFromGp(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdCvt op = (OpCodeSimdCvt)context.CurrOp; + + Operand n = GetIntOrZR(context, op.Rn); + + if ((op.Size & 1) != 0) + { + inst |= Intrinsic.Arm64VDouble; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, n)); + } + + public static void EmitScalarUnaryOpFToGp(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdCvt op = (OpCodeSimdCvt)context.CurrOp; + + Operand n = GetVec(op.Rn); + + if ((op.Size & 1) != 0) + { + inst |= Intrinsic.Arm64VDouble; + } + + SetIntOrZR(context, op.Rd, op.RegisterSize == RegisterSize.Int32 + ? context.AddIntrinsicInt (inst, n) + : context.AddIntrinsicLong(inst, n)); + } + + public static void EmitScalarBinaryOpF(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if ((op.Size & 1) != 0) + { + inst |= Intrinsic.Arm64VDouble; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, n, m)); + } + + public static void EmitScalarBinaryOpFByElem(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if ((op.Size & 1) != 0) + { + inst |= Intrinsic.Arm64VDouble; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, n, m, Const(op.Index))); + } + + public static void EmitScalarTernaryOpF(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + Operand a = GetVec(op.Ra); + + if ((op.Size & 1) != 0) + { + inst |= Intrinsic.Arm64VDouble; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, a, n, m)); + } + + public static void EmitScalarTernaryOpFRdByElem(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if ((op.Size & 1) != 0) + { + inst |= Intrinsic.Arm64VDouble; + } + + context.Copy(d, context.AddIntrinsic(inst, d, n, m, Const(op.Index))); + } + + public static void EmitScalarUnaryOp(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, n)); + } + + public static void EmitScalarBinaryOp(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, n, m)); + } + + public static void EmitScalarBinaryOpRd(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, d, n)); + } + + public static void EmitScalarTernaryOpRd(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + context.Copy(d, context.AddIntrinsic(inst, d, n, m)); + } + + public static void EmitScalarShiftBinaryOp(ArmEmitterContext context, Intrinsic inst, int shift) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, n, Const(shift))); + } + + public static void EmitScalarShiftTernaryOpRd(ArmEmitterContext context, Intrinsic inst, int shift) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, d, n, Const(shift))); + } + + public static void EmitScalarSaturatingShiftTernaryOpRd(ArmEmitterContext context, Intrinsic inst, int shift) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, d, n, Const(shift))); + + context.SetPendingQcFlagSync(); + } + + public static void EmitScalarSaturatingUnaryOp(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + Operand result = context.AddIntrinsic(inst, n); + + context.Copy(GetVec(op.Rd), result); + + context.SetPendingQcFlagSync(); + } + + public static void EmitScalarSaturatingBinaryOp(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + Operand result = context.AddIntrinsic(inst, n, m); + + context.Copy(GetVec(op.Rd), result); + + context.SetPendingQcFlagSync(); + } + + public static void EmitScalarSaturatingBinaryOpRd(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + Operand result = context.AddIntrinsic(inst, d, n); + + context.Copy(GetVec(op.Rd), result); + + context.SetPendingQcFlagSync(); + } + + public static void EmitScalarConvertBinaryOpF(ArmEmitterContext context, Intrinsic inst, int fBits) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + if ((op.Size & 1) != 0) + { + inst |= Intrinsic.Arm64VDouble; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, n, Const(fBits))); + } + + public static void EmitScalarConvertBinaryOpFFromGp(ArmEmitterContext context, Intrinsic inst, int fBits) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetIntOrZR(context, op.Rn); + + if ((op.Size & 1) != 0) + { + inst |= Intrinsic.Arm64VDouble; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, n, Const(fBits))); + } + + public static void EmitScalarConvertBinaryOpFToGp(ArmEmitterContext context, Intrinsic inst, int fBits) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + if ((op.Size & 1) != 0) + { + inst |= Intrinsic.Arm64VDouble; + } + + SetIntOrZR(context, op.Rd, op.RegisterSize == RegisterSize.Int32 + ? context.AddIntrinsicInt (inst, n, Const(fBits)) + : context.AddIntrinsicLong(inst, n, Const(fBits))); + } + + public static void EmitVectorUnaryOpF(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + if ((op.Size & 1) != 0) + { + inst |= Intrinsic.Arm64VDouble; + } + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, n)); + } + + public static void EmitVectorBinaryOpF(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if ((op.Size & 1) != 0) + { + inst |= Intrinsic.Arm64VDouble; + } + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, n, m)); + } + + public static void EmitVectorBinaryOpFRd(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + if ((op.Size & 1) != 0) + { + inst |= Intrinsic.Arm64VDouble; + } + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, d, n)); + } + + public static void EmitVectorBinaryOpFByElem(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if ((op.Size & 1) != 0) + { + inst |= Intrinsic.Arm64VDouble; + } + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, n, m, Const(op.Index))); + } + + public static void EmitVectorTernaryOpFRd(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if ((op.Size & 1) != 0) + { + inst |= Intrinsic.Arm64VDouble; + } + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + context.Copy(d, context.AddIntrinsic(inst, d, n, m)); + } + + public static void EmitVectorTernaryOpFRdByElem(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if ((op.Size & 1) != 0) + { + inst |= Intrinsic.Arm64VDouble; + } + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + context.Copy(d, context.AddIntrinsic(inst, d, n, m, Const(op.Index))); + } + + public static void EmitVectorUnaryOp(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, n)); + } + + public static void EmitVectorBinaryOp(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, n, m)); + } + + public static void EmitVectorBinaryOpRd(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, d, n)); + } + + public static void EmitVectorBinaryOpByElem(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdRegElem op = (OpCodeSimdRegElem)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, n, m, Const(op.Index))); + } + + public static void EmitVectorTernaryOpRd(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + context.Copy(d, context.AddIntrinsic(inst, d, n, m)); + } + + public static void EmitVectorTernaryOpRdByElem(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdRegElem op = (OpCodeSimdRegElem)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + context.Copy(d, context.AddIntrinsic(inst, d, n, m, Const(op.Index))); + } + + public static void EmitVectorShiftBinaryOp(ArmEmitterContext context, Intrinsic inst, int shift) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, n, Const(shift))); + } + + public static void EmitVectorShiftTernaryOpRd(ArmEmitterContext context, Intrinsic inst, int shift) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, d, n, Const(shift))); + } + + public static void EmitVectorSaturatingShiftTernaryOpRd(ArmEmitterContext context, Intrinsic inst, int shift) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, d, n, Const(shift))); + + context.SetPendingQcFlagSync(); + } + + public static void EmitVectorSaturatingUnaryOp(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + Operand result = context.AddIntrinsic(inst, n); + + context.Copy(GetVec(op.Rd), result); + + context.SetPendingQcFlagSync(); + } + + public static void EmitVectorSaturatingBinaryOp(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + Operand result = context.AddIntrinsic(inst, n, m); + + context.Copy(GetVec(op.Rd), result); + + context.SetPendingQcFlagSync(); + } + + public static void EmitVectorSaturatingBinaryOpRd(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + Operand result = context.AddIntrinsic(inst, d, n); + + context.Copy(GetVec(op.Rd), result); + + context.SetPendingQcFlagSync(); + } + + public static void EmitVectorSaturatingBinaryOpByElem(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdRegElem op = (OpCodeSimdRegElem)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + inst |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + Operand result = context.AddIntrinsic(inst, n, m, Const(op.Index)); + + context.Copy(GetVec(op.Rd), result); + + context.SetPendingQcFlagSync(); + } + + public static void EmitVectorConvertBinaryOpF(ArmEmitterContext context, Intrinsic inst, int fBits) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + if ((op.Size & 1) != 0) + { + inst |= Intrinsic.Arm64VDouble; + } + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, n, Const(fBits))); + } + + public static void EmitVectorLookupTable(ArmEmitterContext context, Intrinsic inst) + { + OpCodeSimdTbl op = (OpCodeSimdTbl)context.CurrOp; + + Operand[] operands = new Operand[op.Size + 1]; + + operands[op.Size] = GetVec(op.Rm); + + for (int index = 0; index < op.Size; index++) + { + operands[index] = GetVec((op.Rn + index) & 0x1F); + } + + if (op.RegisterSize == RegisterSize.Simd128) + { + inst |= Intrinsic.Arm64V128; + } + + context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst, operands)); + } + + public static void EmitFcmpOrFcmpe(ArmEmitterContext context, bool signalNaNs) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + bool cmpWithZero = !(op is OpCodeSimdFcond) ? op.Bit3 : false; + + Intrinsic inst = signalNaNs ? Intrinsic.Arm64FcmpeS : Intrinsic.Arm64FcmpS; + + if ((op.Size & 1) != 0) + { + inst |= Intrinsic.Arm64VDouble; + } + + Operand n = GetVec(op.Rn); + Operand m = cmpWithZero ? Const(0) : GetVec(op.Rm); + + Operand nzcv = context.AddIntrinsicInt(inst, n, m); + + Operand one = Const(1); + + SetFlag(context, PState.VFlag, context.BitwiseAnd(context.ShiftRightUI(nzcv, Const(28)), one)); + SetFlag(context, PState.CFlag, context.BitwiseAnd(context.ShiftRightUI(nzcv, Const(29)), one)); + SetFlag(context, PState.ZFlag, context.BitwiseAnd(context.ShiftRightUI(nzcv, Const(30)), one)); + SetFlag(context, PState.NFlag, context.BitwiseAnd(context.ShiftRightUI(nzcv, Const(31)), one)); + } + } +}
\ No newline at end of file diff --git a/src/ARMeilleure/Instructions/InstEmitSimdLogical.cs b/src/ARMeilleure/Instructions/InstEmitSimdLogical.cs new file mode 100644 index 00000000..2bf531e6 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdLogical.cs @@ -0,0 +1,612 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; +using System; +using System.Diagnostics; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit + { + public static void And_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64AndV); + } + else if (Optimizations.UseSse2) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res = context.AddIntrinsic(Intrinsic.X86Pand, n, m); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorBinaryOpZx(context, (op1, op2) => context.BitwiseAnd(op1, op2)); + } + } + + public static void Bic_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64BicV); + } + else if (Optimizations.UseSse2) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res = context.AddIntrinsic(Intrinsic.X86Pandn, m, n); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorBinaryOpZx(context, (op1, op2) => + { + return context.BitwiseAnd(op1, context.BitwiseNot(op2)); + }); + } + } + + public static void Bic_Vi(ArmEmitterContext context) + { + if (Optimizations.UseSse2) + { + OpCodeSimdImm op = (OpCodeSimdImm)context.CurrOp; + + int eSize = 8 << op.Size; + + Operand d = GetVec(op.Rd); + Operand imm = eSize switch { + 16 => X86GetAllElements(context, (short)~op.Immediate), + 32 => X86GetAllElements(context, (int)~op.Immediate), + _ => throw new InvalidOperationException($"Invalid element size {eSize}.") + }; + + Operand res = context.AddIntrinsic(Intrinsic.X86Pand, d, imm); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorImmBinaryOp(context, (op1, op2) => + { + return context.BitwiseAnd(op1, context.BitwiseNot(op2)); + }); + } + } + + public static void Bif_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64BifV); + } + else + { + EmitBifBit(context, notRm: true); + } + } + + public static void Bit_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64BitV); + } + else + { + EmitBifBit(context, notRm: false); + } + } + + private static void EmitBifBit(ArmEmitterContext context, bool notRm) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UseSse2) + { + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res = context.AddIntrinsic(Intrinsic.X86Pxor, n, d); + + if (notRm) + { + res = context.AddIntrinsic(Intrinsic.X86Pandn, m, res); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Pand, m, res); + } + + res = context.AddIntrinsic(Intrinsic.X86Pxor, d, res); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); + } + else + { + Operand res = context.VectorZero(); + + int elems = op.RegisterSize == RegisterSize.Simd128 ? 2 : 1; + + for (int index = 0; index < elems; index++) + { + Operand d = EmitVectorExtractZx(context, op.Rd, index, 3); + Operand n = EmitVectorExtractZx(context, op.Rn, index, 3); + Operand m = EmitVectorExtractZx(context, op.Rm, index, 3); + + if (notRm) + { + m = context.BitwiseNot(m); + } + + Operand e = context.BitwiseExclusiveOr(d, n); + + e = context.BitwiseAnd(e, m); + e = context.BitwiseExclusiveOr(e, d); + + res = EmitVectorInsert(context, res, e, index, 3); + } + + context.Copy(GetVec(op.Rd), res); + } + } + + public static void Bsl_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64BslV); + } + else if (Optimizations.UseSse2) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res = context.AddIntrinsic(Intrinsic.X86Pxor, n, m); + + res = context.AddIntrinsic(Intrinsic.X86Pand, res, d); + res = context.AddIntrinsic(Intrinsic.X86Pxor, res, m); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); + } + else + { + EmitVectorTernaryOpZx(context, (op1, op2, op3) => + { + return context.BitwiseExclusiveOr( + context.BitwiseAnd(op1, + context.BitwiseExclusiveOr(op2, op3)), op3); + }); + } + } + + public static void Eor_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64EorV); + } + else if (Optimizations.UseSse2) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res = context.AddIntrinsic(Intrinsic.X86Pxor, n, m); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorBinaryOpZx(context, (op1, op2) => context.BitwiseExclusiveOr(op1, op2)); + } + } + + public static void Not_V(ArmEmitterContext context) + { + if (Optimizations.UseAvx512Ortho) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + Operand res = context.AddIntrinsic(Intrinsic.X86Vpternlogd, n, n, Const(~0b10101010)); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else if (Optimizations.UseSse2) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + Operand mask = X86GetAllElements(context, -1L); + + Operand res = context.AddIntrinsic(Intrinsic.X86Pandn, n, mask); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorUnaryOpZx(context, (op1) => context.BitwiseNot(op1)); + } + } + + public static void Orn_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64OrnV); + } + else if (Optimizations.UseAvx512Ortho) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res = context.AddIntrinsic(Intrinsic.X86Vpternlogd, n, m, Const(0b11001100 | ~0b10101010)); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else if (Optimizations.UseSse2) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand mask = X86GetAllElements(context, -1L); + + Operand res = context.AddIntrinsic(Intrinsic.X86Pandn, m, mask); + + res = context.AddIntrinsic(Intrinsic.X86Por, res, n); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorBinaryOpZx(context, (op1, op2) => + { + return context.BitwiseOr(op1, context.BitwiseNot(op2)); + }); + } + } + + public static void Orr_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64OrrV); + } + else if (Optimizations.UseSse2) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res = context.AddIntrinsic(Intrinsic.X86Por, n, m); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorBinaryOpZx(context, (op1, op2) => context.BitwiseOr(op1, op2)); + } + } + + public static void Orr_Vi(ArmEmitterContext context) + { + if (Optimizations.UseSse2) + { + OpCodeSimdImm op = (OpCodeSimdImm)context.CurrOp; + + int eSize = 8 << op.Size; + + Operand d = GetVec(op.Rd); + Operand imm = eSize switch { + 16 => X86GetAllElements(context, (short)op.Immediate), + 32 => X86GetAllElements(context, (int)op.Immediate), + _ => throw new InvalidOperationException($"Invalid element size {eSize}.") + }; + + Operand res = context.AddIntrinsic(Intrinsic.X86Por, d, imm); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorImmBinaryOp(context, (op1, op2) => context.BitwiseOr(op1, op2)); + } + } + + public static void Rbit_V(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + if (Optimizations.UseGfni) + { + const long bitMatrix = + (0b10000000L << 56) | + (0b01000000L << 48) | + (0b00100000L << 40) | + (0b00010000L << 32) | + (0b00001000L << 24) | + (0b00000100L << 16) | + (0b00000010L << 8) | + (0b00000001L << 0); + + Operand vBitMatrix = X86GetAllElements(context, bitMatrix); + + Operand res = context.AddIntrinsic(Intrinsic.X86Gf2p8affineqb, GetVec(op.Rn), vBitMatrix, Const(0)); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand res = context.VectorZero(); + int elems = op.RegisterSize == RegisterSize.Simd128 ? 16 : 8; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractZx(context, op.Rn, index, 0); + + Operand de = EmitReverseBits8Op(context, ne); + + res = EmitVectorInsert(context, res, de, index, 0); + } + + context.Copy(GetVec(op.Rd), res); + } + } + + private static Operand EmitReverseBits8Op(ArmEmitterContext context, Operand op) + { + Debug.Assert(op.Type == OperandType.I64); + + Operand val = context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op, Const(0xaaul)), Const(1)), + context.ShiftLeft (context.BitwiseAnd(op, Const(0x55ul)), Const(1))); + + val = context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(val, Const(0xccul)), Const(2)), + context.ShiftLeft (context.BitwiseAnd(val, Const(0x33ul)), Const(2))); + + return context.BitwiseOr(context.ShiftRightUI(val, Const(4)), + context.ShiftLeft (context.BitwiseAnd(val, Const(0x0ful)), Const(4))); + } + + public static void Rev16_V(ArmEmitterContext context) + { + if (Optimizations.UseSsse3) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + const long maskE0 = 06L << 56 | 07L << 48 | 04L << 40 | 05L << 32 | 02L << 24 | 03L << 16 | 00L << 8 | 01L << 0; + const long maskE1 = 14L << 56 | 15L << 48 | 12L << 40 | 13L << 32 | 10L << 24 | 11L << 16 | 08L << 8 | 09L << 0; + + Operand mask = X86GetScalar(context, maskE0); + + mask = EmitVectorInsert(context, mask, Const(maskE1), 1, 3); + + Operand res = context.AddIntrinsic(Intrinsic.X86Pshufb, n, mask); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitRev_V(context, containerSize: 1); + } + } + + public static void Rev32_V(ArmEmitterContext context) + { + if (Optimizations.UseSsse3) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + Operand mask; + + if (op.Size == 0) + { + const long maskE0 = 04L << 56 | 05L << 48 | 06L << 40 | 07L << 32 | 00L << 24 | 01L << 16 | 02L << 8 | 03L << 0; + const long maskE1 = 12L << 56 | 13L << 48 | 14L << 40 | 15L << 32 | 08L << 24 | 09L << 16 | 10L << 8 | 11L << 0; + + mask = X86GetScalar(context, maskE0); + + mask = EmitVectorInsert(context, mask, Const(maskE1), 1, 3); + } + else /* if (op.Size == 1) */ + { + const long maskE0 = 05L << 56 | 04L << 48 | 07L << 40 | 06L << 32 | 01L << 24 | 00L << 16 | 03L << 8 | 02L << 0; + const long maskE1 = 13L << 56 | 12L << 48 | 15L << 40 | 14L << 32 | 09L << 24 | 08L << 16 | 11L << 8 | 10L << 0; + + mask = X86GetScalar(context, maskE0); + + mask = EmitVectorInsert(context, mask, Const(maskE1), 1, 3); + } + + Operand res = context.AddIntrinsic(Intrinsic.X86Pshufb, n, mask); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitRev_V(context, containerSize: 2); + } + } + + public static void Rev64_V(ArmEmitterContext context) + { + if (Optimizations.UseSsse3) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + Operand mask; + + if (op.Size == 0) + { + const long maskE0 = 00L << 56 | 01L << 48 | 02L << 40 | 03L << 32 | 04L << 24 | 05L << 16 | 06L << 8 | 07L << 0; + const long maskE1 = 08L << 56 | 09L << 48 | 10L << 40 | 11L << 32 | 12L << 24 | 13L << 16 | 14L << 8 | 15L << 0; + + mask = X86GetScalar(context, maskE0); + + mask = EmitVectorInsert(context, mask, Const(maskE1), 1, 3); + } + else if (op.Size == 1) + { + const long maskE0 = 01L << 56 | 00L << 48 | 03L << 40 | 02L << 32 | 05L << 24 | 04L << 16 | 07L << 8 | 06L << 0; + const long maskE1 = 09L << 56 | 08L << 48 | 11L << 40 | 10L << 32 | 13L << 24 | 12L << 16 | 15L << 8 | 14L << 0; + + mask = X86GetScalar(context, maskE0); + + mask = EmitVectorInsert(context, mask, Const(maskE1), 1, 3); + } + else /* if (op.Size == 2) */ + { + const long maskE0 = 03L << 56 | 02L << 48 | 01L << 40 | 00L << 32 | 07L << 24 | 06L << 16 | 05L << 8 | 04L << 0; + const long maskE1 = 11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 15L << 24 | 14L << 16 | 13L << 8 | 12L << 0; + + mask = X86GetScalar(context, maskE0); + + mask = EmitVectorInsert(context, mask, Const(maskE1), 1, 3); + } + + Operand res = context.AddIntrinsic(Intrinsic.X86Pshufb, n, mask); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitRev_V(context, containerSize: 3); + } + } + + private static void EmitRev_V(ArmEmitterContext context, int containerSize) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + int containerMask = (1 << (containerSize - op.Size)) - 1; + + for (int index = 0; index < elems; index++) + { + int revIndex = index ^ containerMask; + + Operand ne = EmitVectorExtractZx(context, op.Rn, revIndex, op.Size); + + res = EmitVectorInsert(context, res, ne, index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSimdLogical32.cs b/src/ARMeilleure/Instructions/InstEmitSimdLogical32.cs new file mode 100644 index 00000000..68ef4ed1 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdLogical32.cs @@ -0,0 +1,266 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper32; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit32 + { + public static void Vand_I(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.Arm64AndV | Intrinsic.Arm64V128, n, m)); + } + else if (Optimizations.UseSse2) + { + EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.X86Pand, n, m)); + } + else + { + EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseAnd(op1, op2)); + } + } + + public static void Vbic_I(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.Arm64BicV | Intrinsic.Arm64V128, n, m)); + } + else if (Optimizations.UseSse2) + { + EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.X86Pandn, m, n)); + } + else + { + EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseAnd(op1, context.BitwiseNot(op2))); + } + } + + public static void Vbic_II(ArmEmitterContext context) + { + OpCode32SimdImm op = (OpCode32SimdImm)context.CurrOp; + + long immediate = op.Immediate; + + // Replicate fields to fill the 64-bits, if size is < 64-bits. + switch (op.Size) + { + case 0: immediate *= 0x0101010101010101L; break; + case 1: immediate *= 0x0001000100010001L; break; + case 2: immediate *= 0x0000000100000001L; break; + } + + Operand imm = Const(immediate); + Operand res = GetVecA32(op.Qd); + + if (op.Q) + { + for (int elem = 0; elem < 2; elem++) + { + Operand de = EmitVectorExtractZx(context, op.Qd, elem, 3); + + res = EmitVectorInsert(context, res, context.BitwiseAnd(de, context.BitwiseNot(imm)), elem, 3); + } + } + else + { + Operand de = EmitVectorExtractZx(context, op.Qd, op.Vd & 1, 3); + + res = EmitVectorInsert(context, res, context.BitwiseAnd(de, context.BitwiseNot(imm)), op.Vd & 1, 3); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void Vbif(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorTernaryOpSimd32(context, (d, n, m) => context.AddIntrinsic(Intrinsic.Arm64BifV | Intrinsic.Arm64V128, d, n, m)); + } + else + { + EmitBifBit(context, true); + } + } + + public static void Vbit(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorTernaryOpSimd32(context, (d, n, m) => context.AddIntrinsic(Intrinsic.Arm64BitV | Intrinsic.Arm64V128, d, n, m)); + } + else + { + EmitBifBit(context, false); + } + } + + public static void Vbsl(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorTernaryOpSimd32(context, (d, n, m) => context.AddIntrinsic(Intrinsic.Arm64BslV | Intrinsic.Arm64V128, d, n, m)); + } + else if (Optimizations.UseSse2) + { + EmitVectorTernaryOpSimd32(context, (d, n, m) => + { + Operand res = context.AddIntrinsic(Intrinsic.X86Pxor, n, m); + res = context.AddIntrinsic(Intrinsic.X86Pand, res, d); + return context.AddIntrinsic(Intrinsic.X86Pxor, res, m); + }); + } + else + { + EmitVectorTernaryOpZx32(context, (op1, op2, op3) => + { + return context.BitwiseExclusiveOr( + context.BitwiseAnd(op1, + context.BitwiseExclusiveOr(op2, op3)), op3); + }); + } + } + + public static void Veor_I(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.Arm64EorV | Intrinsic.Arm64V128, n, m)); + } + else if (Optimizations.UseSse2) + { + EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.X86Pxor, n, m)); + } + else + { + EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseExclusiveOr(op1, op2)); + } + } + + public static void Vorn_I(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.Arm64OrnV | Intrinsic.Arm64V128, n, m)); + } + else if (Optimizations.UseAvx512Ortho) + { + EmitVectorBinaryOpSimd32(context, (n, m) => + { + return context.AddIntrinsic(Intrinsic.X86Vpternlogd, n, m, Const(0b11001100 | ~0b10101010)); + }); + } + else if (Optimizations.UseSse2) + { + Operand mask = context.VectorOne(); + + EmitVectorBinaryOpSimd32(context, (n, m) => + { + m = context.AddIntrinsic(Intrinsic.X86Pandn, m, mask); + return context.AddIntrinsic(Intrinsic.X86Por, n, m); + }); + } + else + { + EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseOr(op1, context.BitwiseNot(op2))); + } + } + + public static void Vorr_I(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelper32Arm64.EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.Arm64OrrV | Intrinsic.Arm64V128, n, m)); + } + else if (Optimizations.UseSse2) + { + EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.X86Por, n, m)); + } + else + { + EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseOr(op1, op2)); + } + } + + public static void Vorr_II(ArmEmitterContext context) + { + OpCode32SimdImm op = (OpCode32SimdImm)context.CurrOp; + + long immediate = op.Immediate; + + // Replicate fields to fill the 64-bits, if size is < 64-bits. + switch (op.Size) + { + case 0: immediate *= 0x0101010101010101L; break; + case 1: immediate *= 0x0001000100010001L; break; + case 2: immediate *= 0x0000000100000001L; break; + } + + Operand imm = Const(immediate); + Operand res = GetVecA32(op.Qd); + + if (op.Q) + { + for (int elem = 0; elem < 2; elem++) + { + Operand de = EmitVectorExtractZx(context, op.Qd, elem, 3); + + res = EmitVectorInsert(context, res, context.BitwiseOr(de, imm), elem, 3); + } + } + else + { + Operand de = EmitVectorExtractZx(context, op.Qd, op.Vd & 1, 3); + + res = EmitVectorInsert(context, res, context.BitwiseOr(de, imm), op.Vd & 1, 3); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void Vtst(ArmEmitterContext context) + { + EmitVectorBinaryOpZx32(context, (op1, op2) => + { + Operand isZero = context.ICompareEqual(context.BitwiseAnd(op1, op2), Const(0)); + return context.ConditionalSelect(isZero, Const(0), Const(-1)); + }); + } + + private static void EmitBifBit(ArmEmitterContext context, bool notRm) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + if (Optimizations.UseSse2) + { + EmitVectorTernaryOpSimd32(context, (d, n, m) => + { + Operand res = context.AddIntrinsic(Intrinsic.X86Pxor, n, d); + res = context.AddIntrinsic((notRm) ? Intrinsic.X86Pandn : Intrinsic.X86Pand, m, res); + return context.AddIntrinsic(Intrinsic.X86Pxor, d, res); + }); + } + else + { + EmitVectorTernaryOpZx32(context, (d, n, m) => + { + if (notRm) + { + m = context.BitwiseNot(m); + } + return context.BitwiseExclusiveOr( + context.BitwiseAnd(m, + context.BitwiseExclusiveOr(d, n)), d); + }); + } + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSimdMemory.cs b/src/ARMeilleure/Instructions/InstEmitSimdMemory.cs new file mode 100644 index 00000000..9b19872a --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdMemory.cs @@ -0,0 +1,160 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; +using System.Diagnostics; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitMemoryHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit + { + public static void Ld__Vms(ArmEmitterContext context) + { + EmitSimdMemMs(context, isLoad: true); + } + + public static void Ld__Vss(ArmEmitterContext context) + { + EmitSimdMemSs(context, isLoad: true); + } + + public static void St__Vms(ArmEmitterContext context) + { + EmitSimdMemMs(context, isLoad: false); + } + + public static void St__Vss(ArmEmitterContext context) + { + EmitSimdMemSs(context, isLoad: false); + } + + private static void EmitSimdMemMs(ArmEmitterContext context, bool isLoad) + { + OpCodeSimdMemMs op = (OpCodeSimdMemMs)context.CurrOp; + + Operand n = GetIntOrSP(context, op.Rn); + + long offset = 0; + + for (int rep = 0; rep < op.Reps; rep++) + for (int elem = 0; elem < op.Elems; elem++) + for (int sElem = 0; sElem < op.SElems; sElem++) + { + int rtt = (op.Rt + rep + sElem) & 0x1f; + + Operand tt = GetVec(rtt); + + Operand address = context.Add(n, Const(offset)); + + if (isLoad) + { + EmitLoadSimd(context, address, tt, rtt, elem, op.Size); + + if (op.RegisterSize == RegisterSize.Simd64 && elem == op.Elems - 1) + { + context.Copy(tt, context.VectorZeroUpper64(tt)); + } + } + else + { + EmitStoreSimd(context, address, rtt, elem, op.Size); + } + + offset += 1 << op.Size; + } + + if (op.WBack) + { + EmitSimdMemWBack(context, offset); + } + } + + private static void EmitSimdMemSs(ArmEmitterContext context, bool isLoad) + { + OpCodeSimdMemSs op = (OpCodeSimdMemSs)context.CurrOp; + + Operand n = GetIntOrSP(context, op.Rn); + + long offset = 0; + + if (op.Replicate) + { + // Only loads uses the replicate mode. + Debug.Assert(isLoad, "Replicate mode is not valid for stores."); + + int elems = op.GetBytesCount() >> op.Size; + + for (int sElem = 0; sElem < op.SElems; sElem++) + { + int rt = (op.Rt + sElem) & 0x1f; + + Operand t = GetVec(rt); + + Operand address = context.Add(n, Const(offset)); + + for (int index = 0; index < elems; index++) + { + EmitLoadSimd(context, address, t, rt, index, op.Size); + } + + if (op.RegisterSize == RegisterSize.Simd64) + { + context.Copy(t, context.VectorZeroUpper64(t)); + } + + offset += 1 << op.Size; + } + } + else + { + for (int sElem = 0; sElem < op.SElems; sElem++) + { + int rt = (op.Rt + sElem) & 0x1f; + + Operand t = GetVec(rt); + + Operand address = context.Add(n, Const(offset)); + + if (isLoad) + { + EmitLoadSimd(context, address, t, rt, op.Index, op.Size); + } + else + { + EmitStoreSimd(context, address, rt, op.Index, op.Size); + } + + offset += 1 << op.Size; + } + } + + if (op.WBack) + { + EmitSimdMemWBack(context, offset); + } + } + + private static void EmitSimdMemWBack(ArmEmitterContext context, long offset) + { + OpCodeMemReg op = (OpCodeMemReg)context.CurrOp; + + Operand n = GetIntOrSP(context, op.Rn); + Operand m; + + if (op.Rm != RegisterAlias.Zr) + { + m = GetIntOrZR(context, op.Rm); + } + else + { + m = Const(offset); + } + + context.Copy(n, context.Add(n, m)); + } + } +}
\ No newline at end of file diff --git a/src/ARMeilleure/Instructions/InstEmitSimdMemory32.cs b/src/ARMeilleure/Instructions/InstEmitSimdMemory32.cs new file mode 100644 index 00000000..b774bd06 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdMemory32.cs @@ -0,0 +1,352 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitMemoryHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit32 + { + public static void Vld1(ArmEmitterContext context) + { + EmitVStoreOrLoadN(context, 1, true); + } + + public static void Vld2(ArmEmitterContext context) + { + EmitVStoreOrLoadN(context, 2, true); + } + + public static void Vld3(ArmEmitterContext context) + { + EmitVStoreOrLoadN(context, 3, true); + } + + public static void Vld4(ArmEmitterContext context) + { + EmitVStoreOrLoadN(context, 4, true); + } + + public static void Vst1(ArmEmitterContext context) + { + EmitVStoreOrLoadN(context, 1, false); + } + + public static void Vst2(ArmEmitterContext context) + { + EmitVStoreOrLoadN(context, 2, false); + } + + public static void Vst3(ArmEmitterContext context) + { + EmitVStoreOrLoadN(context, 3, false); + } + + public static void Vst4(ArmEmitterContext context) + { + EmitVStoreOrLoadN(context, 4, false); + } + + public static void EmitVStoreOrLoadN(ArmEmitterContext context, int count, bool load) + { + if (context.CurrOp is OpCode32SimdMemSingle) + { + OpCode32SimdMemSingle op = (OpCode32SimdMemSingle)context.CurrOp; + + int eBytes = 1 << op.Size; + + Operand n = context.Copy(GetIntA32(context, op.Rn)); + + // TODO: Check alignment. + int offset = 0; + int d = op.Vd; + + for (int i = 0; i < count; i++) + { + // Accesses an element from a double simd register. + Operand address = context.Add(n, Const(offset)); + if (eBytes == 8) + { + if (load) + { + EmitDVectorLoad(context, address, d); + } + else + { + EmitDVectorStore(context, address, d); + } + } + else + { + int index = ((d & 1) << (3 - op.Size)) + op.Index; + if (load) + { + if (op.Replicate) + { + var regs = (count > 1) ? 1 : op.Increment; + for (int reg = 0; reg < regs; reg++) + { + int dreg = reg + d; + int rIndex = ((dreg & 1) << (3 - op.Size)); + int limit = rIndex + (1 << (3 - op.Size)); + + while (rIndex < limit) + { + EmitLoadSimd(context, address, GetVecA32(dreg >> 1), dreg >> 1, rIndex++, op.Size); + } + } + } + else + { + EmitLoadSimd(context, address, GetVecA32(d >> 1), d >> 1, index, op.Size); + } + } + else + { + EmitStoreSimd(context, address, d >> 1, index, op.Size); + } + } + offset += eBytes; + d += op.Increment; + } + + if (op.WBack) + { + if (op.RegisterIndex) + { + Operand m = GetIntA32(context, op.Rm); + SetIntA32(context, op.Rn, context.Add(n, m)); + } + else + { + SetIntA32(context, op.Rn, context.Add(n, Const(count * eBytes))); + } + } + } + else + { + OpCode32SimdMemPair op = (OpCode32SimdMemPair)context.CurrOp; + + int increment = count > 1 ? op.Increment : 1; + int eBytes = 1 << op.Size; + + Operand n = context.Copy(GetIntA32(context, op.Rn)); + int offset = 0; + int d = op.Vd; + + for (int reg = 0; reg < op.Regs; reg++) + { + for (int elem = 0; elem < op.Elems; elem++) + { + int elemD = d + reg; + for (int i = 0; i < count; i++) + { + // Accesses an element from a double simd register, + // add ebytes for each element. + Operand address = context.Add(n, Const(offset)); + int index = ((elemD & 1) << (3 - op.Size)) + elem; + if (eBytes == 8) + { + if (load) + { + EmitDVectorLoad(context, address, elemD); + } + else + { + EmitDVectorStore(context, address, elemD); + } + } + else + { + if (load) + { + EmitLoadSimd(context, address, GetVecA32(elemD >> 1), elemD >> 1, index, op.Size); + } + else + { + EmitStoreSimd(context, address, elemD >> 1, index, op.Size); + } + } + + offset += eBytes; + elemD += increment; + } + } + } + + if (op.WBack) + { + if (op.RegisterIndex) + { + Operand m = GetIntA32(context, op.Rm); + SetIntA32(context, op.Rn, context.Add(n, m)); + } + else + { + SetIntA32(context, op.Rn, context.Add(n, Const(count * 8 * op.Regs))); + } + } + } + } + + public static void Vldm(ArmEmitterContext context) + { + OpCode32SimdMemMult op = (OpCode32SimdMemMult)context.CurrOp; + + Operand n = context.Copy(GetIntA32(context, op.Rn)); + + Operand baseAddress = context.Add(n, Const(op.Offset)); + + bool writeBack = op.PostOffset != 0; + + if (writeBack) + { + SetIntA32(context, op.Rn, context.Add(n, Const(op.PostOffset))); + } + + int range = op.RegisterRange; + + int sReg = (op.DoubleWidth) ? (op.Vd << 1) : op.Vd; + int offset = 0; + int byteSize = 4; + + for (int num = 0; num < range; num++, sReg++) + { + Operand address = context.Add(baseAddress, Const(offset)); + Operand vec = GetVecA32(sReg >> 2); + + EmitLoadSimd(context, address, vec, sReg >> 2, sReg & 3, WordSizeLog2); + offset += byteSize; + } + } + + public static void Vstm(ArmEmitterContext context) + { + OpCode32SimdMemMult op = (OpCode32SimdMemMult)context.CurrOp; + + Operand n = context.Copy(GetIntA32(context, op.Rn)); + + Operand baseAddress = context.Add(n, Const(op.Offset)); + + bool writeBack = op.PostOffset != 0; + + if (writeBack) + { + SetIntA32(context, op.Rn, context.Add(n, Const(op.PostOffset))); + } + + int offset = 0; + + int range = op.RegisterRange; + int sReg = (op.DoubleWidth) ? (op.Vd << 1) : op.Vd; + int byteSize = 4; + + for (int num = 0; num < range; num++, sReg++) + { + Operand address = context.Add(baseAddress, Const(offset)); + + EmitStoreSimd(context, address, sReg >> 2, sReg & 3, WordSizeLog2); + + offset += byteSize; + } + } + + public static void Vldr(ArmEmitterContext context) + { + EmitVLoadOrStore(context, AccessType.Load); + } + + public static void Vstr(ArmEmitterContext context) + { + EmitVLoadOrStore(context, AccessType.Store); + } + + private static void EmitDVectorStore(ArmEmitterContext context, Operand address, int vecD) + { + int vecQ = vecD >> 1; + int vecSElem = (vecD & 1) << 1; + Operand lblBigEndian = Label(); + Operand lblEnd = Label(); + + context.BranchIfTrue(lblBigEndian, GetFlag(PState.EFlag)); + + EmitStoreSimd(context, address, vecQ, vecSElem, WordSizeLog2); + EmitStoreSimd(context, context.Add(address, Const(4)), vecQ, vecSElem | 1, WordSizeLog2); + + context.Branch(lblEnd); + + context.MarkLabel(lblBigEndian); + + EmitStoreSimd(context, address, vecQ, vecSElem | 1, WordSizeLog2); + EmitStoreSimd(context, context.Add(address, Const(4)), vecQ, vecSElem, WordSizeLog2); + + context.MarkLabel(lblEnd); + } + + private static void EmitDVectorLoad(ArmEmitterContext context, Operand address, int vecD) + { + int vecQ = vecD >> 1; + int vecSElem = (vecD & 1) << 1; + Operand vec = GetVecA32(vecQ); + + Operand lblBigEndian = Label(); + Operand lblEnd = Label(); + + context.BranchIfTrue(lblBigEndian, GetFlag(PState.EFlag)); + + EmitLoadSimd(context, address, vec, vecQ, vecSElem, WordSizeLog2); + EmitLoadSimd(context, context.Add(address, Const(4)), vec, vecQ, vecSElem | 1, WordSizeLog2); + + context.Branch(lblEnd); + + context.MarkLabel(lblBigEndian); + + EmitLoadSimd(context, address, vec, vecQ, vecSElem | 1, WordSizeLog2); + EmitLoadSimd(context, context.Add(address, Const(4)), vec, vecQ, vecSElem, WordSizeLog2); + + context.MarkLabel(lblEnd); + } + + private static void EmitVLoadOrStore(ArmEmitterContext context, AccessType accType) + { + OpCode32SimdMemImm op = (OpCode32SimdMemImm)context.CurrOp; + + Operand n = context.Copy(GetIntA32(context, op.Rn)); + Operand m = GetMemM(context, setCarry: false); + + Operand address = op.Add + ? context.Add(n, m) + : context.Subtract(n, m); + + int size = op.Size; + + if ((accType & AccessType.Load) != 0) + { + if (size == DWordSizeLog2) + { + EmitDVectorLoad(context, address, op.Vd); + } + else + { + Operand vec = GetVecA32(op.Vd >> 2); + EmitLoadSimd(context, address, vec, op.Vd >> 2, (op.Vd & 3) << (2 - size), size); + } + } + else + { + if (size == DWordSizeLog2) + { + EmitDVectorStore(context, address, op.Vd); + } + else + { + EmitStoreSimd(context, address, op.Vd >> 2, (op.Vd & 3) << (2 - size), size); + } + } + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSimdMove.cs b/src/ARMeilleure/Instructions/InstEmitSimdMove.cs new file mode 100644 index 00000000..b58a32f6 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdMove.cs @@ -0,0 +1,850 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; +using System.Collections.Generic; +using System.Reflection; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit + { +#region "Masks" + private static readonly long[] _masksE0_Uzp = new long[] + { + 13L << 56 | 09L << 48 | 05L << 40 | 01L << 32 | 12L << 24 | 08L << 16 | 04L << 8 | 00L << 0, + 11L << 56 | 10L << 48 | 03L << 40 | 02L << 32 | 09L << 24 | 08L << 16 | 01L << 8 | 00L << 0 + }; + + private static readonly long[] _masksE1_Uzp = new long[] + { + 15L << 56 | 11L << 48 | 07L << 40 | 03L << 32 | 14L << 24 | 10L << 16 | 06L << 8 | 02L << 0, + 15L << 56 | 14L << 48 | 07L << 40 | 06L << 32 | 13L << 24 | 12L << 16 | 05L << 8 | 04L << 0 + }; +#endregion + + public static void Dup_Gp(ArmEmitterContext context) + { + OpCodeSimdIns op = (OpCodeSimdIns)context.CurrOp; + + Operand n = GetIntOrZR(context, op.Rn); + + if (Optimizations.UseSse2) + { + switch (op.Size) + { + case 0: n = context.ZeroExtend8 (n.Type, n); n = context.Multiply(n, Const(n.Type, 0x01010101)); break; + case 1: n = context.ZeroExtend16(n.Type, n); n = context.Multiply(n, Const(n.Type, 0x00010001)); break; + case 2: n = context.ZeroExtend32(n.Type, n); break; + } + + Operand res = context.VectorInsert(context.VectorZero(), n, 0); + + if (op.Size < 3) + { + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.AddIntrinsic(Intrinsic.X86Shufps, res, res, Const(0xf0)); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Shufps, res, res, Const(0)); + } + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Movlhps, res, res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + res = EmitVectorInsert(context, res, n, index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + } + + public static void Dup_S(ArmEmitterContext context) + { + OpCodeSimdIns op = (OpCodeSimdIns)context.CurrOp; + + Operand ne = EmitVectorExtractZx(context, op.Rn, op.DstIndex, op.Size); + + context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), ne, 0, op.Size)); + } + + public static void Dup_V(ArmEmitterContext context) + { + OpCodeSimdIns op = (OpCodeSimdIns)context.CurrOp; + + if (Optimizations.UseSse2) + { + Operand res = GetVec(op.Rn); + + if (op.Size == 0) + { + if (op.DstIndex != 0) + { + res = context.AddIntrinsic(Intrinsic.X86Psrldq, res, Const(op.DstIndex)); + } + + res = context.AddIntrinsic(Intrinsic.X86Punpcklbw, res, res); + res = context.AddIntrinsic(Intrinsic.X86Punpcklwd, res, res); + res = context.AddIntrinsic(Intrinsic.X86Shufps, res, res, Const(0)); + } + else if (op.Size == 1) + { + if (op.DstIndex != 0) + { + res = context.AddIntrinsic(Intrinsic.X86Psrldq, res, Const(op.DstIndex * 2)); + } + + res = context.AddIntrinsic(Intrinsic.X86Punpcklwd, res, res); + res = context.AddIntrinsic(Intrinsic.X86Shufps, res, res, Const(0)); + } + else if (op.Size == 2) + { + int mask = op.DstIndex * 0b01010101; + + res = context.AddIntrinsic(Intrinsic.X86Shufps, res, res, Const(mask)); + } + else if (op.DstIndex == 0 && op.RegisterSize != RegisterSize.Simd64) + { + res = context.AddIntrinsic(Intrinsic.X86Movlhps, res, res); + } + else if (op.DstIndex == 1) + { + res = context.AddIntrinsic(Intrinsic.X86Movhlps, res, res); + } + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand ne = EmitVectorExtractZx(context, op.Rn, op.DstIndex, op.Size); + + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + res = EmitVectorInsert(context, res, ne, index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + } + + public static void Ext_V(ArmEmitterContext context) + { + OpCodeSimdExt op = (OpCodeSimdExt)context.CurrOp; + + if (Optimizations.UseSse2) + { + Operand nShifted = GetVec(op.Rn); + + if (op.RegisterSize == RegisterSize.Simd64) + { + nShifted = context.VectorZeroUpper64(nShifted); + } + + nShifted = context.AddIntrinsic(Intrinsic.X86Psrldq, nShifted, Const(op.Imm4)); + + Operand mShifted = GetVec(op.Rm); + + mShifted = context.AddIntrinsic(Intrinsic.X86Pslldq, mShifted, Const(op.GetBytesCount() - op.Imm4)); + + if (op.RegisterSize == RegisterSize.Simd64) + { + mShifted = context.VectorZeroUpper64(mShifted); + } + + Operand res = context.AddIntrinsic(Intrinsic.X86Por, nShifted, mShifted); + + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand res = context.VectorZero(); + + int bytes = op.GetBytesCount(); + + int position = op.Imm4 & (bytes - 1); + + for (int index = 0; index < bytes; index++) + { + int reg = op.Imm4 + index < bytes ? op.Rn : op.Rm; + + Operand e = EmitVectorExtractZx(context, reg, position, 0); + + position = (position + 1) & (bytes - 1); + + res = EmitVectorInsert(context, res, e, index, 0); + } + + context.Copy(GetVec(op.Rd), res); + } + } + + public static void Fcsel_S(ArmEmitterContext context) + { + OpCodeSimdFcond op = (OpCodeSimdFcond)context.CurrOp; + + Operand lblTrue = Label(); + Operand lblEnd = Label(); + + Operand isTrue = InstEmitFlowHelper.GetCondTrue(context, op.Cond); + + context.BranchIfTrue(lblTrue, isTrue); + + OperandType type = op.Size == 0 ? OperandType.FP32 : OperandType.FP64; + + Operand me = context.VectorExtract(type, GetVec(op.Rm), 0); + + context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), me, 0)); + + context.Branch(lblEnd); + + context.MarkLabel(lblTrue); + + Operand ne = context.VectorExtract(type, GetVec(op.Rn), 0); + + context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), ne, 0)); + + context.MarkLabel(lblEnd); + } + + public static void Fmov_Ftoi(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand ne = EmitVectorExtractZx(context, op.Rn, 0, op.Size + 2); + + SetIntOrZR(context, op.Rd, ne); + } + + public static void Fmov_Ftoi1(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand ne = EmitVectorExtractZx(context, op.Rn, 1, 3); + + SetIntOrZR(context, op.Rd, ne); + } + + public static void Fmov_Itof(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetIntOrZR(context, op.Rn); + + context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), n, 0, op.Size + 2)); + } + + public static void Fmov_Itof1(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetIntOrZR(context, op.Rn); + + context.Copy(d, EmitVectorInsert(context, d, n, 1, 3)); + } + + public static void Fmov_S(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + OperandType type = op.Size == 0 ? OperandType.FP32 : OperandType.FP64; + + Operand ne = context.VectorExtract(type, GetVec(op.Rn), 0); + + context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), ne, 0)); + } + + public static void Fmov_Si(ArmEmitterContext context) + { + OpCodeSimdFmov op = (OpCodeSimdFmov)context.CurrOp; + + if (Optimizations.UseSse2) + { + if (op.Size == 0) + { + context.Copy(GetVec(op.Rd), X86GetScalar(context, (int)op.Immediate)); + } + else + { + context.Copy(GetVec(op.Rd), X86GetScalar(context, op.Immediate)); + } + } + else + { + Operand e = Const(op.Immediate); + + Operand res = context.VectorZero(); + + res = EmitVectorInsert(context, res, e, 0, op.Size + 2); + + context.Copy(GetVec(op.Rd), res); + } + } + + public static void Fmov_Vi(ArmEmitterContext context) + { + OpCodeSimdImm op = (OpCodeSimdImm)context.CurrOp; + + if (Optimizations.UseSse2) + { + if (op.RegisterSize == RegisterSize.Simd128) + { + context.Copy(GetVec(op.Rd), X86GetAllElements(context, op.Immediate)); + } + else + { + context.Copy(GetVec(op.Rd), X86GetScalar(context, op.Immediate)); + } + } + else + { + Operand e = Const(op.Immediate); + + Operand res = context.VectorZero(); + + int elems = op.RegisterSize == RegisterSize.Simd128 ? 2 : 1; + + for (int index = 0; index < elems; index++) + { + res = EmitVectorInsert(context, res, e, index, 3); + } + + context.Copy(GetVec(op.Rd), res); + } + } + + public static void Ins_Gp(ArmEmitterContext context) + { + OpCodeSimdIns op = (OpCodeSimdIns)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetIntOrZR(context, op.Rn); + + context.Copy(d, EmitVectorInsert(context, d, n, op.DstIndex, op.Size)); + } + + public static void Ins_V(ArmEmitterContext context) + { + OpCodeSimdIns op = (OpCodeSimdIns)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand ne = EmitVectorExtractZx(context, op.Rn, op.SrcIndex, op.Size); + + context.Copy(d, EmitVectorInsert(context, d, ne, op.DstIndex, op.Size)); + } + + public static void Movi_V(ArmEmitterContext context) + { + if (Optimizations.UseSse2) + { + EmitSse2VectorMoviMvniOp(context, not: false); + } + else + { + EmitVectorImmUnaryOp(context, (op1) => op1); + } + } + + public static void Mvni_V(ArmEmitterContext context) + { + if (Optimizations.UseSse2) + { + EmitSse2VectorMoviMvniOp(context, not: true); + } + else + { + EmitVectorImmUnaryOp(context, (op1) => context.BitwiseNot(op1)); + } + } + + public static void Smov_S(ArmEmitterContext context) + { + OpCodeSimdIns op = (OpCodeSimdIns)context.CurrOp; + + Operand ne = EmitVectorExtractSx(context, op.Rn, op.DstIndex, op.Size); + + if (op.RegisterSize == RegisterSize.Simd64) + { + ne = context.ZeroExtend32(OperandType.I64, ne); + } + + SetIntOrZR(context, op.Rd, ne); + } + + public static void Tbl_V(ArmEmitterContext context) + { + EmitTableVectorLookup(context, isTbl: true); + } + + public static void Tbx_V(ArmEmitterContext context) + { + EmitTableVectorLookup(context, isTbl: false); + } + + public static void Trn1_V(ArmEmitterContext context) + { + EmitVectorTranspose(context, part: 0); + } + + public static void Trn2_V(ArmEmitterContext context) + { + EmitVectorTranspose(context, part: 1); + } + + public static void Umov_S(ArmEmitterContext context) + { + OpCodeSimdIns op = (OpCodeSimdIns)context.CurrOp; + + Operand ne = EmitVectorExtractZx(context, op.Rn, op.DstIndex, op.Size); + + SetIntOrZR(context, op.Rd, ne); + } + + public static void Uzp1_V(ArmEmitterContext context) + { + EmitVectorUnzip(context, part: 0); + } + + public static void Uzp2_V(ArmEmitterContext context) + { + EmitVectorUnzip(context, part: 1); + } + + public static void Xtn_V(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + if (Optimizations.UseSsse3) + { + Operand d = GetVec(op.Rd); + + Operand res = context.VectorZeroUpper64(d); + + Operand mask = X86GetAllElements(context, EvenMasks[op.Size]); + + Operand res2 = context.AddIntrinsic(Intrinsic.X86Pshufb, GetVec(op.Rn), mask); + + Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 + ? Intrinsic.X86Movlhps + : Intrinsic.X86Movhlps; + + res = context.AddIntrinsic(movInst, res, res2); + + context.Copy(d, res); + } + else + { + int elems = 8 >> op.Size; + + int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0; + + Operand d = GetVec(op.Rd); + + Operand res = part == 0 ? context.VectorZero() : context.Copy(d); + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size + 1); + + res = EmitVectorInsert(context, res, ne, part + index, op.Size); + } + + context.Copy(d, res); + } + } + + public static void Zip1_V(ArmEmitterContext context) + { + EmitVectorZip(context, part: 0); + } + + public static void Zip2_V(ArmEmitterContext context) + { + EmitVectorZip(context, part: 1); + } + + private static void EmitSse2VectorMoviMvniOp(ArmEmitterContext context, bool not) + { + OpCodeSimdImm op = (OpCodeSimdImm)context.CurrOp; + + long imm = op.Immediate; + + switch (op.Size) + { + case 0: imm *= 0x01010101; break; + case 1: imm *= 0x00010001; break; + } + + if (not) + { + imm = ~imm; + } + + Operand mask; + + if (op.Size < 3) + { + mask = X86GetAllElements(context, (int)imm); + } + else + { + mask = X86GetAllElements(context, imm); + } + + if (op.RegisterSize == RegisterSize.Simd64) + { + mask = context.VectorZeroUpper64(mask); + } + + context.Copy(GetVec(op.Rd), mask); + } + + private static void EmitTableVectorLookup(ArmEmitterContext context, bool isTbl) + { + OpCodeSimdTbl op = (OpCodeSimdTbl)context.CurrOp; + + if (Optimizations.UseSsse3) + { + Operand d = GetVec(op.Rd); + Operand m = GetVec(op.Rm); + + Operand res; + + Operand mask = X86GetAllElements(context, 0x0F0F0F0F0F0F0F0FL); + + // Fast path for single register table. + { + Operand n = GetVec(op.Rn); + + Operand mMask = context.AddIntrinsic(Intrinsic.X86Pcmpgtb, m, mask); + mMask = context.AddIntrinsic(Intrinsic.X86Por, mMask, m); + + res = context.AddIntrinsic(Intrinsic.X86Pshufb, n, mMask); + } + + for (int index = 1; index < op.Size; index++) + { + Operand ni = GetVec((op.Rn + index) & 0x1F); + + Operand idxMask = X86GetAllElements(context, 0x1010101010101010L * index); + + Operand mSubMask = context.AddIntrinsic(Intrinsic.X86Psubb, m, idxMask); + + Operand mMask = context.AddIntrinsic(Intrinsic.X86Pcmpgtb, mSubMask, mask); + mMask = context.AddIntrinsic(Intrinsic.X86Por, mMask, mSubMask); + + Operand res2 = context.AddIntrinsic(Intrinsic.X86Pshufb, ni, mMask); + + res = context.AddIntrinsic(Intrinsic.X86Por, res, res2); + } + + if (!isTbl) + { + Operand idxMask = X86GetAllElements(context, (0x1010101010101010L * op.Size) - 0x0101010101010101L); + Operand zeroMask = context.VectorZero(); + + Operand mPosMask = context.AddIntrinsic(Intrinsic.X86Pcmpgtb, m, idxMask); + Operand mNegMask = context.AddIntrinsic(Intrinsic.X86Pcmpgtb, zeroMask, m); + + Operand mMask = context.AddIntrinsic(Intrinsic.X86Por, mPosMask, mNegMask); + + Operand dMask = context.AddIntrinsic(Intrinsic.X86Pand, d, mMask); + + res = context.AddIntrinsic(Intrinsic.X86Por, res, dMask); + } + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); + } + else + { + Operand d = GetVec(op.Rd); + + List<Operand> args = new List<Operand>(); + + if (!isTbl) + { + args.Add(d); + } + + args.Add(GetVec(op.Rm)); + + args.Add(Const(op.RegisterSize == RegisterSize.Simd64 ? 8 : 16)); + + for (int index = 0; index < op.Size; index++) + { + args.Add(GetVec((op.Rn + index) & 0x1F)); + } + + MethodInfo info = null; + + if (isTbl) + { + switch (op.Size) + { + case 1: info = typeof(SoftFallback).GetMethod(nameof(SoftFallback.Tbl1)); break; + case 2: info = typeof(SoftFallback).GetMethod(nameof(SoftFallback.Tbl2)); break; + case 3: info = typeof(SoftFallback).GetMethod(nameof(SoftFallback.Tbl3)); break; + case 4: info = typeof(SoftFallback).GetMethod(nameof(SoftFallback.Tbl4)); break; + } + } + else + { + switch (op.Size) + { + case 1: info = typeof(SoftFallback).GetMethod(nameof(SoftFallback.Tbx1)); break; + case 2: info = typeof(SoftFallback).GetMethod(nameof(SoftFallback.Tbx2)); break; + case 3: info = typeof(SoftFallback).GetMethod(nameof(SoftFallback.Tbx3)); break; + case 4: info = typeof(SoftFallback).GetMethod(nameof(SoftFallback.Tbx4)); break; + } + } + + context.Copy(d, context.Call(info, args.ToArray())); + } + } + + private static void EmitVectorTranspose(ArmEmitterContext context, int part) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UseSsse3) + { + Operand mask = default; + + if (op.Size < 3) + { + long maskE0 = EvenMasks[op.Size]; + long maskE1 = OddMasks [op.Size]; + + mask = X86GetScalar(context, maskE0); + + mask = EmitVectorInsert(context, mask, Const(maskE1), 1, 3); + } + + Operand n = GetVec(op.Rn); + + if (op.Size < 3) + { + n = context.AddIntrinsic(Intrinsic.X86Pshufb, n, mask); + } + + Operand m = GetVec(op.Rm); + + if (op.Size < 3) + { + m = context.AddIntrinsic(Intrinsic.X86Pshufb, m, mask); + } + + Intrinsic punpckInst = part == 0 + ? X86PunpcklInstruction[op.Size] + : X86PunpckhInstruction[op.Size]; + + Operand res = context.AddIntrinsic(punpckInst, n, m); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand res = context.VectorZero(); + + int pairs = op.GetPairsCount() >> op.Size; + + for (int index = 0; index < pairs; index++) + { + int pairIndex = index << 1; + + Operand ne = EmitVectorExtractZx(context, op.Rn, pairIndex + part, op.Size); + Operand me = EmitVectorExtractZx(context, op.Rm, pairIndex + part, op.Size); + + res = EmitVectorInsert(context, res, ne, pairIndex, op.Size); + res = EmitVectorInsert(context, res, me, pairIndex + 1, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + } + + private static void EmitVectorUnzip(ArmEmitterContext context, int part) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UseSsse3) + { + if (op.RegisterSize == RegisterSize.Simd128) + { + Operand mask = default; + + if (op.Size < 3) + { + long maskE0 = EvenMasks[op.Size]; + long maskE1 = OddMasks [op.Size]; + + mask = X86GetScalar(context, maskE0); + + mask = EmitVectorInsert(context, mask, Const(maskE1), 1, 3); + } + + Operand n = GetVec(op.Rn); + + if (op.Size < 3) + { + n = context.AddIntrinsic(Intrinsic.X86Pshufb, n, mask); + } + + Operand m = GetVec(op.Rm); + + if (op.Size < 3) + { + m = context.AddIntrinsic(Intrinsic.X86Pshufb, m, mask); + } + + Intrinsic punpckInst = part == 0 + ? Intrinsic.X86Punpcklqdq + : Intrinsic.X86Punpckhqdq; + + Operand res = context.AddIntrinsic(punpckInst, n, m); + + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Intrinsic punpcklInst = X86PunpcklInstruction[op.Size]; + + Operand res = context.AddIntrinsic(punpcklInst, n, m); + + if (op.Size < 2) + { + long maskE0 = _masksE0_Uzp[op.Size]; + long maskE1 = _masksE1_Uzp[op.Size]; + + Operand mask = X86GetScalar(context, maskE0); + + mask = EmitVectorInsert(context, mask, Const(maskE1), 1, 3); + + res = context.AddIntrinsic(Intrinsic.X86Pshufb, res, mask); + } + + Intrinsic punpckInst = part == 0 + ? Intrinsic.X86Punpcklqdq + : Intrinsic.X86Punpckhqdq; + + res = context.AddIntrinsic(punpckInst, res, context.VectorZero()); + + context.Copy(GetVec(op.Rd), res); + } + } + else + { + Operand res = context.VectorZero(); + + int pairs = op.GetPairsCount() >> op.Size; + + for (int index = 0; index < pairs; index++) + { + int idx = index << 1; + + Operand ne = EmitVectorExtractZx(context, op.Rn, idx + part, op.Size); + Operand me = EmitVectorExtractZx(context, op.Rm, idx + part, op.Size); + + res = EmitVectorInsert(context, res, ne, index, op.Size); + res = EmitVectorInsert(context, res, me, pairs + index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + } + + private static void EmitVectorZip(ArmEmitterContext context, int part) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + if (Optimizations.UseSse2) + { + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + if (op.RegisterSize == RegisterSize.Simd128) + { + Intrinsic punpckInst = part == 0 + ? X86PunpcklInstruction[op.Size] + : X86PunpckhInstruction[op.Size]; + + Operand res = context.AddIntrinsic(punpckInst, n, m); + + context.Copy(GetVec(op.Rd), res); + } + else + { + Operand res = context.AddIntrinsic(X86PunpcklInstruction[op.Size], n, m); + + Intrinsic punpckInst = part == 0 + ? Intrinsic.X86Punpcklqdq + : Intrinsic.X86Punpckhqdq; + + res = context.AddIntrinsic(punpckInst, res, context.VectorZero()); + + context.Copy(GetVec(op.Rd), res); + } + } + else + { + Operand res = context.VectorZero(); + + int pairs = op.GetPairsCount() >> op.Size; + + int baseIndex = part != 0 ? pairs : 0; + + for (int index = 0; index < pairs; index++) + { + int pairIndex = index << 1; + + Operand ne = EmitVectorExtractZx(context, op.Rn, baseIndex + index, op.Size); + Operand me = EmitVectorExtractZx(context, op.Rm, baseIndex + index, op.Size); + + res = EmitVectorInsert(context, res, ne, pairIndex, op.Size); + res = EmitVectorInsert(context, res, me, pairIndex + 1, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSimdMove32.cs b/src/ARMeilleure/Instructions/InstEmitSimdMove32.cs new file mode 100644 index 00000000..b8b91b31 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdMove32.cs @@ -0,0 +1,656 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; +using System; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper32; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit32 + { + #region "Masks" + // Same as InstEmitSimdMove, as the instructions do the same thing. + private static readonly long[] _masksE0_Uzp = new long[] + { + 13L << 56 | 09L << 48 | 05L << 40 | 01L << 32 | 12L << 24 | 08L << 16 | 04L << 8 | 00L << 0, + 11L << 56 | 10L << 48 | 03L << 40 | 02L << 32 | 09L << 24 | 08L << 16 | 01L << 8 | 00L << 0 + }; + + private static readonly long[] _masksE1_Uzp = new long[] + { + 15L << 56 | 11L << 48 | 07L << 40 | 03L << 32 | 14L << 24 | 10L << 16 | 06L << 8 | 02L << 0, + 15L << 56 | 14L << 48 | 07L << 40 | 06L << 32 | 13L << 24 | 12L << 16 | 05L << 8 | 04L << 0 + }; + #endregion + + public static void Vmov_I(ArmEmitterContext context) + { + EmitVectorImmUnaryOp32(context, (op1) => op1); + } + + public static void Vmvn_I(ArmEmitterContext context) + { + if (Optimizations.UseAvx512Ortho) + { + EmitVectorUnaryOpSimd32(context, (op1) => + { + return context.AddIntrinsic(Intrinsic.X86Vpternlogd, op1, op1, Const(0b01010101)); + }); + } + else if (Optimizations.UseSse2) + { + EmitVectorUnaryOpSimd32(context, (op1) => + { + Operand mask = X86GetAllElements(context, -1L); + return context.AddIntrinsic(Intrinsic.X86Pandn, op1, mask); + }); + } + else + { + EmitVectorUnaryOpZx32(context, (op1) => context.BitwiseNot(op1)); + } + } + + public static void Vmvn_II(ArmEmitterContext context) + { + EmitVectorImmUnaryOp32(context, (op1) => context.BitwiseNot(op1)); + } + + public static void Vmov_GS(ArmEmitterContext context) + { + OpCode32SimdMovGp op = (OpCode32SimdMovGp)context.CurrOp; + + Operand vec = GetVecA32(op.Vn >> 2); + if (op.Op == 1) + { + // To general purpose. + Operand value = context.VectorExtract(OperandType.I32, vec, op.Vn & 0x3); + SetIntA32(context, op.Rt, value); + } + else + { + // From general purpose. + Operand value = GetIntA32(context, op.Rt); + context.Copy(vec, context.VectorInsert(vec, value, op.Vn & 0x3)); + } + } + + public static void Vmov_G1(ArmEmitterContext context) + { + OpCode32SimdMovGpElem op = (OpCode32SimdMovGpElem)context.CurrOp; + + int index = op.Index + ((op.Vd & 1) << (3 - op.Size)); + if (op.Op == 1) + { + // To general purpose. + Operand value = EmitVectorExtract32(context, op.Vd >> 1, index, op.Size, !op.U); + SetIntA32(context, op.Rt, value); + } + else + { + // From general purpose. + Operand vec = GetVecA32(op.Vd >> 1); + Operand value = GetIntA32(context, op.Rt); + context.Copy(vec, EmitVectorInsert(context, vec, value, index, op.Size)); + } + } + + public static void Vmov_G2(ArmEmitterContext context) + { + OpCode32SimdMovGpDouble op = (OpCode32SimdMovGpDouble)context.CurrOp; + + Operand vec = GetVecA32(op.Vm >> 2); + int vm1 = op.Vm + 1; + bool sameOwnerVec = (op.Vm >> 2) == (vm1 >> 2); + Operand vec2 = sameOwnerVec ? vec : GetVecA32(vm1 >> 2); + if (op.Op == 1) + { + // To general purpose. + Operand lowValue = context.VectorExtract(OperandType.I32, vec, op.Vm & 3); + SetIntA32(context, op.Rt, lowValue); + + Operand highValue = context.VectorExtract(OperandType.I32, vec2, vm1 & 3); + SetIntA32(context, op.Rt2, highValue); + } + else + { + // From general purpose. + Operand lowValue = GetIntA32(context, op.Rt); + Operand resultVec = context.VectorInsert(vec, lowValue, op.Vm & 3); + + Operand highValue = GetIntA32(context, op.Rt2); + + if (sameOwnerVec) + { + context.Copy(vec, context.VectorInsert(resultVec, highValue, vm1 & 3)); + } + else + { + context.Copy(vec, resultVec); + context.Copy(vec2, context.VectorInsert(vec2, highValue, vm1 & 3)); + } + } + } + + public static void Vmov_GD(ArmEmitterContext context) + { + OpCode32SimdMovGpDouble op = (OpCode32SimdMovGpDouble)context.CurrOp; + + Operand vec = GetVecA32(op.Vm >> 1); + if (op.Op == 1) + { + // To general purpose. + Operand value = context.VectorExtract(OperandType.I64, vec, op.Vm & 1); + SetIntA32(context, op.Rt, context.ConvertI64ToI32(value)); + SetIntA32(context, op.Rt2, context.ConvertI64ToI32(context.ShiftRightUI(value, Const(32)))); + } + else + { + // From general purpose. + Operand lowValue = GetIntA32(context, op.Rt); + Operand highValue = GetIntA32(context, op.Rt2); + + Operand value = context.BitwiseOr( + context.ZeroExtend32(OperandType.I64, lowValue), + context.ShiftLeft(context.ZeroExtend32(OperandType.I64, highValue), Const(32))); + + context.Copy(vec, context.VectorInsert(vec, value, op.Vm & 1)); + } + } + + public static void Vmovl(ArmEmitterContext context) + { + OpCode32SimdLong op = (OpCode32SimdLong)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, !op.U); + + if (op.Size == 2) + { + if (op.U) + { + me = context.ZeroExtend32(OperandType.I64, me); + } + else + { + me = context.SignExtend32(OperandType.I64, me); + } + } + + res = EmitVectorInsert(context, res, me, index, op.Size + 1); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void Vtbl(ArmEmitterContext context) + { + OpCode32SimdTbl op = (OpCode32SimdTbl)context.CurrOp; + + bool extension = op.Opc == 1; + int length = op.Length + 1; + + if (Optimizations.UseSsse3) + { + Operand d = GetVecA32(op.Qd); + Operand m = EmitMoveDoubleWordToSide(context, GetVecA32(op.Qm), op.Vm, 0); + + Operand res; + Operand mask = X86GetAllElements(context, 0x0707070707070707L); + + // Fast path for single register table. + { + Operand n = EmitMoveDoubleWordToSide(context, GetVecA32(op.Qn), op.Vn, 0); + + Operand mMask = context.AddIntrinsic(Intrinsic.X86Pcmpgtb, m, mask); + mMask = context.AddIntrinsic(Intrinsic.X86Por, mMask, m); + + res = context.AddIntrinsic(Intrinsic.X86Pshufb, n, mMask); + } + + for (int index = 1; index < length; index++) + { + int newVn = (op.Vn + index) & 0x1F; + (int qn, int ind) = GetQuadwordAndSubindex(newVn, op.RegisterSize); + Operand ni = EmitMoveDoubleWordToSide(context, GetVecA32(qn), newVn, 0); + + Operand idxMask = X86GetAllElements(context, 0x0808080808080808L * index); + + Operand mSubMask = context.AddIntrinsic(Intrinsic.X86Psubb, m, idxMask); + + Operand mMask = context.AddIntrinsic(Intrinsic.X86Pcmpgtb, mSubMask, mask); + mMask = context.AddIntrinsic(Intrinsic.X86Por, mMask, mSubMask); + + Operand res2 = context.AddIntrinsic(Intrinsic.X86Pshufb, ni, mMask); + + res = context.AddIntrinsic(Intrinsic.X86Por, res, res2); + } + + if (extension) + { + Operand idxMask = X86GetAllElements(context, (0x0808080808080808L * length) - 0x0101010101010101L); + Operand zeroMask = context.VectorZero(); + + Operand mPosMask = context.AddIntrinsic(Intrinsic.X86Pcmpgtb, m, idxMask); + Operand mNegMask = context.AddIntrinsic(Intrinsic.X86Pcmpgtb, zeroMask, m); + + Operand mMask = context.AddIntrinsic(Intrinsic.X86Por, mPosMask, mNegMask); + + Operand dMask = context.AddIntrinsic(Intrinsic.X86Pand, EmitMoveDoubleWordToSide(context, d, op.Vd, 0), mMask); + + res = context.AddIntrinsic(Intrinsic.X86Por, res, dMask); + } + + res = EmitMoveDoubleWordToSide(context, res, 0, op.Vd); + + context.Copy(d, EmitDoubleWordInsert(context, d, res, op.Vd)); + } + else + { + int elems = op.GetBytesCount() >> op.Size; + + (int Qx, int Ix)[] tableTuples = new (int, int)[length]; + for (int i = 0; i < length; i++) + { + tableTuples[i] = GetQuadwordAndSubindex(op.Vn + i, op.RegisterSize); + } + + int byteLength = length * 8; + + Operand res = GetVecA32(op.Qd); + Operand m = GetVecA32(op.Qm); + + for (int index = 0; index < elems; index++) + { + Operand selectedIndex = context.ZeroExtend8(OperandType.I32, context.VectorExtract8(m, index + op.Im)); + + Operand inRange = context.ICompareLess(selectedIndex, Const(byteLength)); + Operand elemRes = default; // Note: This is I64 for ease of calculation. + + // TODO: Branching rather than conditional select. + + // Get indexed byte. + // To simplify (ha) the il, we get bytes from every vector and use a nested conditional select to choose the right result. + // This does have to extract `length` times for every element but certainly not as bad as it could be. + + // Which vector number is the index on. + Operand vecIndex = context.ShiftRightUI(selectedIndex, Const(3)); + // What should we shift by to extract it. + Operand subVecIndexShift = context.ShiftLeft(context.BitwiseAnd(selectedIndex, Const(7)), Const(3)); + + for (int i = 0; i < length; i++) + { + (int qx, int ix) = tableTuples[i]; + // Get the whole vector, we'll get a byte out of it. + Operand lookupResult; + if (qx == op.Qd) + { + // Result contains the current state of the vector. + lookupResult = context.VectorExtract(OperandType.I64, res, ix); + } + else + { + lookupResult = EmitVectorExtract32(context, qx, ix, 3, false); // I64 + } + + lookupResult = context.ShiftRightUI(lookupResult, subVecIndexShift); // Get the relevant byte from this vector. + + if (i == 0) + { + elemRes = lookupResult; // First result is always default. + } + else + { + Operand isThisElem = context.ICompareEqual(vecIndex, Const(i)); + elemRes = context.ConditionalSelect(isThisElem, lookupResult, elemRes); + } + } + + Operand fallback = (extension) ? context.ZeroExtend32(OperandType.I64, EmitVectorExtract32(context, op.Qd, index + op.Id, 0, false)) : Const(0L); + + res = EmitVectorInsert(context, res, context.ConditionalSelect(inRange, elemRes, fallback), index + op.Id, 0); + } + + context.Copy(GetVecA32(op.Qd), res); + } + } + + public static void Vtrn(ArmEmitterContext context) + { + OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp; + + if (Optimizations.UseSsse3) + { + EmitVectorShuffleOpSimd32(context, (m, d) => + { + Operand mask = default; + + if (op.Size < 3) + { + long maskE0 = EvenMasks[op.Size]; + long maskE1 = OddMasks[op.Size]; + + mask = X86GetScalar(context, maskE0); + + mask = EmitVectorInsert(context, mask, Const(maskE1), 1, 3); + } + + if (op.Size < 3) + { + d = context.AddIntrinsic(Intrinsic.X86Pshufb, d, mask); + m = context.AddIntrinsic(Intrinsic.X86Pshufb, m, mask); + } + + Operand resD = context.AddIntrinsic(X86PunpcklInstruction[op.Size], d, m); + Operand resM = context.AddIntrinsic(X86PunpckhInstruction[op.Size], d, m); + + return (resM, resD); + }); + } + else + { + int elems = op.GetBytesCount() >> op.Size; + int pairs = elems >> 1; + + bool overlap = op.Qm == op.Qd; + + Operand resD = GetVecA32(op.Qd); + Operand resM = GetVecA32(op.Qm); + + for (int index = 0; index < pairs; index++) + { + int pairIndex = index << 1; + Operand d2 = EmitVectorExtract32(context, op.Qd, pairIndex + 1 + op.Id, op.Size, false); + Operand m1 = EmitVectorExtract32(context, op.Qm, pairIndex + op.Im, op.Size, false); + + resD = EmitVectorInsert(context, resD, m1, pairIndex + 1 + op.Id, op.Size); + + if (overlap) + { + resM = resD; + } + + resM = EmitVectorInsert(context, resM, d2, pairIndex + op.Im, op.Size); + + if (overlap) + { + resD = resM; + } + } + + context.Copy(GetVecA32(op.Qd), resD); + if (!overlap) + { + context.Copy(GetVecA32(op.Qm), resM); + } + } + } + + public static void Vzip(ArmEmitterContext context) + { + OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + EmitVectorZipUzpOpSimd32(context, Intrinsic.Arm64Zip1V, Intrinsic.Arm64Zip2V); + } + else if (Optimizations.UseSse2) + { + EmitVectorShuffleOpSimd32(context, (m, d) => + { + if (op.RegisterSize == RegisterSize.Simd128) + { + Operand resD = context.AddIntrinsic(X86PunpcklInstruction[op.Size], d, m); + Operand resM = context.AddIntrinsic(X86PunpckhInstruction[op.Size], d, m); + + return (resM, resD); + } + else + { + Operand res = context.AddIntrinsic(X86PunpcklInstruction[op.Size], d, m); + + Operand resD = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, res, context.VectorZero()); + Operand resM = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, res, context.VectorZero()); + return (resM, resD); + } + }); + } + else + { + int elems = op.GetBytesCount() >> op.Size; + int pairs = elems >> 1; + + bool overlap = op.Qm == op.Qd; + + Operand resD = GetVecA32(op.Qd); + Operand resM = GetVecA32(op.Qm); + + for (int index = 0; index < pairs; index++) + { + int pairIndex = index << 1; + Operand dRowD = EmitVectorExtract32(context, op.Qd, index + op.Id, op.Size, false); + Operand mRowD = EmitVectorExtract32(context, op.Qm, index + op.Im, op.Size, false); + + Operand dRowM = EmitVectorExtract32(context, op.Qd, index + op.Id + pairs, op.Size, false); + Operand mRowM = EmitVectorExtract32(context, op.Qm, index + op.Im + pairs, op.Size, false); + + resD = EmitVectorInsert(context, resD, dRowD, pairIndex + op.Id, op.Size); + resD = EmitVectorInsert(context, resD, mRowD, pairIndex + 1 + op.Id, op.Size); + + if (overlap) + { + resM = resD; + } + + resM = EmitVectorInsert(context, resM, dRowM, pairIndex + op.Im, op.Size); + resM = EmitVectorInsert(context, resM, mRowM, pairIndex + 1 + op.Im, op.Size); + + if (overlap) + { + resD = resM; + } + } + + context.Copy(GetVecA32(op.Qd), resD); + if (!overlap) + { + context.Copy(GetVecA32(op.Qm), resM); + } + } + } + + public static void Vuzp(ArmEmitterContext context) + { + OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + EmitVectorZipUzpOpSimd32(context, Intrinsic.Arm64Uzp1V, Intrinsic.Arm64Uzp2V); + } + else if (Optimizations.UseSsse3) + { + EmitVectorShuffleOpSimd32(context, (m, d) => + { + if (op.RegisterSize == RegisterSize.Simd128) + { + Operand mask = default; + + if (op.Size < 3) + { + long maskE0 = EvenMasks[op.Size]; + long maskE1 = OddMasks[op.Size]; + + mask = X86GetScalar(context, maskE0); + mask = EmitVectorInsert(context, mask, Const(maskE1), 1, 3); + + d = context.AddIntrinsic(Intrinsic.X86Pshufb, d, mask); + m = context.AddIntrinsic(Intrinsic.X86Pshufb, m, mask); + } + + Operand resD = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, d, m); + Operand resM = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, d, m); + + return (resM, resD); + } + else + { + Intrinsic punpcklInst = X86PunpcklInstruction[op.Size]; + + Operand res = context.AddIntrinsic(punpcklInst, d, m); + + if (op.Size < 2) + { + long maskE0 = _masksE0_Uzp[op.Size]; + long maskE1 = _masksE1_Uzp[op.Size]; + + Operand mask = X86GetScalar(context, maskE0); + + mask = EmitVectorInsert(context, mask, Const(maskE1), 1, 3); + + res = context.AddIntrinsic(Intrinsic.X86Pshufb, res, mask); + } + + Operand resD = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, res, context.VectorZero()); + Operand resM = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, res, context.VectorZero()); + + return (resM, resD); + } + }); + } + else + { + int elems = op.GetBytesCount() >> op.Size; + int pairs = elems >> 1; + + bool overlap = op.Qm == op.Qd; + + Operand resD = GetVecA32(op.Qd); + Operand resM = GetVecA32(op.Qm); + + for (int index = 0; index < elems; index++) + { + Operand dIns, mIns; + if (index >= pairs) + { + int pairIndex = index - pairs; + dIns = EmitVectorExtract32(context, op.Qm, (pairIndex << 1) + op.Im, op.Size, false); + mIns = EmitVectorExtract32(context, op.Qm, ((pairIndex << 1) | 1) + op.Im, op.Size, false); + } + else + { + dIns = EmitVectorExtract32(context, op.Qd, (index << 1) + op.Id, op.Size, false); + mIns = EmitVectorExtract32(context, op.Qd, ((index << 1) | 1) + op.Id, op.Size, false); + } + + resD = EmitVectorInsert(context, resD, dIns, index + op.Id, op.Size); + + if (overlap) + { + resM = resD; + } + + resM = EmitVectorInsert(context, resM, mIns, index + op.Im, op.Size); + + if (overlap) + { + resD = resM; + } + } + + context.Copy(GetVecA32(op.Qd), resD); + if (!overlap) + { + context.Copy(GetVecA32(op.Qm), resM); + } + } + } + + private static void EmitVectorZipUzpOpSimd32(ArmEmitterContext context, Intrinsic inst1, Intrinsic inst2) + { + OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp; + + bool overlap = op.Qm == op.Qd; + + Operand d = GetVecA32(op.Qd); + Operand m = GetVecA32(op.Qm); + + Operand dPart = d; + Operand mPart = m; + + if (!op.Q) // Register swap: move relevant doubleword to destination side. + { + dPart = InstEmitSimdHelper32Arm64.EmitMoveDoubleWordToSide(context, d, op.Vd, 0); + mPart = InstEmitSimdHelper32Arm64.EmitMoveDoubleWordToSide(context, m, op.Vm, 0); + } + + Intrinsic vSize = op.Q ? Intrinsic.Arm64V128 : Intrinsic.Arm64V64; + + vSize |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift); + + Operand resD = context.AddIntrinsic(inst1 | vSize, dPart, mPart); + Operand resM = context.AddIntrinsic(inst2 | vSize, dPart, mPart); + + if (!op.Q) // Register insert. + { + resD = context.AddIntrinsic(Intrinsic.Arm64InsVe | Intrinsic.Arm64VDWord, d, Const(op.Vd & 1), resD, Const(0)); + + if (overlap) + { + resD = context.AddIntrinsic(Intrinsic.Arm64InsVe | Intrinsic.Arm64VDWord, resD, Const(op.Vm & 1), resM, Const(0)); + } + else + { + resM = context.AddIntrinsic(Intrinsic.Arm64InsVe | Intrinsic.Arm64VDWord, m, Const(op.Vm & 1), resM, Const(0)); + } + } + + context.Copy(d, resD); + if (!overlap) + { + context.Copy(m, resM); + } + } + + private static void EmitVectorShuffleOpSimd32(ArmEmitterContext context, Func<Operand, Operand, (Operand, Operand)> shuffleFunc) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + Operand m = GetVecA32(op.Qm); + Operand d = GetVecA32(op.Qd); + Operand initialM = m; + Operand initialD = d; + + if (!op.Q) // Register swap: move relevant doubleword to side 0, for consistency. + { + m = EmitMoveDoubleWordToSide(context, m, op.Vm, 0); + d = EmitMoveDoubleWordToSide(context, d, op.Vd, 0); + } + + (Operand resM, Operand resD) = shuffleFunc(m, d); + + bool overlap = op.Qm == op.Qd; + + if (!op.Q) // Register insert. + { + resM = EmitDoubleWordInsert(context, initialM, EmitMoveDoubleWordToSide(context, resM, 0, op.Vm), op.Vm); + resD = EmitDoubleWordInsert(context, overlap ? resM : initialD, EmitMoveDoubleWordToSide(context, resD, 0, op.Vd), op.Vd); + } + + if (!overlap) + { + context.Copy(initialM, resM); + } + + context.Copy(initialD, resD); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSimdShift.cs b/src/ARMeilleure/Instructions/InstEmitSimdShift.cs new file mode 100644 index 00000000..19e41119 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdShift.cs @@ -0,0 +1,1827 @@ +// https://github.com/intel/ARM_NEON_2_x86_SSE/blob/master/NEON_2_SSE.h + +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; +using System; +using System.Diagnostics; +using System.Reflection; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + using Func2I = Func<Operand, Operand, Operand>; + + static partial class InstEmit + { +#region "Masks" + private static readonly long[] _masks_SliSri = new long[] // Replication masks. + { + 0x0101010101010101L, 0x0001000100010001L, 0x0000000100000001L, 0x0000000000000001L + }; +#endregion + + public static void Rshrn_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitVectorShiftTernaryOpRd(context, Intrinsic.Arm64RshrnV, shift); + } + else if (Optimizations.UseSsse3) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + long roundConst = 1L << (shift - 1); + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + Operand dLow = context.VectorZeroUpper64(d); + + Operand mask = default; + + switch (op.Size + 1) + { + case 1: mask = X86GetAllElements(context, (int)roundConst * 0x00010001); break; + case 2: mask = X86GetAllElements(context, (int)roundConst); break; + case 3: mask = X86GetAllElements(context, roundConst); break; + } + + Intrinsic addInst = X86PaddInstruction[op.Size + 1]; + + Operand res = context.AddIntrinsic(addInst, n, mask); + + Intrinsic srlInst = X86PsrlInstruction[op.Size + 1]; + + res = context.AddIntrinsic(srlInst, res, Const(shift)); + + Operand mask2 = X86GetAllElements(context, EvenMasks[op.Size]); + + res = context.AddIntrinsic(Intrinsic.X86Pshufb, res, mask2); + + Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 + ? Intrinsic.X86Movlhps + : Intrinsic.X86Movhlps; + + res = context.AddIntrinsic(movInst, dLow, res); + + context.Copy(d, res); + } + else + { + EmitVectorShrImmNarrowOpZx(context, round: true); + } + } + + public static void Shl_S(ArmEmitterContext context) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShl(op); + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarShiftBinaryOp(context, Intrinsic.Arm64ShlS, shift); + } + else + { + EmitScalarUnaryOpZx(context, (op1) => context.ShiftLeft(op1, Const(shift))); + } + } + + public static void Shl_V(ArmEmitterContext context) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShl(op); + int eSize = 8 << op.Size; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorShiftBinaryOp(context, Intrinsic.Arm64ShlV, shift); + } + else if (shift >= eSize) + { + if ((op.RegisterSize == RegisterSize.Simd64)) + { + Operand res = context.VectorZeroUpper64(GetVec(op.Rd)); + + context.Copy(GetVec(op.Rd), res); + } + } + else if (Optimizations.UseGfni && op.Size == 0) + { + Operand n = GetVec(op.Rn); + + ulong bitMatrix = X86GetGf2p8LogicalShiftLeft(shift); + + Operand vBitMatrix = X86GetElements(context, bitMatrix, bitMatrix); + + Operand res = context.AddIntrinsic(Intrinsic.X86Gf2p8affineqb, n, vBitMatrix, Const(0)); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else if (Optimizations.UseSse2 && op.Size > 0) + { + Operand n = GetVec(op.Rn); + + Intrinsic sllInst = X86PsllInstruction[op.Size]; + + Operand res = context.AddIntrinsic(sllInst, n, Const(shift)); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorUnaryOpZx(context, (op1) => context.ShiftLeft(op1, Const(shift))); + } + } + + public static void Shll_V(ArmEmitterContext context) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + int shift = 8 << op.Size; + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64ShllV); + } + else if (Optimizations.UseSse41) + { + Operand n = GetVec(op.Rn); + + if (op.RegisterSize == RegisterSize.Simd128) + { + n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8)); + } + + Intrinsic movsxInst = X86PmovsxInstruction[op.Size]; + + Operand res = context.AddIntrinsic(movsxInst, n); + + Intrinsic sllInst = X86PsllInstruction[op.Size + 1]; + + res = context.AddIntrinsic(sllInst, res, Const(shift)); + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorShImmWidenBinaryZx(context, (op1, op2) => context.ShiftLeft(op1, op2), shift); + } + } + + public static void Shrn_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitVectorShiftTernaryOpRd(context, Intrinsic.Arm64ShrnV, shift); + } + else if (Optimizations.UseSsse3) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + Operand dLow = context.VectorZeroUpper64(d); + + Intrinsic srlInst = X86PsrlInstruction[op.Size + 1]; + + Operand nShifted = context.AddIntrinsic(srlInst, n, Const(shift)); + + Operand mask = X86GetAllElements(context, EvenMasks[op.Size]); + + Operand res = context.AddIntrinsic(Intrinsic.X86Pshufb, nShifted, mask); + + Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 + ? Intrinsic.X86Movlhps + : Intrinsic.X86Movhlps; + + res = context.AddIntrinsic(movInst, dLow, res); + + context.Copy(d, res); + } + else + { + EmitVectorShrImmNarrowOpZx(context, round: false); + } + } + + public static void Sli_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShl(op); + + InstEmitSimdHelperArm64.EmitScalarShiftTernaryOpRd(context, Intrinsic.Arm64SliS, shift); + } + else + { + EmitSli(context, scalar: true); + } + } + + public static void Sli_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShl(op); + + InstEmitSimdHelperArm64.EmitVectorShiftTernaryOpRd(context, Intrinsic.Arm64SliV, shift); + } + else + { + EmitSli(context, scalar: false); + } + } + + public static void Sqrshl_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOp(context, Intrinsic.Arm64SqrshlV); + } + else + { + EmitShlRegOp(context, ShlRegFlags.Signed | ShlRegFlags.Round | ShlRegFlags.Saturating); + } + } + + public static void Sqrshrn_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitScalarSaturatingShiftTernaryOpRd(context, Intrinsic.Arm64SqrshrnS, shift); + } + else + { + EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarSxSx); + } + } + + public static void Sqrshrn_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitVectorSaturatingShiftTernaryOpRd(context, Intrinsic.Arm64SqrshrnV, shift); + } + else + { + EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxSx); + } + } + + public static void Sqrshrun_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitScalarSaturatingShiftTernaryOpRd(context, Intrinsic.Arm64SqrshrunS, shift); + } + else + { + EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarSxZx); + } + } + + public static void Sqrshrun_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitVectorSaturatingShiftTernaryOpRd(context, Intrinsic.Arm64SqrshrunV, shift); + } + else + { + EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxZx); + } + } + + public static void Sqshl_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOp(context, Intrinsic.Arm64SqshlV); + } + else + { + EmitShlRegOp(context, ShlRegFlags.Signed | ShlRegFlags.Saturating); + } + } + + public static void Sqshrn_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitScalarSaturatingShiftTernaryOpRd(context, Intrinsic.Arm64SqshrnS, shift); + } + else + { + EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarSxSx); + } + } + + public static void Sqshrn_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitVectorSaturatingShiftTernaryOpRd(context, Intrinsic.Arm64SqshrnV, shift); + } + else + { + EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxSx); + } + } + + public static void Sqshrun_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitScalarSaturatingShiftTernaryOpRd(context, Intrinsic.Arm64SqshrunS, shift); + } + else + { + EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarSxZx); + } + } + + public static void Sqshrun_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitVectorSaturatingShiftTernaryOpRd(context, Intrinsic.Arm64SqshrunV, shift); + } + else + { + EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxZx); + } + } + + public static void Sri_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitScalarShiftTernaryOpRd(context, Intrinsic.Arm64SriS, shift); + } + else + { + EmitSri(context, scalar: true); + } + } + + public static void Sri_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitVectorShiftTernaryOpRd(context, Intrinsic.Arm64SriV, shift); + } + else + { + EmitSri(context, scalar: false); + } + } + + public static void Srshl_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SrshlV); + } + else + { + EmitShlRegOp(context, ShlRegFlags.Signed | ShlRegFlags.Round); + } + } + + public static void Srshr_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitScalarShiftBinaryOp(context, Intrinsic.Arm64SrshrS, shift); + } + else + { + EmitScalarShrImmOpSx(context, ShrImmFlags.Round); + } + } + + public static void Srshr_V(ArmEmitterContext context) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitVectorShiftBinaryOp(context, Intrinsic.Arm64SrshrV, shift); + } + else if (Optimizations.UseSse2 && op.Size > 0 && op.Size < 3) + { + int shift = GetImmShr(op); + int eSize = 8 << op.Size; + + Operand n = GetVec(op.Rn); + + Intrinsic sllInst = X86PsllInstruction[op.Size]; + + Operand res = context.AddIntrinsic(sllInst, n, Const(eSize - shift)); + + Intrinsic srlInst = X86PsrlInstruction[op.Size]; + + res = context.AddIntrinsic(srlInst, res, Const(eSize - 1)); + + Intrinsic sraInst = X86PsraInstruction[op.Size]; + + Operand nSra = context.AddIntrinsic(sraInst, n, Const(shift)); + + Intrinsic addInst = X86PaddInstruction[op.Size]; + + res = context.AddIntrinsic(addInst, res, nSra); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorShrImmOpSx(context, ShrImmFlags.Round); + } + } + + public static void Srsra_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitScalarShiftTernaryOpRd(context, Intrinsic.Arm64SrsraS, shift); + } + else + { + EmitScalarShrImmOpSx(context, ShrImmFlags.Round | ShrImmFlags.Accumulate); + } + } + + public static void Srsra_V(ArmEmitterContext context) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitVectorShiftTernaryOpRd(context, Intrinsic.Arm64SrsraV, shift); + } + else if (Optimizations.UseSse2 && op.Size > 0 && op.Size < 3) + { + int shift = GetImmShr(op); + int eSize = 8 << op.Size; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + Intrinsic sllInst = X86PsllInstruction[op.Size]; + + Operand res = context.AddIntrinsic(sllInst, n, Const(eSize - shift)); + + Intrinsic srlInst = X86PsrlInstruction[op.Size]; + + res = context.AddIntrinsic(srlInst, res, Const(eSize - 1)); + + Intrinsic sraInst = X86PsraInstruction[op.Size]; + + Operand nSra = context.AddIntrinsic(sraInst, n, Const(shift)); + + Intrinsic addInst = X86PaddInstruction[op.Size]; + + res = context.AddIntrinsic(addInst, res, nSra); + res = context.AddIntrinsic(addInst, res, d); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); + } + else + { + EmitVectorShrImmOpSx(context, ShrImmFlags.Round | ShrImmFlags.Accumulate); + } + } + + public static void Sshl_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOp(context, Intrinsic.Arm64SshlS); + } + else + { + EmitShlRegOp(context, ShlRegFlags.Scalar | ShlRegFlags.Signed); + } + } + + public static void Sshl_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SshlV); + } + else + { + EmitShlRegOp(context, ShlRegFlags.Signed); + } + } + + public static void Sshll_V(ArmEmitterContext context) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShl(op); + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorShiftBinaryOp(context, Intrinsic.Arm64SshllV, shift); + } + else if (Optimizations.UseSse41) + { + Operand n = GetVec(op.Rn); + + if (op.RegisterSize == RegisterSize.Simd128) + { + n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8)); + } + + Intrinsic movsxInst = X86PmovsxInstruction[op.Size]; + + Operand res = context.AddIntrinsic(movsxInst, n); + + if (shift != 0) + { + Intrinsic sllInst = X86PsllInstruction[op.Size + 1]; + + res = context.AddIntrinsic(sllInst, res, Const(shift)); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorShImmWidenBinarySx(context, (op1, op2) => context.ShiftLeft(op1, op2), shift); + } + } + + public static void Sshr_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitScalarShiftBinaryOp(context, Intrinsic.Arm64SshrS, shift); + } + else + { + EmitShrImmOp(context, ShrImmFlags.ScalarSx); + } + } + + public static void Sshr_V(ArmEmitterContext context) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorShiftBinaryOp(context, Intrinsic.Arm64SshrV, shift); + } + else if (Optimizations.UseGfni && op.Size == 0) + { + Operand n = GetVec(op.Rn); + + ulong bitMatrix; + + if (shift < 8) + { + bitMatrix = X86GetGf2p8LogicalShiftLeft(-shift); + + // Extend sign-bit + bitMatrix |= 0x8080808080808080UL >> (64 - shift * 8); + } + else + { + // Replicate sign-bit into all bits + bitMatrix = 0x8080808080808080UL; + } + + Operand vBitMatrix = X86GetElements(context, bitMatrix, bitMatrix); + + Operand res = context.AddIntrinsic(Intrinsic.X86Gf2p8affineqb, n, vBitMatrix, Const(0)); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else if (Optimizations.UseSse2 && op.Size > 0 && op.Size < 3) + { + Operand n = GetVec(op.Rn); + + Intrinsic sraInst = X86PsraInstruction[op.Size]; + + Operand res = context.AddIntrinsic(sraInst, n, Const(shift)); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitShrImmOp(context, ShrImmFlags.VectorSx); + } + } + + public static void Ssra_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitScalarShiftTernaryOpRd(context, Intrinsic.Arm64SsraS, shift); + } + else + { + EmitScalarShrImmOpSx(context, ShrImmFlags.Accumulate); + } + } + + public static void Ssra_V(ArmEmitterContext context) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitVectorShiftTernaryOpRd(context, Intrinsic.Arm64SsraV, shift); + } + else if (Optimizations.UseSse2 && op.Size > 0 && op.Size < 3) + { + int shift = GetImmShr(op); + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + Intrinsic sraInst = X86PsraInstruction[op.Size]; + + Operand res = context.AddIntrinsic(sraInst, n, Const(shift)); + + Intrinsic addInst = X86PaddInstruction[op.Size]; + + res = context.AddIntrinsic(addInst, res, d); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); + } + else + { + EmitVectorShrImmOpSx(context, ShrImmFlags.Accumulate); + } + } + + public static void Uqrshl_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOp(context, Intrinsic.Arm64UqrshlV); + } + else + { + EmitShlRegOp(context, ShlRegFlags.Round | ShlRegFlags.Saturating); + } + } + + public static void Uqrshrn_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitScalarSaturatingShiftTernaryOpRd(context, Intrinsic.Arm64UqrshrnS, shift); + } + else + { + EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarZxZx); + } + } + + public static void Uqrshrn_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitVectorSaturatingShiftTernaryOpRd(context, Intrinsic.Arm64UqrshrnV, shift); + } + else + { + EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorZxZx); + } + } + + public static void Uqshl_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOp(context, Intrinsic.Arm64UqshlV); + } + else + { + EmitShlRegOp(context, ShlRegFlags.Saturating); + } + } + + public static void Uqshrn_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitScalarSaturatingShiftTernaryOpRd(context, Intrinsic.Arm64UqshrnS, shift); + } + else + { + EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarZxZx); + } + } + + public static void Uqshrn_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitVectorSaturatingShiftTernaryOpRd(context, Intrinsic.Arm64UqshrnV, shift); + } + else + { + EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorZxZx); + } + } + + public static void Urshl_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UrshlV); + } + else + { + EmitShlRegOp(context, ShlRegFlags.Round); + } + } + + public static void Urshr_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitScalarShiftBinaryOp(context, Intrinsic.Arm64UrshrS, shift); + } + else + { + EmitScalarShrImmOpZx(context, ShrImmFlags.Round); + } + } + + public static void Urshr_V(ArmEmitterContext context) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitVectorShiftBinaryOp(context, Intrinsic.Arm64UrshrV, shift); + } + else if (Optimizations.UseSse2 && op.Size > 0) + { + int shift = GetImmShr(op); + int eSize = 8 << op.Size; + + Operand n = GetVec(op.Rn); + + Intrinsic sllInst = X86PsllInstruction[op.Size]; + + Operand res = context.AddIntrinsic(sllInst, n, Const(eSize - shift)); + + Intrinsic srlInst = X86PsrlInstruction[op.Size]; + + res = context.AddIntrinsic(srlInst, res, Const(eSize - 1)); + + Operand nSrl = context.AddIntrinsic(srlInst, n, Const(shift)); + + Intrinsic addInst = X86PaddInstruction[op.Size]; + + res = context.AddIntrinsic(addInst, res, nSrl); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorShrImmOpZx(context, ShrImmFlags.Round); + } + } + + public static void Ursra_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitScalarShiftTernaryOpRd(context, Intrinsic.Arm64UrsraS, shift); + } + else + { + EmitScalarShrImmOpZx(context, ShrImmFlags.Round | ShrImmFlags.Accumulate); + } + } + + public static void Ursra_V(ArmEmitterContext context) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitVectorShiftTernaryOpRd(context, Intrinsic.Arm64UrsraV, shift); + } + else if (Optimizations.UseSse2 && op.Size > 0) + { + int shift = GetImmShr(op); + int eSize = 8 << op.Size; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + Intrinsic sllInst = X86PsllInstruction[op.Size]; + + Operand res = context.AddIntrinsic(sllInst, n, Const(eSize - shift)); + + Intrinsic srlInst = X86PsrlInstruction[op.Size]; + + res = context.AddIntrinsic(srlInst, res, Const(eSize - 1)); + + Operand nSrl = context.AddIntrinsic(srlInst, n, Const(shift)); + + Intrinsic addInst = X86PaddInstruction[op.Size]; + + res = context.AddIntrinsic(addInst, res, nSrl); + res = context.AddIntrinsic(addInst, res, d); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); + } + else + { + EmitVectorShrImmOpZx(context, ShrImmFlags.Round | ShrImmFlags.Accumulate); + } + } + + public static void Ushl_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitScalarBinaryOp(context, Intrinsic.Arm64UshlS); + } + else + { + EmitShlRegOp(context, ShlRegFlags.Scalar); + } + } + + public static void Ushl_V(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UshlV); + } + else + { + EmitShlRegOp(context, ShlRegFlags.None); + } + } + + public static void Ushll_V(ArmEmitterContext context) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShl(op); + + if (Optimizations.UseAdvSimd) + { + InstEmitSimdHelperArm64.EmitVectorShiftBinaryOp(context, Intrinsic.Arm64UshllV, shift); + } + else if (Optimizations.UseSse41) + { + Operand n = GetVec(op.Rn); + + if (op.RegisterSize == RegisterSize.Simd128) + { + n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8)); + } + + Intrinsic movzxInst = X86PmovzxInstruction[op.Size]; + + Operand res = context.AddIntrinsic(movzxInst, n); + + if (shift != 0) + { + Intrinsic sllInst = X86PsllInstruction[op.Size + 1]; + + res = context.AddIntrinsic(sllInst, res, Const(shift)); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitVectorShImmWidenBinaryZx(context, (op1, op2) => context.ShiftLeft(op1, op2), shift); + } + } + + public static void Ushr_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitScalarShiftBinaryOp(context, Intrinsic.Arm64UshrS, shift); + } + else + { + EmitShrImmOp(context, ShrImmFlags.ScalarZx); + } + } + + public static void Ushr_V(ArmEmitterContext context) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitVectorShiftBinaryOp(context, Intrinsic.Arm64UshrV, shift); + } + else if (Optimizations.UseSse2 && op.Size > 0) + { + int shift = GetImmShr(op); + + Operand n = GetVec(op.Rn); + + Intrinsic srlInst = X86PsrlInstruction[op.Size]; + + Operand res = context.AddIntrinsic(srlInst, n, Const(shift)); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else + { + EmitShrImmOp(context, ShrImmFlags.VectorZx); + } + } + + public static void Usra_S(ArmEmitterContext context) + { + if (Optimizations.UseAdvSimd) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitScalarShiftTernaryOpRd(context, Intrinsic.Arm64UsraS, shift); + } + else + { + EmitScalarShrImmOpZx(context, ShrImmFlags.Accumulate); + } + } + + public static void Usra_V(ArmEmitterContext context) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + if (Optimizations.UseAdvSimd) + { + int shift = GetImmShr(op); + + InstEmitSimdHelperArm64.EmitVectorShiftTernaryOpRd(context, Intrinsic.Arm64UsraV, shift); + } + else if (Optimizations.UseSse2 && op.Size > 0) + { + int shift = GetImmShr(op); + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + Intrinsic srlInst = X86PsrlInstruction[op.Size]; + + Operand res = context.AddIntrinsic(srlInst, n, Const(shift)); + + Intrinsic addInst = X86PaddInstruction[op.Size]; + + res = context.AddIntrinsic(addInst, res, d); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); + } + else + { + EmitVectorShrImmOpZx(context, ShrImmFlags.Accumulate); + } + } + + [Flags] + private enum ShrImmFlags + { + Scalar = 1 << 0, + Signed = 1 << 1, + + Round = 1 << 2, + Accumulate = 1 << 3, + + ScalarSx = Scalar | Signed, + ScalarZx = Scalar, + + VectorSx = Signed, + VectorZx = 0 + } + + private static void EmitScalarShrImmOpSx(ArmEmitterContext context, ShrImmFlags flags) + { + EmitShrImmOp(context, ShrImmFlags.ScalarSx | flags); + } + + private static void EmitScalarShrImmOpZx(ArmEmitterContext context, ShrImmFlags flags) + { + EmitShrImmOp(context, ShrImmFlags.ScalarZx | flags); + } + + private static void EmitVectorShrImmOpSx(ArmEmitterContext context, ShrImmFlags flags) + { + EmitShrImmOp(context, ShrImmFlags.VectorSx | flags); + } + + private static void EmitVectorShrImmOpZx(ArmEmitterContext context, ShrImmFlags flags) + { + EmitShrImmOp(context, ShrImmFlags.VectorZx | flags); + } + + private static void EmitShrImmOp(ArmEmitterContext context, ShrImmFlags flags) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + Operand res = context.VectorZero(); + + bool scalar = (flags & ShrImmFlags.Scalar) != 0; + bool signed = (flags & ShrImmFlags.Signed) != 0; + bool round = (flags & ShrImmFlags.Round) != 0; + bool accumulate = (flags & ShrImmFlags.Accumulate) != 0; + + int shift = GetImmShr(op); + + long roundConst = 1L << (shift - 1); + + int elems = !scalar ? op.GetBytesCount() >> op.Size : 1; + + for (int index = 0; index < elems; index++) + { + Operand e = EmitVectorExtract(context, op.Rn, index, op.Size, signed); + + if (op.Size <= 2) + { + if (round) + { + e = context.Add(e, Const(roundConst)); + } + + e = signed ? context.ShiftRightSI(e, Const(shift)) : context.ShiftRightUI(e, Const(shift)); + } + else /* if (op.Size == 3) */ + { + e = EmitShrImm64(context, e, signed, round ? roundConst : 0L, shift); + } + + if (accumulate) + { + Operand de = EmitVectorExtract(context, op.Rd, index, op.Size, signed); + + e = context.Add(e, de); + } + + res = EmitVectorInsert(context, res, e, index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + + private static void EmitVectorShrImmNarrowOpZx(ArmEmitterContext context, bool round) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + + long roundConst = 1L << (shift - 1); + + int elems = 8 >> op.Size; + + int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0; + + Operand d = GetVec(op.Rd); + + Operand res = part == 0 ? context.VectorZero() : context.Copy(d); + + for (int index = 0; index < elems; index++) + { + Operand e = EmitVectorExtractZx(context, op.Rn, index, op.Size + 1); + + if (round) + { + e = context.Add(e, Const(roundConst)); + } + + e = context.ShiftRightUI(e, Const(shift)); + + res = EmitVectorInsert(context, res, e, part + index, op.Size); + } + + context.Copy(d, res); + } + + [Flags] + private enum ShrImmSaturatingNarrowFlags + { + Scalar = 1 << 0, + SignedSrc = 1 << 1, + SignedDst = 1 << 2, + + Round = 1 << 3, + + ScalarSxSx = Scalar | SignedSrc | SignedDst, + ScalarSxZx = Scalar | SignedSrc, + ScalarZxZx = Scalar, + + VectorSxSx = SignedSrc | SignedDst, + VectorSxZx = SignedSrc, + VectorZxZx = 0 + } + + private static void EmitRoundShrImmSaturatingNarrowOp(ArmEmitterContext context, ShrImmSaturatingNarrowFlags flags) + { + EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.Round | flags); + } + + private static void EmitShrImmSaturatingNarrowOp(ArmEmitterContext context, ShrImmSaturatingNarrowFlags flags) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + bool scalar = (flags & ShrImmSaturatingNarrowFlags.Scalar) != 0; + bool signedSrc = (flags & ShrImmSaturatingNarrowFlags.SignedSrc) != 0; + bool signedDst = (flags & ShrImmSaturatingNarrowFlags.SignedDst) != 0; + bool round = (flags & ShrImmSaturatingNarrowFlags.Round) != 0; + + int shift = GetImmShr(op); + + long roundConst = 1L << (shift - 1); + + int elems = !scalar ? 8 >> op.Size : 1; + + int part = !scalar && (op.RegisterSize == RegisterSize.Simd128) ? elems : 0; + + Operand d = GetVec(op.Rd); + + Operand res = part == 0 ? context.VectorZero() : context.Copy(d); + + for (int index = 0; index < elems; index++) + { + Operand e = EmitVectorExtract(context, op.Rn, index, op.Size + 1, signedSrc); + + if (op.Size <= 1 || !round) + { + if (round) + { + e = context.Add(e, Const(roundConst)); + } + + e = signedSrc ? context.ShiftRightSI(e, Const(shift)) : context.ShiftRightUI(e, Const(shift)); + } + else /* if (op.Size == 2 && round) */ + { + e = EmitShrImm64(context, e, signedSrc, roundConst, shift); // shift <= 32 + } + + e = signedSrc ? EmitSignedSrcSatQ(context, e, op.Size, signedDst) : EmitUnsignedSrcSatQ(context, e, op.Size, signedDst); + + res = EmitVectorInsert(context, res, e, part + index, op.Size); + } + + context.Copy(d, res); + } + + // dst64 = (Int(src64, signed) + roundConst) >> shift; + private static Operand EmitShrImm64( + ArmEmitterContext context, + Operand value, + bool signed, + long roundConst, + int shift) + { + MethodInfo info = signed + ? typeof(SoftFallback).GetMethod(nameof(SoftFallback.SignedShrImm64)) + : typeof(SoftFallback).GetMethod(nameof(SoftFallback.UnsignedShrImm64)); + + return context.Call(info, value, Const(roundConst), Const(shift)); + } + + private static void EmitVectorShImmWidenBinarySx(ArmEmitterContext context, Func2I emit, int imm) + { + EmitVectorShImmWidenBinaryOp(context, emit, imm, signed: true); + } + + private static void EmitVectorShImmWidenBinaryZx(ArmEmitterContext context, Func2I emit, int imm) + { + EmitVectorShImmWidenBinaryOp(context, emit, imm, signed: false); + } + + private static void EmitVectorShImmWidenBinaryOp(ArmEmitterContext context, Func2I emit, int imm, bool signed) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = 8 >> op.Size; + + int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtract(context, op.Rn, part + index, op.Size, signed); + + res = EmitVectorInsert(context, res, emit(ne, Const(imm)), index, op.Size + 1); + } + + context.Copy(GetVec(op.Rd), res); + } + + private static void EmitSli(ArmEmitterContext context, bool scalar) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShl(op); + int eSize = 8 << op.Size; + + ulong mask = shift != 0 ? ulong.MaxValue >> (64 - shift) : 0UL; + + if (shift >= eSize) + { + if ((op.RegisterSize == RegisterSize.Simd64) || scalar) + { + Operand res = context.VectorZeroUpper64(GetVec(op.Rd)); + + context.Copy(GetVec(op.Rd), res); + } + } + else if (Optimizations.UseGfni && op.Size == 0) + { + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + ulong bitMatrix = X86GetGf2p8LogicalShiftLeft(shift); + + Operand vBitMatrix = X86GetElements(context, bitMatrix, bitMatrix); + + Operand nShifted = context.AddIntrinsic(Intrinsic.X86Gf2p8affineqb, n, vBitMatrix, Const(0)); + + Operand dMask = X86GetAllElements(context, (long)mask * _masks_SliSri[op.Size]); + + Operand dMasked = context.AddIntrinsic(Intrinsic.X86Pand, d, dMask); + + Operand res = context.AddIntrinsic(Intrinsic.X86Por, nShifted, dMasked); + + if ((op.RegisterSize == RegisterSize.Simd64) || scalar) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); + } + else if (Optimizations.UseSse2 && op.Size > 0) + { + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + Intrinsic sllInst = X86PsllInstruction[op.Size]; + + Operand nShifted = context.AddIntrinsic(sllInst, n, Const(shift)); + + Operand dMask = X86GetAllElements(context, (long)mask * _masks_SliSri[op.Size]); + + Operand dMasked = context.AddIntrinsic(Intrinsic.X86Pand, d, dMask); + + Operand res = context.AddIntrinsic(Intrinsic.X86Por, nShifted, dMasked); + + if ((op.RegisterSize == RegisterSize.Simd64) || scalar) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); + } + else + { + Operand res = context.VectorZero(); + + int elems = !scalar ? op.GetBytesCount() >> op.Size : 1; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size); + + Operand neShifted = context.ShiftLeft(ne, Const(shift)); + + Operand de = EmitVectorExtractZx(context, op.Rd, index, op.Size); + + Operand deMasked = context.BitwiseAnd(de, Const(mask)); + + Operand e = context.BitwiseOr(neShifted, deMasked); + + res = EmitVectorInsert(context, res, e, index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + } + + private static void EmitSri(ArmEmitterContext context, bool scalar) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + int eSize = 8 << op.Size; + + ulong mask = (ulong.MaxValue << (eSize - shift)) & (ulong.MaxValue >> (64 - eSize)); + + if (shift >= eSize) + { + if ((op.RegisterSize == RegisterSize.Simd64) || scalar) + { + Operand res = context.VectorZeroUpper64(GetVec(op.Rd)); + + context.Copy(GetVec(op.Rd), res); + } + } + else if (Optimizations.UseGfni && op.Size == 0) + { + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + ulong bitMatrix = X86GetGf2p8LogicalShiftLeft(-shift); + + Operand vBitMatrix = X86GetElements(context, bitMatrix, bitMatrix); + + Operand nShifted = context.AddIntrinsic(Intrinsic.X86Gf2p8affineqb, n, vBitMatrix, Const(0)); + + Operand dMask = X86GetAllElements(context, (long)mask * _masks_SliSri[op.Size]); + + Operand dMasked = context.AddIntrinsic(Intrinsic.X86Pand, d, dMask); + + Operand res = context.AddIntrinsic(Intrinsic.X86Por, nShifted, dMasked); + + if ((op.RegisterSize == RegisterSize.Simd64) || scalar) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); + } + else if (Optimizations.UseSse2 && op.Size > 0) + { + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + Intrinsic srlInst = X86PsrlInstruction[op.Size]; + + Operand nShifted = context.AddIntrinsic(srlInst, n, Const(shift)); + + Operand dMask = X86GetAllElements(context, (long)mask * _masks_SliSri[op.Size]); + + Operand dMasked = context.AddIntrinsic(Intrinsic.X86Pand, d, dMask); + + Operand res = context.AddIntrinsic(Intrinsic.X86Por, nShifted, dMasked); + + if ((op.RegisterSize == RegisterSize.Simd64) || scalar) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); + } + else + { + Operand res = context.VectorZero(); + + int elems = !scalar ? op.GetBytesCount() >> op.Size : 1; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size); + + Operand neShifted = shift != 64 ? context.ShiftRightUI(ne, Const(shift)) : Const(0UL); + + Operand de = EmitVectorExtractZx(context, op.Rd, index, op.Size); + + Operand deMasked = context.BitwiseAnd(de, Const(mask)); + + Operand e = context.BitwiseOr(neShifted, deMasked); + + res = EmitVectorInsert(context, res, e, index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + } + + [Flags] + private enum ShlRegFlags + { + None = 0, + Scalar = 1 << 0, + Signed = 1 << 1, + Round = 1 << 2, + Saturating = 1 << 3 + } + + private static void EmitShlRegOp(ArmEmitterContext context, ShlRegFlags flags = ShlRegFlags.None) + { + bool scalar = flags.HasFlag(ShlRegFlags.Scalar); + bool signed = flags.HasFlag(ShlRegFlags.Signed); + bool round = flags.HasFlag(ShlRegFlags.Round); + bool saturating = flags.HasFlag(ShlRegFlags.Saturating); + + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = !scalar ? op.GetBytesCount() >> op.Size : 1; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtract(context, op.Rn, index, op.Size, signed); + Operand me = EmitVectorExtractSx(context, op.Rm, index << op.Size, size: 0); + + Operand e = !saturating + ? EmitShlReg(context, ne, context.ConvertI64ToI32(me), round, op.Size, signed) + : EmitShlRegSatQ(context, ne, context.ConvertI64ToI32(me), round, op.Size, signed); + + res = EmitVectorInsert(context, res, e, index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + + // long SignedShlReg(long op, int shiftLsB, bool round, int size); + // ulong UnsignedShlReg(ulong op, int shiftLsB, bool round, int size); + private static Operand EmitShlReg(ArmEmitterContext context, Operand op, Operand shiftLsB, bool round, int size, bool signed) + { + int eSize = 8 << size; + + Debug.Assert(op.Type == OperandType.I64); + Debug.Assert(shiftLsB.Type == OperandType.I32); + Debug.Assert(eSize == 8 || eSize == 16 || eSize == 32 || eSize == 64); + + Operand lbl1 = Label(); + Operand lblEnd = Label(); + + Operand eSizeOp = Const(eSize); + Operand zero = Const(0); + Operand zeroL = Const(0L); + + Operand res = context.Copy(context.AllocateLocal(OperandType.I64), op); + + context.BranchIf(lbl1, shiftLsB, zero, Comparison.GreaterOrEqual); + context.Copy(res, signed + ? EmitSignedShrReg(context, op, context.Negate(shiftLsB), round, eSize) + : EmitUnsignedShrReg(context, op, context.Negate(shiftLsB), round, eSize)); + context.Branch(lblEnd); + + context.MarkLabel(lbl1); + context.BranchIf(lblEnd, shiftLsB, zero, Comparison.LessOrEqual); + Operand shl = context.ShiftLeft(op, shiftLsB); + Operand isGreaterOrEqual = context.ICompareGreaterOrEqual(shiftLsB, eSizeOp); + context.Copy(res, context.ConditionalSelect(isGreaterOrEqual, zeroL, shl)); + context.Branch(lblEnd); + + context.MarkLabel(lblEnd); + + return res; + } + + // long SignedShlRegSatQ(long op, int shiftLsB, bool round, int size); + // ulong UnsignedShlRegSatQ(ulong op, int shiftLsB, bool round, int size); + private static Operand EmitShlRegSatQ(ArmEmitterContext context, Operand op, Operand shiftLsB, bool round, int size, bool signed) + { + int eSize = 8 << size; + + Debug.Assert(op.Type == OperandType.I64); + Debug.Assert(shiftLsB.Type == OperandType.I32); + Debug.Assert(eSize == 8 || eSize == 16 || eSize == 32 || eSize == 64); + + Operand lbl1 = Label(); + Operand lbl2 = Label(); + Operand lblEnd = Label(); + + Operand eSizeOp = Const(eSize); + Operand zero = Const(0); + + Operand res = context.Copy(context.AllocateLocal(OperandType.I64), op); + + context.BranchIf(lbl1, shiftLsB, zero, Comparison.GreaterOrEqual); + context.Copy(res, signed + ? EmitSignedShrReg(context, op, context.Negate(shiftLsB), round, eSize) + : EmitUnsignedShrReg(context, op, context.Negate(shiftLsB), round, eSize)); + context.Branch(lblEnd); + + context.MarkLabel(lbl1); + context.BranchIf(lblEnd, shiftLsB, zero, Comparison.LessOrEqual); + context.BranchIf(lbl2, shiftLsB, eSizeOp, Comparison.Less); + context.Copy(res, signed + ? EmitSignedSignSatQ(context, op, size) + : EmitUnsignedSignSatQ(context, op, size)); + context.Branch(lblEnd); + + context.MarkLabel(lbl2); + Operand shl = context.ShiftLeft(op, shiftLsB); + if (eSize == 64) + { + Operand sarOrShr = signed + ? context.ShiftRightSI(shl, shiftLsB) + : context.ShiftRightUI(shl, shiftLsB); + context.Copy(res, shl); + context.BranchIf(lblEnd, sarOrShr, op, Comparison.Equal); + context.Copy(res, signed + ? EmitSignedSignSatQ(context, op, size) + : EmitUnsignedSignSatQ(context, op, size)); + } + else + { + context.Copy(res, signed + ? EmitSignedSrcSatQ(context, shl, size, signedDst: true) + : EmitUnsignedSrcSatQ(context, shl, size, signedDst: false)); + } + context.Branch(lblEnd); + + context.MarkLabel(lblEnd); + + return res; + } + + // shift := [1, 128]; eSize := {8, 16, 32, 64}. + // long SignedShrReg(long op, int shift, bool round, int eSize); + private static Operand EmitSignedShrReg(ArmEmitterContext context, Operand op, Operand shift, bool round, int eSize) + { + if (round) + { + Operand lblEnd = Label(); + + Operand eSizeOp = Const(eSize); + Operand zeroL = Const(0L); + Operand one = Const(1); + Operand oneL = Const(1L); + + Operand res = context.Copy(context.AllocateLocal(OperandType.I64), zeroL); + + context.BranchIf(lblEnd, shift, eSizeOp, Comparison.GreaterOrEqual); + Operand roundConst = context.ShiftLeft(oneL, context.Subtract(shift, one)); + Operand add = context.Add(op, roundConst); + Operand sar = context.ShiftRightSI(add, shift); + if (eSize == 64) + { + Operand shr = context.ShiftRightUI(add, shift); + Operand left = context.BitwiseAnd(context.Negate(op), context.BitwiseExclusiveOr(op, add)); + Operand isLess = context.ICompareLess(left, zeroL); + context.Copy(res, context.ConditionalSelect(isLess, shr, sar)); + } + else + { + context.Copy(res, sar); + } + context.Branch(lblEnd); + + context.MarkLabel(lblEnd); + + return res; + } + else + { + Operand lblEnd = Label(); + + Operand eSizeOp = Const(eSize); + Operand zeroL = Const(0L); + Operand negOneL = Const(-1L); + + Operand sar = context.ShiftRightSI(op, shift); + Operand res = context.Copy(context.AllocateLocal(OperandType.I64), sar); + + context.BranchIf(lblEnd, shift, eSizeOp, Comparison.Less); + Operand isLess = context.ICompareLess(op, zeroL); + context.Copy(res, context.ConditionalSelect(isLess, negOneL, zeroL)); + context.Branch(lblEnd); + + context.MarkLabel(lblEnd); + + return res; + } + } + + // shift := [1, 128]; eSize := {8, 16, 32, 64}. + // ulong UnsignedShrReg(ulong op, int shift, bool round, int eSize); + private static Operand EmitUnsignedShrReg(ArmEmitterContext context, Operand op, Operand shift, bool round, int eSize) + { + if (round) + { + Operand lblEnd = Label(); + + Operand zeroUL = Const(0UL); + Operand one = Const(1); + Operand oneUL = Const(1UL); + Operand eSizeMaxOp = Const(64); + Operand oneShl63UL = Const(1UL << 63); + + Operand res = context.Copy(context.AllocateLocal(OperandType.I64), zeroUL); + + context.BranchIf(lblEnd, shift, eSizeMaxOp, Comparison.Greater); + Operand roundConst = context.ShiftLeft(oneUL, context.Subtract(shift, one)); + Operand add = context.Add(op, roundConst); + Operand shr = context.ShiftRightUI(add, shift); + Operand isEqual = context.ICompareEqual(shift, eSizeMaxOp); + context.Copy(res, context.ConditionalSelect(isEqual, zeroUL, shr)); + if (eSize == 64) + { + context.BranchIf(lblEnd, add, op, Comparison.GreaterOrEqualUI); + Operand right = context.BitwiseOr(shr, context.ShiftRightUI(oneShl63UL, context.Subtract(shift, one))); + context.Copy(res, context.ConditionalSelect(isEqual, oneUL, right)); + } + context.Branch(lblEnd); + + context.MarkLabel(lblEnd); + + return res; + } + else + { + Operand lblEnd = Label(); + + Operand eSizeOp = Const(eSize); + Operand zeroUL = Const(0UL); + + Operand shr = context.ShiftRightUI(op, shift); + Operand res = context.Copy(context.AllocateLocal(OperandType.I64), shr); + + context.BranchIf(lblEnd, shift, eSizeOp, Comparison.Less); + context.Copy(res, zeroUL); + context.Branch(lblEnd); + + context.MarkLabel(lblEnd); + + return res; + } + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSimdShift32.cs b/src/ARMeilleure/Instructions/InstEmitSimdShift32.cs new file mode 100644 index 00000000..9ac68088 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSimdShift32.cs @@ -0,0 +1,389 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; +using System; +using System.Diagnostics; +using System.Reflection; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper32; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit32 + { + public static void Vqrshrn(ArmEmitterContext context) + { + OpCode32SimdShImm op = (OpCode32SimdShImm)context.CurrOp; + + EmitRoundShrImmSaturatingNarrowOp(context, op.U ? ShrImmSaturatingNarrowFlags.VectorZxZx : ShrImmSaturatingNarrowFlags.VectorSxSx); + } + + public static void Vqrshrun(ArmEmitterContext context) + { + EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxZx); + } + + public static void Vqshrn(ArmEmitterContext context) + { + OpCode32SimdShImm op = (OpCode32SimdShImm)context.CurrOp; + + EmitShrImmSaturatingNarrowOp(context, op.U ? ShrImmSaturatingNarrowFlags.VectorZxZx : ShrImmSaturatingNarrowFlags.VectorSxSx); + } + + public static void Vqshrun(ArmEmitterContext context) + { + EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxZx); + } + + public static void Vrshr(ArmEmitterContext context) + { + EmitRoundShrImmOp(context, accumulate: false); + } + + public static void Vrshrn(ArmEmitterContext context) + { + EmitRoundShrImmNarrowOp(context, signed: false); + } + + public static void Vrsra(ArmEmitterContext context) + { + EmitRoundShrImmOp(context, accumulate: true); + } + + public static void Vshl(ArmEmitterContext context) + { + OpCode32SimdShImm op = (OpCode32SimdShImm)context.CurrOp; + + EmitVectorUnaryOpZx32(context, (op1) => context.ShiftLeft(op1, Const(op.Shift))); + } + + public static void Vshl_I(ArmEmitterContext context) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + if (op.U) + { + EmitVectorBinaryOpZx32(context, (op1, op2) => EmitShlRegOp(context, op2, op1, op.Size, true)); + } + else + { + EmitVectorBinaryOpSx32(context, (op1, op2) => EmitShlRegOp(context, op2, op1, op.Size, false)); + } + } + + public static void Vshll(ArmEmitterContext context) + { + OpCode32SimdShImmLong op = (OpCode32SimdShImmLong)context.CurrOp; + + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, !op.U); + + if (op.Size == 2) + { + if (op.U) + { + me = context.ZeroExtend32(OperandType.I64, me); + } + else + { + me = context.SignExtend32(OperandType.I64, me); + } + } + + me = context.ShiftLeft(me, Const(op.Shift)); + + res = EmitVectorInsert(context, res, me, index, op.Size + 1); + } + + context.Copy(GetVecA32(op.Qd), res); + } + + public static void Vshr(ArmEmitterContext context) + { + OpCode32SimdShImm op = (OpCode32SimdShImm)context.CurrOp; + int shift = GetImmShr(op); + int maxShift = (8 << op.Size) - 1; + + if (op.U) + { + EmitVectorUnaryOpZx32(context, (op1) => (shift > maxShift) ? Const(op1.Type, 0) : context.ShiftRightUI(op1, Const(shift))); + } + else + { + EmitVectorUnaryOpSx32(context, (op1) => context.ShiftRightSI(op1, Const(Math.Min(maxShift, shift)))); + } + } + + public static void Vshrn(ArmEmitterContext context) + { + OpCode32SimdShImm op = (OpCode32SimdShImm)context.CurrOp; + int shift = GetImmShr(op); + + EmitVectorUnaryNarrowOp32(context, (op1) => context.ShiftRightUI(op1, Const(shift))); + } + + public static void Vsra(ArmEmitterContext context) + { + OpCode32SimdShImm op = (OpCode32SimdShImm)context.CurrOp; + int shift = GetImmShr(op); + int maxShift = (8 << op.Size) - 1; + + if (op.U) + { + EmitVectorImmBinaryQdQmOpZx32(context, (op1, op2) => + { + Operand shiftRes = shift > maxShift ? Const(op2.Type, 0) : context.ShiftRightUI(op2, Const(shift)); + + return context.Add(op1, shiftRes); + }); + } + else + { + EmitVectorImmBinaryQdQmOpSx32(context, (op1, op2) => context.Add(op1, context.ShiftRightSI(op2, Const(Math.Min(maxShift, shift))))); + } + } + + public static void EmitRoundShrImmOp(ArmEmitterContext context, bool accumulate) + { + OpCode32SimdShImm op = (OpCode32SimdShImm)context.CurrOp; + int shift = GetImmShr(op); + long roundConst = 1L << (shift - 1); + + if (op.U) + { + if (op.Size < 2) + { + EmitVectorUnaryOpZx32(context, (op1) => + { + op1 = context.Add(op1, Const(op1.Type, roundConst)); + + return context.ShiftRightUI(op1, Const(shift)); + }, accumulate); + } + else if (op.Size == 2) + { + EmitVectorUnaryOpZx32(context, (op1) => + { + op1 = context.ZeroExtend32(OperandType.I64, op1); + op1 = context.Add(op1, Const(op1.Type, roundConst)); + + return context.ConvertI64ToI32(context.ShiftRightUI(op1, Const(shift))); + }, accumulate); + } + else /* if (op.Size == 3) */ + { + EmitVectorUnaryOpZx32(context, (op1) => EmitShrImm64(context, op1, signed: false, roundConst, shift), accumulate); + } + } + else + { + if (op.Size < 2) + { + EmitVectorUnaryOpSx32(context, (op1) => + { + op1 = context.Add(op1, Const(op1.Type, roundConst)); + + return context.ShiftRightSI(op1, Const(shift)); + }, accumulate); + } + else if (op.Size == 2) + { + EmitVectorUnaryOpSx32(context, (op1) => + { + op1 = context.SignExtend32(OperandType.I64, op1); + op1 = context.Add(op1, Const(op1.Type, roundConst)); + + return context.ConvertI64ToI32(context.ShiftRightSI(op1, Const(shift))); + }, accumulate); + } + else /* if (op.Size == 3) */ + { + EmitVectorUnaryOpZx32(context, (op1) => EmitShrImm64(context, op1, signed: true, roundConst, shift), accumulate); + } + } + } + + private static void EmitRoundShrImmNarrowOp(ArmEmitterContext context, bool signed) + { + OpCode32SimdShImm op = (OpCode32SimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + long roundConst = 1L << (shift - 1); + + EmitVectorUnaryNarrowOp32(context, (op1) => + { + if (op.Size <= 1) + { + op1 = context.Add(op1, Const(op1.Type, roundConst)); + op1 = signed ? context.ShiftRightSI(op1, Const(shift)) : context.ShiftRightUI(op1, Const(shift)); + } + else /* if (op.Size == 2 && round) */ + { + op1 = EmitShrImm64(context, op1, signed, roundConst, shift); // shift <= 32 + } + + return op1; + }, signed); + } + + private static Operand EmitShlRegOp(ArmEmitterContext context, Operand op, Operand shiftLsB, int size, bool unsigned) + { + if (shiftLsB.Type == OperandType.I64) + { + shiftLsB = context.ConvertI64ToI32(shiftLsB); + } + + shiftLsB = context.SignExtend8(OperandType.I32, shiftLsB); + Debug.Assert((uint)size < 4u); + + Operand negShiftLsB = context.Negate(shiftLsB); + + Operand isPositive = context.ICompareGreaterOrEqual(shiftLsB, Const(0)); + + Operand shl = context.ShiftLeft(op, shiftLsB); + Operand shr = unsigned ? context.ShiftRightUI(op, negShiftLsB) : context.ShiftRightSI(op, negShiftLsB); + + Operand res = context.ConditionalSelect(isPositive, shl, shr); + + if (unsigned) + { + Operand isOutOfRange = context.BitwiseOr( + context.ICompareGreaterOrEqual(shiftLsB, Const(8 << size)), + context.ICompareGreaterOrEqual(negShiftLsB, Const(8 << size))); + + return context.ConditionalSelect(isOutOfRange, Const(op.Type, 0), res); + } + else + { + Operand isOutOfRange0 = context.ICompareGreaterOrEqual(shiftLsB, Const(8 << size)); + Operand isOutOfRangeN = context.ICompareGreaterOrEqual(negShiftLsB, Const(8 << size)); + + // Also zero if shift is too negative, but value was positive. + isOutOfRange0 = context.BitwiseOr(isOutOfRange0, context.BitwiseAnd(isOutOfRangeN, context.ICompareGreaterOrEqual(op, Const(op.Type, 0)))); + + Operand min = (op.Type == OperandType.I64) ? Const(-1L) : Const(-1); + + return context.ConditionalSelect(isOutOfRange0, Const(op.Type, 0), context.ConditionalSelect(isOutOfRangeN, min, res)); + } + } + + [Flags] + private enum ShrImmSaturatingNarrowFlags + { + Scalar = 1 << 0, + SignedSrc = 1 << 1, + SignedDst = 1 << 2, + + Round = 1 << 3, + + ScalarSxSx = Scalar | SignedSrc | SignedDst, + ScalarSxZx = Scalar | SignedSrc, + ScalarZxZx = Scalar, + + VectorSxSx = SignedSrc | SignedDst, + VectorSxZx = SignedSrc, + VectorZxZx = 0 + } + + private static void EmitRoundShrImmSaturatingNarrowOp(ArmEmitterContext context, ShrImmSaturatingNarrowFlags flags) + { + EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.Round | flags); + } + + private static void EmitShrImmSaturatingNarrowOp(ArmEmitterContext context, ShrImmSaturatingNarrowFlags flags) + { + OpCode32SimdShImm op = (OpCode32SimdShImm)context.CurrOp; + + bool scalar = (flags & ShrImmSaturatingNarrowFlags.Scalar) != 0; + bool signedSrc = (flags & ShrImmSaturatingNarrowFlags.SignedSrc) != 0; + bool signedDst = (flags & ShrImmSaturatingNarrowFlags.SignedDst) != 0; + bool round = (flags & ShrImmSaturatingNarrowFlags.Round) != 0; + + if (scalar) + { + // TODO: Support scalar operation. + throw new NotImplementedException(); + } + + int shift = GetImmShr(op); + long roundConst = 1L << (shift - 1); + + EmitVectorUnaryNarrowOp32(context, (op1) => + { + if (op.Size <= 1 || !round) + { + if (round) + { + op1 = context.Add(op1, Const(op1.Type, roundConst)); + } + + op1 = signedSrc ? context.ShiftRightSI(op1, Const(shift)) : context.ShiftRightUI(op1, Const(shift)); + } + else /* if (op.Size == 2 && round) */ + { + op1 = EmitShrImm64(context, op1, signedSrc, roundConst, shift); // shift <= 32 + } + + return EmitSatQ(context, op1, 8 << op.Size, signedSrc, signedDst); + }, signedSrc); + } + + private static int GetImmShr(OpCode32SimdShImm op) + { + return (8 << op.Size) - op.Shift; // Shr amount is flipped. + } + + // dst64 = (Int(src64, signed) + roundConst) >> shift; + private static Operand EmitShrImm64( + ArmEmitterContext context, + Operand value, + bool signed, + long roundConst, + int shift) + { + MethodInfo info = signed + ? typeof(SoftFallback).GetMethod(nameof(SoftFallback.SignedShrImm64)) + : typeof(SoftFallback).GetMethod(nameof(SoftFallback.UnsignedShrImm64)); + + return context.Call(info, value, Const(roundConst), Const(shift)); + } + + private static Operand EmitSatQ(ArmEmitterContext context, Operand value, int eSize, bool signedSrc, bool signedDst) + { + Debug.Assert(eSize <= 32); + + long intMin = signedDst ? -(1L << (eSize - 1)) : 0; + long intMax = signedDst ? (1L << (eSize - 1)) - 1 : (1L << eSize) - 1; + + Operand gt = signedSrc + ? context.ICompareGreater(value, Const(value.Type, intMax)) + : context.ICompareGreaterUI(value, Const(value.Type, intMax)); + + Operand lt = signedSrc + ? context.ICompareLess(value, Const(value.Type, intMin)) + : context.ICompareLessUI(value, Const(value.Type, intMin)); + + value = context.ConditionalSelect(gt, Const(value.Type, intMax), value); + value = context.ConditionalSelect(lt, Const(value.Type, intMin), value); + + Operand lblNoSat = Label(); + + context.BranchIfFalse(lblNoSat, context.BitwiseOr(gt, lt)); + + SetFpFlag(context, FPState.QcFlag, Const(1)); + + context.MarkLabel(lblNoSat); + + return value; + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSystem.cs b/src/ARMeilleure/Instructions/InstEmitSystem.cs new file mode 100644 index 00000000..f84829aa --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSystem.cs @@ -0,0 +1,248 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; +using System; +using System.Reflection; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit + { + private const int DczSizeLog2 = 4; // Log2 size in words + public const int DczSizeInBytes = 4 << DczSizeLog2; + + public static void Isb(ArmEmitterContext context) + { + // Execute as no-op. + } + + public static void Mrs(ArmEmitterContext context) + { + OpCodeSystem op = (OpCodeSystem)context.CurrOp; + + MethodInfo info; + + switch (GetPackedId(op)) + { + case 0b11_011_0000_0000_001: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.GetCtrEl0)); break; + case 0b11_011_0000_0000_111: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.GetDczidEl0)); break; + case 0b11_011_0100_0010_000: EmitGetNzcv(context); return; + case 0b11_011_0100_0100_000: EmitGetFpcr(context); return; + case 0b11_011_0100_0100_001: EmitGetFpsr(context); return; + case 0b11_011_1101_0000_010: EmitGetTpidrEl0(context); return; + case 0b11_011_1101_0000_011: EmitGetTpidrroEl0(context); return; + case 0b11_011_1110_0000_000: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.GetCntfrqEl0)); break; + case 0b11_011_1110_0000_001: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.GetCntpctEl0)); break; + case 0b11_011_1110_0000_010: info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.GetCntvctEl0)); break; + + default: throw new NotImplementedException($"Unknown MRS 0x{op.RawOpCode:X8} at 0x{op.Address:X16}."); + } + + SetIntOrZR(context, op.Rt, context.Call(info)); + } + + public static void Msr(ArmEmitterContext context) + { + OpCodeSystem op = (OpCodeSystem)context.CurrOp; + + switch (GetPackedId(op)) + { + case 0b11_011_0100_0010_000: EmitSetNzcv(context); return; + case 0b11_011_0100_0100_000: EmitSetFpcr(context); return; + case 0b11_011_0100_0100_001: EmitSetFpsr(context); return; + case 0b11_011_1101_0000_010: EmitSetTpidrEl0(context); return; + + default: throw new NotImplementedException($"Unknown MSR 0x{op.RawOpCode:X8} at 0x{op.Address:X16}."); + } + } + + public static void Nop(ArmEmitterContext context) + { + // Do nothing. + } + + public static void Sys(ArmEmitterContext context) + { + // This instruction is used to do some operations on the CPU like cache invalidation, + // address translation and the like. + // We treat it as no-op here since we don't have any cache being emulated anyway. + OpCodeSystem op = (OpCodeSystem)context.CurrOp; + + switch (GetPackedId(op)) + { + case 0b11_011_0111_0100_001: + { + // DC ZVA + Operand t = GetIntOrZR(context, op.Rt); + + for (long offset = 0; offset < DczSizeInBytes; offset += 8) + { + Operand address = context.Add(t, Const(offset)); + + InstEmitMemoryHelper.EmitStore(context, address, RegisterConsts.ZeroIndex, 3); + } + + break; + } + + // No-op + case 0b11_011_0111_1110_001: // DC CIVAC + break; + + case 0b11_011_0111_0101_001: // IC IVAU + Operand target = Register(op.Rt, RegisterType.Integer, OperandType.I64); + context.Call(typeof(NativeInterface).GetMethod(nameof(NativeInterface.InvalidateCacheLine)), target); + break; + } + } + + private static int GetPackedId(OpCodeSystem op) + { + int id; + + id = op.Op2 << 0; + id |= op.CRm << 3; + id |= op.CRn << 7; + id |= op.Op1 << 11; + id |= op.Op0 << 14; + + return id; + } + + private static void EmitGetNzcv(ArmEmitterContext context) + { + OpCodeSystem op = (OpCodeSystem)context.CurrOp; + + Operand nzcv = context.ShiftLeft(GetFlag(PState.VFlag), Const((int)PState.VFlag)); + nzcv = context.BitwiseOr(nzcv, context.ShiftLeft(GetFlag(PState.CFlag), Const((int)PState.CFlag))); + nzcv = context.BitwiseOr(nzcv, context.ShiftLeft(GetFlag(PState.ZFlag), Const((int)PState.ZFlag))); + nzcv = context.BitwiseOr(nzcv, context.ShiftLeft(GetFlag(PState.NFlag), Const((int)PState.NFlag))); + + SetIntOrZR(context, op.Rt, nzcv); + } + + private static void EmitGetFpcr(ArmEmitterContext context) + { + OpCodeSystem op = (OpCodeSystem)context.CurrOp; + + Operand fpcr = Const(0); + + for (int flag = 0; flag < RegisterConsts.FpFlagsCount; flag++) + { + if (FPCR.Mask.HasFlag((FPCR)(1u << flag))) + { + fpcr = context.BitwiseOr(fpcr, context.ShiftLeft(GetFpFlag((FPState)flag), Const(flag))); + } + } + + SetIntOrZR(context, op.Rt, fpcr); + } + + private static void EmitGetFpsr(ArmEmitterContext context) + { + OpCodeSystem op = (OpCodeSystem)context.CurrOp; + + context.SyncQcFlag(); + + Operand fpsr = Const(0); + + for (int flag = 0; flag < RegisterConsts.FpFlagsCount; flag++) + { + if (FPSR.Mask.HasFlag((FPSR)(1u << flag))) + { + fpsr = context.BitwiseOr(fpsr, context.ShiftLeft(GetFpFlag((FPState)flag), Const(flag))); + } + } + + SetIntOrZR(context, op.Rt, fpsr); + } + + private static void EmitGetTpidrEl0(ArmEmitterContext context) + { + OpCodeSystem op = (OpCodeSystem)context.CurrOp; + + Operand nativeContext = context.LoadArgument(OperandType.I64, 0); + + Operand result = context.Load(OperandType.I64, context.Add(nativeContext, Const((ulong)NativeContext.GetTpidrEl0Offset()))); + + SetIntOrZR(context, op.Rt, result); + } + + private static void EmitGetTpidrroEl0(ArmEmitterContext context) + { + OpCodeSystem op = (OpCodeSystem)context.CurrOp; + + Operand nativeContext = context.LoadArgument(OperandType.I64, 0); + + Operand result = context.Load(OperandType.I64, context.Add(nativeContext, Const((ulong)NativeContext.GetTpidrroEl0Offset()))); + + SetIntOrZR(context, op.Rt, result); + } + + private static void EmitSetNzcv(ArmEmitterContext context) + { + OpCodeSystem op = (OpCodeSystem)context.CurrOp; + + Operand nzcv = GetIntOrZR(context, op.Rt); + nzcv = context.ConvertI64ToI32(nzcv); + + SetFlag(context, PState.VFlag, context.BitwiseAnd(context.ShiftRightUI(nzcv, Const((int)PState.VFlag)), Const(1))); + SetFlag(context, PState.CFlag, context.BitwiseAnd(context.ShiftRightUI(nzcv, Const((int)PState.CFlag)), Const(1))); + SetFlag(context, PState.ZFlag, context.BitwiseAnd(context.ShiftRightUI(nzcv, Const((int)PState.ZFlag)), Const(1))); + SetFlag(context, PState.NFlag, context.BitwiseAnd(context.ShiftRightUI(nzcv, Const((int)PState.NFlag)), Const(1))); + } + + private static void EmitSetFpcr(ArmEmitterContext context) + { + OpCodeSystem op = (OpCodeSystem)context.CurrOp; + + Operand fpcr = GetIntOrZR(context, op.Rt); + fpcr = context.ConvertI64ToI32(fpcr); + + for (int flag = 0; flag < RegisterConsts.FpFlagsCount; flag++) + { + if (FPCR.Mask.HasFlag((FPCR)(1u << flag))) + { + SetFpFlag(context, (FPState)flag, context.BitwiseAnd(context.ShiftRightUI(fpcr, Const(flag)), Const(1))); + } + } + + context.UpdateArmFpMode(); + } + + private static void EmitSetFpsr(ArmEmitterContext context) + { + OpCodeSystem op = (OpCodeSystem)context.CurrOp; + + context.ClearQcFlagIfModified(); + + Operand fpsr = GetIntOrZR(context, op.Rt); + fpsr = context.ConvertI64ToI32(fpsr); + + for (int flag = 0; flag < RegisterConsts.FpFlagsCount; flag++) + { + if (FPSR.Mask.HasFlag((FPSR)(1u << flag))) + { + SetFpFlag(context, (FPState)flag, context.BitwiseAnd(context.ShiftRightUI(fpsr, Const(flag)), Const(1))); + } + } + + context.UpdateArmFpMode(); + } + + private static void EmitSetTpidrEl0(ArmEmitterContext context) + { + OpCodeSystem op = (OpCodeSystem)context.CurrOp; + + Operand value = GetIntOrZR(context, op.Rt); + + Operand nativeContext = context.LoadArgument(OperandType.I64, 0); + + context.Store(context.Add(nativeContext, Const((ulong)NativeContext.GetTpidrEl0Offset())), value); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstEmitSystem32.cs b/src/ARMeilleure/Instructions/InstEmitSystem32.cs new file mode 100644 index 00000000..f2732c99 --- /dev/null +++ b/src/ARMeilleure/Instructions/InstEmitSystem32.cs @@ -0,0 +1,351 @@ +using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; +using System; +using System.Reflection; + +using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Instructions +{ + static partial class InstEmit32 + { + public static void Mcr(ArmEmitterContext context) + { + OpCode32System op = (OpCode32System)context.CurrOp; + + if (op.Coproc != 15 || op.Opc1 != 0) + { + InstEmit.Und(context); + + return; + } + + switch (op.CRn) + { + case 13: // Process and Thread Info. + if (op.CRm != 0) + { + throw new NotImplementedException($"Unknown MRC CRm 0x{op.CRm:X} at 0x{op.Address:X} (0x{op.RawOpCode:X})."); + } + + switch (op.Opc2) + { + case 2: + EmitSetTpidrEl0(context); return; + + default: + throw new NotImplementedException($"Unknown MRC Opc2 0x{op.Opc2:X} at 0x{op.Address:X} (0x{op.RawOpCode:X})."); + } + + case 7: + switch (op.CRm) // Cache and Memory barrier. + { + case 10: + switch (op.Opc2) + { + case 5: // Data Memory Barrier Register. + return; // No-op. + + default: + throw new NotImplementedException($"Unknown MRC Opc2 0x{op.Opc2:X16} at 0x{op.Address:X16} (0x{op.RawOpCode:X})."); + } + + default: + throw new NotImplementedException($"Unknown MRC CRm 0x{op.CRm:X16} at 0x{op.Address:X16} (0x{op.RawOpCode:X})."); + } + + default: + throw new NotImplementedException($"Unknown MRC 0x{op.RawOpCode:X8} at 0x{op.Address:X16}."); + } + } + + public static void Mrc(ArmEmitterContext context) + { + OpCode32System op = (OpCode32System)context.CurrOp; + + if (op.Coproc != 15 || op.Opc1 != 0) + { + InstEmit.Und(context); + + return; + } + + Operand result; + + switch (op.CRn) + { + case 13: // Process and Thread Info. + if (op.CRm != 0) + { + throw new NotImplementedException($"Unknown MRC CRm 0x{op.CRm:X} at 0x{op.Address:X} (0x{op.RawOpCode:X})."); + } + + switch (op.Opc2) + { + case 2: + result = EmitGetTpidrEl0(context); break; + + case 3: + result = EmitGetTpidrroEl0(context); break; + + default: + throw new NotImplementedException($"Unknown MRC Opc2 0x{op.Opc2:X} at 0x{op.Address:X} (0x{op.RawOpCode:X})."); + } + + break; + + default: + throw new NotImplementedException($"Unknown MRC 0x{op.RawOpCode:X} at 0x{op.Address:X}."); + } + + if (op.Rt == RegisterAlias.Aarch32Pc) + { + // Special behavior: copy NZCV flags into APSR. + EmitSetNzcv(context, result); + + return; + } + else + { + SetIntA32(context, op.Rt, result); + } + } + + public static void Mrrc(ArmEmitterContext context) + { + OpCode32System op = (OpCode32System)context.CurrOp; + + if (op.Coproc != 15) + { + InstEmit.Und(context); + + return; + } + + int opc = op.MrrcOp; + + MethodInfo info; + + switch (op.CRm) + { + case 14: // Timer. + switch (opc) + { + case 0: + info = typeof(NativeInterface).GetMethod(nameof(NativeInterface.GetCntpctEl0)); break; + + default: + throw new NotImplementedException($"Unknown MRRC Opc1 0x{opc:X} at 0x{op.Address:X} (0x{op.RawOpCode:X})."); + } + + break; + + default: + throw new NotImplementedException($"Unknown MRRC 0x{op.RawOpCode:X} at 0x{op.Address:X}."); + } + + Operand result = context.Call(info); + + SetIntA32(context, op.Rt, context.ConvertI64ToI32(result)); + SetIntA32(context, op.CRn, context.ConvertI64ToI32(context.ShiftRightUI(result, Const(32)))); + } + + public static void Mrs(ArmEmitterContext context) + { + OpCode32Mrs op = (OpCode32Mrs)context.CurrOp; + + if (op.R) + { + throw new NotImplementedException("SPSR"); + } + else + { + Operand spsr = context.ShiftLeft(GetFlag(PState.VFlag), Const((int)PState.VFlag)); + spsr = context.BitwiseOr(spsr, context.ShiftLeft(GetFlag(PState.CFlag), Const((int)PState.CFlag))); + spsr = context.BitwiseOr(spsr, context.ShiftLeft(GetFlag(PState.ZFlag), Const((int)PState.ZFlag))); + spsr = context.BitwiseOr(spsr, context.ShiftLeft(GetFlag(PState.NFlag), Const((int)PState.NFlag))); + spsr = context.BitwiseOr(spsr, context.ShiftLeft(GetFlag(PState.QFlag), Const((int)PState.QFlag))); + + // TODO: Remaining flags. + + SetIntA32(context, op.Rd, spsr); + } + } + + public static void Msr(ArmEmitterContext context) + { + OpCode32MsrReg op = (OpCode32MsrReg)context.CurrOp; + + if (op.R) + { + throw new NotImplementedException("SPSR"); + } + else + { + if ((op.Mask & 8) != 0) + { + Operand value = GetIntA32(context, op.Rn); + + EmitSetNzcv(context, value); + + Operand q = context.BitwiseAnd(context.ShiftRightUI(value, Const((int)PState.QFlag)), Const(1)); + + SetFlag(context, PState.QFlag, q); + } + + if ((op.Mask & 4) != 0) + { + throw new NotImplementedException("APSR_g"); + } + + if ((op.Mask & 2) != 0) + { + throw new NotImplementedException("CPSR_x"); + } + + if ((op.Mask & 1) != 0) + { + throw new NotImplementedException("CPSR_c"); + } + } + } + + public static void Nop(ArmEmitterContext context) { } + + public static void Vmrs(ArmEmitterContext context) + { + OpCode32SimdSpecial op = (OpCode32SimdSpecial)context.CurrOp; + + if (op.Rt == RegisterAlias.Aarch32Pc && op.Sreg == 0b0001) + { + // Special behavior: copy NZCV flags into APSR. + SetFlag(context, PState.VFlag, GetFpFlag(FPState.VFlag)); + SetFlag(context, PState.CFlag, GetFpFlag(FPState.CFlag)); + SetFlag(context, PState.ZFlag, GetFpFlag(FPState.ZFlag)); + SetFlag(context, PState.NFlag, GetFpFlag(FPState.NFlag)); + + return; + } + + switch (op.Sreg) + { + case 0b0000: // FPSID + throw new NotImplementedException("Supervisor Only"); + case 0b0001: // FPSCR + EmitGetFpscr(context); return; + case 0b0101: // MVFR2 + throw new NotImplementedException("MVFR2"); + case 0b0110: // MVFR1 + throw new NotImplementedException("MVFR1"); + case 0b0111: // MVFR0 + throw new NotImplementedException("MVFR0"); + case 0b1000: // FPEXC + throw new NotImplementedException("Supervisor Only"); + default: + throw new NotImplementedException($"Unknown VMRS 0x{op.RawOpCode:X} at 0x{op.Address:X}."); + } + } + + public static void Vmsr(ArmEmitterContext context) + { + OpCode32SimdSpecial op = (OpCode32SimdSpecial)context.CurrOp; + + switch (op.Sreg) + { + case 0b0000: // FPSID + throw new NotImplementedException("Supervisor Only"); + case 0b0001: // FPSCR + EmitSetFpscr(context); return; + case 0b0101: // MVFR2 + throw new NotImplementedException("MVFR2"); + case 0b0110: // MVFR1 + throw new NotImplementedException("MVFR1"); + case 0b0111: // MVFR0 + throw new NotImplementedException("MVFR0"); + case 0b1000: // FPEXC + throw new NotImplementedException("Supervisor Only"); + default: + throw new NotImplementedException($"Unknown VMSR 0x{op.RawOpCode:X} at 0x{op.Address:X}."); + } + } + + private static void EmitSetNzcv(ArmEmitterContext context, Operand t) + { + Operand v = context.BitwiseAnd(context.ShiftRightUI(t, Const((int)PState.VFlag)), Const(1)); + Operand c = context.BitwiseAnd(context.ShiftRightUI(t, Const((int)PState.CFlag)), Const(1)); + Operand z = context.BitwiseAnd(context.ShiftRightUI(t, Const((int)PState.ZFlag)), Const(1)); + Operand n = context.BitwiseAnd(context.ShiftRightUI(t, Const((int)PState.NFlag)), Const(1)); + + SetFlag(context, PState.VFlag, v); + SetFlag(context, PState.CFlag, c); + SetFlag(context, PState.ZFlag, z); + SetFlag(context, PState.NFlag, n); + } + + private static void EmitGetFpscr(ArmEmitterContext context) + { + OpCode32SimdSpecial op = (OpCode32SimdSpecial)context.CurrOp; + + Operand fpscr = Const(0); + + for (int flag = 0; flag < RegisterConsts.FpFlagsCount; flag++) + { + if (FPSCR.Mask.HasFlag((FPSCR)(1u << flag))) + { + fpscr = context.BitwiseOr(fpscr, context.ShiftLeft(GetFpFlag((FPState)flag), Const(flag))); + } + } + + SetIntA32(context, op.Rt, fpscr); + } + + private static void EmitSetFpscr(ArmEmitterContext context) + { + OpCode32SimdSpecial op = (OpCode32SimdSpecial)context.CurrOp; + + Operand fpscr = GetIntA32(context, op.Rt); + + for (int flag = 0; flag < RegisterConsts.FpFlagsCount; flag++) + { + if (FPSCR.Mask.HasFlag((FPSCR)(1u << flag))) + { + SetFpFlag(context, (FPState)flag, context.BitwiseAnd(context.ShiftRightUI(fpscr, Const(flag)), Const(1))); + } + } + + context.UpdateArmFpMode(); + } + + private static Operand EmitGetTpidrEl0(ArmEmitterContext context) + { + OpCode32System op = (OpCode32System)context.CurrOp; + + Operand nativeContext = context.LoadArgument(OperandType.I64, 0); + + return context.Load(OperandType.I64, context.Add(nativeContext, Const((ulong)NativeContext.GetTpidrEl0Offset()))); + } + + private static Operand EmitGetTpidrroEl0(ArmEmitterContext context) + { + OpCode32System op = (OpCode32System)context.CurrOp; + + Operand nativeContext = context.LoadArgument(OperandType.I64, 0); + + return context.Load(OperandType.I64, context.Add(nativeContext, Const((ulong)NativeContext.GetTpidrroEl0Offset()))); + } + + private static void EmitSetTpidrEl0(ArmEmitterContext context) + { + OpCode32System op = (OpCode32System)context.CurrOp; + + Operand value = GetIntA32(context, op.Rt); + + Operand nativeContext = context.LoadArgument(OperandType.I64, 0); + + context.Store(context.Add(nativeContext, Const((ulong)NativeContext.GetTpidrEl0Offset())), context.ZeroExtend32(OperandType.I64, value)); + } + } +} diff --git a/src/ARMeilleure/Instructions/InstName.cs b/src/ARMeilleure/Instructions/InstName.cs new file mode 100644 index 00000000..fd71d92e --- /dev/null +++ b/src/ARMeilleure/Instructions/InstName.cs @@ -0,0 +1,685 @@ +namespace ARMeilleure.Instructions +{ + enum InstName + { + // Base (AArch64) + Adc, + Adcs, + Add, + Adds, + Adr, + Adrp, + And, + Ands, + Asrv, + B, + B_Cond, + Bfm, + Bic, + Bics, + Bl, + Blr, + Br, + Brk, + Cbnz, + Cbz, + Ccmn, + Ccmp, + Clrex, + Cls, + Clz, + Crc32b, + Crc32h, + Crc32w, + Crc32x, + Crc32cb, + Crc32ch, + Crc32cw, + Crc32cx, + Csdb, + Csel, + Csinc, + Csinv, + Csneg, + Dmb, + Dsb, + Eon, + Eor, + Esb, + Extr, + Hint, + Isb, + It, + Ldar, + Ldaxp, + Ldaxr, + Ldp, + Ldr, + Ldr_Literal, + Ldrs, + Ldxr, + Ldxp, + Lslv, + Lsrv, + Madd, + Movk, + Movn, + Movz, + Mrs, + Msr, + Msub, + Nop, + Orn, + Orr, + Prfm, + Rbit, + Ret, + Rev16, + Rev32, + Rev64, + Rorv, + Sbc, + Sbcs, + Sbfm, + Sdiv, + Sel, + Sev, + Sevl, + Shsub8, + Smaddl, + Smsubl, + Smulh, + Smull, + Smulw_, + Ssat, + Ssat16, + Stlr, + Stlxp, + Stlxr, + Stp, + Str, + Stxp, + Stxr, + Sub, + Subs, + Svc, + Sxtb, + Sxth, + Sys, + Tbnz, + Tbz, + Tsb, + Ubfm, + Udiv, + Umaddl, + Umsubl, + Umulh, + Und, + Wfe, + Wfi, + Yield, + + // FP & SIMD (AArch64) + Abs_S, + Abs_V, + Add_S, + Add_V, + Addhn_V, + Addp_S, + Addp_V, + Addv_V, + Aesd_V, + Aese_V, + Aesimc_V, + Aesmc_V, + And_V, + Bic_V, + Bic_Vi, + Bif_V, + Bit_V, + Bsl_V, + Cls_V, + Clz_V, + Cmeq_S, + Cmeq_V, + Cmge_S, + Cmge_V, + Cmgt_S, + Cmgt_V, + Cmhi_S, + Cmhi_V, + Cmhs_S, + Cmhs_V, + Cmle_S, + Cmle_V, + Cmlt_S, + Cmlt_V, + Cmtst_S, + Cmtst_V, + Cnt_V, + Dup_Gp, + Dup_S, + Dup_V, + Eor_V, + Ext_V, + Fabd_S, + Fabd_V, + Fabs_S, + Fabs_V, + Facge_S, + Facge_V, + Facgt_S, + Facgt_V, + Fadd_S, + Fadd_V, + Faddp_S, + Faddp_V, + Fccmp_S, + Fccmpe_S, + Fcmeq_S, + Fcmeq_V, + Fcmge_S, + Fcmge_V, + Fcmgt_S, + Fcmgt_V, + Fcmle_S, + Fcmle_V, + Fcmlt_S, + Fcmlt_V, + Fcmp_S, + Fcmpe_S, + Fcsel_S, + Fcvt_S, + Fcvtas_Gp, + Fcvtas_S, + Fcvtas_V, + Fcvtau_Gp, + Fcvtau_S, + Fcvtau_V, + Fcvtl_V, + Fcvtms_Gp, + Fcvtms_V, + Fcvtmu_Gp, + Fcvtn_V, + Fcvtns_Gp, + Fcvtns_S, + Fcvtns_V, + Fcvtnu_S, + Fcvtnu_V, + Fcvtps_Gp, + Fcvtpu_Gp, + Fcvtzs_Gp, + Fcvtzs_Gp_Fixed, + Fcvtzs_S, + Fcvtzs_V, + Fcvtzs_V_Fixed, + Fcvtzu_Gp, + Fcvtzu_Gp_Fixed, + Fcvtzu_S, + Fcvtzu_V, + Fcvtzu_V_Fixed, + Fdiv_S, + Fdiv_V, + Fmadd_S, + Fmax_S, + Fmax_V, + Fmaxnm_S, + Fmaxnm_V, + Fmaxnmp_S, + Fmaxnmp_V, + Fmaxnmv_V, + Fmaxp_V, + Fmaxv_V, + Fmin_S, + Fmin_V, + Fminnm_S, + Fminnm_V, + Fminnmp_S, + Fminnmp_V, + Fminnmv_V, + Fminp_V, + Fminv_V, + Fmla_Se, + Fmla_V, + Fmla_Ve, + Fmls_Se, + Fmls_V, + Fmls_Ve, + Fmov_S, + Fmov_Si, + Fmov_Vi, + Fmov_Ftoi, + Fmov_Itof, + Fmov_Ftoi1, + Fmov_Itof1, + Fmsub_S, + Fmul_S, + Fmul_Se, + Fmul_V, + Fmul_Ve, + Fmulx_S, + Fmulx_Se, + Fmulx_V, + Fmulx_Ve, + Fneg_S, + Fneg_V, + Fnmadd_S, + Fnmsub_S, + Fnmul_S, + Frecpe_S, + Frecpe_V, + Frecps_S, + Frecps_V, + Frecpx_S, + Frinta_S, + Frinta_V, + Frinti_S, + Frinti_V, + Frintm_S, + Frintm_V, + Frintn_S, + Frintn_V, + Frintp_S, + Frintp_V, + Frintx_S, + Frintx_V, + Frintz_S, + Frintz_V, + Frsqrte_S, + Frsqrte_V, + Frsqrts_S, + Frsqrts_V, + Fsqrt_S, + Fsqrt_V, + Fsub_S, + Fsub_V, + Ins_Gp, + Ins_V, + Ld__Vms, + Ld__Vss, + Mla_V, + Mla_Ve, + Mls_V, + Mls_Ve, + Movi_V, + Mul_V, + Mul_Ve, + Mvni_V, + Neg_S, + Neg_V, + Not_V, + Orn_V, + Orr_V, + Orr_Vi, + Pmull_V, + Raddhn_V, + Rbit_V, + Rev16_V, + Rev32_V, + Rev64_V, + Rshrn_V, + Rsubhn_V, + Saba_V, + Sabal_V, + Sabd_V, + Sabdl_V, + Sadalp_V, + Saddl_V, + Saddlp_V, + Saddlv_V, + Saddw_V, + Scvtf_Gp, + Scvtf_Gp_Fixed, + Scvtf_S, + Scvtf_S_Fixed, + Scvtf_V, + Scvtf_V_Fixed, + Sha1c_V, + Sha1h_V, + Sha1m_V, + Sha1p_V, + Sha1su0_V, + Sha1su1_V, + Sha256h_V, + Sha256h2_V, + Sha256su0_V, + Sha256su1_V, + Shadd_V, + Shl_S, + Shl_V, + Shll_V, + Shrn_V, + Shsub_V, + Sli_S, + Sli_V, + Smax_V, + Smaxp_V, + Smaxv_V, + Smin_V, + Sminp_V, + Sminv_V, + Smlal_V, + Smlal_Ve, + Smlsl_V, + Smlsl_Ve, + Smov_S, + Smull_V, + Smull_Ve, + Sqabs_S, + Sqabs_V, + Sqadd_S, + Sqadd_V, + Sqdmulh_S, + Sqdmulh_V, + Sqdmulh_Ve, + Sqneg_S, + Sqneg_V, + Sqrdmulh_S, + Sqrdmulh_V, + Sqrdmulh_Ve, + Sqrshl_V, + Sqrshrn_S, + Sqrshrn_V, + Sqrshrun_S, + Sqrshrun_V, + Sqshl_V, + Sqshrn_S, + Sqshrn_V, + Sqshrun_S, + Sqshrun_V, + Sqsub_S, + Sqsub_V, + Sqxtn_S, + Sqxtn_V, + Sqxtun_S, + Sqxtun_V, + Srhadd_V, + Sri_S, + Sri_V, + Srshl_V, + Srshr_S, + Srshr_V, + Srsra_S, + Srsra_V, + Sshl_S, + Sshl_V, + Sshll_V, + Sshr_S, + Sshr_V, + Ssra_S, + Ssra_V, + Ssubl_V, + Ssubw_V, + St__Vms, + St__Vss, + Sub_S, + Sub_V, + Subhn_V, + Suqadd_S, + Suqadd_V, + Tbl_V, + Tbx_V, + Trn1_V, + Trn2_V, + Uaba_V, + Uabal_V, + Uabd_V, + Uabdl_V, + Uadalp_V, + Uaddl_V, + Uaddlp_V, + Uaddlv_V, + Uaddw_V, + Ucvtf_Gp, + Ucvtf_Gp_Fixed, + Ucvtf_S, + Ucvtf_S_Fixed, + Ucvtf_V, + Ucvtf_V_Fixed, + Uhadd_V, + Uhsub_V, + Umax_V, + Umaxp_V, + Umaxv_V, + Umin_V, + Uminp_V, + Uminv_V, + Umlal_V, + Umlal_Ve, + Umlsl_V, + Umlsl_Ve, + Umov_S, + Umull_V, + Umull_Ve, + Uqadd_S, + Uqadd_V, + Uqrshl_V, + Uqrshrn_S, + Uqrshrn_V, + Uqshl_V, + Uqshrn_S, + Uqshrn_V, + Uqsub_S, + Uqsub_V, + Uqxtn_S, + Uqxtn_V, + Urhadd_V, + Urshl_V, + Urshr_S, + Urshr_V, + Ursra_S, + Ursra_V, + Ushl_S, + Ushl_V, + Ushll_V, + Ushr_S, + Ushr_V, + Usqadd_S, + Usqadd_V, + Usra_S, + Usra_V, + Usubl_V, + Usubw_V, + Uzp1_V, + Uzp2_V, + Xtn_V, + Zip1_V, + Zip2_V, + + // Base (AArch32) + Bfc, + Bfi, + Blx, + Bx, + Cmp, + Cmn, + Movt, + Mul, + Lda, + Ldab, + Ldaex, + Ldaexb, + Ldaexd, + Ldaexh, + Ldah, + Ldm, + Ldrb, + Ldrd, + Ldrex, + Ldrexb, + Ldrexd, + Ldrexh, + Ldrh, + Ldrsb, + Ldrsh, + Mcr, + Mla, + Mls, + Mov, + Mrc, + Mrrc, + Mvn, + Pkh, + Pld, + Pop, + Push, + Rev, + Revsh, + Rsb, + Rsc, + Sadd8, + Sbfx, + Shadd8, + Smla__, + Smlal, + Smlal__, + Smlaw_, + Smmla, + Smmls, + Smul__, + Smmul, + Ssub8, + Stl, + Stlb, + Stlex, + Stlexb, + Stlexd, + Stlexh, + Stlh, + Stm, + Strb, + Strd, + Strex, + Strexb, + Strexd, + Strexh, + Strh, + Sxtb16, + Tbb, + Tbh, + Teq, + Trap, + Tst, + Uadd8, + Ubfx, + Uhadd8, + Uhsub8, + Umaal, + Umlal, + Umull, + Usat, + Usat16, + Usub8, + Uxtb, + Uxtb16, + Uxth, + + // FP & SIMD (AArch32) + Vabd, + Vabdl, + Vabs, + Vadd, + Vaddl, + Vaddw, + Vand, + Vbic, + Vbif, + Vbit, + Vbsl, + Vceq, + Vcge, + Vcgt, + Vcle, + Vclt, + Vcmp, + Vcmpe, + Vcnt, + Vcvt, + Vdiv, + Vdup, + Veor, + Vext, + Vfma, + Vfms, + Vfnma, + Vfnms, + Vhadd, + Vld1, + Vld2, + Vld3, + Vld4, + Vldm, + Vldr, + Vmax, + Vmaxnm, + Vmin, + Vminnm, + Vmla, + Vmlal, + Vmls, + Vmlsl, + Vmov, + Vmovl, + Vmovn, + Vmrs, + Vmsr, + Vmul, + Vmull, + Vmvn, + Vneg, + Vnmul, + Vnmla, + Vnmls, + Vorn, + Vorr, + Vpadd, + Vpaddl, + Vpmax, + Vpmin, + Vqadd, + Vqdmulh, + Vqmovn, + Vqmovun, + Vqrshrn, + Vqrshrun, + Vqshrn, + Vqshrun, + Vqsub, + Vrev, + Vrhadd, + Vrint, + Vrinta, + Vrintm, + Vrintn, + Vrintp, + Vrintx, + Vrshr, + Vrshrn, + Vsel, + Vshl, + Vshll, + Vshr, + Vshrn, + Vst1, + Vst2, + Vst3, + Vst4, + Vstm, + Vstr, + Vsqrt, + Vrecpe, + Vrecps, + Vrsqrte, + Vrsqrts, + Vrsra, + Vsra, + Vsub, + Vsubl, + Vsubw, + Vtbl, + Vtrn, + Vtst, + Vuzp, + Vzip, + } +} diff --git a/src/ARMeilleure/Instructions/NativeInterface.cs b/src/ARMeilleure/Instructions/NativeInterface.cs new file mode 100644 index 00000000..2c35387a --- /dev/null +++ b/src/ARMeilleure/Instructions/NativeInterface.cs @@ -0,0 +1,195 @@ +using ARMeilleure.Memory; +using ARMeilleure.State; +using ARMeilleure.Translation; +using System; + +namespace ARMeilleure.Instructions +{ + static class NativeInterface + { + private class ThreadContext + { + public ExecutionContext Context { get; } + public IMemoryManager Memory { get; } + public Translator Translator { get; } + + public ThreadContext(ExecutionContext context, IMemoryManager memory, Translator translator) + { + Context = context; + Memory = memory; + Translator = translator; + } + } + + [ThreadStatic] + private static ThreadContext Context; + + public static void RegisterThread(ExecutionContext context, IMemoryManager memory, Translator translator) + { + Context = new ThreadContext(context, memory, translator); + } + + public static void UnregisterThread() + { + Context = null; + } + + public static void Break(ulong address, int imm) + { + Statistics.PauseTimer(); + + GetContext().OnBreak(address, imm); + + Statistics.ResumeTimer(); + } + + public static void SupervisorCall(ulong address, int imm) + { + Statistics.PauseTimer(); + + GetContext().OnSupervisorCall(address, imm); + + Statistics.ResumeTimer(); + } + + public static void Undefined(ulong address, int opCode) + { + Statistics.PauseTimer(); + + GetContext().OnUndefined(address, opCode); + + Statistics.ResumeTimer(); + } + + #region "System registers" + public static ulong GetCtrEl0() + { + return (ulong)GetContext().CtrEl0; + } + + public static ulong GetDczidEl0() + { + return (ulong)GetContext().DczidEl0; + } + + public static ulong GetCntfrqEl0() + { + return GetContext().CntfrqEl0; + } + + public static ulong GetCntpctEl0() + { + return GetContext().CntpctEl0; + } + + public static ulong GetCntvctEl0() + { + return GetContext().CntvctEl0; + } + #endregion + + #region "Read" + public static byte ReadByte(ulong address) + { + return GetMemoryManager().ReadTracked<byte>(address); + } + + public static ushort ReadUInt16(ulong address) + { + return GetMemoryManager().ReadTracked<ushort>(address); + } + + public static uint ReadUInt32(ulong address) + { + return GetMemoryManager().ReadTracked<uint>(address); + } + + public static ulong ReadUInt64(ulong address) + { + return GetMemoryManager().ReadTracked<ulong>(address); + } + + public static V128 ReadVector128(ulong address) + { + return GetMemoryManager().ReadTracked<V128>(address); + } + #endregion + + #region "Write" + public static void WriteByte(ulong address, byte value) + { + GetMemoryManager().Write(address, value); + } + + public static void WriteUInt16(ulong address, ushort value) + { + GetMemoryManager().Write(address, value); + } + + public static void WriteUInt32(ulong address, uint value) + { + GetMemoryManager().Write(address, value); + } + + public static void WriteUInt64(ulong address, ulong value) + { + GetMemoryManager().Write(address, value); + } + + public static void WriteVector128(ulong address, V128 value) + { + GetMemoryManager().Write(address, value); + } + #endregion + + public static void EnqueueForRejit(ulong address) + { + Context.Translator.EnqueueForRejit(address, GetContext().ExecutionMode); + } + + public static void SignalMemoryTracking(ulong address, ulong size, bool write) + { + GetMemoryManager().SignalMemoryTracking(address, size, write); + } + + public static void ThrowInvalidMemoryAccess(ulong address) + { + throw new InvalidAccessException(address); + } + + public static ulong GetFunctionAddress(ulong address) + { + TranslatedFunction function = Context.Translator.GetOrTranslate(address, GetContext().ExecutionMode); + + return (ulong)function.FuncPointer.ToInt64(); + } + + public static void InvalidateCacheLine(ulong address) + { + Context.Translator.InvalidateJitCacheRegion(address, InstEmit.DczSizeInBytes); + } + + public static bool CheckSynchronization() + { + Statistics.PauseTimer(); + + ExecutionContext context = GetContext(); + + context.CheckInterrupt(); + + Statistics.ResumeTimer(); + + return context.Running; + } + + public static ExecutionContext GetContext() + { + return Context.Context; + } + + public static IMemoryManager GetMemoryManager() + { + return Context.Memory; + } + } +}
\ No newline at end of file diff --git a/src/ARMeilleure/Instructions/SoftFallback.cs b/src/ARMeilleure/Instructions/SoftFallback.cs new file mode 100644 index 00000000..06d76a67 --- /dev/null +++ b/src/ARMeilleure/Instructions/SoftFallback.cs @@ -0,0 +1,624 @@ +using ARMeilleure.State; +using System; + +namespace ARMeilleure.Instructions +{ + static class SoftFallback + { +#region "ShrImm64" + public static long SignedShrImm64(long value, long roundConst, int shift) + { + if (roundConst == 0L) + { + if (shift <= 63) + { + return value >> shift; + } + else /* if (shift == 64) */ + { + if (value < 0L) + { + return -1L; + } + else /* if (value >= 0L) */ + { + return 0L; + } + } + } + else /* if (roundConst == 1L << (shift - 1)) */ + { + if (shift <= 63) + { + long add = value + roundConst; + + if ((~value & (value ^ add)) < 0L) + { + return (long)((ulong)add >> shift); + } + else + { + return add >> shift; + } + } + else /* if (shift == 64) */ + { + return 0L; + } + } + } + + public static ulong UnsignedShrImm64(ulong value, long roundConst, int shift) + { + if (roundConst == 0L) + { + if (shift <= 63) + { + return value >> shift; + } + else /* if (shift == 64) */ + { + return 0UL; + } + } + else /* if (roundConst == 1L << (shift - 1)) */ + { + ulong add = value + (ulong)roundConst; + + if ((add < value) && (add < (ulong)roundConst)) + { + if (shift <= 63) + { + return (add >> shift) | (0x8000000000000000UL >> (shift - 1)); + } + else /* if (shift == 64) */ + { + return 1UL; + } + } + else + { + if (shift <= 63) + { + return add >> shift; + } + else /* if (shift == 64) */ + { + return 0UL; + } + } + } + } +#endregion + +#region "Saturation" + public static int SatF32ToS32(float value) + { + if (float.IsNaN(value)) return 0; + + return value >= int.MaxValue ? int.MaxValue : + value <= int.MinValue ? int.MinValue : (int)value; + } + + public static long SatF32ToS64(float value) + { + if (float.IsNaN(value)) return 0; + + return value >= long.MaxValue ? long.MaxValue : + value <= long.MinValue ? long.MinValue : (long)value; + } + + public static uint SatF32ToU32(float value) + { + if (float.IsNaN(value)) return 0; + + return value >= uint.MaxValue ? uint.MaxValue : + value <= uint.MinValue ? uint.MinValue : (uint)value; + } + + public static ulong SatF32ToU64(float value) + { + if (float.IsNaN(value)) return 0; + + return value >= ulong.MaxValue ? ulong.MaxValue : + value <= ulong.MinValue ? ulong.MinValue : (ulong)value; + } + + public static int SatF64ToS32(double value) + { + if (double.IsNaN(value)) return 0; + + return value >= int.MaxValue ? int.MaxValue : + value <= int.MinValue ? int.MinValue : (int)value; + } + + public static long SatF64ToS64(double value) + { + if (double.IsNaN(value)) return 0; + + return value >= long.MaxValue ? long.MaxValue : + value <= long.MinValue ? long.MinValue : (long)value; + } + + public static uint SatF64ToU32(double value) + { + if (double.IsNaN(value)) return 0; + + return value >= uint.MaxValue ? uint.MaxValue : + value <= uint.MinValue ? uint.MinValue : (uint)value; + } + + public static ulong SatF64ToU64(double value) + { + if (double.IsNaN(value)) return 0; + + return value >= ulong.MaxValue ? ulong.MaxValue : + value <= ulong.MinValue ? ulong.MinValue : (ulong)value; + } +#endregion + +#region "Count" + public static ulong CountLeadingSigns(ulong value, int size) // size is 8, 16, 32 or 64 (SIMD&FP or Base Inst.). + { + value ^= value >> 1; + + int highBit = size - 2; + + for (int bit = highBit; bit >= 0; bit--) + { + if (((int)(value >> bit) & 0b1) != 0) + { + return (ulong)(highBit - bit); + } + } + + return (ulong)(size - 1); + } + + private static ReadOnlySpan<byte> ClzNibbleTbl => new byte[] { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 }; + + public static ulong CountLeadingZeros(ulong value, int size) // size is 8, 16, 32 or 64 (SIMD&FP or Base Inst.). + { + if (value == 0ul) + { + return (ulong)size; + } + + int nibbleIdx = size; + int preCount, count = 0; + + do + { + nibbleIdx -= 4; + preCount = ClzNibbleTbl[(int)(value >> nibbleIdx) & 0b1111]; + count += preCount; + } + while (preCount == 4); + + return (ulong)count; + } +#endregion + +#region "Table" + public static V128 Tbl1(V128 vector, int bytes, V128 tb0) + { + return TblOrTbx(default, vector, bytes, tb0); + } + + public static V128 Tbl2(V128 vector, int bytes, V128 tb0, V128 tb1) + { + return TblOrTbx(default, vector, bytes, tb0, tb1); + } + + public static V128 Tbl3(V128 vector, int bytes, V128 tb0, V128 tb1, V128 tb2) + { + return TblOrTbx(default, vector, bytes, tb0, tb1, tb2); + } + + public static V128 Tbl4(V128 vector, int bytes, V128 tb0, V128 tb1, V128 tb2, V128 tb3) + { + return TblOrTbx(default, vector, bytes, tb0, tb1, tb2, tb3); + } + + public static V128 Tbx1(V128 dest, V128 vector, int bytes, V128 tb0) + { + return TblOrTbx(dest, vector, bytes, tb0); + } + + public static V128 Tbx2(V128 dest, V128 vector, int bytes, V128 tb0, V128 tb1) + { + return TblOrTbx(dest, vector, bytes, tb0, tb1); + } + + public static V128 Tbx3(V128 dest, V128 vector, int bytes, V128 tb0, V128 tb1, V128 tb2) + { + return TblOrTbx(dest, vector, bytes, tb0, tb1, tb2); + } + + public static V128 Tbx4(V128 dest, V128 vector, int bytes, V128 tb0, V128 tb1, V128 tb2, V128 tb3) + { + return TblOrTbx(dest, vector, bytes, tb0, tb1, tb2, tb3); + } + + private static V128 TblOrTbx(V128 dest, V128 vector, int bytes, params V128[] tb) + { + byte[] res = new byte[16]; + + if (dest != default) + { + Buffer.BlockCopy(dest.ToArray(), 0, res, 0, bytes); + } + + byte[] table = new byte[tb.Length * 16]; + + for (byte index = 0; index < tb.Length; index++) + { + Buffer.BlockCopy(tb[index].ToArray(), 0, table, index * 16, 16); + } + + byte[] v = vector.ToArray(); + + for (byte index = 0; index < bytes; index++) + { + byte tblIndex = v[index]; + + if (tblIndex < table.Length) + { + res[index] = table[tblIndex]; + } + } + + return new V128(res); + } +#endregion + +#region "Crc32" + private const uint Crc32RevPoly = 0xedb88320; + private const uint Crc32cRevPoly = 0x82f63b78; + + public static uint Crc32b(uint crc, byte value) => Crc32 (crc, Crc32RevPoly, value); + public static uint Crc32h(uint crc, ushort value) => Crc32h(crc, Crc32RevPoly, value); + public static uint Crc32w(uint crc, uint value) => Crc32w(crc, Crc32RevPoly, value); + public static uint Crc32x(uint crc, ulong value) => Crc32x(crc, Crc32RevPoly, value); + + public static uint Crc32cb(uint crc, byte value) => Crc32 (crc, Crc32cRevPoly, value); + public static uint Crc32ch(uint crc, ushort value) => Crc32h(crc, Crc32cRevPoly, value); + public static uint Crc32cw(uint crc, uint value) => Crc32w(crc, Crc32cRevPoly, value); + public static uint Crc32cx(uint crc, ulong value) => Crc32x(crc, Crc32cRevPoly, value); + + private static uint Crc32h(uint crc, uint poly, ushort val) + { + crc = Crc32(crc, poly, (byte)(val >> 0)); + crc = Crc32(crc, poly, (byte)(val >> 8)); + + return crc; + } + + private static uint Crc32w(uint crc, uint poly, uint val) + { + crc = Crc32(crc, poly, (byte)(val >> 0)); + crc = Crc32(crc, poly, (byte)(val >> 8)); + crc = Crc32(crc, poly, (byte)(val >> 16)); + crc = Crc32(crc, poly, (byte)(val >> 24)); + + return crc; + } + + private static uint Crc32x(uint crc, uint poly, ulong val) + { + crc = Crc32(crc, poly, (byte)(val >> 0)); + crc = Crc32(crc, poly, (byte)(val >> 8)); + crc = Crc32(crc, poly, (byte)(val >> 16)); + crc = Crc32(crc, poly, (byte)(val >> 24)); + crc = Crc32(crc, poly, (byte)(val >> 32)); + crc = Crc32(crc, poly, (byte)(val >> 40)); + crc = Crc32(crc, poly, (byte)(val >> 48)); + crc = Crc32(crc, poly, (byte)(val >> 56)); + + return crc; + } + + private static uint Crc32(uint crc, uint poly, byte val) + { + crc ^= val; + + for (int bit = 7; bit >= 0; bit--) + { + uint mask = (uint)(-(int)(crc & 1)); + + crc = (crc >> 1) ^ (poly & mask); + } + + return crc; + } +#endregion + +#region "Aes" + public static V128 Decrypt(V128 value, V128 roundKey) + { + return CryptoHelper.AesInvSubBytes(CryptoHelper.AesInvShiftRows(value ^ roundKey)); + } + + public static V128 Encrypt(V128 value, V128 roundKey) + { + return CryptoHelper.AesSubBytes(CryptoHelper.AesShiftRows(value ^ roundKey)); + } + + public static V128 InverseMixColumns(V128 value) + { + return CryptoHelper.AesInvMixColumns(value); + } + + public static V128 MixColumns(V128 value) + { + return CryptoHelper.AesMixColumns(value); + } +#endregion + +#region "Sha1" + public static V128 HashChoose(V128 hash_abcd, uint hash_e, V128 wk) + { + for (int e = 0; e <= 3; e++) + { + uint t = ShaChoose(hash_abcd.Extract<uint>(1), + hash_abcd.Extract<uint>(2), + hash_abcd.Extract<uint>(3)); + + hash_e += Rol(hash_abcd.Extract<uint>(0), 5) + t + wk.Extract<uint>(e); + + t = Rol(hash_abcd.Extract<uint>(1), 30); + + hash_abcd.Insert(1, t); + + Rol32_160(ref hash_e, ref hash_abcd); + } + + return hash_abcd; + } + + public static uint FixedRotate(uint hash_e) + { + return hash_e.Rol(30); + } + + public static V128 HashMajority(V128 hash_abcd, uint hash_e, V128 wk) + { + for (int e = 0; e <= 3; e++) + { + uint t = ShaMajority(hash_abcd.Extract<uint>(1), + hash_abcd.Extract<uint>(2), + hash_abcd.Extract<uint>(3)); + + hash_e += Rol(hash_abcd.Extract<uint>(0), 5) + t + wk.Extract<uint>(e); + + t = Rol(hash_abcd.Extract<uint>(1), 30); + + hash_abcd.Insert(1, t); + + Rol32_160(ref hash_e, ref hash_abcd); + } + + return hash_abcd; + } + + public static V128 HashParity(V128 hash_abcd, uint hash_e, V128 wk) + { + for (int e = 0; e <= 3; e++) + { + uint t = ShaParity(hash_abcd.Extract<uint>(1), + hash_abcd.Extract<uint>(2), + hash_abcd.Extract<uint>(3)); + + hash_e += Rol(hash_abcd.Extract<uint>(0), 5) + t + wk.Extract<uint>(e); + + t = Rol(hash_abcd.Extract<uint>(1), 30); + + hash_abcd.Insert(1, t); + + Rol32_160(ref hash_e, ref hash_abcd); + } + + return hash_abcd; + } + + public static V128 Sha1SchedulePart1(V128 w0_3, V128 w4_7, V128 w8_11) + { + ulong t2 = w4_7.Extract<ulong>(0); + ulong t1 = w0_3.Extract<ulong>(1); + + V128 result = new V128(t1, t2); + + return result ^ (w0_3 ^ w8_11); + } + + public static V128 Sha1SchedulePart2(V128 tw0_3, V128 w12_15) + { + V128 t = tw0_3 ^ (w12_15 >> 32); + + uint tE0 = t.Extract<uint>(0); + uint tE1 = t.Extract<uint>(1); + uint tE2 = t.Extract<uint>(2); + uint tE3 = t.Extract<uint>(3); + + return new V128(tE0.Rol(1), tE1.Rol(1), tE2.Rol(1), tE3.Rol(1) ^ tE0.Rol(2)); + } + + private static void Rol32_160(ref uint y, ref V128 x) + { + uint xE3 = x.Extract<uint>(3); + + x <<= 32; + x.Insert(0, y); + + y = xE3; + } + + private static uint ShaChoose(uint x, uint y, uint z) + { + return ((y ^ z) & x) ^ z; + } + + private static uint ShaMajority(uint x, uint y, uint z) + { + return (x & y) | ((x | y) & z); + } + + private static uint ShaParity(uint x, uint y, uint z) + { + return x ^ y ^ z; + } + + private static uint Rol(this uint value, int count) + { + return (value << count) | (value >> (32 - count)); + } +#endregion + +#region "Sha256" + public static V128 HashLower(V128 hash_abcd, V128 hash_efgh, V128 wk) + { + return Sha256Hash(hash_abcd, hash_efgh, wk, part1: true); + } + + public static V128 HashUpper(V128 hash_abcd, V128 hash_efgh, V128 wk) + { + return Sha256Hash(hash_abcd, hash_efgh, wk, part1: false); + } + + public static V128 Sha256SchedulePart1(V128 w0_3, V128 w4_7) + { + V128 result = new V128(); + + for (int e = 0; e <= 3; e++) + { + uint elt = (e <= 2 ? w0_3 : w4_7).Extract<uint>(e <= 2 ? e + 1 : 0); + + elt = elt.Ror(7) ^ elt.Ror(18) ^ elt.Lsr(3); + + elt += w0_3.Extract<uint>(e); + + result.Insert(e, elt); + } + + return result; + } + + public static V128 Sha256SchedulePart2(V128 w0_3, V128 w8_11, V128 w12_15) + { + V128 result = new V128(); + + ulong t1 = w12_15.Extract<ulong>(1); + + for (int e = 0; e <= 1; e++) + { + uint elt = t1.ULongPart(e); + + elt = elt.Ror(17) ^ elt.Ror(19) ^ elt.Lsr(10); + + elt += w0_3.Extract<uint>(e) + w8_11.Extract<uint>(e + 1); + + result.Insert(e, elt); + } + + t1 = result.Extract<ulong>(0); + + for (int e = 2; e <= 3; e++) + { + uint elt = t1.ULongPart(e - 2); + + elt = elt.Ror(17) ^ elt.Ror(19) ^ elt.Lsr(10); + + elt += w0_3.Extract<uint>(e) + (e == 2 ? w8_11 : w12_15).Extract<uint>(e == 2 ? 3 : 0); + + result.Insert(e, elt); + } + + return result; + } + + private static V128 Sha256Hash(V128 x, V128 y, V128 w, bool part1) + { + for (int e = 0; e <= 3; e++) + { + uint chs = ShaChoose(y.Extract<uint>(0), + y.Extract<uint>(1), + y.Extract<uint>(2)); + + uint maj = ShaMajority(x.Extract<uint>(0), + x.Extract<uint>(1), + x.Extract<uint>(2)); + + uint t1 = y.Extract<uint>(3) + ShaHashSigma1(y.Extract<uint>(0)) + chs + w.Extract<uint>(e); + + uint t2 = t1 + x.Extract<uint>(3); + + x.Insert(3, t2); + + t2 = t1 + ShaHashSigma0(x.Extract<uint>(0)) + maj; + + y.Insert(3, t2); + + Rol32_256(ref y, ref x); + } + + return part1 ? x : y; + } + + private static void Rol32_256(ref V128 y, ref V128 x) + { + uint yE3 = y.Extract<uint>(3); + uint xE3 = x.Extract<uint>(3); + + y <<= 32; + x <<= 32; + + y.Insert(0, xE3); + x.Insert(0, yE3); + } + + private static uint ShaHashSigma0(uint x) + { + return x.Ror(2) ^ x.Ror(13) ^ x.Ror(22); + } + + private static uint ShaHashSigma1(uint x) + { + return x.Ror(6) ^ x.Ror(11) ^ x.Ror(25); + } + + private static uint Ror(this uint value, int count) + { + return (value >> count) | (value << (32 - count)); + } + + private static uint Lsr(this uint value, int count) + { + return value >> count; + } + + private static uint ULongPart(this ulong value, int part) + { + return part == 0 + ? (uint)(value & 0xFFFFFFFFUL) + : (uint)(value >> 32); + } +#endregion + + public static V128 PolynomialMult64_128(ulong op1, ulong op2) + { + V128 result = V128.Zero; + + V128 op2_128 = new V128(op2, 0); + + for (int i = 0; i < 64; i++) + { + if (((op1 >> i) & 1) == 1) + { + result ^= op2_128 << i; + } + } + + return result; + } + } +} diff --git a/src/ARMeilleure/Instructions/SoftFloat.cs b/src/ARMeilleure/Instructions/SoftFloat.cs new file mode 100644 index 00000000..9e3db68d --- /dev/null +++ b/src/ARMeilleure/Instructions/SoftFloat.cs @@ -0,0 +1,3480 @@ +using ARMeilleure.State; +using System; +using System.Diagnostics; + +namespace ARMeilleure.Instructions +{ + static class SoftFloat + { + static SoftFloat() + { + RecipEstimateTable = BuildRecipEstimateTable(); + RecipSqrtEstimateTable = BuildRecipSqrtEstimateTable(); + } + + public static readonly byte[] RecipEstimateTable; + public static readonly byte[] RecipSqrtEstimateTable; + + private static byte[] BuildRecipEstimateTable() + { + byte[] tbl = new byte[256]; + + for (int idx = 0; idx < 256; idx++) + { + uint src = (uint)idx + 256u; + + Debug.Assert(256u <= src && src < 512u); + + src = (src << 1) + 1u; + + uint aux = (1u << 19) / src; + + uint dst = (aux + 1u) >> 1; + + Debug.Assert(256u <= dst && dst < 512u); + + tbl[idx] = (byte)(dst - 256u); + } + + return tbl; + } + + private static byte[] BuildRecipSqrtEstimateTable() + { + byte[] tbl = new byte[384]; + + for (int idx = 0; idx < 384; idx++) + { + uint src = (uint)idx + 128u; + + Debug.Assert(128u <= src && src < 512u); + + if (src < 256u) + { + src = (src << 1) + 1u; + } + else + { + src = (src >> 1) << 1; + src = (src + 1u) << 1; + } + + uint aux = 512u; + + while (src * (aux + 1u) * (aux + 1u) < (1u << 28)) + { + aux = aux + 1u; + } + + uint dst = (aux + 1u) >> 1; + + Debug.Assert(256u <= dst && dst < 512u); + + tbl[idx] = (byte)(dst - 256u); + } + + return tbl; + } + + public static void FPProcessException(FPException exc, ExecutionContext context) + { + FPProcessException(exc, context, context.Fpcr); + } + + public static void FPProcessException(FPException exc, ExecutionContext context, FPCR fpcr) + { + int enable = (int)exc + 8; + + if ((fpcr & (FPCR)(1 << enable)) != 0) + { + throw new NotImplementedException("Floating-point trap handling."); + } + else + { + context.Fpsr |= (FPSR)(1 << (int)exc); + } + } + + public static FPRoundingMode GetRoundingMode(this FPCR fpcr) + { + const int RModeShift = 22; + + return (FPRoundingMode)(((uint)fpcr >> RModeShift) & 3u); + } + } + + static class SoftFloat16 + { + public static ushort FPDefaultNaN() + { + return (ushort)0x7E00u; + } + + public static ushort FPInfinity(bool sign) + { + return sign ? (ushort)0xFC00u : (ushort)0x7C00u; + } + + public static ushort FPZero(bool sign) + { + return sign ? (ushort)0x8000u : (ushort)0x0000u; + } + + public static ushort FPMaxNormal(bool sign) + { + return sign ? (ushort)0xFBFFu : (ushort)0x7BFFu; + } + + public static double FPUnpackCv( + this ushort valueBits, + out FPType type, + out bool sign, + ExecutionContext context) + { + sign = (~(uint)valueBits & 0x8000u) == 0u; + + uint exp16 = ((uint)valueBits & 0x7C00u) >> 10; + uint frac16 = (uint)valueBits & 0x03FFu; + + double real; + + if (exp16 == 0u) + { + if (frac16 == 0u) + { + type = FPType.Zero; + real = 0d; + } + else + { + type = FPType.Nonzero; // Subnormal. + real = Math.Pow(2d, -14) * ((double)frac16 * Math.Pow(2d, -10)); + } + } + else if (exp16 == 0x1Fu && (context.Fpcr & FPCR.Ahp) == 0) + { + if (frac16 == 0u) + { + type = FPType.Infinity; + real = Math.Pow(2d, 1000); + } + else + { + type = (~frac16 & 0x0200u) == 0u ? FPType.QNaN : FPType.SNaN; + real = 0d; + } + } + else + { + type = FPType.Nonzero; // Normal. + real = Math.Pow(2d, (int)exp16 - 15) * (1d + (double)frac16 * Math.Pow(2d, -10)); + } + + return sign ? -real : real; + } + + public static ushort FPRoundCv(double real, ExecutionContext context) + { + const int minimumExp = -14; + + const int e = 5; + const int f = 10; + + bool sign; + double mantissa; + + if (real < 0d) + { + sign = true; + mantissa = -real; + } + else + { + sign = false; + mantissa = real; + } + + int exponent = 0; + + while (mantissa < 1d) + { + mantissa *= 2d; + exponent--; + } + + while (mantissa >= 2d) + { + mantissa /= 2d; + exponent++; + } + + uint biasedExp = (uint)Math.Max(exponent - minimumExp + 1, 0); + + if (biasedExp == 0u) + { + mantissa /= Math.Pow(2d, minimumExp - exponent); + } + + uint intMant = (uint)Math.Floor(mantissa * Math.Pow(2d, f)); + double error = mantissa * Math.Pow(2d, f) - (double)intMant; + + if (biasedExp == 0u && (error != 0d || (context.Fpcr & FPCR.Ufe) != 0)) + { + SoftFloat.FPProcessException(FPException.Underflow, context); + } + + bool overflowToInf; + bool roundUp; + + switch (context.Fpcr.GetRoundingMode()) + { + default: + case FPRoundingMode.ToNearest: + roundUp = (error > 0.5d || (error == 0.5d && (intMant & 1u) == 1u)); + overflowToInf = true; + break; + + case FPRoundingMode.TowardsPlusInfinity: + roundUp = (error != 0d && !sign); + overflowToInf = !sign; + break; + + case FPRoundingMode.TowardsMinusInfinity: + roundUp = (error != 0d && sign); + overflowToInf = sign; + break; + + case FPRoundingMode.TowardsZero: + roundUp = false; + overflowToInf = false; + break; + } + + if (roundUp) + { + intMant++; + + if (intMant == 1u << f) + { + biasedExp = 1u; + } + + if (intMant == 1u << (f + 1)) + { + biasedExp++; + intMant >>= 1; + } + } + + ushort resultBits; + + if ((context.Fpcr & FPCR.Ahp) == 0) + { + if (biasedExp >= (1u << e) - 1u) + { + resultBits = overflowToInf ? FPInfinity(sign) : FPMaxNormal(sign); + + SoftFloat.FPProcessException(FPException.Overflow, context); + + error = 1d; + } + else + { + resultBits = (ushort)((sign ? 1u : 0u) << 15 | (biasedExp & 0x1Fu) << 10 | (intMant & 0x03FFu)); + } + } + else + { + if (biasedExp >= 1u << e) + { + resultBits = (ushort)((sign ? 1u : 0u) << 15 | 0x7FFFu); + + SoftFloat.FPProcessException(FPException.InvalidOp, context); + + error = 0d; + } + else + { + resultBits = (ushort)((sign ? 1u : 0u) << 15 | (biasedExp & 0x1Fu) << 10 | (intMant & 0x03FFu)); + } + } + + if (error != 0d) + { + SoftFloat.FPProcessException(FPException.Inexact, context); + } + + return resultBits; + } + } + + static class SoftFloat16_32 + { + public static float FPConvert(ushort valueBits) + { + ExecutionContext context = NativeInterface.GetContext(); + + double real = valueBits.FPUnpackCv(out FPType type, out bool sign, context); + + float result; + + if (type == FPType.SNaN || type == FPType.QNaN) + { + if ((context.Fpcr & FPCR.Dn) != 0) + { + result = SoftFloat32.FPDefaultNaN(); + } + else + { + result = FPConvertNaN(valueBits); + } + + if (type == FPType.SNaN) + { + SoftFloat.FPProcessException(FPException.InvalidOp, context); + } + } + else if (type == FPType.Infinity) + { + result = SoftFloat32.FPInfinity(sign); + } + else if (type == FPType.Zero) + { + result = SoftFloat32.FPZero(sign); + } + else + { + result = FPRoundCv(real, context); + } + + return result; + } + + private static float FPRoundCv(double real, ExecutionContext context) + { + const int minimumExp = -126; + + const int e = 8; + const int f = 23; + + bool sign; + double mantissa; + + if (real < 0d) + { + sign = true; + mantissa = -real; + } + else + { + sign = false; + mantissa = real; + } + + int exponent = 0; + + while (mantissa < 1d) + { + mantissa *= 2d; + exponent--; + } + + while (mantissa >= 2d) + { + mantissa /= 2d; + exponent++; + } + + if ((context.Fpcr & FPCR.Fz) != 0 && exponent < minimumExp) + { + context.Fpsr |= FPSR.Ufc; + + return SoftFloat32.FPZero(sign); + } + + uint biasedExp = (uint)Math.Max(exponent - minimumExp + 1, 0); + + if (biasedExp == 0u) + { + mantissa /= Math.Pow(2d, minimumExp - exponent); + } + + uint intMant = (uint)Math.Floor(mantissa * Math.Pow(2d, f)); + double error = mantissa * Math.Pow(2d, f) - (double)intMant; + + if (biasedExp == 0u && (error != 0d || (context.Fpcr & FPCR.Ufe) != 0)) + { + SoftFloat.FPProcessException(FPException.Underflow, context); + } + + bool overflowToInf; + bool roundUp; + + switch (context.Fpcr.GetRoundingMode()) + { + default: + case FPRoundingMode.ToNearest: + roundUp = (error > 0.5d || (error == 0.5d && (intMant & 1u) == 1u)); + overflowToInf = true; + break; + + case FPRoundingMode.TowardsPlusInfinity: + roundUp = (error != 0d && !sign); + overflowToInf = !sign; + break; + + case FPRoundingMode.TowardsMinusInfinity: + roundUp = (error != 0d && sign); + overflowToInf = sign; + break; + + case FPRoundingMode.TowardsZero: + roundUp = false; + overflowToInf = false; + break; + } + + if (roundUp) + { + intMant++; + + if (intMant == 1u << f) + { + biasedExp = 1u; + } + + if (intMant == 1u << (f + 1)) + { + biasedExp++; + intMant >>= 1; + } + } + + float result; + + if (biasedExp >= (1u << e) - 1u) + { + result = overflowToInf ? SoftFloat32.FPInfinity(sign) : SoftFloat32.FPMaxNormal(sign); + + SoftFloat.FPProcessException(FPException.Overflow, context); + + error = 1d; + } + else + { + result = BitConverter.Int32BitsToSingle( + (int)((sign ? 1u : 0u) << 31 | (biasedExp & 0xFFu) << 23 | (intMant & 0x007FFFFFu))); + } + + if (error != 0d) + { + SoftFloat.FPProcessException(FPException.Inexact, context); + } + + return result; + } + + private static float FPConvertNaN(ushort valueBits) + { + return BitConverter.Int32BitsToSingle( + (int)(((uint)valueBits & 0x8000u) << 16 | 0x7FC00000u | ((uint)valueBits & 0x01FFu) << 13)); + } + } + + static class SoftFloat16_64 + { + public static double FPConvert(ushort valueBits) + { + ExecutionContext context = NativeInterface.GetContext(); + + double real = valueBits.FPUnpackCv(out FPType type, out bool sign, context); + + double result; + + if (type == FPType.SNaN || type == FPType.QNaN) + { + if ((context.Fpcr & FPCR.Dn) != 0) + { + result = SoftFloat64.FPDefaultNaN(); + } + else + { + result = FPConvertNaN(valueBits); + } + + if (type == FPType.SNaN) + { + SoftFloat.FPProcessException(FPException.InvalidOp, context); + } + } + else if (type == FPType.Infinity) + { + result = SoftFloat64.FPInfinity(sign); + } + else if (type == FPType.Zero) + { + result = SoftFloat64.FPZero(sign); + } + else + { + result = FPRoundCv(real, context); + } + + return result; + } + + private static double FPRoundCv(double real, ExecutionContext context) + { + const int minimumExp = -1022; + + const int e = 11; + const int f = 52; + + bool sign; + double mantissa; + + if (real < 0d) + { + sign = true; + mantissa = -real; + } + else + { + sign = false; + mantissa = real; + } + + int exponent = 0; + + while (mantissa < 1d) + { + mantissa *= 2d; + exponent--; + } + + while (mantissa >= 2d) + { + mantissa /= 2d; + exponent++; + } + + if ((context.Fpcr & FPCR.Fz) != 0 && exponent < minimumExp) + { + context.Fpsr |= FPSR.Ufc; + + return SoftFloat64.FPZero(sign); + } + + uint biasedExp = (uint)Math.Max(exponent - minimumExp + 1, 0); + + if (biasedExp == 0u) + { + mantissa /= Math.Pow(2d, minimumExp - exponent); + } + + ulong intMant = (ulong)Math.Floor(mantissa * Math.Pow(2d, f)); + double error = mantissa * Math.Pow(2d, f) - (double)intMant; + + if (biasedExp == 0u && (error != 0d || (context.Fpcr & FPCR.Ufe) != 0)) + { + SoftFloat.FPProcessException(FPException.Underflow, context); + } + + bool overflowToInf; + bool roundUp; + + switch (context.Fpcr.GetRoundingMode()) + { + default: + case FPRoundingMode.ToNearest: + roundUp = (error > 0.5d || (error == 0.5d && (intMant & 1u) == 1u)); + overflowToInf = true; + break; + + case FPRoundingMode.TowardsPlusInfinity: + roundUp = (error != 0d && !sign); + overflowToInf = !sign; + break; + + case FPRoundingMode.TowardsMinusInfinity: + roundUp = (error != 0d && sign); + overflowToInf = sign; + break; + + case FPRoundingMode.TowardsZero: + roundUp = false; + overflowToInf = false; + break; + } + + if (roundUp) + { + intMant++; + + if (intMant == 1ul << f) + { + biasedExp = 1u; + } + + if (intMant == 1ul << (f + 1)) + { + biasedExp++; + intMant >>= 1; + } + } + + double result; + + if (biasedExp >= (1u << e) - 1u) + { + result = overflowToInf ? SoftFloat64.FPInfinity(sign) : SoftFloat64.FPMaxNormal(sign); + + SoftFloat.FPProcessException(FPException.Overflow, context); + + error = 1d; + } + else + { + result = BitConverter.Int64BitsToDouble( + (long)((sign ? 1ul : 0ul) << 63 | (biasedExp & 0x7FFul) << 52 | (intMant & 0x000FFFFFFFFFFFFFul))); + } + + if (error != 0d) + { + SoftFloat.FPProcessException(FPException.Inexact, context); + } + + return result; + } + + private static double FPConvertNaN(ushort valueBits) + { + return BitConverter.Int64BitsToDouble( + (long)(((ulong)valueBits & 0x8000ul) << 48 | 0x7FF8000000000000ul | ((ulong)valueBits & 0x01FFul) << 42)); + } + } + + static class SoftFloat32_16 + { + public static ushort FPConvert(float value) + { + ExecutionContext context = NativeInterface.GetContext(); + + double real = value.FPUnpackCv(out FPType type, out bool sign, out uint valueBits, context); + + bool altHp = (context.Fpcr & FPCR.Ahp) != 0; + + ushort resultBits; + + if (type == FPType.SNaN || type == FPType.QNaN) + { + if (altHp) + { + resultBits = SoftFloat16.FPZero(sign); + } + else if ((context.Fpcr & FPCR.Dn) != 0) + { + resultBits = SoftFloat16.FPDefaultNaN(); + } + else + { + resultBits = FPConvertNaN(valueBits); + } + + if (type == FPType.SNaN || altHp) + { + SoftFloat.FPProcessException(FPException.InvalidOp, context); + } + } + else if (type == FPType.Infinity) + { + if (altHp) + { + resultBits = (ushort)((sign ? 1u : 0u) << 15 | 0x7FFFu); + + SoftFloat.FPProcessException(FPException.InvalidOp, context); + } + else + { + resultBits = SoftFloat16.FPInfinity(sign); + } + } + else if (type == FPType.Zero) + { + resultBits = SoftFloat16.FPZero(sign); + } + else + { + resultBits = SoftFloat16.FPRoundCv(real, context); + } + + return resultBits; + } + + private static double FPUnpackCv( + this float value, + out FPType type, + out bool sign, + out uint valueBits, + ExecutionContext context) + { + valueBits = (uint)BitConverter.SingleToInt32Bits(value); + + sign = (~valueBits & 0x80000000u) == 0u; + + uint exp32 = (valueBits & 0x7F800000u) >> 23; + uint frac32 = valueBits & 0x007FFFFFu; + + double real; + + if (exp32 == 0u) + { + if (frac32 == 0u || (context.Fpcr & FPCR.Fz) != 0) + { + type = FPType.Zero; + real = 0d; + + if (frac32 != 0u) + { + SoftFloat.FPProcessException(FPException.InputDenorm, context); + } + } + else + { + type = FPType.Nonzero; // Subnormal. + real = Math.Pow(2d, -126) * ((double)frac32 * Math.Pow(2d, -23)); + } + } + else if (exp32 == 0xFFu) + { + if (frac32 == 0u) + { + type = FPType.Infinity; + real = Math.Pow(2d, 1000); + } + else + { + type = (~frac32 & 0x00400000u) == 0u ? FPType.QNaN : FPType.SNaN; + real = 0d; + } + } + else + { + type = FPType.Nonzero; // Normal. + real = Math.Pow(2d, (int)exp32 - 127) * (1d + (double)frac32 * Math.Pow(2d, -23)); + } + + return sign ? -real : real; + } + + private static ushort FPConvertNaN(uint valueBits) + { + return (ushort)((valueBits & 0x80000000u) >> 16 | 0x7E00u | (valueBits & 0x003FE000u) >> 13); + } + } + + static class SoftFloat32 + { + public static float FPAdd(float value1, float value2) + { + return FPAddFpscr(value1, value2, false); + } + + public static float FPAddFpscr(float value1, float value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out uint op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out uint op2, context, fpcr); + + float result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + if (inf1 && inf2 && sign1 == !sign2) + { + result = FPDefaultNaN(); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else if ((inf1 && !sign1) || (inf2 && !sign2)) + { + result = FPInfinity(false); + } + else if ((inf1 && sign1) || (inf2 && sign2)) + { + result = FPInfinity(true); + } + else if (zero1 && zero2 && sign1 == sign2) + { + result = FPZero(sign1); + } + else + { + result = value1 + value2; + + if ((fpcr & FPCR.Fz) != 0 && float.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0f); + } + } + } + + return result; + } + + public static int FPCompare(float value1, float value2, bool signalNaNs) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out _, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out _, context, fpcr); + + int result; + + if (type1 == FPType.SNaN || type1 == FPType.QNaN || type2 == FPType.SNaN || type2 == FPType.QNaN) + { + result = 0b0011; + + if (type1 == FPType.SNaN || type2 == FPType.SNaN || signalNaNs) + { + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + } + else + { + if (value1 == value2) + { + result = 0b0110; + } + else if (value1 < value2) + { + result = 0b1000; + } + else + { + result = 0b0010; + } + } + + return result; + } + + public static float FPCompareEQ(float value1, float value2) + { + return FPCompareEQFpscr(value1, value2, false); + } + + public static float FPCompareEQFpscr(float value1, float value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out _, out _, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out _, out _, context, fpcr); + + float result; + + if (type1 == FPType.SNaN || type1 == FPType.QNaN || type2 == FPType.SNaN || type2 == FPType.QNaN) + { + result = ZerosOrOnes(false); + + if (type1 == FPType.SNaN || type2 == FPType.SNaN) + { + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + } + else + { + result = ZerosOrOnes(value1 == value2); + } + + return result; + } + + public static float FPCompareGE(float value1, float value2) + { + return FPCompareGEFpscr(value1, value2, false); + } + + public static float FPCompareGEFpscr(float value1, float value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out _, out _, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out _, out _, context, fpcr); + + float result; + + if (type1 == FPType.SNaN || type1 == FPType.QNaN || type2 == FPType.SNaN || type2 == FPType.QNaN) + { + result = ZerosOrOnes(false); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else + { + result = ZerosOrOnes(value1 >= value2); + } + + return result; + } + + public static float FPCompareGT(float value1, float value2) + { + return FPCompareGTFpscr(value1, value2, false); + } + + public static float FPCompareGTFpscr(float value1, float value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out _, out _, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out _, out _, context, fpcr); + + float result; + + if (type1 == FPType.SNaN || type1 == FPType.QNaN || type2 == FPType.SNaN || type2 == FPType.QNaN) + { + result = ZerosOrOnes(false); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else + { + result = ZerosOrOnes(value1 > value2); + } + + return result; + } + + public static float FPCompareLE(float value1, float value2) + { + return FPCompareGE(value2, value1); + } + + public static float FPCompareLT(float value1, float value2) + { + return FPCompareGT(value2, value1); + } + + public static float FPCompareLEFpscr(float value1, float value2, bool standardFpscr) + { + return FPCompareGEFpscr(value2, value1, standardFpscr); + } + + public static float FPCompareLTFpscr(float value1, float value2, bool standardFpscr) + { + return FPCompareGTFpscr(value2, value1, standardFpscr); + } + + public static float FPDiv(float value1, float value2) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out uint op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out uint op2, context, fpcr); + + float result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + if ((inf1 && inf2) || (zero1 && zero2)) + { + result = FPDefaultNaN(); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else if (inf1 || zero2) + { + result = FPInfinity(sign1 ^ sign2); + + if (!inf1) + { + SoftFloat.FPProcessException(FPException.DivideByZero, context, fpcr); + } + } + else if (zero1 || inf2) + { + result = FPZero(sign1 ^ sign2); + } + else + { + result = value1 / value2; + + if ((fpcr & FPCR.Fz) != 0 && float.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0f); + } + } + } + + return result; + } + + public static float FPMax(float value1, float value2) + { + return FPMaxFpscr(value1, value2, false); + } + + public static float FPMaxFpscr(float value1, float value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out uint op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out uint op2, context, fpcr); + + float result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + if (value1 > value2) + { + if (type1 == FPType.Infinity) + { + result = FPInfinity(sign1); + } + else if (type1 == FPType.Zero) + { + result = FPZero(sign1 && sign2); + } + else + { + result = value1; + + if ((fpcr & FPCR.Fz) != 0 && float.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0f); + } + } + } + else + { + if (type2 == FPType.Infinity) + { + result = FPInfinity(sign2); + } + else if (type2 == FPType.Zero) + { + result = FPZero(sign1 && sign2); + } + else + { + result = value2; + + if ((fpcr & FPCR.Fz) != 0 && float.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0f); + } + } + } + } + + return result; + } + + public static float FPMaxNum(float value1, float value2) + { + return FPMaxNumFpscr(value1, value2, false); + } + + public static float FPMaxNumFpscr(float value1, float value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1.FPUnpack(out FPType type1, out _, out _, context, fpcr); + value2.FPUnpack(out FPType type2, out _, out _, context, fpcr); + + if (type1 == FPType.QNaN && type2 != FPType.QNaN) + { + value1 = FPInfinity(true); + } + else if (type1 != FPType.QNaN && type2 == FPType.QNaN) + { + value2 = FPInfinity(true); + } + + return FPMaxFpscr(value1, value2, standardFpscr); + } + + public static float FPMin(float value1, float value2) + { + return FPMinFpscr(value1, value2, false); + } + + public static float FPMinFpscr(float value1, float value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out uint op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out uint op2, context, fpcr); + + float result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + if (value1 < value2) + { + if (type1 == FPType.Infinity) + { + result = FPInfinity(sign1); + } + else if (type1 == FPType.Zero) + { + result = FPZero(sign1 || sign2); + } + else + { + result = value1; + + if ((fpcr & FPCR.Fz) != 0 && float.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0f); + } + } + } + else + { + if (type2 == FPType.Infinity) + { + result = FPInfinity(sign2); + } + else if (type2 == FPType.Zero) + { + result = FPZero(sign1 || sign2); + } + else + { + result = value2; + + if ((fpcr & FPCR.Fz) != 0 && float.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0f); + } + } + } + } + + return result; + } + + public static float FPMinNum(float value1, float value2) + { + return FPMinNumFpscr(value1, value2, false); + } + + public static float FPMinNumFpscr(float value1, float value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1.FPUnpack(out FPType type1, out _, out _, context, fpcr); + value2.FPUnpack(out FPType type2, out _, out _, context, fpcr); + + if (type1 == FPType.QNaN && type2 != FPType.QNaN) + { + value1 = FPInfinity(false); + } + else if (type1 != FPType.QNaN && type2 == FPType.QNaN) + { + value2 = FPInfinity(false); + } + + return FPMinFpscr(value1, value2, standardFpscr); + } + + public static float FPMul(float value1, float value2) + { + return FPMulFpscr(value1, value2, false); + } + + public static float FPMulFpscr(float value1, float value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out uint op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out uint op2, context, fpcr); + + float result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + if ((inf1 && zero2) || (zero1 && inf2)) + { + result = FPDefaultNaN(); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else if (inf1 || inf2) + { + result = FPInfinity(sign1 ^ sign2); + } + else if (zero1 || zero2) + { + result = FPZero(sign1 ^ sign2); + } + else + { + result = value1 * value2; + + if ((fpcr & FPCR.Fz) != 0 && float.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0f); + } + } + } + + return result; + } + + public static float FPMulAdd(float valueA, float value1, float value2) + { + return FPMulAddFpscr(valueA, value1, value2, false); + } + + public static float FPMulAddFpscr(float valueA, float value1, float value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + valueA = valueA.FPUnpack(out FPType typeA, out bool signA, out uint addend, context, fpcr); + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out uint op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out uint op2, context, fpcr); + + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + float result = FPProcessNaNs3(typeA, type1, type2, addend, op1, op2, out bool done, context, fpcr); + + if (typeA == FPType.QNaN && ((inf1 && zero2) || (zero1 && inf2))) + { + result = FPDefaultNaN(); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + + if (!done) + { + bool infA = typeA == FPType.Infinity; bool zeroA = typeA == FPType.Zero; + + bool signP = sign1 ^ sign2; + bool infP = inf1 || inf2; + bool zeroP = zero1 || zero2; + + if ((inf1 && zero2) || (zero1 && inf2) || (infA && infP && signA != signP)) + { + result = FPDefaultNaN(); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else if ((infA && !signA) || (infP && !signP)) + { + result = FPInfinity(false); + } + else if ((infA && signA) || (infP && signP)) + { + result = FPInfinity(true); + } + else if (zeroA && zeroP && signA == signP) + { + result = FPZero(signA); + } + else + { + result = MathF.FusedMultiplyAdd(value1, value2, valueA); + + if ((fpcr & FPCR.Fz) != 0 && float.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0f); + } + } + } + + return result; + } + + public static float FPMulSub(float valueA, float value1, float value2) + { + value1 = value1.FPNeg(); + + return FPMulAdd(valueA, value1, value2); + } + + public static float FPMulSubFpscr(float valueA, float value1, float value2, bool standardFpscr) + { + value1 = value1.FPNeg(); + + return FPMulAddFpscr(valueA, value1, value2, standardFpscr); + } + + public static float FPMulX(float value1, float value2) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out uint op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out uint op2, context, fpcr); + + float result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + if ((inf1 && zero2) || (zero1 && inf2)) + { + result = FPTwo(sign1 ^ sign2); + } + else if (inf1 || inf2) + { + result = FPInfinity(sign1 ^ sign2); + } + else if (zero1 || zero2) + { + result = FPZero(sign1 ^ sign2); + } + else + { + result = value1 * value2; + + if ((fpcr & FPCR.Fz) != 0 && float.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0f); + } + } + } + + return result; + } + + public static float FPNegMulAdd(float valueA, float value1, float value2) + { + valueA = valueA.FPNeg(); + value1 = value1.FPNeg(); + + return FPMulAdd(valueA, value1, value2); + } + + public static float FPNegMulSub(float valueA, float value1, float value2) + { + valueA = valueA.FPNeg(); + + return FPMulAdd(valueA, value1, value2); + } + + public static float FPRecipEstimate(float value) + { + return FPRecipEstimateFpscr(value, false); + } + + public static float FPRecipEstimateFpscr(float value, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value.FPUnpack(out FPType type, out bool sign, out uint op, context, fpcr); + + float result; + + if (type == FPType.SNaN || type == FPType.QNaN) + { + result = FPProcessNaN(type, op, context, fpcr); + } + else if (type == FPType.Infinity) + { + result = FPZero(sign); + } + else if (type == FPType.Zero) + { + result = FPInfinity(sign); + + SoftFloat.FPProcessException(FPException.DivideByZero, context, fpcr); + } + else if (MathF.Abs(value) < MathF.Pow(2f, -128)) + { + bool overflowToInf; + + switch (fpcr.GetRoundingMode()) + { + default: + case FPRoundingMode.ToNearest: overflowToInf = true; break; + case FPRoundingMode.TowardsPlusInfinity: overflowToInf = !sign; break; + case FPRoundingMode.TowardsMinusInfinity: overflowToInf = sign; break; + case FPRoundingMode.TowardsZero: overflowToInf = false; break; + } + + result = overflowToInf ? FPInfinity(sign) : FPMaxNormal(sign); + + SoftFloat.FPProcessException(FPException.Overflow, context, fpcr); + SoftFloat.FPProcessException(FPException.Inexact, context, fpcr); + } + else if ((fpcr & FPCR.Fz) != 0 && (MathF.Abs(value) >= MathF.Pow(2f, 126))) + { + result = FPZero(sign); + + context.Fpsr |= FPSR.Ufc; + } + else + { + ulong fraction = (ulong)(op & 0x007FFFFFu) << 29; + uint exp = (op & 0x7F800000u) >> 23; + + if (exp == 0u) + { + if ((fraction & 0x0008000000000000ul) == 0ul) + { + fraction = (fraction & 0x0003FFFFFFFFFFFFul) << 2; + exp -= 1u; + } + else + { + fraction = (fraction & 0x0007FFFFFFFFFFFFul) << 1; + } + } + + uint scaled = (uint)(((fraction & 0x000FF00000000000ul) | 0x0010000000000000ul) >> 44); + + uint resultExp = 253u - exp; + + uint estimate = (uint)SoftFloat.RecipEstimateTable[scaled - 256u] + 256u; + + fraction = (ulong)(estimate & 0xFFu) << 44; + + if (resultExp == 0u) + { + fraction = ((fraction & 0x000FFFFFFFFFFFFEul) | 0x0010000000000000ul) >> 1; + } + else if (resultExp + 1u == 0u) + { + fraction = ((fraction & 0x000FFFFFFFFFFFFCul) | 0x0010000000000000ul) >> 2; + resultExp = 0u; + } + + result = BitConverter.Int32BitsToSingle( + (int)((sign ? 1u : 0u) << 31 | (resultExp & 0xFFu) << 23 | (uint)(fraction >> 29) & 0x007FFFFFu)); + } + + return result; + } + + public static float FPRecipStep(float value1, float value2) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = context.StandardFpcrValue; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out uint op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out uint op2, context, fpcr); + + float result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + float product; + + if ((inf1 && zero2) || (zero1 && inf2)) + { + product = FPZero(false); + } + else + { + product = FPMulFpscr(value1, value2, true); + } + + result = FPSubFpscr(FPTwo(false), product, true); + } + + return result; + } + + public static float FPRecipStepFused(float value1, float value2) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = context.Fpcr; + + value1 = value1.FPNeg(); + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out uint op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out uint op2, context, fpcr); + + float result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + if ((inf1 && zero2) || (zero1 && inf2)) + { + result = FPTwo(false); + } + else if (inf1 || inf2) + { + result = FPInfinity(sign1 ^ sign2); + } + else + { + result = MathF.FusedMultiplyAdd(value1, value2, 2f); + + if ((fpcr & FPCR.Fz) != 0 && float.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0f); + } + } + } + + return result; + } + + public static float FPRecpX(float value) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = context.Fpcr; + + value.FPUnpack(out FPType type, out bool sign, out uint op, context, fpcr); + + float result; + + if (type == FPType.SNaN || type == FPType.QNaN) + { + result = FPProcessNaN(type, op, context, fpcr); + } + else + { + uint notExp = (~op >> 23) & 0xFFu; + uint maxExp = 0xFEu; + + result = BitConverter.Int32BitsToSingle( + (int)((sign ? 1u : 0u) << 31 | (notExp == 0xFFu ? maxExp : notExp) << 23)); + } + + return result; + } + + public static float FPRSqrtEstimate(float value) + { + return FPRSqrtEstimateFpscr(value, false); + } + + public static float FPRSqrtEstimateFpscr(float value, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value.FPUnpack(out FPType type, out bool sign, out uint op, context, fpcr); + + float result; + + if (type == FPType.SNaN || type == FPType.QNaN) + { + result = FPProcessNaN(type, op, context, fpcr); + } + else if (type == FPType.Zero) + { + result = FPInfinity(sign); + + SoftFloat.FPProcessException(FPException.DivideByZero, context, fpcr); + } + else if (sign) + { + result = FPDefaultNaN(); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else if (type == FPType.Infinity) + { + result = FPZero(false); + } + else + { + ulong fraction = (ulong)(op & 0x007FFFFFu) << 29; + uint exp = (op & 0x7F800000u) >> 23; + + if (exp == 0u) + { + while ((fraction & 0x0008000000000000ul) == 0ul) + { + fraction = (fraction & 0x0007FFFFFFFFFFFFul) << 1; + exp -= 1u; + } + + fraction = (fraction & 0x0007FFFFFFFFFFFFul) << 1; + } + + uint scaled; + + if ((exp & 1u) == 0u) + { + scaled = (uint)(((fraction & 0x000FF00000000000ul) | 0x0010000000000000ul) >> 44); + } + else + { + scaled = (uint)(((fraction & 0x000FE00000000000ul) | 0x0010000000000000ul) >> 45); + } + + uint resultExp = (380u - exp) >> 1; + + uint estimate = (uint)SoftFloat.RecipSqrtEstimateTable[scaled - 128u] + 256u; + + result = BitConverter.Int32BitsToSingle((int)((resultExp & 0xFFu) << 23 | (estimate & 0xFFu) << 15)); + } + + return result; + } + + public static float FPHalvedSub(float value1, float value2, ExecutionContext context, FPCR fpcr) + { + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out uint op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out uint op2, context, fpcr); + + float result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + if (inf1 && inf2 && sign1 == sign2) + { + result = FPDefaultNaN(); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else if ((inf1 && !sign1) || (inf2 && sign2)) + { + result = FPInfinity(false); + } + else if ((inf1 && sign1) || (inf2 && !sign2)) + { + result = FPInfinity(true); + } + else if (zero1 && zero2 && sign1 == !sign2) + { + result = FPZero(sign1); + } + else + { + result = (value1 - value2) / 2.0f; + + if ((fpcr & FPCR.Fz) != 0 && float.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0f); + } + } + } + + return result; + } + + public static float FPRSqrtStep(float value1, float value2) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = context.StandardFpcrValue; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out uint op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out uint op2, context, fpcr); + + float result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + float product; + + if ((inf1 && zero2) || (zero1 && inf2)) + { + product = FPZero(false); + } + else + { + product = FPMulFpscr(value1, value2, true); + } + + result = FPHalvedSub(FPThree(false), product, context, fpcr); + } + + return result; + } + + public static float FPRSqrtStepFused(float value1, float value2) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = context.Fpcr; + + value1 = value1.FPNeg(); + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out uint op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out uint op2, context, fpcr); + + float result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + if ((inf1 && zero2) || (zero1 && inf2)) + { + result = FPOnePointFive(false); + } + else if (inf1 || inf2) + { + result = FPInfinity(sign1 ^ sign2); + } + else + { + result = MathF.FusedMultiplyAdd(value1, value2, 3f) / 2f; + + if ((fpcr & FPCR.Fz) != 0 && float.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0f); + } + } + } + + return result; + } + + public static float FPSqrt(float value) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = context.Fpcr; + + value = value.FPUnpack(out FPType type, out bool sign, out uint op, context, fpcr); + + float result; + + if (type == FPType.SNaN || type == FPType.QNaN) + { + result = FPProcessNaN(type, op, context, fpcr); + } + else if (type == FPType.Zero) + { + result = FPZero(sign); + } + else if (type == FPType.Infinity && !sign) + { + result = FPInfinity(sign); + } + else if (sign) + { + result = FPDefaultNaN(); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else + { + result = MathF.Sqrt(value); + + if ((fpcr & FPCR.Fz) != 0 && float.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0f); + } + } + + return result; + } + + public static float FPSub(float value1, float value2) + { + return FPSubFpscr(value1, value2, false); + } + + public static float FPSubFpscr(float value1, float value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out uint op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out uint op2, context, fpcr); + + float result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + if (inf1 && inf2 && sign1 == sign2) + { + result = FPDefaultNaN(); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else if ((inf1 && !sign1) || (inf2 && sign2)) + { + result = FPInfinity(false); + } + else if ((inf1 && sign1) || (inf2 && !sign2)) + { + result = FPInfinity(true); + } + else if (zero1 && zero2 && sign1 == !sign2) + { + result = FPZero(sign1); + } + else + { + result = value1 - value2; + + if ((fpcr & FPCR.Fz) != 0 && float.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0f); + } + } + } + + return result; + } + + public static float FPDefaultNaN() + { + return BitConverter.Int32BitsToSingle(0x7fc00000); + } + + public static float FPInfinity(bool sign) + { + return sign ? float.NegativeInfinity : float.PositiveInfinity; + } + + public static float FPZero(bool sign) + { + return sign ? -0f : +0f; + } + + public static float FPMaxNormal(bool sign) + { + return sign ? float.MinValue : float.MaxValue; + } + + private static float FPTwo(bool sign) + { + return sign ? -2f : +2f; + } + + private static float FPThree(bool sign) + { + return sign ? -3f : +3f; + } + + private static float FPOnePointFive(bool sign) + { + return sign ? -1.5f : +1.5f; + } + + private static float FPNeg(this float value) + { + return -value; + } + + private static float ZerosOrOnes(bool ones) + { + return BitConverter.Int32BitsToSingle(ones ? -1 : 0); + } + + private static float FPUnpack( + this float value, + out FPType type, + out bool sign, + out uint valueBits, + ExecutionContext context, + FPCR fpcr) + { + valueBits = (uint)BitConverter.SingleToInt32Bits(value); + + sign = (~valueBits & 0x80000000u) == 0u; + + if ((valueBits & 0x7F800000u) == 0u) + { + if ((valueBits & 0x007FFFFFu) == 0u || (fpcr & FPCR.Fz) != 0) + { + type = FPType.Zero; + value = FPZero(sign); + + if ((valueBits & 0x007FFFFFu) != 0u) + { + SoftFloat.FPProcessException(FPException.InputDenorm, context, fpcr); + } + } + else + { + type = FPType.Nonzero; + } + } + else if ((~valueBits & 0x7F800000u) == 0u) + { + if ((valueBits & 0x007FFFFFu) == 0u) + { + type = FPType.Infinity; + } + else + { + type = (~valueBits & 0x00400000u) == 0u ? FPType.QNaN : FPType.SNaN; + value = FPZero(sign); + } + } + else + { + type = FPType.Nonzero; + } + + return value; + } + + private static float FPProcessNaNs( + FPType type1, + FPType type2, + uint op1, + uint op2, + out bool done, + ExecutionContext context, + FPCR fpcr) + { + done = true; + + if (type1 == FPType.SNaN) + { + return FPProcessNaN(type1, op1, context, fpcr); + } + else if (type2 == FPType.SNaN) + { + return FPProcessNaN(type2, op2, context, fpcr); + } + else if (type1 == FPType.QNaN) + { + return FPProcessNaN(type1, op1, context, fpcr); + } + else if (type2 == FPType.QNaN) + { + return FPProcessNaN(type2, op2, context, fpcr); + } + + done = false; + + return FPZero(false); + } + + private static float FPProcessNaNs3( + FPType type1, + FPType type2, + FPType type3, + uint op1, + uint op2, + uint op3, + out bool done, + ExecutionContext context, + FPCR fpcr) + { + done = true; + + if (type1 == FPType.SNaN) + { + return FPProcessNaN(type1, op1, context, fpcr); + } + else if (type2 == FPType.SNaN) + { + return FPProcessNaN(type2, op2, context, fpcr); + } + else if (type3 == FPType.SNaN) + { + return FPProcessNaN(type3, op3, context, fpcr); + } + else if (type1 == FPType.QNaN) + { + return FPProcessNaN(type1, op1, context, fpcr); + } + else if (type2 == FPType.QNaN) + { + return FPProcessNaN(type2, op2, context, fpcr); + } + else if (type3 == FPType.QNaN) + { + return FPProcessNaN(type3, op3, context, fpcr); + } + + done = false; + + return FPZero(false); + } + + private static float FPProcessNaN(FPType type, uint op, ExecutionContext context, FPCR fpcr) + { + if (type == FPType.SNaN) + { + op |= 1u << 22; + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + + if ((fpcr & FPCR.Dn) != 0) + { + return FPDefaultNaN(); + } + + return BitConverter.Int32BitsToSingle((int)op); + } + } + + static class SoftFloat64_16 + { + public static ushort FPConvert(double value) + { + ExecutionContext context = NativeInterface.GetContext(); + + double real = value.FPUnpackCv(out FPType type, out bool sign, out ulong valueBits, context); + + bool altHp = (context.Fpcr & FPCR.Ahp) != 0; + + ushort resultBits; + + if (type == FPType.SNaN || type == FPType.QNaN) + { + if (altHp) + { + resultBits = SoftFloat16.FPZero(sign); + } + else if ((context.Fpcr & FPCR.Dn) != 0) + { + resultBits = SoftFloat16.FPDefaultNaN(); + } + else + { + resultBits = FPConvertNaN(valueBits); + } + + if (type == FPType.SNaN || altHp) + { + SoftFloat.FPProcessException(FPException.InvalidOp, context); + } + } + else if (type == FPType.Infinity) + { + if (altHp) + { + resultBits = (ushort)((sign ? 1u : 0u) << 15 | 0x7FFFu); + + SoftFloat.FPProcessException(FPException.InvalidOp, context); + } + else + { + resultBits = SoftFloat16.FPInfinity(sign); + } + } + else if (type == FPType.Zero) + { + resultBits = SoftFloat16.FPZero(sign); + } + else + { + resultBits = SoftFloat16.FPRoundCv(real, context); + } + + return resultBits; + } + + private static double FPUnpackCv( + this double value, + out FPType type, + out bool sign, + out ulong valueBits, + ExecutionContext context) + { + valueBits = (ulong)BitConverter.DoubleToInt64Bits(value); + + sign = (~valueBits & 0x8000000000000000ul) == 0u; + + ulong exp64 = (valueBits & 0x7FF0000000000000ul) >> 52; + ulong frac64 = valueBits & 0x000FFFFFFFFFFFFFul; + + double real; + + if (exp64 == 0u) + { + if (frac64 == 0u || (context.Fpcr & FPCR.Fz) != 0) + { + type = FPType.Zero; + real = 0d; + + if (frac64 != 0u) + { + SoftFloat.FPProcessException(FPException.InputDenorm, context); + } + } + else + { + type = FPType.Nonzero; // Subnormal. + real = Math.Pow(2d, -1022) * ((double)frac64 * Math.Pow(2d, -52)); + } + } + else if (exp64 == 0x7FFul) + { + if (frac64 == 0u) + { + type = FPType.Infinity; + real = Math.Pow(2d, 1000000); + } + else + { + type = (~frac64 & 0x0008000000000000ul) == 0u ? FPType.QNaN : FPType.SNaN; + real = 0d; + } + } + else + { + type = FPType.Nonzero; // Normal. + real = Math.Pow(2d, (int)exp64 - 1023) * (1d + (double)frac64 * Math.Pow(2d, -52)); + } + + return sign ? -real : real; + } + + private static ushort FPConvertNaN(ulong valueBits) + { + return (ushort)((valueBits & 0x8000000000000000ul) >> 48 | 0x7E00u | (valueBits & 0x0007FC0000000000ul) >> 42); + } + } + + static class SoftFloat64 + { + public static double FPAdd(double value1, double value2) + { + return FPAddFpscr(value1, value2, false); + } + + public static double FPAddFpscr(double value1, double value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out ulong op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out ulong op2, context, fpcr); + + double result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + if (inf1 && inf2 && sign1 == !sign2) + { + result = FPDefaultNaN(); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else if ((inf1 && !sign1) || (inf2 && !sign2)) + { + result = FPInfinity(false); + } + else if ((inf1 && sign1) || (inf2 && sign2)) + { + result = FPInfinity(true); + } + else if (zero1 && zero2 && sign1 == sign2) + { + result = FPZero(sign1); + } + else + { + result = value1 + value2; + + if ((fpcr & FPCR.Fz) != 0 && double.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0d); + } + } + } + + return result; + } + + public static int FPCompare(double value1, double value2, bool signalNaNs) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out _, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out _, context, fpcr); + + int result; + + if (type1 == FPType.SNaN || type1 == FPType.QNaN || type2 == FPType.SNaN || type2 == FPType.QNaN) + { + result = 0b0011; + + if (type1 == FPType.SNaN || type2 == FPType.SNaN || signalNaNs) + { + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + } + else + { + if (value1 == value2) + { + result = 0b0110; + } + else if (value1 < value2) + { + result = 0b1000; + } + else + { + result = 0b0010; + } + } + + return result; + } + + public static double FPCompareEQ(double value1, double value2) + { + return FPCompareEQFpscr(value1, value2, false); + } + + public static double FPCompareEQFpscr(double value1, double value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out _, out _, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out _, out _, context, fpcr); + + double result; + + if (type1 == FPType.SNaN || type1 == FPType.QNaN || type2 == FPType.SNaN || type2 == FPType.QNaN) + { + result = ZerosOrOnes(false); + + if (type1 == FPType.SNaN || type2 == FPType.SNaN) + { + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + } + else + { + result = ZerosOrOnes(value1 == value2); + } + + return result; + } + + public static double FPCompareGE(double value1, double value2) + { + return FPCompareGEFpscr(value1, value2, false); + } + + public static double FPCompareGEFpscr(double value1, double value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out _, out _, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out _, out _, context, fpcr); + + double result; + + if (type1 == FPType.SNaN || type1 == FPType.QNaN || type2 == FPType.SNaN || type2 == FPType.QNaN) + { + result = ZerosOrOnes(false); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else + { + result = ZerosOrOnes(value1 >= value2); + } + + return result; + } + + public static double FPCompareGT(double value1, double value2) + { + return FPCompareGTFpscr(value1, value2, false); + } + + public static double FPCompareGTFpscr(double value1, double value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out _, out _, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out _, out _, context, fpcr); + + double result; + + if (type1 == FPType.SNaN || type1 == FPType.QNaN || type2 == FPType.SNaN || type2 == FPType.QNaN) + { + result = ZerosOrOnes(false); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else + { + result = ZerosOrOnes(value1 > value2); + } + + return result; + } + + public static double FPCompareLE(double value1, double value2) + { + return FPCompareGE(value2, value1); + } + + public static double FPCompareLT(double value1, double value2) + { + return FPCompareGT(value2, value1); + } + + public static double FPCompareLEFpscr(double value1, double value2, bool standardFpscr) + { + return FPCompareGEFpscr(value2, value1, standardFpscr); + } + + public static double FPCompareLTFpscr(double value1, double value2, bool standardFpscr) + { + return FPCompareGTFpscr(value2, value1, standardFpscr); + } + + public static double FPDiv(double value1, double value2) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out ulong op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out ulong op2, context, fpcr); + + double result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + if ((inf1 && inf2) || (zero1 && zero2)) + { + result = FPDefaultNaN(); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else if (inf1 || zero2) + { + result = FPInfinity(sign1 ^ sign2); + + if (!inf1) + { + SoftFloat.FPProcessException(FPException.DivideByZero, context, fpcr); + } + } + else if (zero1 || inf2) + { + result = FPZero(sign1 ^ sign2); + } + else + { + result = value1 / value2; + + if ((fpcr & FPCR.Fz) != 0 && double.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0d); + } + } + } + + return result; + } + + public static double FPMax(double value1, double value2) + { + return FPMaxFpscr(value1, value2, false); + } + + public static double FPMaxFpscr(double value1, double value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out ulong op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out ulong op2, context, fpcr); + + double result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + if (value1 > value2) + { + if (type1 == FPType.Infinity) + { + result = FPInfinity(sign1); + } + else if (type1 == FPType.Zero) + { + result = FPZero(sign1 && sign2); + } + else + { + result = value1; + + if ((fpcr & FPCR.Fz) != 0 && double.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0d); + } + } + } + else + { + if (type2 == FPType.Infinity) + { + result = FPInfinity(sign2); + } + else if (type2 == FPType.Zero) + { + result = FPZero(sign1 && sign2); + } + else + { + result = value2; + + if ((fpcr & FPCR.Fz) != 0 && double.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0d); + } + } + } + } + + return result; + } + + public static double FPMaxNum(double value1, double value2) + { + return FPMaxNumFpscr(value1, value2, false); + } + + public static double FPMaxNumFpscr(double value1, double value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1.FPUnpack(out FPType type1, out _, out _, context, fpcr); + value2.FPUnpack(out FPType type2, out _, out _, context, fpcr); + + if (type1 == FPType.QNaN && type2 != FPType.QNaN) + { + value1 = FPInfinity(true); + } + else if (type1 != FPType.QNaN && type2 == FPType.QNaN) + { + value2 = FPInfinity(true); + } + + return FPMaxFpscr(value1, value2, standardFpscr); + } + + public static double FPMin(double value1, double value2) + { + return FPMinFpscr(value1, value2, false); + } + + public static double FPMinFpscr(double value1, double value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out ulong op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out ulong op2, context, fpcr); + + double result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + if (value1 < value2) + { + if (type1 == FPType.Infinity) + { + result = FPInfinity(sign1); + } + else if (type1 == FPType.Zero) + { + result = FPZero(sign1 || sign2); + } + else + { + result = value1; + + if ((fpcr & FPCR.Fz) != 0 && double.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0d); + } + } + } + else + { + if (type2 == FPType.Infinity) + { + result = FPInfinity(sign2); + } + else if (type2 == FPType.Zero) + { + result = FPZero(sign1 || sign2); + } + else + { + result = value2; + + if ((fpcr & FPCR.Fz) != 0 && double.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0d); + } + } + } + } + + return result; + } + + public static double FPMinNum(double value1, double value2) + { + return FPMinNumFpscr(value1, value2, false); + } + + public static double FPMinNumFpscr(double value1, double value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1.FPUnpack(out FPType type1, out _, out _, context, fpcr); + value2.FPUnpack(out FPType type2, out _, out _, context, fpcr); + + if (type1 == FPType.QNaN && type2 != FPType.QNaN) + { + value1 = FPInfinity(false); + } + else if (type1 != FPType.QNaN && type2 == FPType.QNaN) + { + value2 = FPInfinity(false); + } + + return FPMinFpscr(value1, value2, standardFpscr); + } + + public static double FPMul(double value1, double value2) + { + return FPMulFpscr(value1, value2, false); + } + + public static double FPMulFpscr(double value1, double value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out ulong op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out ulong op2, context, fpcr); + + double result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + if ((inf1 && zero2) || (zero1 && inf2)) + { + result = FPDefaultNaN(); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else if (inf1 || inf2) + { + result = FPInfinity(sign1 ^ sign2); + } + else if (zero1 || zero2) + { + result = FPZero(sign1 ^ sign2); + } + else + { + result = value1 * value2; + + if ((fpcr & FPCR.Fz) != 0 && double.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0d); + } + } + } + + return result; + } + + public static double FPMulAdd(double valueA, double value1, double value2) + { + return FPMulAddFpscr(valueA, value1, value2, false); + } + + public static double FPMulAddFpscr(double valueA, double value1, double value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + valueA = valueA.FPUnpack(out FPType typeA, out bool signA, out ulong addend, context, fpcr); + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out ulong op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out ulong op2, context, fpcr); + + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + double result = FPProcessNaNs3(typeA, type1, type2, addend, op1, op2, out bool done, context, fpcr); + + if (typeA == FPType.QNaN && ((inf1 && zero2) || (zero1 && inf2))) + { + result = FPDefaultNaN(); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + + if (!done) + { + bool infA = typeA == FPType.Infinity; bool zeroA = typeA == FPType.Zero; + + bool signP = sign1 ^ sign2; + bool infP = inf1 || inf2; + bool zeroP = zero1 || zero2; + + if ((inf1 && zero2) || (zero1 && inf2) || (infA && infP && signA != signP)) + { + result = FPDefaultNaN(); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else if ((infA && !signA) || (infP && !signP)) + { + result = FPInfinity(false); + } + else if ((infA && signA) || (infP && signP)) + { + result = FPInfinity(true); + } + else if (zeroA && zeroP && signA == signP) + { + result = FPZero(signA); + } + else + { + result = Math.FusedMultiplyAdd(value1, value2, valueA); + + if ((fpcr & FPCR.Fz) != 0 && double.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0d); + } + } + } + + return result; + } + + public static double FPMulSub(double valueA, double value1, double value2) + { + value1 = value1.FPNeg(); + + return FPMulAdd(valueA, value1, value2); + } + + public static double FPMulSubFpscr(double valueA, double value1, double value2, bool standardFpscr) + { + value1 = value1.FPNeg(); + + return FPMulAddFpscr(valueA, value1, value2, standardFpscr); + } + + public static double FPMulX(double value1, double value2) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out ulong op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out ulong op2, context, fpcr); + + double result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + if ((inf1 && zero2) || (zero1 && inf2)) + { + result = FPTwo(sign1 ^ sign2); + } + else if (inf1 || inf2) + { + result = FPInfinity(sign1 ^ sign2); + } + else if (zero1 || zero2) + { + result = FPZero(sign1 ^ sign2); + } + else + { + result = value1 * value2; + + if ((fpcr & FPCR.Fz) != 0 && double.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0d); + } + } + } + + return result; + } + + public static double FPNegMulAdd(double valueA, double value1, double value2) + { + valueA = valueA.FPNeg(); + value1 = value1.FPNeg(); + + return FPMulAdd(valueA, value1, value2); + } + + public static double FPNegMulSub(double valueA, double value1, double value2) + { + valueA = valueA.FPNeg(); + + return FPMulAdd(valueA, value1, value2); + } + + public static double FPRecipEstimate(double value) + { + return FPRecipEstimateFpscr(value, false); + } + + public static double FPRecipEstimateFpscr(double value, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value.FPUnpack(out FPType type, out bool sign, out ulong op, context, fpcr); + + double result; + + if (type == FPType.SNaN || type == FPType.QNaN) + { + result = FPProcessNaN(type, op, context, fpcr); + } + else if (type == FPType.Infinity) + { + result = FPZero(sign); + } + else if (type == FPType.Zero) + { + result = FPInfinity(sign); + + SoftFloat.FPProcessException(FPException.DivideByZero, context, fpcr); + } + else if (Math.Abs(value) < Math.Pow(2d, -1024)) + { + bool overflowToInf; + + switch (fpcr.GetRoundingMode()) + { + default: + case FPRoundingMode.ToNearest: overflowToInf = true; break; + case FPRoundingMode.TowardsPlusInfinity: overflowToInf = !sign; break; + case FPRoundingMode.TowardsMinusInfinity: overflowToInf = sign; break; + case FPRoundingMode.TowardsZero: overflowToInf = false; break; + } + + result = overflowToInf ? FPInfinity(sign) : FPMaxNormal(sign); + + SoftFloat.FPProcessException(FPException.Overflow, context, fpcr); + SoftFloat.FPProcessException(FPException.Inexact, context, fpcr); + } + else if ((fpcr & FPCR.Fz) != 0 && (Math.Abs(value) >= Math.Pow(2d, 1022))) + { + result = FPZero(sign); + + context.Fpsr |= FPSR.Ufc; + } + else + { + ulong fraction = op & 0x000FFFFFFFFFFFFFul; + uint exp = (uint)((op & 0x7FF0000000000000ul) >> 52); + + if (exp == 0u) + { + if ((fraction & 0x0008000000000000ul) == 0ul) + { + fraction = (fraction & 0x0003FFFFFFFFFFFFul) << 2; + exp -= 1u; + } + else + { + fraction = (fraction & 0x0007FFFFFFFFFFFFul) << 1; + } + } + + uint scaled = (uint)(((fraction & 0x000FF00000000000ul) | 0x0010000000000000ul) >> 44); + + uint resultExp = 2045u - exp; + + uint estimate = (uint)SoftFloat.RecipEstimateTable[scaled - 256u] + 256u; + + fraction = (ulong)(estimate & 0xFFu) << 44; + + if (resultExp == 0u) + { + fraction = ((fraction & 0x000FFFFFFFFFFFFEul) | 0x0010000000000000ul) >> 1; + } + else if (resultExp + 1u == 0u) + { + fraction = ((fraction & 0x000FFFFFFFFFFFFCul) | 0x0010000000000000ul) >> 2; + resultExp = 0u; + } + + result = BitConverter.Int64BitsToDouble( + (long)((sign ? 1ul : 0ul) << 63 | (resultExp & 0x7FFul) << 52 | (fraction & 0x000FFFFFFFFFFFFFul))); + } + + return result; + } + + public static double FPRecipStep(double value1, double value2) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = context.StandardFpcrValue; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out ulong op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out ulong op2, context, fpcr); + + double result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + double product; + + if ((inf1 && zero2) || (zero1 && inf2)) + { + product = FPZero(false); + } + else + { + product = FPMulFpscr(value1, value2, true); + } + + result = FPSubFpscr(FPTwo(false), product, true); + } + + return result; + } + + public static double FPRecipStepFused(double value1, double value2) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = context.Fpcr; + + value1 = value1.FPNeg(); + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out ulong op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out ulong op2, context, fpcr); + + double result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + if ((inf1 && zero2) || (zero1 && inf2)) + { + result = FPTwo(false); + } + else if (inf1 || inf2) + { + result = FPInfinity(sign1 ^ sign2); + } + else + { + result = Math.FusedMultiplyAdd(value1, value2, 2d); + + if ((fpcr & FPCR.Fz) != 0 && double.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0d); + } + } + } + + return result; + } + + public static double FPRecpX(double value) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = context.Fpcr; + + value.FPUnpack(out FPType type, out bool sign, out ulong op, context, fpcr); + + double result; + + if (type == FPType.SNaN || type == FPType.QNaN) + { + result = FPProcessNaN(type, op, context, fpcr); + } + else + { + ulong notExp = (~op >> 52) & 0x7FFul; + ulong maxExp = 0x7FEul; + + result = BitConverter.Int64BitsToDouble( + (long)((sign ? 1ul : 0ul) << 63 | (notExp == 0x7FFul ? maxExp : notExp) << 52)); + } + + return result; + } + + public static double FPRSqrtEstimate(double value) + { + return FPRSqrtEstimateFpscr(value, false); + } + + public static double FPRSqrtEstimateFpscr(double value, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value.FPUnpack(out FPType type, out bool sign, out ulong op, context, fpcr); + + double result; + + if (type == FPType.SNaN || type == FPType.QNaN) + { + result = FPProcessNaN(type, op, context, fpcr); + } + else if (type == FPType.Zero) + { + result = FPInfinity(sign); + + SoftFloat.FPProcessException(FPException.DivideByZero, context, fpcr); + } + else if (sign) + { + result = FPDefaultNaN(); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else if (type == FPType.Infinity) + { + result = FPZero(false); + } + else + { + ulong fraction = op & 0x000FFFFFFFFFFFFFul; + uint exp = (uint)((op & 0x7FF0000000000000ul) >> 52); + + if (exp == 0u) + { + while ((fraction & 0x0008000000000000ul) == 0ul) + { + fraction = (fraction & 0x0007FFFFFFFFFFFFul) << 1; + exp -= 1u; + } + + fraction = (fraction & 0x0007FFFFFFFFFFFFul) << 1; + } + + uint scaled; + + if ((exp & 1u) == 0u) + { + scaled = (uint)(((fraction & 0x000FF00000000000ul) | 0x0010000000000000ul) >> 44); + } + else + { + scaled = (uint)(((fraction & 0x000FE00000000000ul) | 0x0010000000000000ul) >> 45); + } + + uint resultExp = (3068u - exp) >> 1; + + uint estimate = (uint)SoftFloat.RecipSqrtEstimateTable[scaled - 128u] + 256u; + + result = BitConverter.Int64BitsToDouble((long)((resultExp & 0x7FFul) << 52 | (estimate & 0xFFul) << 44)); + } + + return result; + } + + public static double FPHalvedSub(double value1, double value2, ExecutionContext context, FPCR fpcr) + { + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out ulong op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out ulong op2, context, fpcr); + + double result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + if (inf1 && inf2 && sign1 == sign2) + { + result = FPDefaultNaN(); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else if ((inf1 && !sign1) || (inf2 && sign2)) + { + result = FPInfinity(false); + } + else if ((inf1 && sign1) || (inf2 && !sign2)) + { + result = FPInfinity(true); + } + else if (zero1 && zero2 && sign1 == !sign2) + { + result = FPZero(sign1); + } + else + { + result = (value1 - value2) / 2.0; + + if ((fpcr & FPCR.Fz) != 0 && double.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0d); + } + } + } + + return result; + } + + public static double FPRSqrtStep(double value1, double value2) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = context.StandardFpcrValue; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out ulong op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out ulong op2, context, fpcr); + + double result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + double product; + + if ((inf1 && zero2) || (zero1 && inf2)) + { + product = FPZero(false); + } + else + { + product = FPMulFpscr(value1, value2, true); + } + + result = FPHalvedSub(FPThree(false), product, context, fpcr); + } + + return result; + } + + public static double FPRSqrtStepFused(double value1, double value2) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = context.Fpcr; + + value1 = value1.FPNeg(); + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out ulong op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out ulong op2, context, fpcr); + + double result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + if ((inf1 && zero2) || (zero1 && inf2)) + { + result = FPOnePointFive(false); + } + else if (inf1 || inf2) + { + result = FPInfinity(sign1 ^ sign2); + } + else + { + result = Math.FusedMultiplyAdd(value1, value2, 3d) / 2d; + + if ((fpcr & FPCR.Fz) != 0 && double.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0d); + } + } + } + + return result; + } + + public static double FPSqrt(double value) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = context.Fpcr; + + value = value.FPUnpack(out FPType type, out bool sign, out ulong op, context, fpcr); + + double result; + + if (type == FPType.SNaN || type == FPType.QNaN) + { + result = FPProcessNaN(type, op, context, fpcr); + } + else if (type == FPType.Zero) + { + result = FPZero(sign); + } + else if (type == FPType.Infinity && !sign) + { + result = FPInfinity(sign); + } + else if (sign) + { + result = FPDefaultNaN(); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else + { + result = Math.Sqrt(value); + + if ((fpcr & FPCR.Fz) != 0 && double.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0d); + } + } + + return result; + } + + public static double FPSub(double value1, double value2) + { + return FPSubFpscr(value1, value2, false); + } + + public static double FPSubFpscr(double value1, double value2, bool standardFpscr) + { + ExecutionContext context = NativeInterface.GetContext(); + FPCR fpcr = standardFpscr ? context.StandardFpcrValue : context.Fpcr; + + value1 = value1.FPUnpack(out FPType type1, out bool sign1, out ulong op1, context, fpcr); + value2 = value2.FPUnpack(out FPType type2, out bool sign2, out ulong op2, context, fpcr); + + double result = FPProcessNaNs(type1, type2, op1, op2, out bool done, context, fpcr); + + if (!done) + { + bool inf1 = type1 == FPType.Infinity; bool zero1 = type1 == FPType.Zero; + bool inf2 = type2 == FPType.Infinity; bool zero2 = type2 == FPType.Zero; + + if (inf1 && inf2 && sign1 == sign2) + { + result = FPDefaultNaN(); + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + else if ((inf1 && !sign1) || (inf2 && sign2)) + { + result = FPInfinity(false); + } + else if ((inf1 && sign1) || (inf2 && !sign2)) + { + result = FPInfinity(true); + } + else if (zero1 && zero2 && sign1 == !sign2) + { + result = FPZero(sign1); + } + else + { + result = value1 - value2; + + if ((fpcr & FPCR.Fz) != 0 && double.IsSubnormal(result)) + { + context.Fpsr |= FPSR.Ufc; + + result = FPZero(result < 0d); + } + } + } + + return result; + } + + public static double FPDefaultNaN() + { + return BitConverter.Int64BitsToDouble(0x7ff8000000000000); + } + + public static double FPInfinity(bool sign) + { + return sign ? double.NegativeInfinity : double.PositiveInfinity; + } + + public static double FPZero(bool sign) + { + return sign ? -0d : +0d; + } + + public static double FPMaxNormal(bool sign) + { + return sign ? double.MinValue : double.MaxValue; + } + + private static double FPTwo(bool sign) + { + return sign ? -2d : +2d; + } + + private static double FPThree(bool sign) + { + return sign ? -3d : +3d; + } + + private static double FPOnePointFive(bool sign) + { + return sign ? -1.5d : +1.5d; + } + + private static double FPNeg(this double value) + { + return -value; + } + + private static double ZerosOrOnes(bool ones) + { + return BitConverter.Int64BitsToDouble(ones ? -1L : 0L); + } + + private static double FPUnpack( + this double value, + out FPType type, + out bool sign, + out ulong valueBits, + ExecutionContext context, + FPCR fpcr) + { + valueBits = (ulong)BitConverter.DoubleToInt64Bits(value); + + sign = (~valueBits & 0x8000000000000000ul) == 0ul; + + if ((valueBits & 0x7FF0000000000000ul) == 0ul) + { + if ((valueBits & 0x000FFFFFFFFFFFFFul) == 0ul || (fpcr & FPCR.Fz) != 0) + { + type = FPType.Zero; + value = FPZero(sign); + + if ((valueBits & 0x000FFFFFFFFFFFFFul) != 0ul) + { + SoftFloat.FPProcessException(FPException.InputDenorm, context, fpcr); + } + } + else + { + type = FPType.Nonzero; + } + } + else if ((~valueBits & 0x7FF0000000000000ul) == 0ul) + { + if ((valueBits & 0x000FFFFFFFFFFFFFul) == 0ul) + { + type = FPType.Infinity; + } + else + { + type = (~valueBits & 0x0008000000000000ul) == 0ul ? FPType.QNaN : FPType.SNaN; + value = FPZero(sign); + } + } + else + { + type = FPType.Nonzero; + } + + return value; + } + + private static double FPProcessNaNs( + FPType type1, + FPType type2, + ulong op1, + ulong op2, + out bool done, + ExecutionContext context, + FPCR fpcr) + { + done = true; + + if (type1 == FPType.SNaN) + { + return FPProcessNaN(type1, op1, context, fpcr); + } + else if (type2 == FPType.SNaN) + { + return FPProcessNaN(type2, op2, context, fpcr); + } + else if (type1 == FPType.QNaN) + { + return FPProcessNaN(type1, op1, context, fpcr); + } + else if (type2 == FPType.QNaN) + { + return FPProcessNaN(type2, op2, context, fpcr); + } + + done = false; + + return FPZero(false); + } + + private static double FPProcessNaNs3( + FPType type1, + FPType type2, + FPType type3, + ulong op1, + ulong op2, + ulong op3, + out bool done, + ExecutionContext context, + FPCR fpcr) + { + done = true; + + if (type1 == FPType.SNaN) + { + return FPProcessNaN(type1, op1, context, fpcr); + } + else if (type2 == FPType.SNaN) + { + return FPProcessNaN(type2, op2, context, fpcr); + } + else if (type3 == FPType.SNaN) + { + return FPProcessNaN(type3, op3, context, fpcr); + } + else if (type1 == FPType.QNaN) + { + return FPProcessNaN(type1, op1, context, fpcr); + } + else if (type2 == FPType.QNaN) + { + return FPProcessNaN(type2, op2, context, fpcr); + } + else if (type3 == FPType.QNaN) + { + return FPProcessNaN(type3, op3, context, fpcr); + } + + done = false; + + return FPZero(false); + } + + private static double FPProcessNaN(FPType type, ulong op, ExecutionContext context, FPCR fpcr) + { + if (type == FPType.SNaN) + { + op |= 1ul << 51; + + SoftFloat.FPProcessException(FPException.InvalidOp, context, fpcr); + } + + if ((fpcr & FPCR.Dn) != 0) + { + return FPDefaultNaN(); + } + + return BitConverter.Int64BitsToDouble((long)op); + } + } +} |
