Adjust naming conventions for Ryujinx and ChocolArm64 projects (#484)

* Change naming convention for Ryujinx project * Change naming convention for ChocolArm64 project * Fix NaN * Remove unneeded this. from Ryujinx project * Adjust naming from new PRs * Name changes based on feedback * How did this get removed? * Rebasing fix * Change FP enum case * Remove prefix from ChocolArm64 classes - Part 1 * Remove prefix from ChocolArm64 classes - Part 2 * Fix alignment from last commit's renaming * Rename namespaces * Rename stragglers * Fix alignment * Rename OpCode class * Missed a few * Adjust alignment
author: Alex Barney <thealexbarney@gmail.com> 2018-10-30 19:43:02 -0600
committer: gdkchan <gab.dark.100@gmail.com> 2018-10-30 22:43:02 -0300
commit: 9cb57fb4bb3bbae0ae052a5af4a96a49fc5d864d (patch)
tree: 0c97425aeb311c142bc92a6fcc503cb2c07d4376 /ChocolArm64/Instructions
parent: 5a87e58183578f5b84ca8d01cbb76aed11820f78 (diff)
31 files changed, 13615 insertions, 0 deletions
diff --git a/ChocolArm64/Instructions/CryptoHelper.cs b/ChocolArm64/Instructions/CryptoHelper.cs
new file mode 100644
index 00000000..bb9a22a3
--- /dev/null
+++ b/ChocolArm64/Instructions/CryptoHelper.cs
@@ -0,0 +1,328 @@
+// https://www.intel.com/content/dam/doc/white-paper/advanced-encryption-standard-new-instructions-set-paper.pdf
+
+using System;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace ChocolArm64.Instructions
+{
+    static class CryptoHelper
+    {
+#region "LookUp Tables"
+        private static byte[] _sBox =
+        {
+            0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
+            0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
+            0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+            0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
+            0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
+            0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+            0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
+            0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
+            0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+            0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
+            0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
+            0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+            0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
+            0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
+            0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+            0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+        };
+
+        private static byte[] _invSBox =
+        {
+            0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
+            0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
+            0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+            0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
+            0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
+            0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+            0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
+            0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
+            0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+            0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
+            0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
+            0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+            0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
+            0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
+            0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+            0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+        };
+
+        private static byte[] _gfMul02 =
+        {
+            0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e,
+            0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
+            0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e,
+            0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e, 0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e,
+            0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e, 0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e,
+            0xa0, 0xa2, 0xa4, 0xa6, 0xa8, 0xaa, 0xac, 0xae, 0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe,
+            0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde,
+            0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee, 0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe,
+            0x1b, 0x19, 0x1f, 0x1d, 0x13, 0x11, 0x17, 0x15, 0x0b, 0x09, 0x0f, 0x0d, 0x03, 0x01, 0x07, 0x05,
+            0x3b, 0x39, 0x3f, 0x3d, 0x33, 0x31, 0x37, 0x35, 0x2b, 0x29, 0x2f, 0x2d, 0x23, 0x21, 0x27, 0x25,
+            0x5b, 0x59, 0x5f, 0x5d, 0x53, 0x51, 0x57, 0x55, 0x4b, 0x49, 0x4f, 0x4d, 0x43, 0x41, 0x47, 0x45,
+            0x7b, 0x79, 0x7f, 0x7d, 0x73, 0x71, 0x77, 0x75, 0x6b, 0x69, 0x6f, 0x6d, 0x63, 0x61, 0x67, 0x65,
+            0x9b, 0x99, 0x9f, 0x9d, 0x93, 0x91, 0x97, 0x95, 0x8b, 0x89, 0x8f, 0x8d, 0x83, 0x81, 0x87, 0x85,
+            0xbb, 0xb9, 0xbf, 0xbd, 0xb3, 0xb1, 0xb7, 0xb5, 0xab, 0xa9, 0xaf, 0xad, 0xa3, 0xa1, 0xa7, 0xa5,
+            0xdb, 0xd9, 0xdf, 0xdd, 0xd3, 0xd1, 0xd7, 0xd5, 0xcb, 0xc9, 0xcf, 0xcd, 0xc3, 0xc1, 0xc7, 0xc5,
+            0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5
+        };
+
+        private static byte[] _gfMul03 =
+        {
+            0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11,
+            0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39, 0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21,
+            0x60, 0x63, 0x66, 0x65, 0x6c, 0x6f, 0x6a, 0x69, 0x78, 0x7b, 0x7e, 0x7d, 0x74, 0x77, 0x72, 0x71,
+            0x50, 0x53, 0x56, 0x55, 0x5c, 0x5f, 0x5a, 0x59, 0x48, 0x4b, 0x4e, 0x4d, 0x44, 0x47, 0x42, 0x41,
+            0xc0, 0xc3, 0xc6, 0xc5, 0xcc, 0xcf, 0xca, 0xc9, 0xd8, 0xdb, 0xde, 0xdd, 0xd4, 0xd7, 0xd2, 0xd1,
+            0xf0, 0xf3, 0xf6, 0xf5, 0xfc, 0xff, 0xfa, 0xf9, 0xe8, 0xeb, 0xee, 0xed, 0xe4, 0xe7, 0xe2, 0xe1,
+            0xa0, 0xa3, 0xa6, 0xa5, 0xac, 0xaf, 0xaa, 0xa9, 0xb8, 0xbb, 0xbe, 0xbd, 0xb4, 0xb7, 0xb2, 0xb1,
+            0x90, 0x93, 0x96, 0x95, 0x9c, 0x9f, 0x9a, 0x99, 0x88, 0x8b, 0x8e, 0x8d, 0x84, 0x87, 0x82, 0x81,
+            0x9b, 0x98, 0x9d, 0x9e, 0x97, 0x94, 0x91, 0x92, 0x83, 0x80, 0x85, 0x86, 0x8f, 0x8c, 0x89, 0x8a,
+            0xab, 0xa8, 0xad, 0xae, 0xa7, 0xa4, 0xa1, 0xa2, 0xb3, 0xb0, 0xb5, 0xb6, 0xbf, 0xbc, 0xb9, 0xba,
+            0xfb, 0xf8, 0xfd, 0xfe, 0xf7, 0xf4, 0xf1, 0xf2, 0xe3, 0xe0, 0xe5, 0xe6, 0xef, 0xec, 0xe9, 0xea,
+            0xcb, 0xc8, 0xcd, 0xce, 0xc7, 0xc4, 0xc1, 0xc2, 0xd3, 0xd0, 0xd5, 0xd6, 0xdf, 0xdc, 0xd9, 0xda,
+            0x5b, 0x58, 0x5d, 0x5e, 0x57, 0x54, 0x51, 0x52, 0x43, 0x40, 0x45, 0x46, 0x4f, 0x4c, 0x49, 0x4a,
+            0x6b, 0x68, 0x6d, 0x6e, 0x67, 0x64, 0x61, 0x62, 0x73, 0x70, 0x75, 0x76, 0x7f, 0x7c, 0x79, 0x7a,
+            0x3b, 0x38, 0x3d, 0x3e, 0x37, 0x34, 0x31, 0x32, 0x23, 0x20, 0x25, 0x26, 0x2f, 0x2c, 0x29, 0x2a,
+            0x0b, 0x08, 0x0d, 0x0e, 0x07, 0x04, 0x01, 0x02, 0x13, 0x10, 0x15, 0x16, 0x1f, 0x1c, 0x19, 0x1a
+        };
+
+        private static byte[] _gfMul09 =
+        {
+            0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77,
+            0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7,
+            0x3b, 0x32, 0x29, 0x20, 0x1f, 0x16, 0x0d, 0x04, 0x73, 0x7a, 0x61, 0x68, 0x57, 0x5e, 0x45, 0x4c,
+            0xab, 0xa2, 0xb9, 0xb0, 0x8f, 0x86, 0x9d, 0x94, 0xe3, 0xea, 0xf1, 0xf8, 0xc7, 0xce, 0xd5, 0xdc,
+            0x76, 0x7f, 0x64, 0x6d, 0x52, 0x5b, 0x40, 0x49, 0x3e, 0x37, 0x2c, 0x25, 0x1a, 0x13, 0x08, 0x01,
+            0xe6, 0xef, 0xf4, 0xfd, 0xc2, 0xcb, 0xd0, 0xd9, 0xae, 0xa7, 0xbc, 0xb5, 0x8a, 0x83, 0x98, 0x91,
+            0x4d, 0x44, 0x5f, 0x56, 0x69, 0x60, 0x7b, 0x72, 0x05, 0x0c, 0x17, 0x1e, 0x21, 0x28, 0x33, 0x3a,
+            0xdd, 0xd4, 0xcf, 0xc6, 0xf9, 0xf0, 0xeb, 0xe2, 0x95, 0x9c, 0x87, 0x8e, 0xb1, 0xb8, 0xa3, 0xaa,
+            0xec, 0xe5, 0xfe, 0xf7, 0xc8, 0xc1, 0xda, 0xd3, 0xa4, 0xad, 0xb6, 0xbf, 0x80, 0x89, 0x92, 0x9b,
+            0x7c, 0x75, 0x6e, 0x67, 0x58, 0x51, 0x4a, 0x43, 0x34, 0x3d, 0x26, 0x2f, 0x10, 0x19, 0x02, 0x0b,
+            0xd7, 0xde, 0xc5, 0xcc, 0xf3, 0xfa, 0xe1, 0xe8, 0x9f, 0x96, 0x8d, 0x84, 0xbb, 0xb2, 0xa9, 0xa0,
+            0x47, 0x4e, 0x55, 0x5c, 0x63, 0x6a, 0x71, 0x78, 0x0f, 0x06, 0x1d, 0x14, 0x2b, 0x22, 0x39, 0x30,
+            0x9a, 0x93, 0x88, 0x81, 0xbe, 0xb7, 0xac, 0xa5, 0xd2, 0xdb, 0xc0, 0xc9, 0xf6, 0xff, 0xe4, 0xed,
+            0x0a, 0x03, 0x18, 0x11, 0x2e, 0x27, 0x3c, 0x35, 0x42, 0x4b, 0x50, 0x59, 0x66, 0x6f, 0x74, 0x7d,
+            0xa1, 0xa8, 0xb3, 0xba, 0x85, 0x8c, 0x97, 0x9e, 0xe9, 0xe0, 0xfb, 0xf2, 0xcd, 0xc4, 0xdf, 0xd6,
+            0x31, 0x38, 0x23, 0x2a, 0x15, 0x1c, 0x07, 0x0e, 0x79, 0x70, 0x6b, 0x62, 0x5d, 0x54, 0x4f, 0x46
+        };
+
+        private static byte[] _gfMul0B =
+        {
+            0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69,
+            0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9,
+            0x7b, 0x70, 0x6d, 0x66, 0x57, 0x5c, 0x41, 0x4a, 0x23, 0x28, 0x35, 0x3e, 0x0f, 0x04, 0x19, 0x12,
+            0xcb, 0xc0, 0xdd, 0xd6, 0xe7, 0xec, 0xf1, 0xfa, 0x93, 0x98, 0x85, 0x8e, 0xbf, 0xb4, 0xa9, 0xa2,
+            0xf6, 0xfd, 0xe0, 0xeb, 0xda, 0xd1, 0xcc, 0xc7, 0xae, 0xa5, 0xb8, 0xb3, 0x82, 0x89, 0x94, 0x9f,
+            0x46, 0x4d, 0x50, 0x5b, 0x6a, 0x61, 0x7c, 0x77, 0x1e, 0x15, 0x08, 0x03, 0x32, 0x39, 0x24, 0x2f,
+            0x8d, 0x86, 0x9b, 0x90, 0xa1, 0xaa, 0xb7, 0xbc, 0xd5, 0xde, 0xc3, 0xc8, 0xf9, 0xf2, 0xef, 0xe4,
+            0x3d, 0x36, 0x2b, 0x20, 0x11, 0x1a, 0x07, 0x0c, 0x65, 0x6e, 0x73, 0x78, 0x49, 0x42, 0x5f, 0x54,
+            0xf7, 0xfc, 0xe1, 0xea, 0xdb, 0xd0, 0xcd, 0xc6, 0xaf, 0xa4, 0xb9, 0xb2, 0x83, 0x88, 0x95, 0x9e,
+            0x47, 0x4c, 0x51, 0x5a, 0x6b, 0x60, 0x7d, 0x76, 0x1f, 0x14, 0x09, 0x02, 0x33, 0x38, 0x25, 0x2e,
+            0x8c, 0x87, 0x9a, 0x91, 0xa0, 0xab, 0xb6, 0xbd, 0xd4, 0xdf, 0xc2, 0xc9, 0xf8, 0xf3, 0xee, 0xe5,
+            0x3c, 0x37, 0x2a, 0x21, 0x10, 0x1b, 0x06, 0x0d, 0x64, 0x6f, 0x72, 0x79, 0x48, 0x43, 0x5e, 0x55,
+            0x01, 0x0a, 0x17, 0x1c, 0x2d, 0x26, 0x3b, 0x30, 0x59, 0x52, 0x4f, 0x44, 0x75, 0x7e, 0x63, 0x68,
+            0xb1, 0xba, 0xa7, 0xac, 0x9d, 0x96, 0x8b, 0x80, 0xe9, 0xe2, 0xff, 0xf4, 0xc5, 0xce, 0xd3, 0xd8,
+            0x7a, 0x71, 0x6c, 0x67, 0x56, 0x5d, 0x40, 0x4b, 0x22, 0x29, 0x34, 0x3f, 0x0e, 0x05, 0x18, 0x13,
+            0xca, 0xc1, 0xdc, 0xd7, 0xe6, 0xed, 0xf0, 0xfb, 0x92, 0x99, 0x84, 0x8f, 0xbe, 0xb5, 0xa8, 0xa3
+        };
+
+        private static byte[] _gfMul0D =
+        {
+            0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b,
+            0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b,
+            0xbb, 0xb6, 0xa1, 0xac, 0x8f, 0x82, 0x95, 0x98, 0xd3, 0xde, 0xc9, 0xc4, 0xe7, 0xea, 0xfd, 0xf0,
+            0x6b, 0x66, 0x71, 0x7c, 0x5f, 0x52, 0x45, 0x48, 0x03, 0x0e, 0x19, 0x14, 0x37, 0x3a, 0x2d, 0x20,
+            0x6d, 0x60, 0x77, 0x7a, 0x59, 0x54, 0x43, 0x4e, 0x05, 0x08, 0x1f, 0x12, 0x31, 0x3c, 0x2b, 0x26,
+            0xbd, 0xb0, 0xa7, 0xaa, 0x89, 0x84, 0x93, 0x9e, 0xd5, 0xd8, 0xcf, 0xc2, 0xe1, 0xec, 0xfb, 0xf6,
+            0xd6, 0xdb, 0xcc, 0xc1, 0xe2, 0xef, 0xf8, 0xf5, 0xbe, 0xb3, 0xa4, 0xa9, 0x8a, 0x87, 0x90, 0x9d,
+            0x06, 0x0b, 0x1c, 0x11, 0x32, 0x3f, 0x28, 0x25, 0x6e, 0x63, 0x74, 0x79, 0x5a, 0x57, 0x40, 0x4d,
+            0xda, 0xd7, 0xc0, 0xcd, 0xee, 0xe3, 0xf4, 0xf9, 0xb2, 0xbf, 0xa8, 0xa5, 0x86, 0x8b, 0x9c, 0x91,
+            0x0a, 0x07, 0x10, 0x1d, 0x3e, 0x33, 0x24, 0x29, 0x62, 0x6f, 0x78, 0x75, 0x56, 0x5b, 0x4c, 0x41,
+            0x61, 0x6c, 0x7b, 0x76, 0x55, 0x58, 0x4f, 0x42, 0x09, 0x04, 0x13, 0x1e, 0x3d, 0x30, 0x27, 0x2a,
+            0xb1, 0xbc, 0xab, 0xa6, 0x85, 0x88, 0x9f, 0x92, 0xd9, 0xd4, 0xc3, 0xce, 0xed, 0xe0, 0xf7, 0xfa,
+            0xb7, 0xba, 0xad, 0xa0, 0x83, 0x8e, 0x99, 0x94, 0xdf, 0xd2, 0xc5, 0xc8, 0xeb, 0xe6, 0xf1, 0xfc,
+            0x67, 0x6a, 0x7d, 0x70, 0x53, 0x5e, 0x49, 0x44, 0x0f, 0x02, 0x15, 0x18, 0x3b, 0x36, 0x21, 0x2c,
+            0x0c, 0x01, 0x16, 0x1b, 0x38, 0x35, 0x22, 0x2f, 0x64, 0x69, 0x7e, 0x73, 0x50, 0x5d, 0x4a, 0x47,
+            0xdc, 0xd1, 0xc6, 0xcb, 0xe8, 0xe5, 0xf2, 0xff, 0xb4, 0xb9, 0xae, 0xa3, 0x80, 0x8d, 0x9a, 0x97
+        };
+
+        private static byte[] _gfMul0E =
+        {
+            0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a,
+            0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba,
+            0xdb, 0xd5, 0xc7, 0xc9, 0xe3, 0xed, 0xff, 0xf1, 0xab, 0xa5, 0xb7, 0xb9, 0x93, 0x9d, 0x8f, 0x81,
+            0x3b, 0x35, 0x27, 0x29, 0x03, 0x0d, 0x1f, 0x11, 0x4b, 0x45, 0x57, 0x59, 0x73, 0x7d, 0x6f, 0x61,
+            0xad, 0xa3, 0xb1, 0xbf, 0x95, 0x9b, 0x89, 0x87, 0xdd, 0xd3, 0xc1, 0xcf, 0xe5, 0xeb, 0xf9, 0xf7,
+            0x4d, 0x43, 0x51, 0x5f, 0x75, 0x7b, 0x69, 0x67, 0x3d, 0x33, 0x21, 0x2f, 0x05, 0x0b, 0x19, 0x17,
+            0x76, 0x78, 0x6a, 0x64, 0x4e, 0x40, 0x52, 0x5c, 0x06, 0x08, 0x1a, 0x14, 0x3e, 0x30, 0x22, 0x2c,
+            0x96, 0x98, 0x8a, 0x84, 0xae, 0xa0, 0xb2, 0xbc, 0xe6, 0xe8, 0xfa, 0xf4, 0xde, 0xd0, 0xc2, 0xcc,
+            0x41, 0x4f, 0x5d, 0x53, 0x79, 0x77, 0x65, 0x6b, 0x31, 0x3f, 0x2d, 0x23, 0x09, 0x07, 0x15, 0x1b,
+            0xa1, 0xaf, 0xbd, 0xb3, 0x99, 0x97, 0x85, 0x8b, 0xd1, 0xdf, 0xcd, 0xc3, 0xe9, 0xe7, 0xf5, 0xfb,
+            0x9a, 0x94, 0x86, 0x88, 0xa2, 0xac, 0xbe, 0xb0, 0xea, 0xe4, 0xf6, 0xf8, 0xd2, 0xdc, 0xce, 0xc0,
+            0x7a, 0x74, 0x66, 0x68, 0x42, 0x4c, 0x5e, 0x50, 0x0a, 0x04, 0x16, 0x18, 0x32, 0x3c, 0x2e, 0x20,
+            0xec, 0xe2, 0xf0, 0xfe, 0xd4, 0xda, 0xc8, 0xc6, 0x9c, 0x92, 0x80, 0x8e, 0xa4, 0xaa, 0xb8, 0xb6,
+            0x0c, 0x02, 0x10, 0x1e, 0x34, 0x3a, 0x28, 0x26, 0x7c, 0x72, 0x60, 0x6e, 0x44, 0x4a, 0x58, 0x56,
+            0x37, 0x39, 0x2b, 0x25, 0x0f, 0x01, 0x13, 0x1d, 0x47, 0x49, 0x5b, 0x55, 0x7f, 0x71, 0x63, 0x6d,
+            0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d
+        };
+
+        private static byte[] _srPerm = { 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3 };
+
+        private static byte[] _isrPerm = { 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11 };
+#endregion
+
+        public static Vector128<float> AesInvMixColumns(Vector128<float> op)
+        {
+            byte[] inState  = new byte[16];
+            byte[] outState = new byte[16];
+
+            FromVectorToByteArray(inState, ref op);
+
+            for (int columns = 0; columns <= 3; columns++)
+            {
+                int idx = columns << 2;
+
+                byte row0 = inState[idx + 0]; // A, E, I, M: [Row0, Col0-Col3]
+                byte row1 = inState[idx + 1]; // B, F, J, N: [Row1, Col0-Col3]
+                byte row2 = inState[idx + 2]; // C, G, K, O: [Row2, Col0-Col3]
+                byte row3 = inState[idx + 3]; // D, H, L, P: [Row3, Col0-Col3]
+
+                outState[idx + 0] = (byte)((uint)_gfMul0E[row0] ^ _gfMul0B[row1] ^ _gfMul0D[row2] ^ _gfMul09[row3]);
+                outState[idx + 1] = (byte)((uint)_gfMul09[row0] ^ _gfMul0E[row1] ^ _gfMul0B[row2] ^ _gfMul0D[row3]);
+                outState[idx + 2] = (byte)((uint)_gfMul0D[row0] ^ _gfMul09[row1] ^ _gfMul0E[row2] ^ _gfMul0B[row3]);
+                outState[idx + 3] = (byte)((uint)_gfMul0B[row0] ^ _gfMul0D[row1] ^ _gfMul09[row2] ^ _gfMul0E[row3]);
+            }
+
+            FromByteArrayToVector(outState, ref op);
+
+            return op;
+        }
+
+        public static Vector128<float> AesInvShiftRows(Vector128<float> op)
+        {
+            byte[] inState  = new byte[16];
+            byte[] outState = new byte[16];
+
+            FromVectorToByteArray(inState, ref op);
+
+            for (int idx = 0; idx <= 15; idx++)
+            {
+                outState[_isrPerm[idx]] = inState[idx];
+            }
+
+            FromByteArrayToVector(outState, ref op);
+
+            return op;
+        }
+
+        public static Vector128<float> AesInvSubBytes(Vector128<float> op)
+        {
+            byte[] inState  = new byte[16];
+            byte[] outState = new byte[16];
+
+            FromVectorToByteArray(inState, ref op);
+
+            for (int idx = 0; idx <= 15; idx++)
+            {
+                outState[idx] = _invSBox[inState[idx]];
+            }
+
+            FromByteArrayToVector(outState, ref op);
+
+            return op;
+        }
+
+        public static Vector128<float> AesMixColumns(Vector128<float> op)
+        {
+            byte[] inState  = new byte[16];
+            byte[] outState = new byte[16];
+
+            FromVectorToByteArray(inState, ref op);
+
+            for (int columns = 0; columns <= 3; columns++)
+            {
+                int idx = columns << 2;
+
+                byte row0 = inState[idx + 0]; // A, E, I, M: [Row0, Col0-Col3]
+                byte row1 = inState[idx + 1]; // B, F, J, N: [Row1, Col0-Col3]
+                byte row2 = inState[idx + 2]; // C, G, K, O: [Row2, Col0-Col3]
+                byte row3 = inState[idx + 3]; // D, H, L, P: [Row3, Col0-Col3]
+
+                outState[idx + 0] = (byte)((uint)_gfMul02[row0] ^ _gfMul03[row1] ^ row2 ^ row3);
+                outState[idx + 1] = (byte)((uint)row0 ^ _gfMul02[row1] ^ _gfMul03[row2] ^ row3);
+                outState[idx + 2] = (byte)((uint)row0 ^ row1 ^ _gfMul02[row2] ^ _gfMul03[row3]);
+                outState[idx + 3] = (byte)((uint)_gfMul03[row0] ^ row1 ^ row2 ^ _gfMul02[row3]);
+            }
+
+            FromByteArrayToVector(outState, ref op);
+
+            return op;
+        }
+
+        public static Vector128<float> AesShiftRows(Vector128<float> op)
+        {
+            byte[] inState  = new byte[16];
+            byte[] outState = new byte[16];
+
+            FromVectorToByteArray(inState, ref op);
+
+            for (int idx = 0; idx <= 15; idx++)
+            {
+                outState[_srPerm[idx]] = inState[idx];
+            }
+
+            FromByteArrayToVector(outState, ref op);
+
+            return op;
+        }
+
+        public static Vector128<float> AesSubBytes(Vector128<float> op)
+        {
+            byte[] inState  = new byte[16];
+            byte[] outState = new byte[16];
+
+            FromVectorToByteArray(inState, ref op);
+
+            for (int idx = 0; idx <= 15; idx++)
+            {
+                outState[idx] = _sBox[inState[idx]];
+            }
+
+            FromByteArrayToVector(outState, ref op);
+
+            return op;
+        }
+
+        private static void FromVectorToByteArray(byte[] state, ref Vector128<float> op)
+        {
+            ulong uLongLow  = VectorHelper.VectorExtractIntZx((op), (byte)0, 3);
+            ulong uLongHigh = VectorHelper.VectorExtractIntZx((op), (byte)1, 3);
+
+            for (int idx = 0; idx <= 7; idx++)
+            {
+                state[idx + 0] = (byte)(uLongLow  & 0xFFUL);
+                state[idx + 8] = (byte)(uLongHigh & 0xFFUL);
+
+                uLongLow  >>= 8;
+                uLongHigh >>= 8;
+            }
+        }
+
+        private static void FromByteArrayToVector(byte[] state, ref Vector128<float> op)
+        {
+            if (!Sse2.IsSupported)
+            {
+                throw new PlatformNotSupportedException();
+            }
+
+            op = Sse.StaticCast<byte, float>(Sse2.SetVector128(
+                state[15], state[14], state[13], state[12],
+                state[11], state[10], state[9],  state[8],
+                state[7],  state[6],  state[5],  state[4],
+                state[3],  state[2],  state[1],  state[0]));
+        }
+    }
+}
diff --git a/ChocolArm64/Instructions/Inst.cs b/ChocolArm64/Instructions/Inst.cs
new file mode 100644
index 00000000..5f6740ca
--- /dev/null
+++ b/ChocolArm64/Instructions/Inst.cs
@@ -0,0 +1,20 @@
+using System;
+
+namespace ChocolArm64.Instructions
+{
+    struct Inst
+    {
+        public InstInterpreter Interpreter { get; private set; }
+        public InstEmitter     Emitter     { get; private set; }
+        public Type             Type        { get; private set; }
+
+        public static Inst Undefined => new Inst(null, InstEmit.Und, null);
+
+        public Inst(InstInterpreter interpreter, InstEmitter emitter, Type type)
+        {
+            Interpreter = interpreter;
+            Emitter     = emitter;
+            Type        = type;
+        }
+    }
+}
+\ No newline at end of file
diff --git a/ChocolArm64/Instructions/InstEmitAlu.cs b/ChocolArm64/Instructions/InstEmitAlu.cs
new file mode 100644
index 00000000..c0258ed2
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitAlu.cs
@@ -0,0 +1,402 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.State;
+using ChocolArm64.Translation;
+using System;
+using System.Reflection;
+using System.Reflection.Emit;
+using System.Runtime.Intrinsics.X86;
+
+using static ChocolArm64.Instructions.InstEmitAluHelper;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        public static void Adc(ILEmitterCtx context)  => EmitAdc(context, false);
+        public static void Adcs(ILEmitterCtx context) => EmitAdc(context, true);
+
+        private static void EmitAdc(ILEmitterCtx context, bool setFlags)
+        {
+            EmitDataLoadOpers(context);
+
+            context.Emit(OpCodes.Add);
+
+            context.EmitLdflg((int)PState.CBit);
+
+            Type[] mthdTypes  = new Type[] { typeof(bool) };
+
+            MethodInfo mthdInfo = typeof(Convert).GetMethod(nameof(Convert.ToInt32), mthdTypes);
+
+            context.EmitCall(mthdInfo);
+
+            if (context.CurrOp.RegisterSize != RegisterSize.Int32)
+            {
+                context.Emit(OpCodes.Conv_U8);
+            }
+
+            context.Emit(OpCodes.Add);
+
+            if (setFlags)
+            {
+                context.EmitZnFlagCheck();
+
+                EmitAdcsCCheck(context);
+                EmitAddsVCheck(context);
+            }
+
+            EmitDataStore(context);
+        }
+
+        public static void Add(ILEmitterCtx context) => EmitDataOp(context, OpCodes.Add);
+
+        public static void Adds(ILEmitterCtx context)
+        {
+            EmitDataLoadOpers(context);
+
+            context.Emit(OpCodes.Add);
+
+            context.EmitZnFlagCheck();
+
+            EmitAddsCCheck(context);
+            EmitAddsVCheck(context);
+            EmitDataStoreS(context);
+        }
+
+        public static void And(ILEmitterCtx context) => EmitDataOp(context, OpCodes.And);
+
+        public static void Ands(ILEmitterCtx context)
+        {
+            EmitDataLoadOpers(context);
+
+            context.Emit(OpCodes.And);
+
+            EmitZeroCvFlags(context);
+
+            context.EmitZnFlagCheck();
+
+            EmitDataStoreS(context);
+        }
+
+        public static void Asrv(ILEmitterCtx context) => EmitDataOpShift(context, OpCodes.Shr);
+
+        public static void Bic(ILEmitterCtx context)  => EmitBic(context, false);
+        public static void Bics(ILEmitterCtx context) => EmitBic(context, true);
+
+        private static void EmitBic(ILEmitterCtx context, bool setFlags)
+        {
+            EmitDataLoadOpers(context);
+
+            context.Emit(OpCodes.Not);
+            context.Emit(OpCodes.And);
+
+            if (setFlags)
+            {
+                EmitZeroCvFlags(context);
+
+                context.EmitZnFlagCheck();
+            }
+
+            EmitDataStore(context, setFlags);
+        }
+
+        public static void Cls(ILEmitterCtx context)
+        {
+            OpCodeAlu64 op = (OpCodeAlu64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rn);
+
+            context.EmitLdc_I4(op.RegisterSize == RegisterSize.Int32 ? 32 : 64);
+
+            SoftFallback.EmitCall(context, nameof(SoftFallback.CountLeadingSigns));
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        public static void Clz(ILEmitterCtx context)
+        {
+            OpCodeAlu64 op = (OpCodeAlu64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rn);
+
+            if (Lzcnt.IsSupported)
+            {
+                Type tValue = op.RegisterSize == RegisterSize.Int32 ? typeof(uint) : typeof(ulong);
+
+                context.EmitCall(typeof(Lzcnt).GetMethod(nameof(Lzcnt.LeadingZeroCount), new Type[] { tValue }));
+            }
+            else
+            {
+                context.EmitLdc_I4(op.RegisterSize == RegisterSize.Int32 ? 32 : 64);
+
+                SoftFallback.EmitCall(context, nameof(SoftFallback.CountLeadingZeros));
+            }
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        public static void Eon(ILEmitterCtx context)
+        {
+            EmitDataLoadOpers(context);
+
+            context.Emit(OpCodes.Not);
+            context.Emit(OpCodes.Xor);
+
+            EmitDataStore(context);
+        }
+
+        public static void Eor(ILEmitterCtx context) => EmitDataOp(context, OpCodes.Xor);
+
+        public static void Extr(ILEmitterCtx context)
+        {
+            //TODO: Ensure that the Shift is valid for the Is64Bits.
+            OpCodeAluRs64 op = (OpCodeAluRs64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rm);
+
+            if (op.Shift > 0)
+            {
+                context.EmitLdc_I4(op.Shift);
+
+                context.Emit(OpCodes.Shr_Un);
+
+                context.EmitLdintzr(op.Rn);
+                context.EmitLdc_I4(op.GetBitsCount() - op.Shift);
+
+                context.Emit(OpCodes.Shl);
+                context.Emit(OpCodes.Or);
+            }
+
+            EmitDataStore(context);
+        }
+
+        public static void Lslv(ILEmitterCtx context) => EmitDataOpShift(context, OpCodes.Shl);
+        public static void Lsrv(ILEmitterCtx context) => EmitDataOpShift(context, OpCodes.Shr_Un);
+
+        public static void Sbc(ILEmitterCtx context)  => EmitSbc(context, false);
+        public static void Sbcs(ILEmitterCtx context) => EmitSbc(context, true);
+
+        private static void EmitSbc(ILEmitterCtx context, bool setFlags)
+        {
+            EmitDataLoadOpers(context);
+
+            context.Emit(OpCodes.Sub);
+
+            context.EmitLdflg((int)PState.CBit);
+
+            Type[] mthdTypes  = new Type[] { typeof(bool) };
+
+            MethodInfo mthdInfo = typeof(Convert).GetMethod(nameof(Convert.ToInt32), mthdTypes);
+
+            context.EmitCall(mthdInfo);
+
+            context.EmitLdc_I4(1);
+
+            context.Emit(OpCodes.Xor);
+
+            if (context.CurrOp.RegisterSize != RegisterSize.Int32)
+            {
+                context.Emit(OpCodes.Conv_U8);
+            }
+
+            context.Emit(OpCodes.Sub);
+
+            if (setFlags)
+            {
+                context.EmitZnFlagCheck();
+
+                EmitSbcsCCheck(context);
+                EmitSubsVCheck(context);
+            }
+
+            EmitDataStore(context);
+        }
+
+        public static void Sub(ILEmitterCtx context) => EmitDataOp(context, OpCodes.Sub);
+
+        public static void Subs(ILEmitterCtx context)
+        {
+            context.TryOptMarkCondWithoutCmp();
+
+            EmitDataLoadOpers(context);
+
+            context.Emit(OpCodes.Sub);
+
+            context.EmitZnFlagCheck();
+
+            EmitSubsCCheck(context);
+            EmitSubsVCheck(context);
+            EmitDataStoreS(context);
+        }
+
+        public static void Orn(ILEmitterCtx context)
+        {
+            EmitDataLoadOpers(context);
+
+            context.Emit(OpCodes.Not);
+            context.Emit(OpCodes.Or);
+
+            EmitDataStore(context);
+        }
+
+        public static void Orr(ILEmitterCtx context) => EmitDataOp(context, OpCodes.Or);
+
+        public static void Rbit(ILEmitterCtx context) => EmitFallback32_64(context,
+            nameof(SoftFallback.ReverseBits32),
+            nameof(SoftFallback.ReverseBits64));
+
+        public static void Rev16(ILEmitterCtx context) => EmitFallback32_64(context,
+            nameof(SoftFallback.ReverseBytes16_32),
+            nameof(SoftFallback.ReverseBytes16_64));
+
+        public static void Rev32(ILEmitterCtx context) => EmitFallback32_64(context,
+            nameof(SoftFallback.ReverseBytes32_32),
+            nameof(SoftFallback.ReverseBytes32_64));
+
+        private static void EmitFallback32_64(ILEmitterCtx context, string name32, string name64)
+        {
+            OpCodeAlu64 op = (OpCodeAlu64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rn);
+
+            if (op.RegisterSize == RegisterSize.Int32)
+            {
+                SoftFallback.EmitCall(context, name32);
+            }
+            else
+            {
+                SoftFallback.EmitCall(context, name64);
+            }
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        public static void Rev64(ILEmitterCtx context)
+        {
+            OpCodeAlu64 op = (OpCodeAlu64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rn);
+
+            SoftFallback.EmitCall(context, nameof(SoftFallback.ReverseBytes64));
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        public static void Rorv(ILEmitterCtx context)
+        {
+            EmitDataLoadRn(context);
+            EmitDataLoadShift(context);
+
+            context.Emit(OpCodes.Shr_Un);
+
+            EmitDataLoadRn(context);
+
+            context.EmitLdc_I4(context.CurrOp.GetBitsCount());
+
+            EmitDataLoadShift(context);
+
+            context.Emit(OpCodes.Sub);
+            context.Emit(OpCodes.Shl);
+            context.Emit(OpCodes.Or);
+
+            EmitDataStore(context);
+        }
+
+        public static void Sdiv(ILEmitterCtx context) => EmitDiv(context, OpCodes.Div);
+        public static void Udiv(ILEmitterCtx context) => EmitDiv(context, OpCodes.Div_Un);
+
+        private static void EmitDiv(ILEmitterCtx context, OpCode ilOp)
+        {
+            //If Rm == 0, Rd = 0 (division by zero).
+            context.EmitLdc_I(0);
+
+            EmitDataLoadRm(context);
+
+            context.EmitLdc_I(0);
+
+            ILLabel badDiv = new ILLabel();
+
+            context.Emit(OpCodes.Beq_S, badDiv);
+            context.Emit(OpCodes.Pop);
+
+            if (ilOp == OpCodes.Div)
+            {
+                //If Rn == INT_MIN && Rm == -1, Rd = INT_MIN (overflow).
+                long intMin = 1L << (context.CurrOp.GetBitsCount() - 1);
+
+                context.EmitLdc_I(intMin);
+
+                EmitDataLoadRn(context);
+
+                context.EmitLdc_I(intMin);
+
+                context.Emit(OpCodes.Ceq);
+
+                EmitDataLoadRm(context);
+
+                context.EmitLdc_I(-1);
+
+                context.Emit(OpCodes.Ceq);
+                context.Emit(OpCodes.And);
+                context.Emit(OpCodes.Brtrue_S, badDiv);
+                context.Emit(OpCodes.Pop);
+            }
+
+            EmitDataLoadRn(context);
+            EmitDataLoadRm(context);
+
+            context.Emit(ilOp);
+
+            context.MarkLabel(badDiv);
+
+            EmitDataStore(context);
+        }
+
+        private static void EmitDataOp(ILEmitterCtx context, OpCode ilOp)
+        {
+            EmitDataLoadOpers(context);
+
+            context.Emit(ilOp);
+
+            EmitDataStore(context);
+        }
+
+        private static void EmitDataOpShift(ILEmitterCtx context, OpCode ilOp)
+        {
+            EmitDataLoadRn(context);
+            EmitDataLoadShift(context);
+
+            context.Emit(ilOp);
+
+            EmitDataStore(context);
+        }
+
+        private static void EmitDataLoadShift(ILEmitterCtx context)
+        {
+            EmitDataLoadRm(context);
+
+            context.EmitLdc_I(context.CurrOp.GetBitsCount() - 1);
+
+            context.Emit(OpCodes.And);
+
+            //Note: Only 32-bits shift values are valid, so when the value is 64-bits
+            //we need to cast it to a 32-bits integer. This is fine because we
+            //AND the value and only keep the lower 5 or 6 bits anyway -- it
+            //could very well fit on a byte.
+            if (context.CurrOp.RegisterSize != RegisterSize.Int32)
+            {
+                context.Emit(OpCodes.Conv_I4);
+            }
+        }
+
+        private static void EmitZeroCvFlags(ILEmitterCtx context)
+        {
+            context.EmitLdc_I4(0);
+
+            context.EmitStflg((int)PState.VBit);
+
+            context.EmitLdc_I4(0);
+
+            context.EmitStflg((int)PState.CBit);
+        }
+    }
+}
diff --git a/ChocolArm64/Instructions/InstEmitAluHelper.cs b/ChocolArm64/Instructions/InstEmitAluHelper.cs
new file mode 100644
index 00000000..613dd234
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitAluHelper.cs
@@ -0,0 +1,212 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.State;
+using ChocolArm64.Translation;
+using System.Reflection.Emit;
+
+namespace ChocolArm64.Instructions
+{
+    static class InstEmitAluHelper
+    {
+        public static void EmitAdcsCCheck(ILEmitterCtx context)
+        {
+            //C = (Rd == Rn && CIn) || Rd < Rn
+            context.EmitSttmp();
+            context.EmitLdtmp();
+            context.EmitLdtmp();
+
+            EmitDataLoadRn(context);
+
+            context.Emit(OpCodes.Ceq);
+
+            context.EmitLdflg((int)PState.CBit);
+
+            context.Emit(OpCodes.And);
+
+            context.EmitLdtmp();
+
+            EmitDataLoadRn(context);
+
+            context.Emit(OpCodes.Clt_Un);
+            context.Emit(OpCodes.Or);
+
+            context.EmitStflg((int)PState.CBit);
+        }
+
+        public static void EmitAddsCCheck(ILEmitterCtx context)
+        {
+            //C = Rd < Rn
+            context.Emit(OpCodes.Dup);
+
+            EmitDataLoadRn(context);
+
+            context.Emit(OpCodes.Clt_Un);
+
+            context.EmitStflg((int)PState.CBit);
+        }
+
+        public static void EmitAddsVCheck(ILEmitterCtx context)
+        {
+            //V = (Rd ^ Rn) & ~(Rn ^ Rm) < 0
+            context.Emit(OpCodes.Dup);
+
+            EmitDataLoadRn(context);
+
+            context.Emit(OpCodes.Xor);
+
+            EmitDataLoadOpers(context);
+
+            context.Emit(OpCodes.Xor);
+            context.Emit(OpCodes.Not);
+            context.Emit(OpCodes.And);
+
+            context.EmitLdc_I(0);
+
+            context.Emit(OpCodes.Clt);
+
+            context.EmitStflg((int)PState.VBit);
+        }
+
+        public static void EmitSbcsCCheck(ILEmitterCtx context)
+        {
+            //C = (Rn == Rm && CIn) || Rn > Rm
+            EmitDataLoadOpers(context);
+
+            context.Emit(OpCodes.Ceq);
+
+            context.EmitLdflg((int)PState.CBit);
+
+            context.Emit(OpCodes.And);
+
+            EmitDataLoadOpers(context);
+
+            context.Emit(OpCodes.Cgt_Un);
+            context.Emit(OpCodes.Or);
+
+            context.EmitStflg((int)PState.CBit);
+        }
+
+        public static void EmitSubsCCheck(ILEmitterCtx context)
+        {
+            //C = Rn == Rm || Rn > Rm = !(Rn < Rm)
+            EmitDataLoadOpers(context);
+
+            context.Emit(OpCodes.Clt_Un);
+
+            context.EmitLdc_I4(1);
+
+            context.Emit(OpCodes.Xor);
+
+            context.EmitStflg((int)PState.CBit);
+        }
+
+        public static void EmitSubsVCheck(ILEmitterCtx context)
+        {
+            //V = (Rd ^ Rn) & (Rn ^ Rm) < 0
+            context.Emit(OpCodes.Dup);
+
+            EmitDataLoadRn(context);
+
+            context.Emit(OpCodes.Xor);
+
+            EmitDataLoadOpers(context);
+
+            context.Emit(OpCodes.Xor);
+            context.Emit(OpCodes.And);
+
+            context.EmitLdc_I(0);
+
+            context.Emit(OpCodes.Clt);
+
+            context.EmitStflg((int)PState.VBit);
+        }
+
+        public static void EmitDataLoadRm(ILEmitterCtx context)
+        {
+            context.EmitLdintzr(((IOpCodeAluRs64)context.CurrOp).Rm);
+        }
+
+        public static void EmitDataLoadOpers(ILEmitterCtx context)
+        {
+            EmitDataLoadRn(context);
+            EmitDataLoadOper2(context);
+        }
+
+        public static void EmitDataLoadRn(ILEmitterCtx context)
+        {
+            IOpCodeAlu64 op = (IOpCodeAlu64)context.CurrOp;
+
+            if (op.DataOp == DataOp.Logical || op is IOpCodeAluRs64)
+            {
+                context.EmitLdintzr(op.Rn);
+            }
+            else
+            {
+                context.EmitLdint(op.Rn);
+            }
+        }
+
+        public static void EmitDataLoadOper2(ILEmitterCtx context)
+        {
+            switch (context.CurrOp)
+            {
+                case IOpCodeAluImm64 op:
+                    context.EmitLdc_I(op.Imm);
+                    break;
+
+                case IOpCodeAluRs64 op:
+                    context.EmitLdintzr(op.Rm);
+
+                    switch (op.ShiftType)
+                    {
+                        case ShiftType.Lsl: context.EmitLsl(op.Shift); break;
+                        case ShiftType.Lsr: context.EmitLsr(op.Shift); break;
+                        case ShiftType.Asr: context.EmitAsr(op.Shift); break;
+                        case ShiftType.Ror: context.EmitRor(op.Shift); break;
+                    }
+                    break;
+
+                case IOpCodeAluRx64 op:
+                    context.EmitLdintzr(op.Rm);
+                    context.EmitCast(op.IntType);
+                    context.EmitLsl(op.Shift);
+                    break;
+            }
+        }
+
+        public static void EmitDataStore(ILEmitterCtx context)  => EmitDataStore(context, false);
+        public static void EmitDataStoreS(ILEmitterCtx context) => EmitDataStore(context, true);
+
+        public static void EmitDataStore(ILEmitterCtx context, bool setFlags)
+        {
+            IOpCodeAlu64 op = (IOpCodeAlu64)context.CurrOp;
+
+            if (setFlags || op is IOpCodeAluRs64)
+            {
+                context.EmitStintzr(op.Rd);
+            }
+            else
+            {
+                context.EmitStint(op.Rd);
+            }
+        }
+
+        public static void EmitSetNzcv(ILEmitterCtx context, int nzcv)
+        {
+            context.EmitLdc_I4((nzcv >> 0) & 1);
+
+            context.EmitStflg((int)PState.VBit);
+
+            context.EmitLdc_I4((nzcv >> 1) & 1);
+
+            context.EmitStflg((int)PState.CBit);
+
+            context.EmitLdc_I4((nzcv >> 2) & 1);
+
+            context.EmitStflg((int)PState.ZBit);
+
+            context.EmitLdc_I4((nzcv >> 3) & 1);
+
+            context.EmitStflg((int)PState.NBit);
+        }
+    }
+}
+\ No newline at end of file
diff --git a/ChocolArm64/Instructions/InstEmitBfm.cs b/ChocolArm64/Instructions/InstEmitBfm.cs
new file mode 100644
index 00000000..d25af8be
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitBfm.cs
@@ -0,0 +1,208 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.State;
+using ChocolArm64.Translation;
+using System.Reflection.Emit;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        public static void Bfm(ILEmitterCtx context)
+        {
+            OpCodeBfm64 op = (OpCodeBfm64)context.CurrOp;
+
+            EmitBfmLoadRn(context);
+
+            context.EmitLdintzr(op.Rd);
+            context.EmitLdc_I(~op.WMask & op.TMask);
+
+            context.Emit(OpCodes.And);
+            context.Emit(OpCodes.Or);
+
+            context.EmitLdintzr(op.Rd);
+            context.EmitLdc_I(~op.TMask);
+
+            context.Emit(OpCodes.And);
+            context.Emit(OpCodes.Or);
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        public static void Sbfm(ILEmitterCtx context)
+        {
+            OpCodeBfm64 op = (OpCodeBfm64)context.CurrOp;
+
+            int bitsCount = op.GetBitsCount();
+
+            if (op.Pos + 1 == bitsCount)
+            {
+                EmitSbfmShift(context);
+            }
+            else if (op.Pos < op.Shift)
+            {
+                EmitSbfiz(context);
+            }
+            else if (op.Pos == 7 && op.Shift == 0)
+            {
+                EmitSbfmCast(context, OpCodes.Conv_I1);
+            }
+            else if (op.Pos == 15 && op.Shift == 0)
+            {
+                EmitSbfmCast(context, OpCodes.Conv_I2);
+            }
+            else if (op.Pos == 31 && op.Shift == 0)
+            {
+                EmitSbfmCast(context, OpCodes.Conv_I4);
+            }
+            else
+            {
+                EmitBfmLoadRn(context);
+
+                context.EmitLdintzr(op.Rn);
+
+                context.EmitLsl(bitsCount - 1 - op.Pos);
+                context.EmitAsr(bitsCount - 1);
+
+                context.EmitLdc_I(~op.TMask);
+
+                context.Emit(OpCodes.And);
+                context.Emit(OpCodes.Or);
+
+                context.EmitStintzr(op.Rd);
+            }
+        }
+
+        public static void Ubfm(ILEmitterCtx context)
+        {
+            OpCodeBfm64 op = (OpCodeBfm64)context.CurrOp;
+
+            if (op.Pos + 1 == op.GetBitsCount())
+            {
+                EmitUbfmShift(context);
+            }
+            else if (op.Pos < op.Shift)
+            {
+                EmitUbfiz(context);
+            }
+            else if (op.Pos + 1 == op.Shift)
+            {
+                EmitBfmLsl(context);
+            }
+            else if (op.Pos == 7 && op.Shift == 0)
+            {
+                EmitUbfmCast(context, OpCodes.Conv_U1);
+            }
+            else if (op.Pos == 15 && op.Shift == 0)
+            {
+                EmitUbfmCast(context, OpCodes.Conv_U2);
+            }
+            else
+            {
+                EmitBfmLoadRn(context);
+
+                context.EmitStintzr(op.Rd);
+            }
+        }
+
+        private static void EmitSbfiz(ILEmitterCtx context) => EmitBfiz(context, true);
+        private static void EmitUbfiz(ILEmitterCtx context) => EmitBfiz(context, false);
+
+        private static void EmitBfiz(ILEmitterCtx context, bool signed)
+        {
+            OpCodeBfm64 op = (OpCodeBfm64)context.CurrOp;
+
+            int width = op.Pos + 1;
+
+            context.EmitLdintzr(op.Rn);
+
+            context.EmitLsl(op.GetBitsCount() - width);
+
+            if (signed)
+            {
+                context.EmitAsr(op.Shift - width);
+            }
+            else
+            {
+                context.EmitLsr(op.Shift - width);
+            }
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        private static void EmitSbfmCast(ILEmitterCtx context, OpCode ilOp)
+        {
+            EmitBfmCast(context, ilOp, true);
+        }
+
+        private static void EmitUbfmCast(ILEmitterCtx context, OpCode ilOp)
+        {
+            EmitBfmCast(context, ilOp, false);
+        }
+
+        private static void EmitBfmCast(ILEmitterCtx context, OpCode ilOp, bool signed)
+        {
+            OpCodeBfm64 op = (OpCodeBfm64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rn);
+
+            context.Emit(ilOp);
+
+            if (op.RegisterSize != RegisterSize.Int32)
+            {
+                context.Emit(signed
+                    ? OpCodes.Conv_I8
+                    : OpCodes.Conv_U8);
+            }
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        private static void EmitSbfmShift(ILEmitterCtx context)
+        {
+            EmitBfmShift(context, true);
+        }
+
+        private static void EmitUbfmShift(ILEmitterCtx context)
+        {
+            EmitBfmShift(context, false);
+        }
+
+        private static void EmitBfmShift(ILEmitterCtx context, bool signed)
+        {
+            OpCodeBfm64 op = (OpCodeBfm64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rn);
+            context.EmitLdc_I4(op.Shift);
+
+            context.Emit(signed
+                ? OpCodes.Shr
+                : OpCodes.Shr_Un);
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        private static void EmitBfmLsl(ILEmitterCtx context)
+        {
+            OpCodeBfm64 op = (OpCodeBfm64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rn);
+
+            context.EmitLsl(op.GetBitsCount() - op.Shift);
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        private static void EmitBfmLoadRn(ILEmitterCtx context)
+        {
+            OpCodeBfm64 op = (OpCodeBfm64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rn);
+
+            context.EmitRor(op.Shift);
+
+            context.EmitLdc_I(op.WMask & op.TMask);
+
+            context.Emit(OpCodes.And);
+        }
+    }
+}
+\ No newline at end of file
diff --git a/ChocolArm64/Instructions/InstEmitCcmp.cs b/ChocolArm64/Instructions/InstEmitCcmp.cs
new file mode 100644
index 00000000..b91104c9
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitCcmp.cs
@@ -0,0 +1,81 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.State;
+using ChocolArm64.Translation;
+using System;
+using System.Reflection.Emit;
+
+using static ChocolArm64.Instructions.InstEmitAluHelper;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        private enum CcmpOp
+        {
+            Cmp,
+            Cmn
+        }
+
+        public static void Ccmn(ILEmitterCtx context) => EmitCcmp(context, CcmpOp.Cmn);
+        public static void Ccmp(ILEmitterCtx context) => EmitCcmp(context, CcmpOp.Cmp);
+
+        private static void EmitCcmp(ILEmitterCtx context, CcmpOp cmpOp)
+        {
+            OpCodeCcmp64 op = (OpCodeCcmp64)context.CurrOp;
+
+            ILLabel lblTrue = new ILLabel();
+            ILLabel lblEnd  = new ILLabel();
+
+            context.EmitCondBranch(lblTrue, op.Cond);
+
+            context.EmitLdc_I4((op.Nzcv >> 0) & 1);
+
+            context.EmitStflg((int)PState.VBit);
+
+            context.EmitLdc_I4((op.Nzcv >> 1) & 1);
+
+            context.EmitStflg((int)PState.CBit);
+
+            context.EmitLdc_I4((op.Nzcv >> 2) & 1);
+
+            context.EmitStflg((int)PState.ZBit);
+
+            context.EmitLdc_I4((op.Nzcv >> 3) & 1);
+
+            context.EmitStflg((int)PState.NBit);
+
+            context.Emit(OpCodes.Br_S, lblEnd);
+
+            context.MarkLabel(lblTrue);
+
+            EmitDataLoadOpers(context);
+
+            if (cmpOp == CcmpOp.Cmp)
+            {
+                context.Emit(OpCodes.Sub);
+
+                context.EmitZnFlagCheck();
+
+                EmitSubsCCheck(context);
+                EmitSubsVCheck(context);
+            }
+            else if (cmpOp == CcmpOp.Cmn)
+            {
+                context.Emit(OpCodes.Add);
+
+                context.EmitZnFlagCheck();
+
+                EmitAddsCCheck(context);
+                EmitAddsVCheck(context);
+            }
+            else
+            {
+                throw new ArgumentException(nameof(cmpOp));
+            }
+
+            context.Emit(OpCodes.Pop);
+
+            context.MarkLabel(lblEnd);
+        }
+    }
+}
+\ No newline at end of file
diff --git a/ChocolArm64/Instructions/InstEmitCsel.cs b/ChocolArm64/Instructions/InstEmitCsel.cs
new file mode 100644
index 00000000..19b073ce
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitCsel.cs
@@ -0,0 +1,58 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.Translation;
+using System.Reflection.Emit;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        private enum CselOperation
+        {
+            None,
+            Increment,
+            Invert,
+            Negate
+        }
+
+        public static void Csel(ILEmitterCtx context)  => EmitCsel(context, CselOperation.None);
+        public static void Csinc(ILEmitterCtx context) => EmitCsel(context, CselOperation.Increment);
+        public static void Csinv(ILEmitterCtx context) => EmitCsel(context, CselOperation.Invert);
+        public static void Csneg(ILEmitterCtx context) => EmitCsel(context, CselOperation.Negate);
+
+        private static void EmitCsel(ILEmitterCtx context, CselOperation cselOp)
+        {
+            OpCodeCsel64 op = (OpCodeCsel64)context.CurrOp;
+
+            ILLabel lblTrue = new ILLabel();
+            ILLabel lblEnd  = new ILLabel();
+
+            context.EmitCondBranch(lblTrue, op.Cond);
+            context.EmitLdintzr(op.Rm);
+
+            if (cselOp == CselOperation.Increment)
+            {
+                context.EmitLdc_I(1);
+
+                context.Emit(OpCodes.Add);
+            }
+            else if (cselOp == CselOperation.Invert)
+            {
+                context.Emit(OpCodes.Not);
+            }
+            else if (cselOp == CselOperation.Negate)
+            {
+                context.Emit(OpCodes.Neg);
+            }
+
+            context.Emit(OpCodes.Br_S, lblEnd);
+
+            context.MarkLabel(lblTrue);
+
+            context.EmitLdintzr(op.Rn);
+
+            context.MarkLabel(lblEnd);
+
+            context.EmitStintzr(op.Rd);
+        }
+    }
+}
+\ No newline at end of file
diff --git a/ChocolArm64/Instructions/InstEmitException.cs b/ChocolArm64/Instructions/InstEmitException.cs
new file mode 100644
index 00000000..8325a397
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitException.cs
@@ -0,0 +1,86 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.State;
+using ChocolArm64.Translation;
+using System.Reflection.Emit;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        public static void Brk(ILEmitterCtx context)
+        {
+            EmitExceptionCall(context, nameof(CpuThreadState.OnBreak));
+        }
+
+        public static void Svc(ILEmitterCtx context)
+        {
+            EmitExceptionCall(context, nameof(CpuThreadState.OnSvcCall));
+        }
+
+        private static void EmitExceptionCall(ILEmitterCtx context, string mthdName)
+        {
+            OpCodeException64 op = (OpCodeException64)context.CurrOp;
+
+            context.EmitStoreState();
+
+            context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+            context.EmitLdc_I8(op.Position);
+            context.EmitLdc_I4(op.Id);
+
+            context.EmitPrivateCall(typeof(CpuThreadState), mthdName);
+
+            //Check if the thread should still be running, if it isn't then we return 0
+            //to force a return to the dispatcher and then exit the thread.
+            context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+            context.EmitCallPropGet(typeof(CpuThreadState), nameof(CpuThreadState.Running));
+
+            ILLabel lblEnd = new ILLabel();
+
+            context.Emit(OpCodes.Brtrue_S, lblEnd);
+
+            context.EmitLdc_I8(0);
+
+            context.Emit(OpCodes.Ret);
+
+            context.MarkLabel(lblEnd);
+
+            if (context.CurrBlock.Next != null)
+            {
+                context.EmitLoadState(context.CurrBlock.Next);
+            }
+            else
+            {
+                context.EmitLdc_I8(op.Position + 4);
+
+                context.Emit(OpCodes.Ret);
+            }
+        }
+
+        public static void Und(ILEmitterCtx context)
+        {
+            OpCode64 op = context.CurrOp;
+
+            context.EmitStoreState();
+
+            context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+            context.EmitLdc_I8(op.Position);
+            context.EmitLdc_I4(op.RawOpCode);
+
+            context.EmitPrivateCall(typeof(CpuThreadState), nameof(CpuThreadState.OnUndefined));
+
+            if (context.CurrBlock.Next != null)
+            {
+                context.EmitLoadState(context.CurrBlock.Next);
+            }
+            else
+            {
+                context.EmitLdc_I8(op.Position + 4);
+
+                context.Emit(OpCodes.Ret);
+            }
+        }
+    }
+}
+\ No newline at end of file
diff --git a/ChocolArm64/Instructions/InstEmitFlow.cs b/ChocolArm64/Instructions/InstEmitFlow.cs
new file mode 100644
index 00000000..7d0897cd
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitFlow.cs
@@ -0,0 +1,189 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.State;
+using ChocolArm64.Translation;
+using System.Reflection.Emit;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        public static void B(ILEmitterCtx context)
+        {
+            OpCodeBImmAl64 op = (OpCodeBImmAl64)context.CurrOp;
+
+            if (context.CurrBlock.Branch != null)
+            {
+                context.Emit(OpCodes.Br, context.GetLabel(op.Imm));
+            }
+            else
+            {
+                context.EmitStoreState();
+                context.EmitLdc_I8(op.Imm);
+
+                context.Emit(OpCodes.Ret);
+            }
+        }
+
+        public static void B_Cond(ILEmitterCtx context)
+        {
+            OpCodeBImmCond64 op = (OpCodeBImmCond64)context.CurrOp;
+
+            EmitBranch(context, op.Cond);
+        }
+
+        public static void Bl(ILEmitterCtx context)
+        {
+            OpCodeBImmAl64 op = (OpCodeBImmAl64)context.CurrOp;
+
+            context.EmitLdc_I(op.Position + 4);
+            context.EmitStint(CpuThreadState.LrIndex);
+            context.EmitStoreState();
+
+            if (context.TryOptEmitSubroutineCall())
+            {
+                //Note: the return value of the called method will be placed
+                //at the Stack, the return value is always a Int64 with the
+                //return address of the function. We check if the address is
+                //correct, if it isn't we keep returning until we reach the dispatcher.
+                context.Emit(OpCodes.Dup);
+
+                context.EmitLdc_I8(op.Position + 4);
+
+                ILLabel lblContinue = new ILLabel();
+
+                context.Emit(OpCodes.Beq_S, lblContinue);
+                context.Emit(OpCodes.Ret);
+
+                context.MarkLabel(lblContinue);
+
+                context.Emit(OpCodes.Pop);
+
+                context.EmitLoadState(context.CurrBlock.Next);
+            }
+            else
+            {
+                context.EmitLdc_I8(op.Imm);
+
+                context.Emit(OpCodes.Ret);
+            }
+        }
+
+        public static void Blr(ILEmitterCtx context)
+        {
+            OpCodeBReg64 op = (OpCodeBReg64)context.CurrOp;
+
+            context.EmitLdc_I(op.Position + 4);
+            context.EmitStint(CpuThreadState.LrIndex);
+            context.EmitStoreState();
+            context.EmitLdintzr(op.Rn);
+
+            context.Emit(OpCodes.Ret);
+        }
+
+        public static void Br(ILEmitterCtx context)
+        {
+            OpCodeBReg64 op = (OpCodeBReg64)context.CurrOp;
+
+            context.EmitStoreState();
+            context.EmitLdintzr(op.Rn);
+
+            context.Emit(OpCodes.Ret);
+        }
+
+        public static void Cbnz(ILEmitterCtx context) => EmitCb(context, OpCodes.Bne_Un);
+        public static void Cbz(ILEmitterCtx context)  => EmitCb(context, OpCodes.Beq);
+
+        private static void EmitCb(ILEmitterCtx context, OpCode ilOp)
+        {
+            OpCodeBImmCmp64 op = (OpCodeBImmCmp64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rt);
+            context.EmitLdc_I(0);
+
+            EmitBranch(context, ilOp);
+        }
+
+        public static void Ret(ILEmitterCtx context)
+        {
+            context.EmitStoreState();
+            context.EmitLdint(CpuThreadState.LrIndex);
+
+            context.Emit(OpCodes.Ret);
+        }
+
+        public static void Tbnz(ILEmitterCtx context) => EmitTb(context, OpCodes.Bne_Un);
+        public static void Tbz(ILEmitterCtx context)  => EmitTb(context, OpCodes.Beq);
+
+        private static void EmitTb(ILEmitterCtx context, OpCode ilOp)
+        {
+            OpCodeBImmTest64 op = (OpCodeBImmTest64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rt);
+            context.EmitLdc_I(1L << op.Pos);
+
+            context.Emit(OpCodes.And);
+
+            context.EmitLdc_I(0);
+
+            EmitBranch(context, ilOp);
+        }
+
+        private static void EmitBranch(ILEmitterCtx context, Cond cond)
+        {
+            OpCodeBImm64 op = (OpCodeBImm64)context.CurrOp;
+
+            if (context.CurrBlock.Next   != null &&
+                context.CurrBlock.Branch != null)
+            {
+                context.EmitCondBranch(context.GetLabel(op.Imm), cond);
+            }
+            else
+            {
+                context.EmitStoreState();
+
+                ILLabel lblTaken = new ILLabel();
+
+                context.EmitCondBranch(lblTaken, cond);
+
+                context.EmitLdc_I8(op.Position + 4);
+
+                context.Emit(OpCodes.Ret);
+
+                context.MarkLabel(lblTaken);
+
+                context.EmitLdc_I8(op.Imm);
+
+                context.Emit(OpCodes.Ret);
+            }
+        }
+
+        private static void EmitBranch(ILEmitterCtx context, OpCode ilOp)
+        {
+            OpCodeBImm64 op = (OpCodeBImm64)context.CurrOp;
+
+            if (context.CurrBlock.Next   != null &&
+                context.CurrBlock.Branch != null)
+            {
+                context.Emit(ilOp, context.GetLabel(op.Imm));
+            }
+            else
+            {
+                context.EmitStoreState();
+
+                ILLabel lblTaken = new ILLabel();
+
+                context.Emit(ilOp, lblTaken);
+
+                context.EmitLdc_I8(op.Position + 4);
+
+                context.Emit(OpCodes.Ret);
+
+                context.MarkLabel(lblTaken);
+
+                context.EmitLdc_I8(op.Imm);
+
+                context.Emit(OpCodes.Ret);
+            }
+        }
+    }
+}
+\ No newline at end of file
diff --git a/ChocolArm64/Instructions/InstEmitHash.cs b/ChocolArm64/Instructions/InstEmitHash.cs
new file mode 100644
index 00000000..7e21a886
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitHash.cs
@@ -0,0 +1,115 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.State;
+using ChocolArm64.Translation;
+using System;
+using System.Reflection.Emit;
+using System.Runtime.Intrinsics.X86;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        public static void Crc32b(ILEmitterCtx context)
+        {
+            EmitCrc32(context, nameof(SoftFallback.Crc32B));
+        }
+
+        public static void Crc32h(ILEmitterCtx context)
+        {
+            EmitCrc32(context, nameof(SoftFallback.Crc32H));
+        }
+
+        public static void Crc32w(ILEmitterCtx context)
+        {
+            EmitCrc32(context, nameof(SoftFallback.Crc32W));
+        }
+
+        public static void Crc32x(ILEmitterCtx context)
+        {
+            EmitCrc32(context, nameof(SoftFallback.Crc32X));
+        }
+
+        public static void Crc32cb(ILEmitterCtx context)
+        {
+            if (Optimizations.UseSse42)
+            {
+                EmitSse42Crc32(context, typeof(uint), typeof(byte));
+            }
+            else
+            {
+                EmitCrc32(context, nameof(SoftFallback.Crc32Cb));
+            }
+        }
+
+        public static void Crc32ch(ILEmitterCtx context)
+        {
+            if (Optimizations.UseSse42)
+            {
+                EmitSse42Crc32(context, typeof(uint), typeof(ushort));
+            }
+            else
+            {
+                EmitCrc32(context, nameof(SoftFallback.Crc32Ch));
+            }
+        }
+
+        public static void Crc32cw(ILEmitterCtx context)
+        {
+            if (Optimizations.UseSse42)
+            {
+                EmitSse42Crc32(context, typeof(uint), typeof(uint));
+            }
+            else
+            {
+                EmitCrc32(context, nameof(SoftFallback.Crc32Cw));
+            }
+        }
+
+        public static void Crc32cx(ILEmitterCtx context)
+        {
+            if (Optimizations.UseSse42)
+            {
+                EmitSse42Crc32(context, typeof(ulong), typeof(ulong));
+            }
+            else
+            {
+                EmitCrc32(context, nameof(SoftFallback.Crc32Cx));
+            }
+        }
+
+        private static void EmitSse42Crc32(ILEmitterCtx context, Type tCrc, Type tData)
+        {
+            OpCodeAluRs64 op = (OpCodeAluRs64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rn);
+            context.EmitLdintzr(op.Rm);
+
+            context.EmitCall(typeof(Sse42).GetMethod(nameof(Sse42.Crc32), new Type[] { tCrc, tData }));
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        private static void EmitCrc32(ILEmitterCtx context, string name)
+        {
+            OpCodeAluRs64 op = (OpCodeAluRs64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rn);
+
+            if (op.RegisterSize != RegisterSize.Int32)
+            {
+                context.Emit(OpCodes.Conv_U4);
+            }
+
+            context.EmitLdintzr(op.Rm);
+
+            SoftFallback.EmitCall(context, name);
+
+            if (op.RegisterSize != RegisterSize.Int32)
+            {
+                context.Emit(OpCodes.Conv_U8);
+            }
+
+            context.EmitStintzr(op.Rd);
+        }
+    }
+}
diff --git a/ChocolArm64/Instructions/InstEmitMemory.cs b/ChocolArm64/Instructions/InstEmitMemory.cs
new file mode 100644
index 00000000..96e45b3f
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitMemory.cs
@@ -0,0 +1,252 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.Translation;
+using System.Reflection.Emit;
+
+using static ChocolArm64.Instructions.InstEmitMemoryHelper;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        public static void Adr(ILEmitterCtx context)
+        {
+            OpCodeAdr64 op = (OpCodeAdr64)context.CurrOp;
+
+            context.EmitLdc_I(op.Position + op.Imm);
+            context.EmitStintzr(op.Rd);
+        }
+
+        public static void Adrp(ILEmitterCtx context)
+        {
+            OpCodeAdr64 op = (OpCodeAdr64)context.CurrOp;
+
+            context.EmitLdc_I((op.Position & ~0xfffL) + (op.Imm << 12));
+            context.EmitStintzr(op.Rd);
+        }
+
+        public static void Ldr(ILEmitterCtx context)  => EmitLdr(context, false);
+        public static void Ldrs(ILEmitterCtx context) => EmitLdr(context, true);
+
+        private static void EmitLdr(ILEmitterCtx context, bool signed)
+        {
+            OpCodeMem64 op = (OpCodeMem64)context.CurrOp;
+
+            context.EmitLdarg(TranslatedSub.MemoryArgIdx);
+
+            EmitLoadAddress(context);
+
+            if (signed && op.Extend64)
+            {
+                EmitReadSx64Call(context, op.Size);
+            }
+            else if (signed)
+            {
+                EmitReadSx32Call(context, op.Size);
+            }
+            else
+            {
+                EmitReadZxCall(context, op.Size);
+            }
+
+            if (op is IOpCodeSimd64)
+            {
+                context.EmitStvec(op.Rt);
+            }
+            else
+            {
+                context.EmitStintzr(op.Rt);
+            }
+
+            EmitWBackIfNeeded(context);
+        }
+
+        public static void LdrLit(ILEmitterCtx context)
+        {
+            IOpCodeLit64 op = (IOpCodeLit64)context.CurrOp;
+
+            if (op.Prefetch)
+            {
+                return;
+            }
+
+            context.EmitLdarg(TranslatedSub.MemoryArgIdx);
+            context.EmitLdc_I8(op.Imm);
+
+            if (op.Signed)
+            {
+                EmitReadSx64Call(context, op.Size);
+            }
+            else
+            {
+                EmitReadZxCall(context, op.Size);
+            }
+
+            if (op is IOpCodeSimd64)
+            {
+                context.EmitStvec(op.Rt);
+            }
+            else
+            {
+                context.EmitStint(op.Rt);
+            }
+        }
+
+        public static void Ldp(ILEmitterCtx context)
+        {
+            OpCodeMemPair64 op = (OpCodeMemPair64)context.CurrOp;
+
+            void EmitReadAndStore(int rt)
+            {
+                if (op.Extend64)
+                {
+                    EmitReadSx64Call(context, op.Size);
+                }
+                else
+                {
+                    EmitReadZxCall(context, op.Size);
+                }
+
+                if (op is IOpCodeSimd64)
+                {
+                    context.EmitStvec(rt);
+                }
+                else
+                {
+                    context.EmitStintzr(rt);
+                }
+            }
+
+            context.EmitLdarg(TranslatedSub.MemoryArgIdx);
+
+            EmitLoadAddress(context);
+
+            EmitReadAndStore(op.Rt);
+
+            context.EmitLdarg(TranslatedSub.MemoryArgIdx);
+            context.EmitLdtmp();
+            context.EmitLdc_I8(1 << op.Size);
+
+            context.Emit(OpCodes.Add);
+
+            EmitReadAndStore(op.Rt2);
+
+            EmitWBackIfNeeded(context);
+        }
+
+        public static void Str(ILEmitterCtx context)
+        {
+            OpCodeMem64 op = (OpCodeMem64)context.CurrOp;
+
+            context.EmitLdarg(TranslatedSub.MemoryArgIdx);
+
+            EmitLoadAddress(context);
+
+            if (op is IOpCodeSimd64)
+            {
+                context.EmitLdvec(op.Rt);
+            }
+            else
+            {
+                context.EmitLdintzr(op.Rt);
+            }
+
+            EmitWriteCall(context, op.Size);
+
+            EmitWBackIfNeeded(context);
+        }
+
+        public static void Stp(ILEmitterCtx context)
+        {
+            OpCodeMemPair64 op = (OpCodeMemPair64)context.CurrOp;
+
+            context.EmitLdarg(TranslatedSub.MemoryArgIdx);
+
+            EmitLoadAddress(context);
+
+            if (op is IOpCodeSimd64)
+            {
+                context.EmitLdvec(op.Rt);
+            }
+            else
+            {
+                context.EmitLdintzr(op.Rt);
+            }
+
+            EmitWriteCall(context, op.Size);
+
+            context.EmitLdarg(TranslatedSub.MemoryArgIdx);
+            context.EmitLdtmp();
+            context.EmitLdc_I8(1 << op.Size);
+
+            context.Emit(OpCodes.Add);
+
+            if (op is IOpCodeSimd64)
+            {
+                context.EmitLdvec(op.Rt2);
+            }
+            else
+            {
+                context.EmitLdintzr(op.Rt2);
+            }
+
+            EmitWriteCall(context, op.Size);
+
+            EmitWBackIfNeeded(context);
+        }
+
+        private static void EmitLoadAddress(ILEmitterCtx context)
+        {
+            switch (context.CurrOp)
+            {
+                case OpCodeMemImm64 op:
+                    context.EmitLdint(op.Rn);
+
+                    if (!op.PostIdx)
+                    {
+                        //Pre-indexing.
+                        context.EmitLdc_I(op.Imm);
+
+                        context.Emit(OpCodes.Add);
+                    }
+                    break;
+
+                case OpCodeMemReg64 op:
+                    context.EmitLdint(op.Rn);
+                    context.EmitLdintzr(op.Rm);
+                    context.EmitCast(op.IntType);
+
+                    if (op.Shift)
+                    {
+                        context.EmitLsl(op.Size);
+                    }
+
+                    context.Emit(OpCodes.Add);
+                    break;
+            }
+
+            //Save address to Scratch var since the register value may change.
+            context.Emit(OpCodes.Dup);
+
+            context.EmitSttmp();
+        }
+
+        private static void EmitWBackIfNeeded(ILEmitterCtx context)
+        {
+            //Check whenever the current OpCode has post-indexed write back, if so write it.
+            //Note: AOpCodeMemPair inherits from AOpCodeMemImm, so this works for both.
+            if (context.CurrOp is OpCodeMemImm64 op && op.WBack)
+            {
+                context.EmitLdtmp();
+
+                if (op.PostIdx)
+                {
+                    context.EmitLdc_I(op.Imm);
+
+                    context.Emit(OpCodes.Add);
+                }
+
+                context.EmitStint(op.Rn);
+            }
+        }
+    }
+}
+\ No newline at end of file
diff --git a/ChocolArm64/Instructions/InstEmitMemoryEx.cs b/ChocolArm64/Instructions/InstEmitMemoryEx.cs
new file mode 100644
index 00000000..42daca63
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitMemoryEx.cs
@@ -0,0 +1,192 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.Memory;
+using ChocolArm64.State;
+using ChocolArm64.Translation;
+using System;
+using System.Reflection.Emit;
+using System.Threading;
+
+using static ChocolArm64.Instructions.InstEmitMemoryHelper;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        [Flags]
+        private enum AccessType
+        {
+            None      = 0,
+            Ordered   = 1,
+            Exclusive = 2,
+            OrderedEx = Ordered | Exclusive
+        }
+
+        public static void Clrex(ILEmitterCtx context)
+        {
+            EmitMemoryCall(context, nameof(MemoryManager.ClearExclusive));
+        }
+
+        public static void Dmb(ILEmitterCtx context) => EmitBarrier(context);
+        public static void Dsb(ILEmitterCtx context) => EmitBarrier(context);
+
+        public static void Ldar(ILEmitterCtx context)  => EmitLdr(context, AccessType.Ordered);
+        public static void Ldaxr(ILEmitterCtx context) => EmitLdr(context, AccessType.OrderedEx);
+        public static void Ldxr(ILEmitterCtx context)  => EmitLdr(context, AccessType.Exclusive);
+        public static void Ldxp(ILEmitterCtx context)  => EmitLdp(context, AccessType.Exclusive);
+        public static void Ldaxp(ILEmitterCtx context) => EmitLdp(context, AccessType.OrderedEx);
+
+        private static void EmitLdr(ILEmitterCtx context, AccessType accType)
+        {
+            EmitLoad(context, accType, false);
+        }
+
+        private static void EmitLdp(ILEmitterCtx context, AccessType accType)
+        {
+            EmitLoad(context, accType, true);
+        }
+
+        private static void EmitLoad(ILEmitterCtx context, AccessType accType, bool pair)
+        {
+            OpCodeMemEx64 op = (OpCodeMemEx64)context.CurrOp;
+
+            bool ordered   = (accType & AccessType.Ordered)   != 0;
+            bool exclusive = (accType & AccessType.Exclusive) != 0;
+
+            if (ordered)
+            {
+                EmitBarrier(context);
+            }
+
+            if (exclusive)
+            {
+                EmitMemoryCall(context, nameof(MemoryManager.SetExclusive), op.Rn);
+            }
+
+            context.EmitLdint(op.Rn);
+            context.EmitSttmp();
+
+            context.EmitLdarg(TranslatedSub.MemoryArgIdx);
+            context.EmitLdtmp();
+
+            EmitReadZxCall(context, op.Size);
+
+            context.EmitStintzr(op.Rt);
+
+            if (pair)
+            {
+                context.EmitLdarg(TranslatedSub.MemoryArgIdx);
+                context.EmitLdtmp();
+                context.EmitLdc_I8(1 << op.Size);
+
+                context.Emit(OpCodes.Add);
+
+                EmitReadZxCall(context, op.Size);
+
+                context.EmitStintzr(op.Rt2);
+            }
+        }
+
+        public static void Pfrm(ILEmitterCtx context)
+        {
+            //Memory Prefetch, execute as no-op.
+        }
+
+        public static void Stlr(ILEmitterCtx context)  => EmitStr(context, AccessType.Ordered);
+        public static void Stlxr(ILEmitterCtx context) => EmitStr(context, AccessType.OrderedEx);
+        public static void Stxr(ILEmitterCtx context)  => EmitStr(context, AccessType.Exclusive);
+        public static void Stxp(ILEmitterCtx context)  => EmitStp(context, AccessType.Exclusive);
+        public static void Stlxp(ILEmitterCtx context) => EmitStp(context, AccessType.OrderedEx);
+
+        private static void EmitStr(ILEmitterCtx context, AccessType accType)
+        {
+            EmitStore(context, accType, false);
+        }
+
+        private static void EmitStp(ILEmitterCtx context, AccessType accType)
+        {
+            EmitStore(context, accType, true);
+        }
+
+        private static void EmitStore(ILEmitterCtx context, AccessType accType, bool pair)
+        {
+            OpCodeMemEx64 op = (OpCodeMemEx64)context.CurrOp;
+
+            bool ordered   = (accType & AccessType.Ordered)   != 0;
+            bool exclusive = (accType & AccessType.Exclusive) != 0;
+
+            if (ordered)
+            {
+                EmitBarrier(context);
+            }
+
+            ILLabel lblEx  = new ILLabel();
+            ILLabel lblEnd = new ILLabel();
+
+            if (exclusive)
+            {
+                EmitMemoryCall(context, nameof(MemoryManager.TestExclusive), op.Rn);
+
+                context.Emit(OpCodes.Brtrue_S, lblEx);
+
+                context.EmitLdc_I8(1);
+                context.EmitStintzr(op.Rs);
+
+                context.Emit(OpCodes.Br_S, lblEnd);
+            }
+
+            context.MarkLabel(lblEx);
+
+            context.EmitLdarg(TranslatedSub.MemoryArgIdx);
+            context.EmitLdint(op.Rn);
+            context.EmitLdintzr(op.Rt);
+
+            EmitWriteCall(context, op.Size);
+
+            if (pair)
+            {
+                context.EmitLdarg(TranslatedSub.MemoryArgIdx);
+                context.EmitLdint(op.Rn);
+                context.EmitLdc_I8(1 << op.Size);
+
+                context.Emit(OpCodes.Add);
+
+                context.EmitLdintzr(op.Rt2);
+
+                EmitWriteCall(context, op.Size);
+            }
+
+            if (exclusive)
+            {
+                context.EmitLdc_I8(0);
+                context.EmitStintzr(op.Rs);
+
+                EmitMemoryCall(context, nameof(MemoryManager.ClearExclusiveForStore));
+            }
+
+            context.MarkLabel(lblEnd);
+        }
+
+        private static void EmitMemoryCall(ILEmitterCtx context, string name, int rn = -1)
+        {
+            context.EmitLdarg(TranslatedSub.MemoryArgIdx);
+            context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+            context.EmitCallPropGet(typeof(CpuThreadState), nameof(CpuThreadState.Core));
+
+            if (rn != -1)
+            {
+                context.EmitLdint(rn);
+            }
+
+            context.EmitCall(typeof(MemoryManager), name);
+        }
+
+        private static void EmitBarrier(ILEmitterCtx context)
+        {
+            //Note: This barrier is most likely not necessary, and probably
+            //doesn't make any difference since we need to do a ton of stuff
+            //(software MMU emulation) to read or write anything anyway.
+            context.EmitCall(typeof(Thread), nameof(Thread.MemoryBarrier));
+        }
+    }
+}
+\ No newline at end of file
diff --git a/ChocolArm64/Instructions/InstEmitMemoryHelper.cs b/ChocolArm64/Instructions/InstEmitMemoryHelper.cs
new file mode 100644
index 00000000..f953564c
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitMemoryHelper.cs
@@ -0,0 +1,138 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.Memory;
+using ChocolArm64.Translation;
+using System;
+using System.Reflection.Emit;
+
+namespace ChocolArm64.Instructions
+{
+    static class InstEmitMemoryHelper
+    {
+        private enum Extension
+        {
+            Zx,
+            Sx32,
+            Sx64
+        }
+
+        public static void EmitReadZxCall(ILEmitterCtx context, int size)
+        {
+            EmitReadCall(context, Extension.Zx, size);
+        }
+
+        public static void EmitReadSx32Call(ILEmitterCtx context, int size)
+        {
+            EmitReadCall(context, Extension.Sx32, size);
+        }
+
+        public static void EmitReadSx64Call(ILEmitterCtx context, int size)
+        {
+            EmitReadCall(context, Extension.Sx64, size);
+        }
+
+        private static void EmitReadCall(ILEmitterCtx context, Extension ext, int size)
+        {
+            bool isSimd = GetIsSimd(context);
+
+            string name = null;
+
+            if (size < 0 || size > (isSimd ? 4 : 3))
+            {
+                throw new ArgumentOutOfRangeException(nameof(size));
+            }
+
+            if (isSimd)
+            {
+                switch (size)
+                {
+                    case 0: name = nameof(MemoryManager.ReadVector8);   break;
+                    case 1: name = nameof(MemoryManager.ReadVector16);  break;
+                    case 2: name = nameof(MemoryManager.ReadVector32);  break;
+                    case 3: name = nameof(MemoryManager.ReadVector64);  break;
+                    case 4: name = nameof(MemoryManager.ReadVector128); break;
+                }
+            }
+            else
+            {
+                switch (size)
+                {
+                    case 0: name = nameof(MemoryManager.ReadByte);   break;
+                    case 1: name = nameof(MemoryManager.ReadUInt16); break;
+                    case 2: name = nameof(MemoryManager.ReadUInt32); break;
+                    case 3: name = nameof(MemoryManager.ReadUInt64); break;
+                }
+            }
+
+            context.EmitCall(typeof(MemoryManager), name);
+
+            if (!isSimd)
+            {
+                if (ext == Extension.Sx32 ||
+                    ext == Extension.Sx64)
+                {
+                    switch (size)
+                    {
+                        case 0: context.Emit(OpCodes.Conv_I1); break;
+                        case 1: context.Emit(OpCodes.Conv_I2); break;
+                        case 2: context.Emit(OpCodes.Conv_I4); break;
+                    }
+                }
+
+                if (size < 3)
+                {
+                    context.Emit(ext == Extension.Sx64
+                        ? OpCodes.Conv_I8
+                        : OpCodes.Conv_U8);
+                }
+            }
+        }
+
+        public static void EmitWriteCall(ILEmitterCtx context, int size)
+        {
+            bool isSimd = GetIsSimd(context);
+
+            string name = null;
+
+            if (size < 0 || size > (isSimd ? 4 : 3))
+            {
+                throw new ArgumentOutOfRangeException(nameof(size));
+            }
+
+            if (size < 3 && !isSimd)
+            {
+                context.Emit(OpCodes.Conv_I4);
+            }
+
+            if (isSimd)
+            {
+                switch (size)
+                {
+                    case 0: name = nameof(MemoryManager.WriteVector8);   break;
+                    case 1: name = nameof(MemoryManager.WriteVector16);  break;
+                    case 2: name = nameof(MemoryManager.WriteVector32);  break;
+                    case 3: name = nameof(MemoryManager.WriteVector64);  break;
+                    case 4: name = nameof(MemoryManager.WriteVector128); break;
+                }
+            }
+            else
+            {
+                switch (size)
+                {
+                    case 0: name = nameof(MemoryManager.WriteByte);   break;
+                    case 1: name = nameof(MemoryManager.WriteUInt16); break;
+                    case 2: name = nameof(MemoryManager.WriteUInt32); break;
+                    case 3: name = nameof(MemoryManager.WriteUInt64); break;
+                }
+            }
+
+            context.EmitCall(typeof(MemoryManager), name);
+        }
+
+        private static bool GetIsSimd(ILEmitterCtx context)
+        {
+            return context.CurrOp is IOpCodeSimd64 &&
+                 !(context.CurrOp is OpCodeSimdMemMs64 ||
+                   context.CurrOp is OpCodeSimdMemSs64);
+        }
+    }
+}
+\ No newline at end of file
diff --git a/ChocolArm64/Instructions/InstEmitMove.cs b/ChocolArm64/Instructions/InstEmitMove.cs
new file mode 100644
index 00000000..be3e8e2d
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitMove.cs
@@ -0,0 +1,41 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.Translation;
+using System.Reflection.Emit;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        public static void Movk(ILEmitterCtx context)
+        {
+            OpCodeMov64 op = (OpCodeMov64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rd);
+            context.EmitLdc_I(~(0xffffL << op.Pos));
+
+            context.Emit(OpCodes.And);
+
+            context.EmitLdc_I(op.Imm);
+
+            context.Emit(OpCodes.Or);
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        public static void Movn(ILEmitterCtx context)
+        {
+            OpCodeMov64 op = (OpCodeMov64)context.CurrOp;
+
+            context.EmitLdc_I(~op.Imm);
+            context.EmitStintzr(op.Rd);
+        }
+
+        public static void Movz(ILEmitterCtx context)
+        {
+            OpCodeMov64 op = (OpCodeMov64)context.CurrOp;
+
+            context.EmitLdc_I(op.Imm);
+            context.EmitStintzr(op.Rd);
+        }
+    }
+}
+\ No newline at end of file
diff --git a/ChocolArm64/Instructions/InstEmitMul.cs b/ChocolArm64/Instructions/InstEmitMul.cs
new file mode 100644
index 00000000..b7418e69
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitMul.cs
@@ -0,0 +1,80 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.Translation;
+using System.Reflection.Emit;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        public static void Madd(ILEmitterCtx context) => EmitMul(context, OpCodes.Add);
+        public static void Msub(ILEmitterCtx context) => EmitMul(context, OpCodes.Sub);
+
+        private static void EmitMul(ILEmitterCtx context, OpCode ilOp)
+        {
+            OpCodeMul64 op = (OpCodeMul64)context.CurrOp;
+
+            context.EmitLdintzr(op.Ra);
+            context.EmitLdintzr(op.Rn);
+            context.EmitLdintzr(op.Rm);
+
+            context.Emit(OpCodes.Mul);
+            context.Emit(ilOp);
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        public static void Smaddl(ILEmitterCtx context) => EmitMull(context, OpCodes.Add, true);
+        public static void Smsubl(ILEmitterCtx context) => EmitMull(context, OpCodes.Sub, true);
+        public static void Umaddl(ILEmitterCtx context) => EmitMull(context, OpCodes.Add, false);
+        public static void Umsubl(ILEmitterCtx context) => EmitMull(context, OpCodes.Sub, false);
+
+        private static void EmitMull(ILEmitterCtx context, OpCode addSubOp, bool signed)
+        {
+            OpCodeMul64 op = (OpCodeMul64)context.CurrOp;
+
+            OpCode castOp = signed
+                ? OpCodes.Conv_I8
+                : OpCodes.Conv_U8;
+
+            context.EmitLdintzr(op.Ra);
+            context.EmitLdintzr(op.Rn);
+
+            context.Emit(OpCodes.Conv_I4);
+            context.Emit(castOp);
+
+            context.EmitLdintzr(op.Rm);
+
+            context.Emit(OpCodes.Conv_I4);
+            context.Emit(castOp);
+            context.Emit(OpCodes.Mul);
+
+            context.Emit(addSubOp);
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        public static void Smulh(ILEmitterCtx context)
+        {
+            OpCodeMul64 op = (OpCodeMul64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rn);
+            context.EmitLdintzr(op.Rm);
+
+            SoftFallback.EmitCall(context, nameof(SoftFallback.SMulHi128));
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        public static void Umulh(ILEmitterCtx context)
+        {
+            OpCodeMul64 op = (OpCodeMul64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rn);
+            context.EmitLdintzr(op.Rm);
+
+            SoftFallback.EmitCall(context, nameof(SoftFallback.UMulHi128));
+
+            context.EmitStintzr(op.Rd);
+        }
+    }
+}
+\ No newline at end of file
diff --git a/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs b/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs
new file mode 100644
index 00000000..9217de5f
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs
@@ -0,0 +1,2387 @@
+// https://github.com/intel/ARM_NEON_2_x86_SSE/blob/master/NEON_2_SSE.h
+
+using ChocolArm64.Decoders;
+using ChocolArm64.State;
+using ChocolArm64.Translation;
+using System;
+using System.Reflection;
+using System.Reflection.Emit;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+using static ChocolArm64.Instructions.InstEmitSimdHelper;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        public static void Abs_S(ILEmitterCtx context)
+        {
+            EmitScalarUnaryOpSx(context, () => EmitAbs(context));
+        }
+
+        public static void Abs_V(ILEmitterCtx context)
+        {
+            EmitVectorUnaryOpSx(context, () => EmitAbs(context));
+        }
+
+        public static void Add_S(ILEmitterCtx context)
+        {
+            EmitScalarBinaryOpZx(context, () => context.Emit(OpCodes.Add));
+        }
+
+        public static void Add_V(ILEmitterCtx context)
+        {
+            if (Optimizations.UseSse2)
+            {
+                EmitSse2Op(context, nameof(Sse2.Add));
+            }
+            else
+            {
+                EmitVectorBinaryOpZx(context, () => context.Emit(OpCodes.Add));
+            }
+        }
+
+        public static void Addhn_V(ILEmitterCtx context)
+        {
+            EmitHighNarrow(context, () => context.Emit(OpCodes.Add), round: false);
+        }
+
+        public static void Addp_S(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            EmitVectorExtractZx(context, op.Rn, 0, op.Size);
+            EmitVectorExtractZx(context, op.Rn, 1, op.Size);
+
+            context.Emit(OpCodes.Add);
+
+            EmitScalarSet(context, op.Rd, op.Size);
+        }
+
+        public static void Addp_V(ILEmitterCtx context)
+        {
+            EmitVectorPairwiseOpZx(context, () => context.Emit(OpCodes.Add));
+        }
+
+        public static void Addv_V(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> op.Size;
+
+            EmitVectorExtractZx(context, op.Rn, 0, op.Size);
+
+            for (int index = 1; index < elems; index++)
+            {
+                EmitVectorExtractZx(context, op.Rn, index, op.Size);
+
+                context.Emit(OpCodes.Add);
+            }
+
+            EmitScalarSet(context, op.Rd, op.Size);
+        }
+
+        public static void Cls_V(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> op.Size;
+
+            int eSize = 8 << op.Size;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractZx(context, op.Rn, index, op.Size);
+
+                context.EmitLdc_I4(eSize);
+
+                SoftFallback.EmitCall(context, nameof(SoftFallback.CountLeadingSigns));
+
+                EmitVectorInsert(context, op.Rd, index, op.Size);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        public static void Clz_V(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> op.Size;
+
+            int eSize = 8 << op.Size;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractZx(context, op.Rn, index, op.Size);
+
+                if (Lzcnt.IsSupported && eSize == 32)
+                {
+                    context.Emit(OpCodes.Conv_U4);
+
+                    context.EmitCall(typeof(Lzcnt).GetMethod(nameof(Lzcnt.LeadingZeroCount), new Type[] { typeof(uint) }));
+
+                    context.Emit(OpCodes.Conv_U8);
+                }
+                else
+                {
+                    context.EmitLdc_I4(eSize);
+
+                    SoftFallback.EmitCall(context, nameof(SoftFallback.CountLeadingZeros));
+                }
+
+                EmitVectorInsert(context, op.Rd, index, op.Size);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        public static void Cnt_V(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int elems = op.RegisterSize == RegisterSize.Simd128 ? 16 : 8;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractZx(context, op.Rn, index, 0);
+
+                if (Popcnt.IsSupported)
+                {
+                    context.EmitCall(typeof(Popcnt).GetMethod(nameof(Popcnt.PopCount), new Type[] { typeof(ulong) }));
+                }
+                else
+                {
+                    SoftFallback.EmitCall(context, nameof(SoftFallback.CountSetBits8));
+                }
+
+                EmitVectorInsert(context, op.Rd, index, 0);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        public static void Fabd_S(ILEmitterCtx context)
+        {
+            EmitScalarBinaryOpF(context, () =>
+            {
+                context.Emit(OpCodes.Sub);
+
+                EmitUnaryMathCall(context, nameof(Math.Abs));
+            });
+        }
+
+        public static void Fabs_S(ILEmitterCtx context)
+        {
+            EmitScalarUnaryOpF(context, () =>
+            {
+                EmitUnaryMathCall(context, nameof(Math.Abs));
+            });
+        }
+
+        public static void Fabs_V(ILEmitterCtx context)
+        {
+            EmitVectorUnaryOpF(context, () =>
+            {
+                EmitUnaryMathCall(context, nameof(Math.Abs));
+            });
+        }
+
+        public static void Fadd_S(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse
+                                      && Optimizations.UseSse2)
+            {
+                EmitScalarSseOrSse2OpF(context, nameof(Sse.AddScalar));
+            }
+            else
+            {
+                EmitScalarBinaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd));
+                });
+            }
+        }
+
+        public static void Fadd_V(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse
+                                      && Optimizations.UseSse2)
+            {
+                EmitVectorSseOrSse2OpF(context, nameof(Sse.Add));
+            }
+            else
+            {
+                EmitVectorBinaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd));
+                });
+            }
+        }
+
+        public static void Faddp_S(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+
+            EmitVectorExtractF(context, op.Rn, 0, sizeF);
+            EmitVectorExtractF(context, op.Rn, 1, sizeF);
+
+            context.Emit(OpCodes.Add);
+
+            EmitScalarSetF(context, op.Rd, sizeF);
+        }
+
+        public static void Faddp_V(ILEmitterCtx context)
+        {
+            EmitVectorPairwiseOpF(context, () => context.Emit(OpCodes.Add));
+        }
+
+        public static void Fdiv_S(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse
+                                      && Optimizations.UseSse2)
+            {
+                EmitScalarSseOrSse2OpF(context, nameof(Sse.DivideScalar));
+            }
+            else
+            {
+                EmitScalarBinaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPDiv));
+                });
+            }
+        }
+
+        public static void Fdiv_V(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse
+                                      && Optimizations.UseSse2)
+            {
+                EmitVectorSseOrSse2OpF(context, nameof(Sse.Divide));
+            }
+            else
+            {
+                EmitVectorBinaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPDiv));
+                });
+            }
+        }
+
+        public static void Fmadd_S(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse2)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                if (op.Size == 0)
+                {
+                    Type[] typesMulAdd = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+
+                    context.EmitLdvec(op.Ra);
+                    context.EmitLdvec(op.Rn);
+                    context.EmitLdvec(op.Rm);
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MultiplyScalar), typesMulAdd));
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.AddScalar),      typesMulAdd));
+
+                    context.EmitStvec(op.Rd);
+
+                    EmitVectorZero32_128(context, op.Rd);
+                }
+                else /* if (Op.Size == 1) */
+                {
+                    Type[] typesMulAdd = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+
+                    EmitLdvecWithCastToDouble(context, op.Ra);
+                    EmitLdvecWithCastToDouble(context, op.Rn);
+                    EmitLdvecWithCastToDouble(context, op.Rm);
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.MultiplyScalar), typesMulAdd));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AddScalar),      typesMulAdd));
+
+                    EmitStvecWithCastFromDouble(context, op.Rd);
+
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitScalarTernaryRaOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd));
+                });
+            }
+        }
+
+        public static void Fmax_S(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse
+                                      && Optimizations.UseSse2)
+            {
+                EmitScalarSseOrSse2OpF(context, nameof(Sse.MaxScalar));
+            }
+            else
+            {
+                EmitScalarBinaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPMax));
+                });
+            }
+        }
+
+        public static void Fmax_V(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse
+                                      && Optimizations.UseSse2)
+            {
+                EmitVectorSseOrSse2OpF(context, nameof(Sse.Max));
+            }
+            else
+            {
+                EmitVectorBinaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPMax));
+                });
+            }
+        }
+
+        public static void Fmaxnm_S(ILEmitterCtx context)
+        {
+            EmitScalarBinaryOpF(context, () =>
+            {
+                EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum));
+            });
+        }
+
+        public static void Fmaxnm_V(ILEmitterCtx context)
+        {
+            EmitVectorBinaryOpF(context, () =>
+            {
+                EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum));
+            });
+        }
+
+        public static void Fmaxp_V(ILEmitterCtx context)
+        {
+            EmitVectorPairwiseOpF(context, () =>
+            {
+                EmitSoftFloatCall(context, nameof(SoftFloat32.FPMax));
+            });
+        }
+
+        public static void Fmin_S(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse
+                                      && Optimizations.UseSse2)
+            {
+                EmitScalarSseOrSse2OpF(context, nameof(Sse.MinScalar));
+            }
+            else
+            {
+                EmitScalarBinaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPMin));
+                });
+            }
+        }
+
+        public static void Fmin_V(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse
+                                      && Optimizations.UseSse2)
+            {
+                EmitVectorSseOrSse2OpF(context, nameof(Sse.Min));
+            }
+            else
+            {
+                EmitVectorBinaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPMin));
+                });
+            }
+        }
+
+        public static void Fminnm_S(ILEmitterCtx context)
+        {
+            EmitScalarBinaryOpF(context, () =>
+            {
+                EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum));
+            });
+        }
+
+        public static void Fminnm_V(ILEmitterCtx context)
+        {
+            EmitVectorBinaryOpF(context, () =>
+            {
+                EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum));
+            });
+        }
+
+        public static void Fminp_V(ILEmitterCtx context)
+        {
+            EmitVectorPairwiseOpF(context, () =>
+            {
+                EmitSoftFloatCall(context, nameof(SoftFloat32.FPMin));
+            });
+        }
+
+        public static void Fmla_Se(ILEmitterCtx context)
+        {
+            EmitScalarTernaryOpByElemF(context, () =>
+            {
+                context.Emit(OpCodes.Mul);
+                context.Emit(OpCodes.Add);
+            });
+        }
+
+        public static void Fmla_V(ILEmitterCtx context)
+        {
+            EmitVectorTernaryOpF(context, () =>
+            {
+                context.Emit(OpCodes.Mul);
+                context.Emit(OpCodes.Add);
+            });
+        }
+
+        public static void Fmla_Ve(ILEmitterCtx context)
+        {
+            EmitVectorTernaryOpByElemF(context, () =>
+            {
+                context.Emit(OpCodes.Mul);
+                context.Emit(OpCodes.Add);
+            });
+        }
+
+        public static void Fmls_Se(ILEmitterCtx context)
+        {
+            EmitScalarTernaryOpByElemF(context, () =>
+            {
+                context.Emit(OpCodes.Mul);
+                context.Emit(OpCodes.Sub);
+            });
+        }
+
+        public static void Fmls_V(ILEmitterCtx context)
+        {
+            EmitVectorTernaryOpF(context, () =>
+            {
+                context.Emit(OpCodes.Mul);
+                context.Emit(OpCodes.Sub);
+            });
+        }
+
+        public static void Fmls_Ve(ILEmitterCtx context)
+        {
+            EmitVectorTernaryOpByElemF(context, () =>
+            {
+                context.Emit(OpCodes.Mul);
+                context.Emit(OpCodes.Sub);
+            });
+        }
+
+        public static void Fmsub_S(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse2)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                if (op.Size == 0)
+                {
+                    Type[] typesMulSub = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+
+                    context.EmitLdvec(op.Ra);
+                    context.EmitLdvec(op.Rn);
+                    context.EmitLdvec(op.Rm);
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MultiplyScalar), typesMulSub));
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SubtractScalar), typesMulSub));
+
+                    context.EmitStvec(op.Rd);
+
+                    EmitVectorZero32_128(context, op.Rd);
+                }
+                else /* if (Op.Size == 1) */
+                {
+                    Type[] typesMulSub = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+
+                    EmitLdvecWithCastToDouble(context, op.Ra);
+                    EmitLdvecWithCastToDouble(context, op.Rn);
+                    EmitLdvecWithCastToDouble(context, op.Rm);
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.MultiplyScalar), typesMulSub));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SubtractScalar), typesMulSub));
+
+                    EmitStvecWithCastFromDouble(context, op.Rd);
+
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitScalarTernaryRaOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulSub));
+                });
+            }
+        }
+
+        public static void Fmul_S(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse
+                                      && Optimizations.UseSse2)
+            {
+                EmitScalarSseOrSse2OpF(context, nameof(Sse.MultiplyScalar));
+            }
+            else
+            {
+                EmitScalarBinaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul));
+                });
+            }
+        }
+
+        public static void Fmul_Se(ILEmitterCtx context)
+        {
+            EmitScalarBinaryOpByElemF(context, () => context.Emit(OpCodes.Mul));
+        }
+
+        public static void Fmul_V(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse
+                                      && Optimizations.UseSse2)
+            {
+                EmitVectorSseOrSse2OpF(context, nameof(Sse.Multiply));
+            }
+            else
+            {
+                EmitVectorBinaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul));
+                });
+            }
+        }
+
+        public static void Fmul_Ve(ILEmitterCtx context)
+        {
+            EmitVectorBinaryOpByElemF(context, () => context.Emit(OpCodes.Mul));
+        }
+
+        public static void Fmulx_S(ILEmitterCtx context)
+        {
+            EmitScalarBinaryOpF(context, () =>
+            {
+                EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulX));
+            });
+        }
+
+        public static void Fmulx_Se(ILEmitterCtx context)
+        {
+            EmitScalarBinaryOpByElemF(context, () =>
+            {
+                EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulX));
+            });
+        }
+
+        public static void Fmulx_V(ILEmitterCtx context)
+        {
+            EmitVectorBinaryOpF(context, () =>
+            {
+                EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulX));
+            });
+        }
+
+        public static void Fmulx_Ve(ILEmitterCtx context)
+        {
+            EmitVectorBinaryOpByElemF(context, () =>
+            {
+                EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulX));
+            });
+        }
+
+        public static void Fneg_S(ILEmitterCtx context)
+        {
+            EmitScalarUnaryOpF(context, () => context.Emit(OpCodes.Neg));
+        }
+
+        public static void Fneg_V(ILEmitterCtx context)
+        {
+            EmitVectorUnaryOpF(context, () => context.Emit(OpCodes.Neg));
+        }
+
+        public static void Fnmadd_S(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+
+            EmitVectorExtractF(context, op.Rn, 0, sizeF);
+
+            context.Emit(OpCodes.Neg);
+
+            EmitVectorExtractF(context, op.Rm, 0, sizeF);
+
+            context.Emit(OpCodes.Mul);
+
+            EmitVectorExtractF(context, op.Ra, 0, sizeF);
+
+            context.Emit(OpCodes.Sub);
+
+            EmitScalarSetF(context, op.Rd, sizeF);
+        }
+
+        public static void Fnmsub_S(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+
+            EmitVectorExtractF(context, op.Rn, 0, sizeF);
+            EmitVectorExtractF(context, op.Rm, 0, sizeF);
+
+            context.Emit(OpCodes.Mul);
+
+            EmitVectorExtractF(context, op.Ra, 0, sizeF);
+
+            context.Emit(OpCodes.Sub);
+
+            EmitScalarSetF(context, op.Rd, sizeF);
+        }
+
+        public static void Fnmul_S(ILEmitterCtx context)
+        {
+            EmitScalarBinaryOpF(context, () =>
+            {
+                context.Emit(OpCodes.Mul);
+                context.Emit(OpCodes.Neg);
+            });
+        }
+
+        public static void Frecpe_S(ILEmitterCtx context)
+        {
+            EmitScalarUnaryOpF(context, () =>
+            {
+                EmitUnarySoftFloatCall(context, nameof(SoftFloat.RecipEstimate));
+            });
+        }
+
+        public static void Frecpe_V(ILEmitterCtx context)
+        {
+            EmitVectorUnaryOpF(context, () =>
+            {
+                EmitUnarySoftFloatCall(context, nameof(SoftFloat.RecipEstimate));
+            });
+        }
+
+        public static void Frecps_S(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse2)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                int sizeF = op.Size & 1;
+
+                if (sizeF == 0)
+                {
+                    Type[] typesSsv    = new Type[] { typeof(float) };
+                    Type[] typesMulSub = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+
+                    context.EmitLdc_R4(2f);
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetScalarVector128), typesSsv));
+
+                    context.EmitLdvec(op.Rn);
+                    context.EmitLdvec(op.Rm);
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MultiplyScalar), typesMulSub));
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SubtractScalar), typesMulSub));
+
+                    context.EmitStvec(op.Rd);
+
+                    EmitVectorZero32_128(context, op.Rd);
+                }
+                else /* if (SizeF == 1) */
+                {
+                    Type[] typesSsv    = new Type[] { typeof(double) };
+                    Type[] typesMulSub = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+
+                    context.EmitLdc_R8(2d);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetScalarVector128), typesSsv));
+
+                    EmitLdvecWithCastToDouble(context, op.Rn);
+                    EmitLdvecWithCastToDouble(context, op.Rm);
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.MultiplyScalar), typesMulSub));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SubtractScalar), typesMulSub));
+
+                    EmitStvecWithCastFromDouble(context, op.Rd);
+
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitScalarBinaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPRecipStepFused));
+                });
+            }
+        }
+
+        public static void Frecps_V(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse2)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                int sizeF = op.Size & 1;
+
+                if (sizeF == 0)
+                {
+                    Type[] typesSav    = new Type[] { typeof(float) };
+                    Type[] typesMulSub = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+
+                    context.EmitLdc_R4(2f);
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetAllVector128), typesSav));
+
+                    context.EmitLdvec(op.Rn);
+                    context.EmitLdvec(op.Rm);
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Multiply), typesMulSub));
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Subtract), typesMulSub));
+
+                    context.EmitStvec(op.Rd);
+
+                    if (op.RegisterSize == RegisterSize.Simd64)
+                    {
+                        EmitVectorZeroUpper(context, op.Rd);
+                    }
+                }
+                else /* if (SizeF == 1) */
+                {
+                    Type[] typesSav    = new Type[] { typeof(double) };
+                    Type[] typesMulSub = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+
+                    context.EmitLdc_R8(2d);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
+
+                    EmitLdvecWithCastToDouble(context, op.Rn);
+                    EmitLdvecWithCastToDouble(context, op.Rm);
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Multiply), typesMulSub));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesMulSub));
+
+                    EmitStvecWithCastFromDouble(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorBinaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPRecipStepFused));
+                });
+            }
+        }
+
+        public static void Frecpx_S(ILEmitterCtx context)
+        {
+            EmitScalarUnaryOpF(context, () =>
+            {
+                EmitSoftFloatCall(context, nameof(SoftFloat32.FPRecpX));
+            });
+        }
+
+        public static void Frinta_S(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            EmitVectorExtractF(context, op.Rn, 0, op.Size);
+
+            EmitRoundMathCall(context, MidpointRounding.AwayFromZero);
+
+            EmitScalarSetF(context, op.Rd, op.Size);
+        }
+
+        public static void Frinta_V(ILEmitterCtx context)
+        {
+            EmitVectorUnaryOpF(context, () =>
+            {
+                EmitRoundMathCall(context, MidpointRounding.AwayFromZero);
+            });
+        }
+
+        public static void Frinti_S(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            EmitScalarUnaryOpF(context, () =>
+            {
+                context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+                if (op.Size == 0)
+                {
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.RoundF));
+                }
+                else if (op.Size == 1)
+                {
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.Round));
+                }
+                else
+                {
+                    throw new InvalidOperationException();
+                }
+            });
+        }
+
+        public static void Frinti_V(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+
+            EmitVectorUnaryOpF(context, () =>
+            {
+                context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+                if (sizeF == 0)
+                {
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.RoundF));
+                }
+                else if (sizeF == 1)
+                {
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.Round));
+                }
+                else
+                {
+                    throw new InvalidOperationException();
+                }
+            });
+        }
+
+        public static void Frintm_S(ILEmitterCtx context)
+        {
+            EmitScalarUnaryOpF(context, () =>
+            {
+                EmitUnaryMathCall(context, nameof(Math.Floor));
+            });
+        }
+
+        public static void Frintm_V(ILEmitterCtx context)
+        {
+            EmitVectorUnaryOpF(context, () =>
+            {
+                EmitUnaryMathCall(context, nameof(Math.Floor));
+            });
+        }
+
+        public static void Frintn_S(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            EmitVectorExtractF(context, op.Rn, 0, op.Size);
+
+            EmitRoundMathCall(context, MidpointRounding.ToEven);
+
+            EmitScalarSetF(context, op.Rd, op.Size);
+        }
+
+        public static void Frintn_V(ILEmitterCtx context)
+        {
+            EmitVectorUnaryOpF(context, () =>
+            {
+                EmitRoundMathCall(context, MidpointRounding.ToEven);
+            });
+        }
+
+        public static void Frintp_S(ILEmitterCtx context)
+        {
+            EmitScalarUnaryOpF(context, () =>
+            {
+                EmitUnaryMathCall(context, nameof(Math.Ceiling));
+            });
+        }
+
+        public static void Frintp_V(ILEmitterCtx context)
+        {
+            EmitVectorUnaryOpF(context, () =>
+            {
+                EmitUnaryMathCall(context, nameof(Math.Ceiling));
+            });
+        }
+
+        public static void Frintx_S(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            EmitScalarUnaryOpF(context, () =>
+            {
+                context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+                if (op.Size == 0)
+                {
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.RoundF));
+                }
+                else if (op.Size == 1)
+                {
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.Round));
+                }
+                else
+                {
+                    throw new InvalidOperationException();
+                }
+            });
+        }
+
+        public static void Frintx_V(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            EmitVectorUnaryOpF(context, () =>
+            {
+                context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+                if (op.Size == 0)
+                {
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.RoundF));
+                }
+                else if (op.Size == 1)
+                {
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.Round));
+                }
+                else
+                {
+                    throw new InvalidOperationException();
+                }
+            });
+        }
+
+        public static void Frsqrte_S(ILEmitterCtx context)
+        {
+            EmitScalarUnaryOpF(context, () =>
+            {
+                EmitUnarySoftFloatCall(context, nameof(SoftFloat.InvSqrtEstimate));
+            });
+        }
+
+        public static void Frsqrte_V(ILEmitterCtx context)
+        {
+            EmitVectorUnaryOpF(context, () =>
+            {
+                EmitUnarySoftFloatCall(context, nameof(SoftFloat.InvSqrtEstimate));
+            });
+        }
+
+        public static void Frsqrts_S(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse2)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                int sizeF = op.Size & 1;
+
+                if (sizeF == 0)
+                {
+                    Type[] typesSsv    = new Type[] { typeof(float) };
+                    Type[] typesMulSub = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+
+                    context.EmitLdc_R4(0.5f);
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetScalarVector128), typesSsv));
+
+                    context.EmitLdc_R4(3f);
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetScalarVector128), typesSsv));
+
+                    context.EmitLdvec(op.Rn);
+                    context.EmitLdvec(op.Rm);
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MultiplyScalar), typesMulSub));
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SubtractScalar), typesMulSub));
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MultiplyScalar), typesMulSub));
+
+                    context.EmitStvec(op.Rd);
+
+                    EmitVectorZero32_128(context, op.Rd);
+                }
+                else /* if (SizeF == 1) */
+                {
+                    Type[] typesSsv    = new Type[] { typeof(double) };
+                    Type[] typesMulSub = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+
+                    context.EmitLdc_R8(0.5d);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetScalarVector128), typesSsv));
+
+                    context.EmitLdc_R8(3d);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetScalarVector128), typesSsv));
+
+                    EmitLdvecWithCastToDouble(context, op.Rn);
+                    EmitLdvecWithCastToDouble(context, op.Rm);
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.MultiplyScalar), typesMulSub));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SubtractScalar), typesMulSub));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.MultiplyScalar), typesMulSub));
+
+                    EmitStvecWithCastFromDouble(context, op.Rd);
+
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitScalarBinaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FprSqrtStepFused));
+                });
+            }
+        }
+
+        public static void Frsqrts_V(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse2)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                int sizeF = op.Size & 1;
+
+                if (sizeF == 0)
+                {
+                    Type[] typesSav    = new Type[] { typeof(float) };
+                    Type[] typesMulSub = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+
+                    context.EmitLdc_R4(0.5f);
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetAllVector128), typesSav));
+
+                    context.EmitLdc_R4(3f);
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetAllVector128), typesSav));
+
+                    context.EmitLdvec(op.Rn);
+                    context.EmitLdvec(op.Rm);
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Multiply), typesMulSub));
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Subtract), typesMulSub));
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Multiply), typesMulSub));
+
+                    context.EmitStvec(op.Rd);
+
+                    if (op.RegisterSize == RegisterSize.Simd64)
+                    {
+                        EmitVectorZeroUpper(context, op.Rd);
+                    }
+                }
+                else /* if (SizeF == 1) */
+                {
+                    Type[] typesSav    = new Type[] { typeof(double) };
+                    Type[] typesMulSub = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+
+                    context.EmitLdc_R8(0.5d);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
+
+                    context.EmitLdc_R8(3d);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
+
+                    EmitLdvecWithCastToDouble(context, op.Rn);
+                    EmitLdvecWithCastToDouble(context, op.Rm);
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Multiply), typesMulSub));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesMulSub));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Multiply), typesMulSub));
+
+                    EmitStvecWithCastFromDouble(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorBinaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FprSqrtStepFused));
+                });
+            }
+        }
+
+        public static void Fsqrt_S(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse
+                                      && Optimizations.UseSse2)
+            {
+                EmitScalarSseOrSse2OpF(context, nameof(Sse.SqrtScalar));
+            }
+            else
+            {
+                EmitScalarUnaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPSqrt));
+                });
+            }
+        }
+
+        public static void Fsqrt_V(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse
+                                      && Optimizations.UseSse2)
+            {
+                EmitVectorSseOrSse2OpF(context, nameof(Sse.Sqrt));
+            }
+            else
+            {
+                EmitVectorUnaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPSqrt));
+                });
+            }
+        }
+
+        public static void Fsub_S(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse
+                                      && Optimizations.UseSse2)
+            {
+                EmitScalarSseOrSse2OpF(context, nameof(Sse.SubtractScalar));
+            }
+            else
+            {
+                EmitScalarBinaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub));
+                });
+            }
+        }
+
+        public static void Fsub_V(ILEmitterCtx context)
+        {
+            if (Optimizations.FastFP && Optimizations.UseSse
+                                      && Optimizations.UseSse2)
+            {
+                EmitVectorSseOrSse2OpF(context, nameof(Sse.Subtract));
+            }
+            else
+            {
+                EmitVectorBinaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub));
+                });
+            }
+        }
+
+        public static void Mla_V(ILEmitterCtx context)
+        {
+            EmitVectorTernaryOpZx(context, () =>
+            {
+                context.Emit(OpCodes.Mul);
+                context.Emit(OpCodes.Add);
+            });
+        }
+
+        public static void Mla_Ve(ILEmitterCtx context)
+        {
+            EmitVectorTernaryOpByElemZx(context, () =>
+            {
+                context.Emit(OpCodes.Mul);
+                context.Emit(OpCodes.Add);
+            });
+        }
+
+        public static void Mls_V(ILEmitterCtx context)
+        {
+            EmitVectorTernaryOpZx(context, () =>
+            {
+                context.Emit(OpCodes.Mul);
+                context.Emit(OpCodes.Sub);
+            });
+        }
+
+        public static void Mls_Ve(ILEmitterCtx context)
+        {
+            EmitVectorTernaryOpByElemZx(context, () =>
+            {
+                context.Emit(OpCodes.Mul);
+                context.Emit(OpCodes.Sub);
+            });
+        }
+
+        public static void Mul_V(ILEmitterCtx context)
+        {
+            EmitVectorBinaryOpZx(context, () => context.Emit(OpCodes.Mul));
+        }
+
+        public static void Mul_Ve(ILEmitterCtx context)
+        {
+            EmitVectorBinaryOpByElemZx(context, () => context.Emit(OpCodes.Mul));
+        }
+
+        public static void Neg_S(ILEmitterCtx context)
+        {
+            EmitScalarUnaryOpSx(context, () => context.Emit(OpCodes.Neg));
+        }
+
+        public static void Neg_V(ILEmitterCtx context)
+        {
+            EmitVectorUnaryOpSx(context, () => context.Emit(OpCodes.Neg));
+        }
+
+        public static void Raddhn_V(ILEmitterCtx context)
+        {
+            EmitHighNarrow(context, () => context.Emit(OpCodes.Add), round: true);
+        }
+
+        public static void Rsubhn_V(ILEmitterCtx context)
+        {
+            EmitHighNarrow(context, () => context.Emit(OpCodes.Sub), round: true);
+        }
+
+        public static void Saba_V(ILEmitterCtx context)
+        {
+            EmitVectorTernaryOpSx(context, () =>
+            {
+                context.Emit(OpCodes.Sub);
+                EmitAbs(context);
+
+                context.Emit(OpCodes.Add);
+            });
+        }
+
+        public static void Sabal_V(ILEmitterCtx context)
+        {
+            EmitVectorWidenRnRmTernaryOpSx(context, () =>
+            {
+                context.Emit(OpCodes.Sub);
+                EmitAbs(context);
+
+                context.Emit(OpCodes.Add);
+            });
+        }
+
+        public static void Sabd_V(ILEmitterCtx context)
+        {
+            EmitVectorBinaryOpSx(context, () =>
+            {
+                context.Emit(OpCodes.Sub);
+                EmitAbs(context);
+            });
+        }
+
+        public static void Sabdl_V(ILEmitterCtx context)
+        {
+            EmitVectorWidenRnRmBinaryOpSx(context, () =>
+            {
+                context.Emit(OpCodes.Sub);
+                EmitAbs(context);
+            });
+        }
+
+        public static void Sadalp_V(ILEmitterCtx context)
+        {
+            EmitAddLongPairwise(context, signed: true, accumulate: true);
+        }
+
+        public static void Saddl_V(ILEmitterCtx context)
+        {
+            if (Optimizations.UseSse41)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                Type[] typesSrl = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesCvt = new Type[] { VectorIntTypesPerSizeLog2[op.Size] };
+                Type[] typesAdd = new Type[] { VectorIntTypesPerSizeLog2[op.Size + 1],
+                                               VectorIntTypesPerSizeLog2[op.Size + 1] };
+
+                string[] namesCvt = new string[] { nameof(Sse41.ConvertToVector128Int16),
+                                                   nameof(Sse41.ConvertToVector128Int32),
+                                                   nameof(Sse41.ConvertToVector128Int64) };
+
+                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
+
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
+
+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAdd));
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size + 1);
+            }
+            else
+            {
+                EmitVectorWidenRnRmBinaryOpSx(context, () => context.Emit(OpCodes.Add));
+            }
+        }
+
+        public static void Saddlp_V(ILEmitterCtx context)
+        {
+            EmitAddLongPairwise(context, signed: true, accumulate: false);
+        }
+
+        public static void Saddw_V(ILEmitterCtx context)
+        {
+            EmitVectorWidenRmBinaryOpSx(context, () => context.Emit(OpCodes.Add));
+        }
+
+        public static void Shadd_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            if (Optimizations.UseSse2 && op.Size > 0)
+            {
+                Type[] typesSra       = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesAndXorAdd = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };
+
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size);
+
+                context.Emit(OpCodes.Dup);
+                context.EmitStvectmp();
+
+                EmitLdvecWithSignedCast(context, op.Rm, op.Size);
+
+                context.Emit(OpCodes.Dup);
+                context.EmitStvectmp2();
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), typesAndXorAdd));
+
+                context.EmitLdvectmp();
+                context.EmitLdvectmp2();
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesAndXorAdd));
+
+                context.EmitLdc_I4(1);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightArithmetic), typesSra));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAndXorAdd));
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorBinaryOpSx(context, () =>
+                {
+                    context.Emit(OpCodes.Add);
+
+                    context.Emit(OpCodes.Ldc_I4_1);
+                    context.Emit(OpCodes.Shr);
+                });
+            }
+        }
+
+        public static void Shsub_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            if (Optimizations.UseSse2 && op.Size < 2)
+            {
+                Type[] typesSav    = new Type[] { IntTypesPerSizeLog2[op.Size] };
+                Type[] typesAddSub = new Type[] { VectorIntTypesPerSizeLog2 [op.Size], VectorIntTypesPerSizeLog2 [op.Size] };
+                Type[] typesAvg    = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], VectorUIntTypesPerSizeLog2[op.Size] };
+
+                context.EmitLdc_I4(op.Size == 0 ? sbyte.MinValue : short.MinValue);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
+
+                context.EmitStvectmp();
+
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size);
+                context.EmitLdvectmp();
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAddSub));
+
+                context.Emit(OpCodes.Dup);
+
+                EmitLdvecWithSignedCast(context, op.Rm, op.Size);
+                context.EmitLdvectmp();
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAddSub));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Average), typesAvg));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesAddSub));
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorBinaryOpSx(context, () =>
+                {
+                    context.Emit(OpCodes.Sub);
+
+                    context.Emit(OpCodes.Ldc_I4_1);
+                    context.Emit(OpCodes.Shr);
+                });
+            }
+        }
+
+        public static void Smax_V(ILEmitterCtx context)
+        {
+            Type[] types = new Type[] { typeof(long), typeof(long) };
+
+            MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Max), types);
+
+            EmitVectorBinaryOpSx(context, () => context.EmitCall(mthdInfo));
+        }
+
+        public static void Smaxp_V(ILEmitterCtx context)
+        {
+            Type[] types = new Type[] { typeof(long), typeof(long) };
+
+            MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Max), types);
+
+            EmitVectorPairwiseOpSx(context, () => context.EmitCall(mthdInfo));
+        }
+
+        public static void Smin_V(ILEmitterCtx context)
+        {
+            Type[] types = new Type[] { typeof(long), typeof(long) };
+
+            MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Min), types);
+
+            EmitVectorBinaryOpSx(context, () => context.EmitCall(mthdInfo));
+        }
+
+        public static void Sminp_V(ILEmitterCtx context)
+        {
+            Type[] types = new Type[] { typeof(long), typeof(long) };
+
+            MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Min), types);
+
+            EmitVectorPairwiseOpSx(context, () => context.EmitCall(mthdInfo));
+        }
+
+        public static void Smlal_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            if (Optimizations.UseSse41 && op.Size < 2)
+            {
+                Type[] typesSrl    = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesCvt    = new Type[] { VectorIntTypesPerSizeLog2[op.Size] };
+                Type[] typesMulAdd = new Type[] { VectorIntTypesPerSizeLog2[op.Size + 1],
+                                                  VectorIntTypesPerSizeLog2[op.Size + 1] };
+
+                Type typeMul = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
+
+                string nameCvt = op.Size == 0
+                    ? nameof(Sse41.ConvertToVector128Int16)
+                    : nameof(Sse41.ConvertToVector128Int32);
+
+                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
+
+                EmitLdvecWithSignedCast(context, op.Rd, op.Size + 1);
+
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
+
+                EmitLdvecWithSignedCast(context, op.Rm, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
+
+                context.EmitCall(typeMul.GetMethod(nameof(Sse2.MultiplyLow), typesMulAdd));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesMulAdd));
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size + 1);
+            }
+            else
+            {
+                EmitVectorWidenRnRmTernaryOpSx(context, () =>
+                {
+                    context.Emit(OpCodes.Mul);
+                    context.Emit(OpCodes.Add);
+                });
+            }
+        }
+
+        public static void Smlsl_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            if (Optimizations.UseSse41 && op.Size < 2)
+            {
+                Type[] typesSrl    = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesCvt    = new Type[] { VectorIntTypesPerSizeLog2[op.Size] };
+                Type[] typesMulSub = new Type[] { VectorIntTypesPerSizeLog2[op.Size + 1],
+                                                  VectorIntTypesPerSizeLog2[op.Size + 1] };
+
+                Type typeMul = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
+
+                string nameCvt = op.Size == 0
+                    ? nameof(Sse41.ConvertToVector128Int16)
+                    : nameof(Sse41.ConvertToVector128Int32);
+
+                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
+
+                EmitLdvecWithSignedCast(context, op.Rd, op.Size + 1);
+
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
+
+                EmitLdvecWithSignedCast(context, op.Rm, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
+
+                context.EmitCall(typeMul.GetMethod(nameof(Sse2.MultiplyLow), typesMulSub));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesMulSub));
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size + 1);
+            }
+            else
+            {
+                EmitVectorWidenRnRmTernaryOpSx(context, () =>
+                {
+                    context.Emit(OpCodes.Mul);
+                    context.Emit(OpCodes.Sub);
+                });
+            }
+        }
+
+        public static void Smull_V(ILEmitterCtx context)
+        {
+            EmitVectorWidenRnRmBinaryOpSx(context, () => context.Emit(OpCodes.Mul));
+        }
+
+        public static void Sqabs_S(ILEmitterCtx context)
+        {
+            EmitScalarSaturatingUnaryOpSx(context, () => EmitAbs(context));
+        }
+
+        public static void Sqabs_V(ILEmitterCtx context)
+        {
+            EmitVectorSaturatingUnaryOpSx(context, () => EmitAbs(context));
+        }
+
+        public static void Sqadd_S(ILEmitterCtx context)
+        {
+            EmitScalarSaturatingBinaryOpSx(context, SaturatingFlags.Add);
+        }
+
+        public static void Sqadd_V(ILEmitterCtx context)
+        {
+            EmitVectorSaturatingBinaryOpSx(context, SaturatingFlags.Add);
+        }
+
+        public static void Sqdmulh_S(ILEmitterCtx context)
+        {
+            EmitSaturatingBinaryOp(context, () => EmitDoublingMultiplyHighHalf(context, round: false), SaturatingFlags.ScalarSx);
+        }
+
+        public static void Sqdmulh_V(ILEmitterCtx context)
+        {
+            EmitSaturatingBinaryOp(context, () => EmitDoublingMultiplyHighHalf(context, round: false), SaturatingFlags.VectorSx);
+        }
+
+        public static void Sqneg_S(ILEmitterCtx context)
+        {
+            EmitScalarSaturatingUnaryOpSx(context, () => context.Emit(OpCodes.Neg));
+        }
+
+        public static void Sqneg_V(ILEmitterCtx context)
+        {
+            EmitVectorSaturatingUnaryOpSx(context, () => context.Emit(OpCodes.Neg));
+        }
+
+        public static void Sqrdmulh_S(ILEmitterCtx context)
+        {
+            EmitSaturatingBinaryOp(context, () => EmitDoublingMultiplyHighHalf(context, round: true), SaturatingFlags.ScalarSx);
+        }
+
+        public static void Sqrdmulh_V(ILEmitterCtx context)
+        {
+            EmitSaturatingBinaryOp(context, () => EmitDoublingMultiplyHighHalf(context, round: true), SaturatingFlags.VectorSx);
+        }
+
+        public static void Sqsub_S(ILEmitterCtx context)
+        {
+            EmitScalarSaturatingBinaryOpSx(context, SaturatingFlags.Sub);
+        }
+
+        public static void Sqsub_V(ILEmitterCtx context)
+        {
+            EmitVectorSaturatingBinaryOpSx(context, SaturatingFlags.Sub);
+        }
+
+        public static void Sqxtn_S(ILEmitterCtx context)
+        {
+            EmitSaturatingNarrowOp(context, SaturatingNarrowFlags.ScalarSxSx);
+        }
+
+        public static void Sqxtn_V(ILEmitterCtx context)
+        {
+            EmitSaturatingNarrowOp(context, SaturatingNarrowFlags.VectorSxSx);
+        }
+
+        public static void Sqxtun_S(ILEmitterCtx context)
+        {
+            EmitSaturatingNarrowOp(context, SaturatingNarrowFlags.ScalarSxZx);
+        }
+
+        public static void Sqxtun_V(ILEmitterCtx context)
+        {
+            EmitSaturatingNarrowOp(context, SaturatingNarrowFlags.VectorSxZx);
+        }
+
+        public static void Srhadd_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            if (Optimizations.UseSse2 && op.Size < 2)
+            {
+                Type[] typesSav    = new Type[] { IntTypesPerSizeLog2[op.Size] };
+                Type[] typesSubAdd = new Type[] { VectorIntTypesPerSizeLog2 [op.Size], VectorIntTypesPerSizeLog2 [op.Size] };
+                Type[] typesAvg    = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], VectorUIntTypesPerSizeLog2[op.Size] };
+
+                context.EmitLdc_I4(op.Size == 0 ? sbyte.MinValue : short.MinValue);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
+
+                context.Emit(OpCodes.Dup);
+                context.EmitStvectmp();
+
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size);
+                context.EmitLdvectmp();
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesSubAdd));
+
+                EmitLdvecWithSignedCast(context, op.Rm, op.Size);
+                context.EmitLdvectmp();
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesSubAdd));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Average), typesAvg));
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add),     typesSubAdd));
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorBinaryOpSx(context, () =>
+                {
+                    context.Emit(OpCodes.Add);
+
+                    context.Emit(OpCodes.Ldc_I4_1);
+                    context.Emit(OpCodes.Add);
+
+                    context.Emit(OpCodes.Ldc_I4_1);
+                    context.Emit(OpCodes.Shr);
+                });
+            }
+        }
+
+        public static void Ssubl_V(ILEmitterCtx context)
+        {
+            if (Optimizations.UseSse41)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                Type[] typesSrl = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesCvt = new Type[] { VectorIntTypesPerSizeLog2[op.Size] };
+                Type[] typesSub = new Type[] { VectorIntTypesPerSizeLog2[op.Size + 1],
+                                               VectorIntTypesPerSizeLog2[op.Size + 1] };
+
+                string[] namesCvt = new string[] { nameof(Sse41.ConvertToVector128Int16),
+                                                   nameof(Sse41.ConvertToVector128Int32),
+                                                   nameof(Sse41.ConvertToVector128Int64) };
+
+                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
+
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
+
+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesSub));
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size + 1);
+            }
+            else
+            {
+                EmitVectorWidenRnRmBinaryOpSx(context, () => context.Emit(OpCodes.Sub));
+            }
+        }
+
+        public static void Ssubw_V(ILEmitterCtx context)
+        {
+            EmitVectorWidenRmBinaryOpSx(context, () => context.Emit(OpCodes.Sub));
+        }
+
+        public static void Sub_S(ILEmitterCtx context)
+        {
+            EmitScalarBinaryOpZx(context, () => context.Emit(OpCodes.Sub));
+        }
+
+        public static void Sub_V(ILEmitterCtx context)
+        {
+            if (Optimizations.UseSse2)
+            {
+                EmitSse2Op(context, nameof(Sse2.Subtract));
+            }
+            else
+            {
+                EmitVectorBinaryOpZx(context, () => context.Emit(OpCodes.Sub));
+            }
+        }
+
+        public static void Subhn_V(ILEmitterCtx context)
+        {
+            EmitHighNarrow(context, () => context.Emit(OpCodes.Sub), round: false);
+        }
+
+        public static void Suqadd_S(ILEmitterCtx context)
+        {
+            EmitScalarSaturatingBinaryOpSx(context, SaturatingFlags.Accumulate);
+        }
+
+        public static void Suqadd_V(ILEmitterCtx context)
+        {
+            EmitVectorSaturatingBinaryOpSx(context, SaturatingFlags.Accumulate);
+        }
+
+        public static void Uaba_V(ILEmitterCtx context)
+        {
+            EmitVectorTernaryOpZx(context, () =>
+            {
+                context.Emit(OpCodes.Sub);
+                EmitAbs(context);
+
+                context.Emit(OpCodes.Add);
+            });
+        }
+
+        public static void Uabal_V(ILEmitterCtx context)
+        {
+            EmitVectorWidenRnRmTernaryOpZx(context, () =>
+            {
+                context.Emit(OpCodes.Sub);
+                EmitAbs(context);
+
+                context.Emit(OpCodes.Add);
+            });
+        }
+
+        public static void Uabd_V(ILEmitterCtx context)
+        {
+            EmitVectorBinaryOpZx(context, () =>
+            {
+                context.Emit(OpCodes.Sub);
+                EmitAbs(context);
+            });
+        }
+
+        public static void Uabdl_V(ILEmitterCtx context)
+        {
+            EmitVectorWidenRnRmBinaryOpZx(context, () =>
+            {
+                context.Emit(OpCodes.Sub);
+                EmitAbs(context);
+            });
+        }
+
+        public static void Uadalp_V(ILEmitterCtx context)
+        {
+            EmitAddLongPairwise(context, signed: false, accumulate: true);
+        }
+
+        public static void Uaddl_V(ILEmitterCtx context)
+        {
+            if (Optimizations.UseSse41)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                Type[] typesSrl = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesCvt = new Type[] { VectorUIntTypesPerSizeLog2[op.Size] };
+                Type[] typesAdd = new Type[] { VectorUIntTypesPerSizeLog2[op.Size + 1],
+                                               VectorUIntTypesPerSizeLog2[op.Size + 1] };
+
+                string[] namesCvt = new string[] { nameof(Sse41.ConvertToVector128Int16),
+                                                   nameof(Sse41.ConvertToVector128Int32),
+                                                   nameof(Sse41.ConvertToVector128Int64) };
+
+                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
+
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
+
+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAdd));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size + 1);
+            }
+            else
+            {
+                EmitVectorWidenRnRmBinaryOpZx(context, () => context.Emit(OpCodes.Add));
+            }
+        }
+
+        public static void Uaddlp_V(ILEmitterCtx context)
+        {
+            EmitAddLongPairwise(context, signed: false, accumulate: false);
+        }
+
+        public static void Uaddlv_V(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> op.Size;
+
+            EmitVectorExtractZx(context, op.Rn, 0, op.Size);
+
+            for (int index = 1; index < elems; index++)
+            {
+                EmitVectorExtractZx(context, op.Rn, index, op.Size);
+
+                context.Emit(OpCodes.Add);
+            }
+
+            EmitScalarSet(context, op.Rd, op.Size + 1);
+        }
+
+        public static void Uaddw_V(ILEmitterCtx context)
+        {
+            EmitVectorWidenRmBinaryOpZx(context, () => context.Emit(OpCodes.Add));
+        }
+
+        public static void Uhadd_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            if (Optimizations.UseSse2 && op.Size > 0)
+            {
+                Type[] typesSrl       = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesAndXorAdd = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], VectorUIntTypesPerSizeLog2[op.Size] };
+
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+
+                context.Emit(OpCodes.Dup);
+                context.EmitStvectmp();
+
+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+
+                context.Emit(OpCodes.Dup);
+                context.EmitStvectmp2();
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), typesAndXorAdd));
+
+                context.EmitLdvectmp();
+                context.EmitLdvectmp2();
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesAndXorAdd));
+
+                context.EmitLdc_I4(1);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesSrl));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAndXorAdd));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorBinaryOpZx(context, () =>
+                {
+                    context.Emit(OpCodes.Add);
+
+                    context.Emit(OpCodes.Ldc_I4_1);
+                    context.Emit(OpCodes.Shr_Un);
+                });
+            }
+        }
+
+        public static void Uhsub_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            if (Optimizations.UseSse2 && op.Size < 2)
+            {
+                Type[] typesAvgSub = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], VectorUIntTypesPerSizeLog2[op.Size] };
+
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+                context.Emit(OpCodes.Dup);
+
+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Average), typesAvgSub));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesAvgSub));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorBinaryOpZx(context, () =>
+                {
+                    context.Emit(OpCodes.Sub);
+
+                    context.Emit(OpCodes.Ldc_I4_1);
+                    context.Emit(OpCodes.Shr_Un);
+                });
+            }
+        }
+
+        public static void Umax_V(ILEmitterCtx context)
+        {
+            Type[] types = new Type[] { typeof(ulong), typeof(ulong) };
+
+            MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Max), types);
+
+            EmitVectorBinaryOpZx(context, () => context.EmitCall(mthdInfo));
+        }
+
+        public static void Umaxp_V(ILEmitterCtx context)
+        {
+            Type[] types = new Type[] { typeof(ulong), typeof(ulong) };
+
+            MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Max), types);
+
+            EmitVectorPairwiseOpZx(context, () => context.EmitCall(mthdInfo));
+        }
+
+        public static void Umin_V(ILEmitterCtx context)
+        {
+            Type[] types = new Type[] { typeof(ulong), typeof(ulong) };
+
+            MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Min), types);
+
+            EmitVectorBinaryOpZx(context, () => context.EmitCall(mthdInfo));
+        }
+
+        public static void Uminp_V(ILEmitterCtx context)
+        {
+            Type[] types = new Type[] { typeof(ulong), typeof(ulong) };
+
+            MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Min), types);
+
+            EmitVectorPairwiseOpZx(context, () => context.EmitCall(mthdInfo));
+        }
+
+        public static void Umlal_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            if (Optimizations.UseSse41 && op.Size < 2)
+            {
+                Type[] typesSrl    = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesCvt    = new Type[] { VectorUIntTypesPerSizeLog2[op.Size] };
+                Type[] typesMulAdd = new Type[] { VectorIntTypesPerSizeLog2 [op.Size + 1],
+                                                  VectorIntTypesPerSizeLog2 [op.Size + 1] };
+
+                Type typeMul = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
+
+                string nameCvt = op.Size == 0
+                    ? nameof(Sse41.ConvertToVector128Int16)
+                    : nameof(Sse41.ConvertToVector128Int32);
+
+                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
+
+                EmitLdvecWithUnsignedCast(context, op.Rd, op.Size + 1);
+
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
+
+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
+
+                context.EmitCall(typeMul.GetMethod(nameof(Sse2.MultiplyLow), typesMulAdd));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesMulAdd));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size + 1);
+            }
+            else
+            {
+                EmitVectorWidenRnRmTernaryOpZx(context, () =>
+                {
+                    context.Emit(OpCodes.Mul);
+                    context.Emit(OpCodes.Add);
+                });
+            }
+        }
+
+        public static void Umlsl_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            if (Optimizations.UseSse41 && op.Size < 2)
+            {
+                Type[] typesSrl    = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesCvt    = new Type[] { VectorUIntTypesPerSizeLog2[op.Size] };
+                Type[] typesMulSub = new Type[] { VectorIntTypesPerSizeLog2 [op.Size + 1],
+                                                  VectorIntTypesPerSizeLog2 [op.Size + 1] };
+
+                Type typeMul = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
+
+                string nameCvt = op.Size == 0
+                    ? nameof(Sse41.ConvertToVector128Int16)
+                    : nameof(Sse41.ConvertToVector128Int32);
+
+                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
+
+                EmitLdvecWithUnsignedCast(context, op.Rd, op.Size + 1);
+
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
+
+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));
+
+                context.EmitCall(typeMul.GetMethod(nameof(Sse2.MultiplyLow), typesMulSub));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesMulSub));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size + 1);
+            }
+            else
+            {
+                EmitVectorWidenRnRmTernaryOpZx(context, () =>
+                {
+                    context.Emit(OpCodes.Mul);
+                    context.Emit(OpCodes.Sub);
+                });
+            }
+        }
+
+        public static void Umull_V(ILEmitterCtx context)
+        {
+            EmitVectorWidenRnRmBinaryOpZx(context, () => context.Emit(OpCodes.Mul));
+        }
+
+        public static void Uqadd_S(ILEmitterCtx context)
+        {
+            EmitScalarSaturatingBinaryOpZx(context, SaturatingFlags.Add);
+        }
+
+        public static void Uqadd_V(ILEmitterCtx context)
+        {
+            EmitVectorSaturatingBinaryOpZx(context, SaturatingFlags.Add);
+        }
+
+        public static void Uqsub_S(ILEmitterCtx context)
+        {
+            EmitScalarSaturatingBinaryOpZx(context, SaturatingFlags.Sub);
+        }
+
+        public static void Uqsub_V(ILEmitterCtx context)
+        {
+            EmitVectorSaturatingBinaryOpZx(context, SaturatingFlags.Sub);
+        }
+
+        public static void Uqxtn_S(ILEmitterCtx context)
+        {
+            EmitSaturatingNarrowOp(context, SaturatingNarrowFlags.ScalarZxZx);
+        }
+
+        public static void Uqxtn_V(ILEmitterCtx context)
+        {
+            EmitSaturatingNarrowOp(context, SaturatingNarrowFlags.VectorZxZx);
+        }
+
+        public static void Urhadd_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            if (Optimizations.UseSse2 && op.Size < 2)
+            {
+                Type[] typesAvg = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], VectorUIntTypesPerSizeLog2[op.Size] };
+
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Average), typesAvg));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorBinaryOpZx(context, () =>
+                {
+                    context.Emit(OpCodes.Add);
+
+                    context.Emit(OpCodes.Ldc_I4_1);
+                    context.Emit(OpCodes.Add);
+
+                    context.Emit(OpCodes.Ldc_I4_1);
+                    context.Emit(OpCodes.Shr_Un);
+                });
+            }
+        }
+
+        public static void Usqadd_S(ILEmitterCtx context)
+        {
+            EmitScalarSaturatingBinaryOpZx(context, SaturatingFlags.Accumulate);
+        }
+
+        public static void Usqadd_V(ILEmitterCtx context)
+        {
+            EmitVectorSaturatingBinaryOpZx(context, SaturatingFlags.Accumulate);
+        }
+
+        public static void Usubl_V(ILEmitterCtx context)
+        {
+            if (Optimizations.UseSse41)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                Type[] typesSrl = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesCvt = new Type[] { VectorUIntTypesPerSizeLog2[op.Size] };
+                Type[] typesSub = new Type[] { VectorUIntTypesPerSizeLog2[op.Size + 1],
+                                               VectorUIntTypesPerSizeLog2[op.Size + 1] };
+
+                string[] namesCvt = new string[] { nameof(Sse41.ConvertToVector128Int16),
+                                                   nameof(Sse41.ConvertToVector128Int32),
+                                                   nameof(Sse41.ConvertToVector128Int64) };
+
+                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
+
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
+
+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesSub));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size + 1);
+            }
+            else
+            {
+                EmitVectorWidenRnRmBinaryOpZx(context, () => context.Emit(OpCodes.Sub));
+            }
+        }
+
+        public static void Usubw_V(ILEmitterCtx context)
+        {
+            EmitVectorWidenRmBinaryOpZx(context, () => context.Emit(OpCodes.Sub));
+        }
+
+        private static void EmitAbs(ILEmitterCtx context)
+        {
+            ILLabel lblTrue = new ILLabel();
+
+            context.Emit(OpCodes.Dup);
+            context.Emit(OpCodes.Ldc_I4_0);
+            context.Emit(OpCodes.Bge_S, lblTrue);
+
+            context.Emit(OpCodes.Neg);
+
+            context.MarkLabel(lblTrue);
+        }
+
+        private static void EmitAddLongPairwise(ILEmitterCtx context, bool signed, bool accumulate)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int words = op.GetBitsCount() >> 4;
+            int pairs = words >> op.Size;
+
+            for (int index = 0; index < pairs; index++)
+            {
+                int idx = index << 1;
+
+                EmitVectorExtract(context, op.Rn, idx,     op.Size, signed);
+                EmitVectorExtract(context, op.Rn, idx + 1, op.Size, signed);
+
+                context.Emit(OpCodes.Add);
+
+                if (accumulate)
+                {
+                    EmitVectorExtract(context, op.Rd, index, op.Size + 1, signed);
+
+                    context.Emit(OpCodes.Add);
+                }
+
+                EmitVectorInsertTmp(context, index, op.Size + 1);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        private static void EmitDoublingMultiplyHighHalf(ILEmitterCtx context, bool round)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int eSize = 8 << op.Size;
+
+            context.Emit(OpCodes.Mul);
+
+            if (!round)
+            {
+                context.EmitAsr(eSize - 1);
+            }
+            else
+            {
+                long roundConst = 1L << (eSize - 1);
+
+                ILLabel lblTrue = new ILLabel();
+
+                context.EmitLsl(1);
+
+                context.EmitLdc_I8(roundConst);
+
+                context.Emit(OpCodes.Add);
+
+                context.EmitAsr(eSize);
+
+                context.Emit(OpCodes.Dup);
+                context.EmitLdc_I8((long)int.MinValue);
+                context.Emit(OpCodes.Bne_Un_S, lblTrue);
+
+                context.Emit(OpCodes.Neg);
+
+                context.MarkLabel(lblTrue);
+            }
+        }
+
+        private static void EmitHighNarrow(ILEmitterCtx context, Action emit, bool round)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int elems = 8 >> op.Size;
+
+            int eSize = 8 << op.Size;
+
+            int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
+
+            long roundConst = 1L << (eSize - 1);
+
+            if (part != 0)
+            {
+                context.EmitLdvec(op.Rd);
+                context.EmitStvectmp();
+            }
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractZx(context, op.Rn, index, op.Size + 1);
+                EmitVectorExtractZx(context, op.Rm, index, op.Size + 1);
+
+                emit();
+
+                if (round)
+                {
+                    context.EmitLdc_I8(roundConst);
+
+                    context.Emit(OpCodes.Add);
+                }
+
+                context.EmitLsr(eSize);
+
+                EmitVectorInsertTmp(context, part + index, op.Size);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+
+            if (part == 0)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+    }
+}
diff --git a/ChocolArm64/Instructions/InstEmitSimdCmp.cs b/ChocolArm64/Instructions/InstEmitSimdCmp.cs
new file mode 100644
index 00000000..c473c0ae
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitSimdCmp.cs
@@ -0,0 +1,526 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.State;
+using ChocolArm64.Translation;
+using System;
+using System.Reflection.Emit;
+using System.Runtime.Intrinsics.X86;
+
+using static ChocolArm64.Instructions.InstEmitAluHelper;
+using static ChocolArm64.Instructions.InstEmitSimdHelper;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        public static void Cmeq_S(ILEmitterCtx context)
+        {
+            EmitCmp(context, OpCodes.Beq_S, scalar: true);
+        }
+
+        public static void Cmeq_V(ILEmitterCtx context)
+        {
+            if (context.CurrOp is OpCodeSimdReg64 op)
+            {
+                if (op.Size < 3 && Optimizations.UseSse2)
+                {
+                    EmitSse2Op(context, nameof(Sse2.CompareEqual));
+                }
+                else if (op.Size == 3 && Optimizations.UseSse41)
+                {
+                    EmitSse41Op(context, nameof(Sse41.CompareEqual));
+                }
+                else
+                {
+                    EmitCmp(context, OpCodes.Beq_S, scalar: false);
+                }
+            }
+            else
+            {
+                EmitCmp(context, OpCodes.Beq_S, scalar: false);
+            }
+        }
+
+        public static void Cmge_S(ILEmitterCtx context)
+        {
+            EmitCmp(context, OpCodes.Bge_S, scalar: true);
+        }
+
+        public static void Cmge_V(ILEmitterCtx context)
+        {
+            EmitCmp(context, OpCodes.Bge_S, scalar: false);
+        }
+
+        public static void Cmgt_S(ILEmitterCtx context)
+        {
+            EmitCmp(context, OpCodes.Bgt_S, scalar: true);
+        }
+
+        public static void Cmgt_V(ILEmitterCtx context)
+        {
+            if (context.CurrOp is OpCodeSimdReg64 op)
+            {
+                if (op.Size < 3 && Optimizations.UseSse2)
+                {
+                    EmitSse2Op(context, nameof(Sse2.CompareGreaterThan));
+                }
+                else if (op.Size == 3 && Optimizations.UseSse42)
+                {
+                    EmitSse42Op(context, nameof(Sse42.CompareGreaterThan));
+                }
+                else
+                {
+                    EmitCmp(context, OpCodes.Bgt_S, scalar: false);
+                }
+            }
+            else
+            {
+                EmitCmp(context, OpCodes.Bgt_S, scalar: false);
+            }
+        }
+
+        public static void Cmhi_S(ILEmitterCtx context)
+        {
+            EmitCmp(context, OpCodes.Bgt_Un_S, scalar: true);
+        }
+
+        public static void Cmhi_V(ILEmitterCtx context)
+        {
+            EmitCmp(context, OpCodes.Bgt_Un_S, scalar: false);
+        }
+
+        public static void Cmhs_S(ILEmitterCtx context)
+        {
+            EmitCmp(context, OpCodes.Bge_Un_S, scalar: true);
+        }
+
+        public static void Cmhs_V(ILEmitterCtx context)
+        {
+            EmitCmp(context, OpCodes.Bge_Un_S, scalar: false);
+        }
+
+        public static void Cmle_S(ILEmitterCtx context)
+        {
+            EmitCmp(context, OpCodes.Ble_S, scalar: true);
+        }
+
+        public static void Cmle_V(ILEmitterCtx context)
+        {
+            EmitCmp(context, OpCodes.Ble_S, scalar: false);
+        }
+
+        public static void Cmlt_S(ILEmitterCtx context)
+        {
+            EmitCmp(context, OpCodes.Blt_S, scalar: true);
+        }
+
+        public static void Cmlt_V(ILEmitterCtx context)
+        {
+            EmitCmp(context, OpCodes.Blt_S, scalar: false);
+        }
+
+        public static void Cmtst_S(ILEmitterCtx context)
+        {
+            EmitCmtst(context, scalar: true);
+        }
+
+        public static void Cmtst_V(ILEmitterCtx context)
+        {
+            EmitCmtst(context, scalar: false);
+        }
+
+        public static void Fccmp_S(ILEmitterCtx context)
+        {
+            OpCodeSimdFcond64 op = (OpCodeSimdFcond64)context.CurrOp;
+
+            ILLabel lblTrue = new ILLabel();
+            ILLabel lblEnd  = new ILLabel();
+
+            context.EmitCondBranch(lblTrue, op.Cond);
+
+            EmitSetNzcv(context, op.Nzcv);
+
+            context.Emit(OpCodes.Br, lblEnd);
+
+            context.MarkLabel(lblTrue);
+
+            Fcmp_S(context);
+
+            context.MarkLabel(lblEnd);
+        }
+
+        public static void Fccmpe_S(ILEmitterCtx context)
+        {
+            Fccmp_S(context);
+        }
+
+        public static void Fcmeq_S(ILEmitterCtx context)
+        {
+            if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse
+                                                 && Optimizations.UseSse2)
+            {
+                EmitScalarSseOrSse2OpF(context, nameof(Sse.CompareEqualScalar));
+            }
+            else
+            {
+                EmitScalarFcmp(context, OpCodes.Beq_S);
+            }
+        }
+
+        public static void Fcmeq_V(ILEmitterCtx context)
+        {
+            if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse
+                                                 && Optimizations.UseSse2)
+            {
+                EmitVectorSseOrSse2OpF(context, nameof(Sse.CompareEqual));
+            }
+            else
+            {
+                EmitVectorFcmp(context, OpCodes.Beq_S);
+            }
+        }
+
+        public static void Fcmge_S(ILEmitterCtx context)
+        {
+            if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse
+                                                 && Optimizations.UseSse2)
+            {
+                EmitScalarSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar));
+            }
+            else
+            {
+                EmitScalarFcmp(context, OpCodes.Bge_S);
+            }
+        }
+
+        public static void Fcmge_V(ILEmitterCtx context)
+        {
+            if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse
+                                                 && Optimizations.UseSse2)
+            {
+                EmitVectorSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual));
+            }
+            else
+            {
+                EmitVectorFcmp(context, OpCodes.Bge_S);
+            }
+        }
+
+        public static void Fcmgt_S(ILEmitterCtx context)
+        {
+            if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse
+                                                 && Optimizations.UseSse2)
+            {
+                EmitScalarSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar));
+            }
+            else
+            {
+                EmitScalarFcmp(context, OpCodes.Bgt_S);
+            }
+        }
+
+        public static void Fcmgt_V(ILEmitterCtx context)
+        {
+            if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse
+                                                 && Optimizations.UseSse2)
+            {
+                EmitVectorSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan));
+            }
+            else
+            {
+                EmitVectorFcmp(context, OpCodes.Bgt_S);
+            }
+        }
+
+        public static void Fcmle_S(ILEmitterCtx context)
+        {
+            EmitScalarFcmp(context, OpCodes.Ble_S);
+        }
+
+        public static void Fcmle_V(ILEmitterCtx context)
+        {
+            EmitVectorFcmp(context, OpCodes.Ble_S);
+        }
+
+        public static void Fcmlt_S(ILEmitterCtx context)
+        {
+            EmitScalarFcmp(context, OpCodes.Blt_S);
+        }
+
+        public static void Fcmlt_V(ILEmitterCtx context)
+        {
+            EmitVectorFcmp(context, OpCodes.Blt_S);
+        }
+
+        public static void Fcmp_S(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            bool cmpWithZero = !(op is OpCodeSimdFcond64) ? op.Bit3 : false;
+
+            //Handle NaN case.
+            //If any number is NaN, then NZCV = 0011.
+            if (cmpWithZero)
+            {
+                EmitNaNCheck(context, op.Rn);
+            }
+            else
+            {
+                EmitNaNCheck(context, op.Rn);
+                EmitNaNCheck(context, op.Rm);
+
+                context.Emit(OpCodes.Or);
+            }
+
+            ILLabel lblNaN = new ILLabel();
+            ILLabel lblEnd = new ILLabel();
+
+            context.Emit(OpCodes.Brtrue_S, lblNaN);
+
+            void EmitLoadOpers()
+            {
+                EmitVectorExtractF(context, op.Rn, 0, op.Size);
+
+                if (cmpWithZero)
+                {
+                    if (op.Size == 0)
+                    {
+                        context.EmitLdc_R4(0f);
+                    }
+                    else /* if (Op.Size == 1) */
+                    {
+                        context.EmitLdc_R8(0d);
+                    }
+                }
+                else
+                {
+                    EmitVectorExtractF(context, op.Rm, 0, op.Size);
+                }
+            }
+
+            //Z = Rn == Rm
+            EmitLoadOpers();
+
+            context.Emit(OpCodes.Ceq);
+            context.Emit(OpCodes.Dup);
+
+            context.EmitStflg((int)PState.ZBit);
+
+            //C = Rn >= Rm
+            EmitLoadOpers();
+
+            context.Emit(OpCodes.Cgt);
+            context.Emit(OpCodes.Or);
+
+            context.EmitStflg((int)PState.CBit);
+
+            //N = Rn < Rm
+            EmitLoadOpers();
+
+            context.Emit(OpCodes.Clt);
+
+            context.EmitStflg((int)PState.NBit);
+
+            //V = 0
+            context.EmitLdc_I4(0);
+
+            context.EmitStflg((int)PState.VBit);
+
+            context.Emit(OpCodes.Br_S, lblEnd);
+
+            context.MarkLabel(lblNaN);
+
+            EmitSetNzcv(context, 0b0011);
+
+            context.MarkLabel(lblEnd);
+        }
+
+        public static void Fcmpe_S(ILEmitterCtx context)
+        {
+            Fcmp_S(context);
+        }
+
+        private static void EmitNaNCheck(ILEmitterCtx context, int reg)
+        {
+            IOpCodeSimd64 op = (IOpCodeSimd64)context.CurrOp;
+
+            EmitVectorExtractF(context, reg, 0, op.Size);
+
+            if (op.Size == 0)
+            {
+                context.EmitCall(typeof(float), nameof(float.IsNaN));
+            }
+            else if (op.Size == 1)
+            {
+                context.EmitCall(typeof(double), nameof(double.IsNaN));
+            }
+            else
+            {
+                throw new InvalidOperationException();
+            }
+        }
+
+        private static void EmitCmp(ILEmitterCtx context, OpCode ilOp, bool scalar)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = !scalar ? bytes >> op.Size : 1;
+
+            ulong szMask = ulong.MaxValue >> (64 - (8 << op.Size));
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractSx(context, op.Rn, index, op.Size);
+
+                if (op is OpCodeSimdReg64 binOp)
+                {
+                    EmitVectorExtractSx(context, binOp.Rm, index, op.Size);
+                }
+                else
+                {
+                    context.EmitLdc_I8(0L);
+                }
+
+                ILLabel lblTrue = new ILLabel();
+                ILLabel lblEnd  = new ILLabel();
+
+                context.Emit(ilOp, lblTrue);
+
+                EmitVectorInsert(context, op.Rd, index, op.Size, 0);
+
+                context.Emit(OpCodes.Br_S, lblEnd);
+
+                context.MarkLabel(lblTrue);
+
+                EmitVectorInsert(context, op.Rd, index, op.Size, (long)szMask);
+
+                context.MarkLabel(lblEnd);
+            }
+
+            if ((op.RegisterSize == RegisterSize.Simd64) || scalar)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        private static void EmitCmtst(ILEmitterCtx context, bool scalar)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = !scalar ? bytes >> op.Size : 1;
+
+            ulong szMask = ulong.MaxValue >> (64 - (8 << op.Size));
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractZx(context, op.Rn, index, op.Size);
+                EmitVectorExtractZx(context, op.Rm, index, op.Size);
+
+                ILLabel lblTrue = new ILLabel();
+                ILLabel lblEnd  = new ILLabel();
+
+                context.Emit(OpCodes.And);
+
+                context.EmitLdc_I8(0L);
+
+                context.Emit(OpCodes.Bne_Un_S, lblTrue);
+
+                EmitVectorInsert(context, op.Rd, index, op.Size, 0);
+
+                context.Emit(OpCodes.Br_S, lblEnd);
+
+                context.MarkLabel(lblTrue);
+
+                EmitVectorInsert(context, op.Rd, index, op.Size, (long)szMask);
+
+                context.MarkLabel(lblEnd);
+            }
+
+            if ((op.RegisterSize == RegisterSize.Simd64) || scalar)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        private static void EmitScalarFcmp(ILEmitterCtx context, OpCode ilOp)
+        {
+            EmitFcmp(context, ilOp, 0, scalar: true);
+        }
+
+        private static void EmitVectorFcmp(ILEmitterCtx context, OpCode ilOp)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> sizeF + 2;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitFcmp(context, ilOp, index, scalar: false);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        private static void EmitFcmp(ILEmitterCtx context, OpCode ilOp, int index, bool scalar)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+
+            ulong szMask = ulong.MaxValue >> (64 - (32 << sizeF));
+
+            EmitVectorExtractF(context, op.Rn, index, sizeF);
+
+            if (op is OpCodeSimdReg64 binOp)
+            {
+                EmitVectorExtractF(context, binOp.Rm, index, sizeF);
+            }
+            else if (sizeF == 0)
+            {
+                context.EmitLdc_R4(0f);
+            }
+            else /* if (SizeF == 1) */
+            {
+                context.EmitLdc_R8(0d);
+            }
+
+            ILLabel lblTrue = new ILLabel();
+            ILLabel lblEnd  = new ILLabel();
+
+            context.Emit(ilOp, lblTrue);
+
+            if (scalar)
+            {
+                EmitVectorZeroAll(context, op.Rd);
+            }
+            else
+            {
+                EmitVectorInsert(context, op.Rd, index, sizeF + 2, 0);
+            }
+
+            context.Emit(OpCodes.Br_S, lblEnd);
+
+            context.MarkLabel(lblTrue);
+
+            if (scalar)
+            {
+                EmitVectorInsert(context, op.Rd, index, 3, (long)szMask);
+
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+            else
+            {
+                EmitVectorInsert(context, op.Rd, index, sizeF + 2, (long)szMask);
+            }
+
+            context.MarkLabel(lblEnd);
+        }
+    }
+}
diff --git a/ChocolArm64/Instructions/InstEmitSimdCrypto.cs b/ChocolArm64/Instructions/InstEmitSimdCrypto.cs
new file mode 100644
index 00000000..33c81aab
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitSimdCrypto.cs
@@ -0,0 +1,54 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.Translation;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        public static void Aesd_V(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            context.EmitLdvec(op.Rd);
+            context.EmitLdvec(op.Rn);
+
+            SoftFallback.EmitCall(context, nameof(SoftFallback.Decrypt));
+
+            context.EmitStvec(op.Rd);
+        }
+
+        public static void Aese_V(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            context.EmitLdvec(op.Rd);
+            context.EmitLdvec(op.Rn);
+
+            SoftFallback.EmitCall(context, nameof(SoftFallback.Encrypt));
+
+            context.EmitStvec(op.Rd);
+        }
+
+        public static void Aesimc_V(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            context.EmitLdvec(op.Rn);
+
+            SoftFallback.EmitCall(context, nameof(SoftFallback.InverseMixColumns));
+
+            context.EmitStvec(op.Rd);
+        }
+
+        public static void Aesmc_V(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            context.EmitLdvec(op.Rn);
+
+            SoftFallback.EmitCall(context, nameof(SoftFallback.MixColumns));
+
+            context.EmitStvec(op.Rd);
+        }
+    }
+}
diff --git a/ChocolArm64/Instructions/InstEmitSimdCvt.cs b/ChocolArm64/Instructions/InstEmitSimdCvt.cs
new file mode 100644
index 00000000..fa17c09d
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitSimdCvt.cs
@@ -0,0 +1,697 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.State;
+using ChocolArm64.Translation;
+using System;
+using System.Reflection.Emit;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+using static ChocolArm64.Instructions.InstEmitSimdHelper;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        public static void Fcvt_S(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            if (Optimizations.UseSse2)
+            {
+                if (op.Size == 1 && op.Opc == 0)
+                {
+                    //Double -> Single.
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
+
+                    EmitLdvecWithCastToDouble(context, op.Rn);
+
+                    Type[] types = new Type[] { typeof(Vector128<float>), typeof(Vector128<double>) };
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertScalarToVector128Single), types));
+
+                    context.EmitStvec(op.Rd);
+                }
+                else if (op.Size == 0 && op.Opc == 1)
+                {
+                    //Single -> Double.
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorDoubleZero));
+
+                    context.EmitLdvec(op.Rn);
+
+                    Type[] types = new Type[] { typeof(Vector128<double>), typeof(Vector128<float>) };
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertScalarToVector128Double), types));
+
+                    EmitStvecWithCastFromDouble(context, op.Rd);
+                }
+                else
+                {
+                    //Invalid encoding.
+                    throw new InvalidOperationException();
+                }
+            }
+            else
+            {
+                EmitVectorExtractF(context, op.Rn, 0, op.Size);
+
+                EmitFloatCast(context, op.Opc);
+
+                EmitScalarSetF(context, op.Rd, op.Opc);
+            }
+        }
+
+        public static void Fcvtas_Gp(ILEmitterCtx context)
+        {
+            EmitFcvt_s_Gp(context, () => EmitRoundMathCall(context, MidpointRounding.AwayFromZero));
+        }
+
+        public static void Fcvtau_Gp(ILEmitterCtx context)
+        {
+            EmitFcvt_u_Gp(context, () => EmitRoundMathCall(context, MidpointRounding.AwayFromZero));
+        }
+
+        public static void Fcvtl_V(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+
+            int elems = 4 >> sizeF;
+
+            int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
+
+            for (int index = 0; index < elems; index++)
+            {
+                if (sizeF == 0)
+                {
+                    EmitVectorExtractZx(context, op.Rn, part + index, 1);
+                    context.Emit(OpCodes.Conv_U2);
+
+                    context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+                    context.EmitCall(typeof(SoftFloat1632), nameof(SoftFloat1632.FPConvert));
+                }
+                else /* if (SizeF == 1) */
+                {
+                    EmitVectorExtractF(context, op.Rn, part + index, 0);
+
+                    context.Emit(OpCodes.Conv_R8);
+                }
+
+                EmitVectorInsertTmpF(context, index, sizeF);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+        }
+
+        public static void Fcvtms_Gp(ILEmitterCtx context)
+        {
+            EmitFcvt_s_Gp(context, () => EmitUnaryMathCall(context, nameof(Math.Floor)));
+        }
+
+        public static void Fcvtmu_Gp(ILEmitterCtx context)
+        {
+            EmitFcvt_u_Gp(context, () => EmitUnaryMathCall(context, nameof(Math.Floor)));
+        }
+
+        public static void Fcvtn_V(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+
+            int elems = 4 >> sizeF;
+
+            int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
+
+            if (part != 0)
+            {
+                context.EmitLdvec(op.Rd);
+                context.EmitStvectmp();
+            }
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractF(context, op.Rn, index, sizeF);
+
+                if (sizeF == 0)
+                {
+                    context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+                    context.EmitCall(typeof(SoftFloat3216), nameof(SoftFloat3216.FPConvert));
+
+                    context.Emit(OpCodes.Conv_U8);
+                    EmitVectorInsertTmp(context, part + index, 1);
+                }
+                else /* if (SizeF == 1) */
+                {
+                    context.Emit(OpCodes.Conv_R4);
+
+                    EmitVectorInsertTmpF(context, part + index, 0);
+                }
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+
+            if (part == 0)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        public static void Fcvtns_S(ILEmitterCtx context)
+        {
+            EmitFcvtn(context, signed: true, scalar: true);
+        }
+
+        public static void Fcvtns_V(ILEmitterCtx context)
+        {
+            EmitFcvtn(context, signed: true, scalar: false);
+        }
+
+        public static void Fcvtnu_S(ILEmitterCtx context)
+        {
+            EmitFcvtn(context, signed: false, scalar: true);
+        }
+
+        public static void Fcvtnu_V(ILEmitterCtx context)
+        {
+            EmitFcvtn(context, signed: false, scalar: false);
+        }
+
+        public static void Fcvtps_Gp(ILEmitterCtx context)
+        {
+            EmitFcvt_s_Gp(context, () => EmitUnaryMathCall(context, nameof(Math.Ceiling)));
+        }
+
+        public static void Fcvtpu_Gp(ILEmitterCtx context)
+        {
+            EmitFcvt_u_Gp(context, () => EmitUnaryMathCall(context, nameof(Math.Ceiling)));
+        }
+
+        public static void Fcvtzs_Gp(ILEmitterCtx context)
+        {
+            EmitFcvt_s_Gp(context, () => { });
+        }
+
+        public static void Fcvtzs_Gp_Fix(ILEmitterCtx context)
+        {
+            EmitFcvtzs_Gp_Fix(context);
+        }
+
+        public static void Fcvtzs_S(ILEmitterCtx context)
+        {
+            EmitScalarFcvtzs(context);
+        }
+
+        public static void Fcvtzs_V(ILEmitterCtx context)
+        {
+            EmitVectorFcvtzs(context);
+        }
+
+        public static void Fcvtzu_Gp(ILEmitterCtx context)
+        {
+            EmitFcvt_u_Gp(context, () => { });
+        }
+
+        public static void Fcvtzu_Gp_Fix(ILEmitterCtx context)
+        {
+            EmitFcvtzu_Gp_Fix(context);
+        }
+
+        public static void Fcvtzu_S(ILEmitterCtx context)
+        {
+            EmitScalarFcvtzu(context);
+        }
+
+        public static void Fcvtzu_V(ILEmitterCtx context)
+        {
+            EmitVectorFcvtzu(context);
+        }
+
+        public static void Scvtf_Gp(ILEmitterCtx context)
+        {
+            OpCodeSimdCvt64 op = (OpCodeSimdCvt64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rn);
+
+            if (context.CurrOp.RegisterSize == RegisterSize.Int32)
+            {
+                context.Emit(OpCodes.Conv_U4);
+            }
+
+            EmitFloatCast(context, op.Size);
+
+            EmitScalarSetF(context, op.Rd, op.Size);
+        }
+
+        public static void Scvtf_S(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            EmitVectorExtractSx(context, op.Rn, 0, op.Size + 2);
+
+            EmitFloatCast(context, op.Size);
+
+            EmitScalarSetF(context, op.Rd, op.Size);
+        }
+
+        public static void Scvtf_V(ILEmitterCtx context)
+        {
+            EmitVectorCvtf(context, signed: true);
+        }
+
+        public static void Ucvtf_Gp(ILEmitterCtx context)
+        {
+            OpCodeSimdCvt64 op = (OpCodeSimdCvt64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rn);
+
+            if (context.CurrOp.RegisterSize == RegisterSize.Int32)
+            {
+                context.Emit(OpCodes.Conv_U4);
+            }
+
+            context.Emit(OpCodes.Conv_R_Un);
+
+            EmitFloatCast(context, op.Size);
+
+            EmitScalarSetF(context, op.Rd, op.Size);
+        }
+
+        public static void Ucvtf_S(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            EmitVectorExtractZx(context, op.Rn, 0, op.Size + 2);
+
+            context.Emit(OpCodes.Conv_R_Un);
+
+            EmitFloatCast(context, op.Size);
+
+            EmitScalarSetF(context, op.Rd, op.Size);
+        }
+
+        public static void Ucvtf_V(ILEmitterCtx context)
+        {
+            EmitVectorCvtf(context, signed: false);
+        }
+
+        private static int GetFBits(ILEmitterCtx context)
+        {
+            if (context.CurrOp is OpCodeSimdShImm64 op)
+            {
+                return GetImmShr(op);
+            }
+
+            return 0;
+        }
+
+        private static void EmitFloatCast(ILEmitterCtx context, int size)
+        {
+            if (size == 0)
+            {
+                context.Emit(OpCodes.Conv_R4);
+            }
+            else if (size == 1)
+            {
+                context.Emit(OpCodes.Conv_R8);
+            }
+            else
+            {
+                throw new ArgumentOutOfRangeException(nameof(size));
+            }
+        }
+
+        private static void EmitFcvtn(ILEmitterCtx context, bool signed, bool scalar)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+            int sizeI = sizeF + 2;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = !scalar ? bytes >> sizeI : 1;
+
+            if (scalar && (sizeF == 0))
+            {
+                EmitVectorZeroLowerTmp(context);
+            }
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractF(context, op.Rn, index, sizeF);
+
+                EmitRoundMathCall(context, MidpointRounding.ToEven);
+
+                if (sizeF == 0)
+                {
+                    VectorHelper.EmitCall(context, signed
+                        ? nameof(VectorHelper.SatF32ToS32)
+                        : nameof(VectorHelper.SatF32ToU32));
+
+                    context.Emit(OpCodes.Conv_U8);
+                }
+                else /* if (SizeF == 1) */
+                {
+                    VectorHelper.EmitCall(context, signed
+                        ? nameof(VectorHelper.SatF64ToS64)
+                        : nameof(VectorHelper.SatF64ToU64));
+                }
+
+                EmitVectorInsertTmp(context, index, sizeI);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+
+            if ((op.RegisterSize == RegisterSize.Simd64) || scalar)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        private static void EmitFcvt_s_Gp(ILEmitterCtx context, Action emit)
+        {
+            EmitFcvt___Gp(context, emit, true);
+        }
+
+        private static void EmitFcvt_u_Gp(ILEmitterCtx context, Action emit)
+        {
+            EmitFcvt___Gp(context, emit, false);
+        }
+
+        private static void EmitFcvt___Gp(ILEmitterCtx context, Action emit, bool signed)
+        {
+            OpCodeSimdCvt64 op = (OpCodeSimdCvt64)context.CurrOp;
+
+            EmitVectorExtractF(context, op.Rn, 0, op.Size);
+
+            emit();
+
+            if (signed)
+            {
+                EmitScalarFcvts(context, op.Size, 0);
+            }
+            else
+            {
+                EmitScalarFcvtu(context, op.Size, 0);
+            }
+
+            if (context.CurrOp.RegisterSize == RegisterSize.Int32)
+            {
+                context.Emit(OpCodes.Conv_U8);
+            }
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        private static void EmitFcvtzs_Gp_Fix(ILEmitterCtx context)
+        {
+            EmitFcvtz__Gp_Fix(context, true);
+        }
+
+        private static void EmitFcvtzu_Gp_Fix(ILEmitterCtx context)
+        {
+            EmitFcvtz__Gp_Fix(context, false);
+        }
+
+        private static void EmitFcvtz__Gp_Fix(ILEmitterCtx context, bool signed)
+        {
+            OpCodeSimdCvt64 op = (OpCodeSimdCvt64)context.CurrOp;
+
+            EmitVectorExtractF(context, op.Rn, 0, op.Size);
+
+            if (signed)
+            {
+                EmitScalarFcvts(context, op.Size, op.FBits);
+            }
+            else
+            {
+                EmitScalarFcvtu(context, op.Size, op.FBits);
+            }
+
+            if (context.CurrOp.RegisterSize == RegisterSize.Int32)
+            {
+                context.Emit(OpCodes.Conv_U8);
+            }
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        private static void EmitVectorScvtf(ILEmitterCtx context)
+        {
+            EmitVectorCvtf(context, true);
+        }
+
+        private static void EmitVectorUcvtf(ILEmitterCtx context)
+        {
+            EmitVectorCvtf(context, false);
+        }
+
+        private static void EmitVectorCvtf(ILEmitterCtx context, bool signed)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+            int sizeI = sizeF + 2;
+
+            int fBits = GetFBits(context);
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> sizeI;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtract(context, op.Rn, index, sizeI, signed);
+
+                if (!signed)
+                {
+                    context.Emit(OpCodes.Conv_R_Un);
+                }
+
+                context.Emit(sizeF == 0
+                    ? OpCodes.Conv_R4
+                    : OpCodes.Conv_R8);
+
+                EmitI2fFBitsMul(context, sizeF, fBits);
+
+                EmitVectorInsertF(context, op.Rd, index, sizeF);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        private static void EmitScalarFcvtzs(ILEmitterCtx context)
+        {
+            EmitScalarFcvtz(context, true);
+        }
+
+        private static void EmitScalarFcvtzu(ILEmitterCtx context)
+        {
+            EmitScalarFcvtz(context, false);
+        }
+
+        private static void EmitScalarFcvtz(ILEmitterCtx context, bool signed)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+            int sizeI = sizeF + 2;
+
+            int fBits = GetFBits(context);
+
+            EmitVectorExtractF(context, op.Rn, 0, sizeF);
+
+            EmitF2iFBitsMul(context, sizeF, fBits);
+
+            if (sizeF == 0)
+            {
+                VectorHelper.EmitCall(context, signed
+                    ? nameof(VectorHelper.SatF32ToS32)
+                    : nameof(VectorHelper.SatF32ToU32));
+            }
+            else /* if (SizeF == 1) */
+            {
+                VectorHelper.EmitCall(context, signed
+                    ? nameof(VectorHelper.SatF64ToS64)
+                    : nameof(VectorHelper.SatF64ToU64));
+            }
+
+            if (sizeF == 0)
+            {
+                context.Emit(OpCodes.Conv_U8);
+            }
+
+            EmitScalarSet(context, op.Rd, sizeI);
+        }
+
+        private static void EmitVectorFcvtzs(ILEmitterCtx context)
+        {
+            EmitVectorFcvtz(context, true);
+        }
+
+        private static void EmitVectorFcvtzu(ILEmitterCtx context)
+        {
+            EmitVectorFcvtz(context, false);
+        }
+
+        private static void EmitVectorFcvtz(ILEmitterCtx context, bool signed)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+            int sizeI = sizeF + 2;
+
+            int fBits = GetFBits(context);
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> sizeI;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractF(context, op.Rn, index, sizeF);
+
+                EmitF2iFBitsMul(context, sizeF, fBits);
+
+                if (sizeF == 0)
+                {
+                    VectorHelper.EmitCall(context, signed
+                        ? nameof(VectorHelper.SatF32ToS32)
+                        : nameof(VectorHelper.SatF32ToU32));
+                }
+                else /* if (SizeF == 1) */
+                {
+                    VectorHelper.EmitCall(context, signed
+                        ? nameof(VectorHelper.SatF64ToS64)
+                        : nameof(VectorHelper.SatF64ToU64));
+                }
+
+                if (sizeF == 0)
+                {
+                    context.Emit(OpCodes.Conv_U8);
+                }
+
+                EmitVectorInsert(context, op.Rd, index, sizeI);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        private static void EmitScalarFcvts(ILEmitterCtx context, int size, int fBits)
+        {
+            if (size < 0 || size > 1)
+            {
+                throw new ArgumentOutOfRangeException(nameof(size));
+            }
+
+            EmitF2iFBitsMul(context, size, fBits);
+
+            if (context.CurrOp.RegisterSize == RegisterSize.Int32)
+            {
+                if (size == 0)
+                {
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.SatF32ToS32));
+                }
+                else /* if (Size == 1) */
+                {
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.SatF64ToS32));
+                }
+            }
+            else
+            {
+                if (size == 0)
+                {
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.SatF32ToS64));
+                }
+                else /* if (Size == 1) */
+                {
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.SatF64ToS64));
+                }
+            }
+        }
+
+        private static void EmitScalarFcvtu(ILEmitterCtx context, int size, int fBits)
+        {
+            if (size < 0 || size > 1)
+            {
+                throw new ArgumentOutOfRangeException(nameof(size));
+            }
+
+            EmitF2iFBitsMul(context, size, fBits);
+
+            if (context.CurrOp.RegisterSize == RegisterSize.Int32)
+            {
+                if (size == 0)
+                {
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.SatF32ToU32));
+                }
+                else /* if (Size == 1) */
+                {
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.SatF64ToU32));
+                }
+            }
+            else
+            {
+                if (size == 0)
+                {
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.SatF32ToU64));
+                }
+                else /* if (Size == 1) */
+                {
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.SatF64ToU64));
+                }
+            }
+        }
+
+        private static void EmitF2iFBitsMul(ILEmitterCtx context, int size, int fBits)
+        {
+            if (fBits != 0)
+            {
+                if (size == 0)
+                {
+                    context.EmitLdc_R4(MathF.Pow(2f, fBits));
+                }
+                else if (size == 1)
+                {
+                    context.EmitLdc_R8(Math.Pow(2d, fBits));
+                }
+                else
+                {
+                    throw new ArgumentOutOfRangeException(nameof(size));
+                }
+
+                context.Emit(OpCodes.Mul);
+            }
+        }
+
+        private static void EmitI2fFBitsMul(ILEmitterCtx context, int size, int fBits)
+        {
+            if (fBits != 0)
+            {
+                if (size == 0)
+                {
+                    context.EmitLdc_R4(1f / MathF.Pow(2f, fBits));
+                }
+                else if (size == 1)
+                {
+                    context.EmitLdc_R8(1d / Math.Pow(2d, fBits));
+                }
+                else
+                {
+                    throw new ArgumentOutOfRangeException(nameof(size));
+                }
+
+                context.Emit(OpCodes.Mul);
+            }
+        }
+    }
+}
diff --git a/ChocolArm64/Instructions/InstEmitSimdHash.cs b/ChocolArm64/Instructions/InstEmitSimdHash.cs
new file mode 100644
index 00000000..bb767fec
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitSimdHash.cs
@@ -0,0 +1,140 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.Translation;
+
+using static ChocolArm64.Instructions.InstEmitSimdHelper;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+#region "Sha1"
+        public static void Sha1c_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            context.EmitLdvec(op.Rd);
+            EmitVectorExtractZx(context, op.Rn, 0, 2);
+            context.EmitLdvec(op.Rm);
+
+            SoftFallback.EmitCall(context, nameof(SoftFallback.HashChoose));
+
+            context.EmitStvec(op.Rd);
+        }
+
+        public static void Sha1h_V(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            EmitVectorExtractZx(context, op.Rn, 0, 2);
+
+            SoftFallback.EmitCall(context, nameof(SoftFallback.FixedRotate));
+
+            EmitScalarSet(context, op.Rd, 2);
+        }
+
+        public static void Sha1m_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            context.EmitLdvec(op.Rd);
+            EmitVectorExtractZx(context, op.Rn, 0, 2);
+            context.EmitLdvec(op.Rm);
+
+            SoftFallback.EmitCall(context, nameof(SoftFallback.HashMajority));
+
+            context.EmitStvec(op.Rd);
+        }
+
+        public static void Sha1p_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            context.EmitLdvec(op.Rd);
+            EmitVectorExtractZx(context, op.Rn, 0, 2);
+            context.EmitLdvec(op.Rm);
+
+            SoftFallback.EmitCall(context, nameof(SoftFallback.HashParity));
+
+            context.EmitStvec(op.Rd);
+        }
+
+        public static void Sha1su0_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            context.EmitLdvec(op.Rd);
+            context.EmitLdvec(op.Rn);
+            context.EmitLdvec(op.Rm);
+
+            SoftFallback.EmitCall(context, nameof(SoftFallback.Sha1SchedulePart1));
+
+            context.EmitStvec(op.Rd);
+        }
+
+        public static void Sha1su1_V(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            context.EmitLdvec(op.Rd);
+            context.EmitLdvec(op.Rn);
+
+            SoftFallback.EmitCall(context, nameof(SoftFallback.Sha1SchedulePart2));
+
+            context.EmitStvec(op.Rd);
+        }
+#endregion
+
+#region "Sha256"
+        public static void Sha256h_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            context.EmitLdvec(op.Rd);
+            context.EmitLdvec(op.Rn);
+            context.EmitLdvec(op.Rm);
+
+            SoftFallback.EmitCall(context, nameof(SoftFallback.HashLower));
+
+            context.EmitStvec(op.Rd);
+        }
+
+        public static void Sha256h2_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            context.EmitLdvec(op.Rd);
+            context.EmitLdvec(op.Rn);
+            context.EmitLdvec(op.Rm);
+
+            SoftFallback.EmitCall(context, nameof(SoftFallback.HashUpper));
+
+            context.EmitStvec(op.Rd);
+        }
+
+        public static void Sha256su0_V(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            context.EmitLdvec(op.Rd);
+            context.EmitLdvec(op.Rn);
+
+            SoftFallback.EmitCall(context, nameof(SoftFallback.Sha256SchedulePart1));
+
+            context.EmitStvec(op.Rd);
+        }
+
+        public static void Sha256su1_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            context.EmitLdvec(op.Rd);
+            context.EmitLdvec(op.Rn);
+            context.EmitLdvec(op.Rm);
+
+            SoftFallback.EmitCall(context, nameof(SoftFallback.Sha256SchedulePart2));
+
+            context.EmitStvec(op.Rd);
+        }
+#endregion
+    }
+}
diff --git a/ChocolArm64/Instructions/InstEmitSimdHelper.cs b/ChocolArm64/Instructions/InstEmitSimdHelper.cs
new file mode 100644
index 00000000..fad51510
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitSimdHelper.cs
@@ -0,0 +1,1495 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.State;
+using ChocolArm64.Translation;
+using System;
+using System.Reflection;
+using System.Reflection.Emit;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace ChocolArm64.Instructions
+{
+    static class InstEmitSimdHelper
+    {
+        public static readonly Type[] IntTypesPerSizeLog2 = new Type[]
+        {
+            typeof(sbyte),
+            typeof(short),
+            typeof(int),
+            typeof(long)
+        };
+
+        public static readonly Type[] UIntTypesPerSizeLog2 = new Type[]
+        {
+            typeof(byte),
+            typeof(ushort),
+            typeof(uint),
+            typeof(ulong)
+        };
+
+        public static readonly Type[] VectorIntTypesPerSizeLog2 = new Type[]
+        {
+            typeof(Vector128<sbyte>),
+            typeof(Vector128<short>),
+            typeof(Vector128<int>),
+            typeof(Vector128<long>)
+        };
+
+        public static readonly Type[] VectorUIntTypesPerSizeLog2 = new Type[]
+        {
+            typeof(Vector128<byte>),
+            typeof(Vector128<ushort>),
+            typeof(Vector128<uint>),
+            typeof(Vector128<ulong>)
+        };
+
+        [Flags]
+        public enum OperFlags
+        {
+            Rd = 1 << 0,
+            Rn = 1 << 1,
+            Rm = 1 << 2,
+            Ra = 1 << 3,
+
+            RnRm   = Rn | Rm,
+            RdRn   = Rd | Rn,
+            RaRnRm = Ra | Rn | Rm,
+            RdRnRm = Rd | Rn | Rm
+        }
+
+        public static int GetImmShl(OpCodeSimdShImm64 op)
+        {
+            return op.Imm - (8 << op.Size);
+        }
+
+        public static int GetImmShr(OpCodeSimdShImm64 op)
+        {
+            return (8 << (op.Size + 1)) - op.Imm;
+        }
+
+        public static void EmitSse2Op(ILEmitterCtx context, string name)
+        {
+            EmitSseOp(context, name, typeof(Sse2));
+        }
+
+        public static void EmitSse41Op(ILEmitterCtx context, string name)
+        {
+            EmitSseOp(context, name, typeof(Sse41));
+        }
+
+        public static void EmitSse42Op(ILEmitterCtx context, string name)
+        {
+            EmitSseOp(context, name, typeof(Sse42));
+        }
+
+        private static void EmitSseOp(ILEmitterCtx context, string name, Type type)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            EmitLdvecWithSignedCast(context, op.Rn, op.Size);
+
+            Type baseType = VectorIntTypesPerSizeLog2[op.Size];
+
+            if (op is OpCodeSimdReg64 binOp)
+            {
+                EmitLdvecWithSignedCast(context, binOp.Rm, op.Size);
+
+                context.EmitCall(type.GetMethod(name, new Type[] { baseType, baseType }));
+            }
+            else
+            {
+                context.EmitCall(type.GetMethod(name, new Type[] { baseType }));
+            }
+
+            EmitStvecWithSignedCast(context, op.Rd, op.Size);
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        public static void EmitLdvecWithSignedCast(ILEmitterCtx context, int reg, int size)
+        {
+            context.EmitLdvec(reg);
+
+            switch (size)
+            {
+                case 0: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleToSByte)); break;
+                case 1: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleToInt16)); break;
+                case 2: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleToInt32)); break;
+                case 3: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleToInt64)); break;
+
+                default: throw new ArgumentOutOfRangeException(nameof(size));
+            }
+        }
+
+        public static void EmitLdvecWithCastToDouble(ILEmitterCtx context, int reg)
+        {
+            context.EmitLdvec(reg);
+
+            VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleToDouble));
+        }
+
+        public static void EmitStvecWithCastFromDouble(ILEmitterCtx context, int reg)
+        {
+            VectorHelper.EmitCall(context, nameof(VectorHelper.VectorDoubleToSingle));
+
+            context.EmitStvec(reg);
+        }
+
+        public static void EmitLdvecWithUnsignedCast(ILEmitterCtx context, int reg, int size)
+        {
+            context.EmitLdvec(reg);
+
+            switch (size)
+            {
+                case 0: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleToByte));   break;
+                case 1: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleToUInt16)); break;
+                case 2: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleToUInt32)); break;
+                case 3: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleToUInt64)); break;
+
+                default: throw new ArgumentOutOfRangeException(nameof(size));
+            }
+        }
+
+        public static void EmitStvecWithSignedCast(ILEmitterCtx context, int reg, int size)
+        {
+            switch (size)
+            {
+                case 0: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSByteToSingle)); break;
+                case 1: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInt16ToSingle)); break;
+                case 2: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInt32ToSingle)); break;
+                case 3: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInt64ToSingle)); break;
+
+                default: throw new ArgumentOutOfRangeException(nameof(size));
+            }
+
+            context.EmitStvec(reg);
+        }
+
+        public static void EmitStvecWithUnsignedCast(ILEmitterCtx context, int reg, int size)
+        {
+            switch (size)
+            {
+                case 0: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorByteToSingle));   break;
+                case 1: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorUInt16ToSingle)); break;
+                case 2: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorUInt32ToSingle)); break;
+                case 3: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorUInt64ToSingle)); break;
+
+                default: throw new ArgumentOutOfRangeException(nameof(size));
+            }
+
+            context.EmitStvec(reg);
+        }
+
+        public static void EmitScalarSseOrSse2OpF(ILEmitterCtx context, string name)
+        {
+            EmitSseOrSse2OpF(context, name, true);
+        }
+
+        public static void EmitVectorSseOrSse2OpF(ILEmitterCtx context, string name)
+        {
+            EmitSseOrSse2OpF(context, name, false);
+        }
+
+        public static void EmitSseOrSse2OpF(ILEmitterCtx context, string name, bool scalar)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+
+            void Ldvec(int reg)
+            {
+                context.EmitLdvec(reg);
+
+                if (sizeF == 1)
+                {
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleToDouble));
+                }
+            }
+
+            Ldvec(op.Rn);
+
+            Type type;
+            Type baseType;
+
+            if (sizeF == 0)
+            {
+                type     = typeof(Sse);
+                baseType = typeof(Vector128<float>);
+            }
+            else /* if (SizeF == 1) */
+            {
+                type     = typeof(Sse2);
+                baseType = typeof(Vector128<double>);
+            }
+
+            if (op is OpCodeSimdReg64 binOp)
+            {
+                Ldvec(binOp.Rm);
+
+                context.EmitCall(type.GetMethod(name, new Type[] { baseType, baseType }));
+            }
+            else
+            {
+                context.EmitCall(type.GetMethod(name, new Type[] { baseType }));
+            }
+
+            if (sizeF == 1)
+            {
+                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorDoubleToSingle));
+            }
+
+            context.EmitStvec(op.Rd);
+
+            if (scalar)
+            {
+                if (sizeF == 0)
+                {
+                    EmitVectorZero32_128(context, op.Rd);
+                }
+                else /* if (SizeF == 1) */
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        public static void EmitUnaryMathCall(ILEmitterCtx context, string name)
+        {
+            IOpCodeSimd64 op = (IOpCodeSimd64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+
+            MethodInfo mthdInfo;
+
+            if (sizeF == 0)
+            {
+                mthdInfo = typeof(MathF).GetMethod(name, new Type[] { typeof(float) });
+            }
+            else /* if (SizeF == 1) */
+            {
+                mthdInfo = typeof(Math).GetMethod(name, new Type[] { typeof(double) });
+            }
+
+            context.EmitCall(mthdInfo);
+        }
+
+        public static void EmitBinaryMathCall(ILEmitterCtx context, string name)
+        {
+            IOpCodeSimd64 op = (IOpCodeSimd64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+
+            MethodInfo mthdInfo;
+
+            if (sizeF == 0)
+            {
+                mthdInfo = typeof(MathF).GetMethod(name, new Type[] { typeof(float), typeof(float) });
+            }
+            else /* if (SizeF == 1) */
+            {
+                mthdInfo = typeof(Math).GetMethod(name, new Type[] { typeof(double), typeof(double) });
+            }
+
+            context.EmitCall(mthdInfo);
+        }
+
+        public static void EmitRoundMathCall(ILEmitterCtx context, MidpointRounding roundMode)
+        {
+            IOpCodeSimd64 op = (IOpCodeSimd64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+
+            MethodInfo mthdInfo;
+
+            if (sizeF == 0)
+            {
+                mthdInfo = typeof(MathF).GetMethod(nameof(MathF.Round), new Type[] { typeof(float), typeof(MidpointRounding) });
+            }
+            else /* if (SizeF == 1) */
+            {
+                mthdInfo = typeof(Math).GetMethod(nameof(Math.Round), new Type[] { typeof(double), typeof(MidpointRounding) });
+            }
+
+            context.EmitLdc_I4((int)roundMode);
+
+            context.EmitCall(mthdInfo);
+        }
+
+        public static void EmitUnarySoftFloatCall(ILEmitterCtx context, string name)
+        {
+            IOpCodeSimd64 op = (IOpCodeSimd64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+
+            MethodInfo mthdInfo;
+
+            if (sizeF == 0)
+            {
+                mthdInfo = typeof(SoftFloat).GetMethod(name, new Type[] { typeof(float) });
+            }
+            else /* if (SizeF == 1) */
+            {
+                mthdInfo = typeof(SoftFloat).GetMethod(name, new Type[] { typeof(double) });
+            }
+
+            context.EmitCall(mthdInfo);
+        }
+
+        public static void EmitSoftFloatCall(ILEmitterCtx context, string name)
+        {
+            IOpCodeSimd64 op = (IOpCodeSimd64)context.CurrOp;
+
+            Type type = (op.Size & 1) == 0
+                ? typeof(SoftFloat32)
+                : typeof(SoftFloat64);
+
+            context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+            context.EmitCall(type, name);
+        }
+
+        public static void EmitScalarBinaryOpByElemF(ILEmitterCtx context, Action emit)
+        {
+            OpCodeSimdRegElemF64 op = (OpCodeSimdRegElemF64)context.CurrOp;
+
+            EmitScalarOpByElemF(context, emit, op.Index, ternary: false);
+        }
+
+        public static void EmitScalarTernaryOpByElemF(ILEmitterCtx context, Action emit)
+        {
+            OpCodeSimdRegElemF64 op = (OpCodeSimdRegElemF64)context.CurrOp;
+
+            EmitScalarOpByElemF(context, emit, op.Index, ternary: true);
+        }
+
+        public static void EmitScalarOpByElemF(ILEmitterCtx context, Action emit, int elem, bool ternary)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+
+            if (ternary)
+            {
+                EmitVectorExtractF(context, op.Rd, 0, sizeF);
+            }
+
+            EmitVectorExtractF(context, op.Rn, 0,    sizeF);
+            EmitVectorExtractF(context, op.Rm, elem, sizeF);
+
+            emit();
+
+            EmitScalarSetF(context, op.Rd, sizeF);
+        }
+
+        public static void EmitScalarUnaryOpSx(ILEmitterCtx context, Action emit)
+        {
+            EmitScalarOp(context, emit, OperFlags.Rn, true);
+        }
+
+        public static void EmitScalarBinaryOpSx(ILEmitterCtx context, Action emit)
+        {
+            EmitScalarOp(context, emit, OperFlags.RnRm, true);
+        }
+
+        public static void EmitScalarUnaryOpZx(ILEmitterCtx context, Action emit)
+        {
+            EmitScalarOp(context, emit, OperFlags.Rn, false);
+        }
+
+        public static void EmitScalarBinaryOpZx(ILEmitterCtx context, Action emit)
+        {
+            EmitScalarOp(context, emit, OperFlags.RnRm, false);
+        }
+
+        public static void EmitScalarTernaryOpZx(ILEmitterCtx context, Action emit)
+        {
+            EmitScalarOp(context, emit, OperFlags.RdRnRm, false);
+        }
+
+        public static void EmitScalarOp(ILEmitterCtx context, Action emit, OperFlags opers, bool signed)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            bool rd = (opers & OperFlags.Rd) != 0;
+            bool rn = (opers & OperFlags.Rn) != 0;
+            bool rm = (opers & OperFlags.Rm) != 0;
+
+            if (rd)
+            {
+                EmitVectorExtract(context, op.Rd, 0, op.Size, signed);
+            }
+
+            if (rn)
+            {
+                EmitVectorExtract(context, op.Rn, 0, op.Size, signed);
+            }
+
+            if (rm)
+            {
+                EmitVectorExtract(context, ((OpCodeSimdReg64)op).Rm, 0, op.Size, signed);
+            }
+
+            emit();
+
+            EmitScalarSet(context, op.Rd, op.Size);
+        }
+
+        public static void EmitScalarUnaryOpF(ILEmitterCtx context, Action emit)
+        {
+            EmitScalarOpF(context, emit, OperFlags.Rn);
+        }
+
+        public static void EmitScalarBinaryOpF(ILEmitterCtx context, Action emit)
+        {
+            EmitScalarOpF(context, emit, OperFlags.RnRm);
+        }
+
+        public static void EmitScalarTernaryRaOpF(ILEmitterCtx context, Action emit)
+        {
+            EmitScalarOpF(context, emit, OperFlags.RaRnRm);
+        }
+
+        public static void EmitScalarOpF(ILEmitterCtx context, Action emit, OperFlags opers)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+
+            bool ra = (opers & OperFlags.Ra) != 0;
+            bool rn = (opers & OperFlags.Rn) != 0;
+            bool rm = (opers & OperFlags.Rm) != 0;
+
+            if (ra)
+            {
+                EmitVectorExtractF(context, ((OpCodeSimdReg64)op).Ra, 0, sizeF);
+            }
+
+            if (rn)
+            {
+                EmitVectorExtractF(context, op.Rn, 0, sizeF);
+            }
+
+            if (rm)
+            {
+                EmitVectorExtractF(context, ((OpCodeSimdReg64)op).Rm, 0, sizeF);
+            }
+
+            emit();
+
+            EmitScalarSetF(context, op.Rd, sizeF);
+        }
+
+        public static void EmitVectorUnaryOpF(ILEmitterCtx context, Action emit)
+        {
+            EmitVectorOpF(context, emit, OperFlags.Rn);
+        }
+
+        public static void EmitVectorBinaryOpF(ILEmitterCtx context, Action emit)
+        {
+            EmitVectorOpF(context, emit, OperFlags.RnRm);
+        }
+
+        public static void EmitVectorTernaryOpF(ILEmitterCtx context, Action emit)
+        {
+            EmitVectorOpF(context, emit, OperFlags.RdRnRm);
+        }
+
+        public static void EmitVectorOpF(ILEmitterCtx context, Action emit, OperFlags opers)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> sizeF + 2;
+
+            bool rd = (opers & OperFlags.Rd) != 0;
+            bool rn = (opers & OperFlags.Rn) != 0;
+            bool rm = (opers & OperFlags.Rm) != 0;
+
+            for (int index = 0; index < elems; index++)
+            {
+                if (rd)
+                {
+                    EmitVectorExtractF(context, op.Rd, index, sizeF);
+                }
+
+                if (rn)
+                {
+                    EmitVectorExtractF(context, op.Rn, index, sizeF);
+                }
+
+                if (rm)
+                {
+                    EmitVectorExtractF(context, ((OpCodeSimdReg64)op).Rm, index, sizeF);
+                }
+
+                emit();
+
+                EmitVectorInsertF(context, op.Rd, index, sizeF);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        public static void EmitVectorBinaryOpByElemF(ILEmitterCtx context, Action emit)
+        {
+            OpCodeSimdRegElemF64 op = (OpCodeSimdRegElemF64)context.CurrOp;
+
+            EmitVectorOpByElemF(context, emit, op.Index, ternary: false);
+        }
+
+        public static void EmitVectorTernaryOpByElemF(ILEmitterCtx context, Action emit)
+        {
+            OpCodeSimdRegElemF64 op = (OpCodeSimdRegElemF64)context.CurrOp;
+
+            EmitVectorOpByElemF(context, emit, op.Index, ternary: true);
+        }
+
+        public static void EmitVectorOpByElemF(ILEmitterCtx context, Action emit, int elem, bool ternary)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> sizeF + 2;
+
+            for (int index = 0; index < elems; index++)
+            {
+                if (ternary)
+                {
+                    EmitVectorExtractF(context, op.Rd, index, sizeF);
+                }
+
+                EmitVectorExtractF(context, op.Rn, index, sizeF);
+                EmitVectorExtractF(context, op.Rm, elem,  sizeF);
+
+                emit();
+
+                EmitVectorInsertTmpF(context, index, sizeF);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        public static void EmitVectorUnaryOpSx(ILEmitterCtx context, Action emit)
+        {
+            EmitVectorOp(context, emit, OperFlags.Rn, true);
+        }
+
+        public static void EmitVectorBinaryOpSx(ILEmitterCtx context, Action emit)
+        {
+            EmitVectorOp(context, emit, OperFlags.RnRm, true);
+        }
+
+        public static void EmitVectorTernaryOpSx(ILEmitterCtx context, Action emit)
+        {
+            EmitVectorOp(context, emit, OperFlags.RdRnRm, true);
+        }
+
+        public static void EmitVectorUnaryOpZx(ILEmitterCtx context, Action emit)
+        {
+            EmitVectorOp(context, emit, OperFlags.Rn, false);
+        }
+
+        public static void EmitVectorBinaryOpZx(ILEmitterCtx context, Action emit)
+        {
+            EmitVectorOp(context, emit, OperFlags.RnRm, false);
+        }
+
+        public static void EmitVectorTernaryOpZx(ILEmitterCtx context, Action emit)
+        {
+            EmitVectorOp(context, emit, OperFlags.RdRnRm, false);
+        }
+
+        public static void EmitVectorOp(ILEmitterCtx context, Action emit, OperFlags opers, bool signed)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> op.Size;
+
+            bool rd = (opers & OperFlags.Rd) != 0;
+            bool rn = (opers & OperFlags.Rn) != 0;
+            bool rm = (opers & OperFlags.Rm) != 0;
+
+            for (int index = 0; index < elems; index++)
+            {
+                if (rd)
+                {
+                    EmitVectorExtract(context, op.Rd, index, op.Size, signed);
+                }
+
+                if (rn)
+                {
+                    EmitVectorExtract(context, op.Rn, index, op.Size, signed);
+                }
+
+                if (rm)
+                {
+                    EmitVectorExtract(context, ((OpCodeSimdReg64)op).Rm, index, op.Size, signed);
+                }
+
+                emit();
+
+                EmitVectorInsert(context, op.Rd, index, op.Size);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        public static void EmitVectorBinaryOpByElemSx(ILEmitterCtx context, Action emit)
+        {
+            OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
+
+            EmitVectorOpByElem(context, emit, op.Index, false, true);
+        }
+
+        public static void EmitVectorBinaryOpByElemZx(ILEmitterCtx context, Action emit)
+        {
+            OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
+
+            EmitVectorOpByElem(context, emit, op.Index, false, false);
+        }
+
+        public static void EmitVectorTernaryOpByElemZx(ILEmitterCtx context, Action emit)
+        {
+            OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
+
+            EmitVectorOpByElem(context, emit, op.Index, true, false);
+        }
+
+        public static void EmitVectorOpByElem(ILEmitterCtx context, Action emit, int elem, bool ternary, bool signed)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> op.Size;
+
+            EmitVectorExtract(context, op.Rm, elem, op.Size, signed);
+            context.EmitSttmp();
+
+            for (int index = 0; index < elems; index++)
+            {
+                if (ternary)
+                {
+                    EmitVectorExtract(context, op.Rd, index, op.Size, signed);
+                }
+
+                EmitVectorExtract(context, op.Rn, index, op.Size, signed);
+                context.EmitLdtmp();
+
+                emit();
+
+                EmitVectorInsertTmp(context, index, op.Size);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        public static void EmitVectorImmUnaryOp(ILEmitterCtx context, Action emit)
+        {
+            EmitVectorImmOp(context, emit, false);
+        }
+
+        public static void EmitVectorImmBinaryOp(ILEmitterCtx context, Action emit)
+        {
+            EmitVectorImmOp(context, emit, true);
+        }
+
+        public static void EmitVectorImmOp(ILEmitterCtx context, Action emit, bool binary)
+        {
+            OpCodeSimdImm64 op = (OpCodeSimdImm64)context.CurrOp;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> op.Size;
+
+            for (int index = 0; index < elems; index++)
+            {
+                if (binary)
+                {
+                    EmitVectorExtractZx(context, op.Rd, index, op.Size);
+                }
+
+                context.EmitLdc_I8(op.Imm);
+
+                emit();
+
+                EmitVectorInsert(context, op.Rd, index, op.Size);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        public static void EmitVectorWidenRmBinaryOpSx(ILEmitterCtx context, Action emit)
+        {
+            EmitVectorWidenRmBinaryOp(context, emit, true);
+        }
+
+        public static void EmitVectorWidenRmBinaryOpZx(ILEmitterCtx context, Action emit)
+        {
+            EmitVectorWidenRmBinaryOp(context, emit, false);
+        }
+
+        public static void EmitVectorWidenRmBinaryOp(ILEmitterCtx context, Action emit, bool signed)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int elems = 8 >> op.Size;
+
+            int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtract(context, op.Rn,        index, op.Size + 1, signed);
+                EmitVectorExtract(context, op.Rm, part + index, op.Size,     signed);
+
+                emit();
+
+                EmitVectorInsertTmp(context, index, op.Size + 1);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+        }
+
+        public static void EmitVectorWidenRnRmBinaryOpSx(ILEmitterCtx context, Action emit)
+        {
+            EmitVectorWidenRnRmOp(context, emit, false, true);
+        }
+
+        public static void EmitVectorWidenRnRmBinaryOpZx(ILEmitterCtx context, Action emit)
+        {
+            EmitVectorWidenRnRmOp(context, emit, false, false);
+        }
+
+        public static void EmitVectorWidenRnRmTernaryOpSx(ILEmitterCtx context, Action emit)
+        {
+            EmitVectorWidenRnRmOp(context, emit, true, true);
+        }
+
+        public static void EmitVectorWidenRnRmTernaryOpZx(ILEmitterCtx context, Action emit)
+        {
+            EmitVectorWidenRnRmOp(context, emit, true, false);
+        }
+
+        public static void EmitVectorWidenRnRmOp(ILEmitterCtx context, Action emit, bool ternary, bool signed)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int elems = 8 >> op.Size;
+
+            int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
+
+            for (int index = 0; index < elems; index++)
+            {
+                if (ternary)
+                {
+                    EmitVectorExtract(context, op.Rd, index, op.Size + 1, signed);
+                }
+
+                EmitVectorExtract(context, op.Rn, part + index, op.Size, signed);
+                EmitVectorExtract(context, op.Rm, part + index, op.Size, signed);
+
+                emit();
+
+                EmitVectorInsertTmp(context, index, op.Size + 1);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+        }
+
+        public static void EmitVectorPairwiseOpSx(ILEmitterCtx context, Action emit)
+        {
+            EmitVectorPairwiseOp(context, emit, true);
+        }
+
+        public static void EmitVectorPairwiseOpZx(ILEmitterCtx context, Action emit)
+        {
+            EmitVectorPairwiseOp(context, emit, false);
+        }
+
+        public static void EmitVectorPairwiseOp(ILEmitterCtx context, Action emit, bool signed)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int words = op.GetBitsCount() >> 4;
+            int pairs = words >> op.Size;
+
+            for (int index = 0; index < pairs; index++)
+            {
+                int idx = index << 1;
+
+                EmitVectorExtract(context, op.Rn, idx,     op.Size, signed);
+                EmitVectorExtract(context, op.Rn, idx + 1, op.Size, signed);
+
+                emit();
+
+                EmitVectorExtract(context, op.Rm, idx,     op.Size, signed);
+                EmitVectorExtract(context, op.Rm, idx + 1, op.Size, signed);
+
+                emit();
+
+                EmitVectorInsertTmp(context, pairs + index, op.Size);
+                EmitVectorInsertTmp(context,         index, op.Size);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        public static void EmitVectorPairwiseOpF(ILEmitterCtx context, Action emit)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+
+            int words = op.GetBitsCount() >> 4;
+            int pairs = words >> sizeF + 2;
+
+            for (int index = 0; index < pairs; index++)
+            {
+                int idx = index << 1;
+
+                EmitVectorExtractF(context, op.Rn, idx,     sizeF);
+                EmitVectorExtractF(context, op.Rn, idx + 1, sizeF);
+
+                emit();
+
+                EmitVectorExtractF(context, op.Rm, idx,     sizeF);
+                EmitVectorExtractF(context, op.Rm, idx + 1, sizeF);
+
+                emit();
+
+                EmitVectorInsertTmpF(context, pairs + index, sizeF);
+                EmitVectorInsertTmpF(context,         index, sizeF);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        [Flags]
+        public enum SaturatingFlags
+        {
+            Scalar = 1 << 0,
+            Signed = 1 << 1,
+
+            Add = 1 << 2,
+            Sub = 1 << 3,
+
+            Accumulate = 1 << 4,
+
+            ScalarSx = Scalar | Signed,
+            ScalarZx = Scalar,
+
+            VectorSx = Signed,
+            VectorZx = 0
+        }
+
+        public static void EmitScalarSaturatingUnaryOpSx(ILEmitterCtx context, Action emit)
+        {
+            EmitSaturatingUnaryOpSx(context, emit, SaturatingFlags.ScalarSx);
+        }
+
+        public static void EmitVectorSaturatingUnaryOpSx(ILEmitterCtx context, Action emit)
+        {
+            EmitSaturatingUnaryOpSx(context, emit, SaturatingFlags.VectorSx);
+        }
+
+        public static void EmitSaturatingUnaryOpSx(ILEmitterCtx context, Action emit, SaturatingFlags flags)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            bool scalar = (flags & SaturatingFlags.Scalar) != 0;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = !scalar ? bytes >> op.Size : 1;
+
+            if (scalar)
+            {
+                EmitVectorZeroLowerTmp(context);
+            }
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractSx(context, op.Rn, index, op.Size);
+
+                emit();
+
+                if (op.Size <= 2)
+                {
+                    EmitSatQ(context, op.Size, true, true);
+                }
+                else /* if (Op.Size == 3) */
+                {
+                    EmitUnarySignedSatQAbsOrNeg(context);
+                }
+
+                EmitVectorInsertTmp(context, index, op.Size);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+
+            if ((op.RegisterSize == RegisterSize.Simd64) || scalar)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        public static void EmitScalarSaturatingBinaryOpSx(ILEmitterCtx context, SaturatingFlags flags)
+        {
+            EmitSaturatingBinaryOp(context, () => { }, SaturatingFlags.ScalarSx | flags);
+        }
+
+        public static void EmitScalarSaturatingBinaryOpZx(ILEmitterCtx context, SaturatingFlags flags)
+        {
+            EmitSaturatingBinaryOp(context, () => { }, SaturatingFlags.ScalarZx | flags);
+        }
+
+        public static void EmitVectorSaturatingBinaryOpSx(ILEmitterCtx context, SaturatingFlags flags)
+        {
+            EmitSaturatingBinaryOp(context, () => { }, SaturatingFlags.VectorSx | flags);
+        }
+
+        public static void EmitVectorSaturatingBinaryOpZx(ILEmitterCtx context, SaturatingFlags flags)
+        {
+            EmitSaturatingBinaryOp(context, () => { }, SaturatingFlags.VectorZx | flags);
+        }
+
+        public static void EmitSaturatingBinaryOp(ILEmitterCtx context, Action emit, SaturatingFlags flags)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            bool scalar = (flags & SaturatingFlags.Scalar) != 0;
+            bool signed = (flags & SaturatingFlags.Signed) != 0;
+
+            bool add = (flags & SaturatingFlags.Add) != 0;
+            bool sub = (flags & SaturatingFlags.Sub) != 0;
+
+            bool accumulate = (flags & SaturatingFlags.Accumulate) != 0;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = !scalar ? bytes >> op.Size : 1;
+
+            if (scalar)
+            {
+                EmitVectorZeroLowerTmp(context);
+            }
+
+            if (add || sub)
+            {
+                for (int index = 0; index < elems; index++)
+                {
+                    EmitVectorExtract(context,                   op.Rn, index, op.Size, signed);
+                    EmitVectorExtract(context, ((OpCodeSimdReg64)op).Rm, index, op.Size, signed);
+
+                    if (op.Size <= 2)
+                    {
+                        context.Emit(add ? OpCodes.Add : OpCodes.Sub);
+
+                        EmitSatQ(context, op.Size, true, signed);
+                    }
+                    else /* if (Op.Size == 3) */
+                    {
+                        if (add)
+                        {
+                            EmitBinarySatQAdd(context, signed);
+                        }
+                        else /* if (Sub) */
+                        {
+                            EmitBinarySatQSub(context, signed);
+                        }
+                    }
+
+                    EmitVectorInsertTmp(context, index, op.Size);
+                }
+            }
+            else if (accumulate)
+            {
+                for (int index = 0; index < elems; index++)
+                {
+                    EmitVectorExtract(context, op.Rn, index, op.Size, !signed);
+                    EmitVectorExtract(context, op.Rd, index, op.Size,  signed);
+
+                    if (op.Size <= 2)
+                    {
+                        context.Emit(OpCodes.Add);
+
+                        EmitSatQ(context, op.Size, true, signed);
+                    }
+                    else /* if (Op.Size == 3) */
+                    {
+                        EmitBinarySatQAccumulate(context, signed);
+                    }
+
+                    EmitVectorInsertTmp(context, index, op.Size);
+                }
+            }
+            else
+            {
+                for (int index = 0; index < elems; index++)
+                {
+                    EmitVectorExtract(context,                   op.Rn, index, op.Size, signed);
+                    EmitVectorExtract(context, ((OpCodeSimdReg64)op).Rm, index, op.Size, signed);
+
+                    emit();
+
+                    EmitSatQ(context, op.Size, true, signed);
+
+                    EmitVectorInsertTmp(context, index, op.Size);
+                }
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+
+            if ((op.RegisterSize == RegisterSize.Simd64) || scalar)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        [Flags]
+        public enum SaturatingNarrowFlags
+        {
+            Scalar    = 1 << 0,
+            SignedSrc = 1 << 1,
+            SignedDst = 1 << 2,
+
+            ScalarSxSx = Scalar | SignedSrc | SignedDst,
+            ScalarSxZx = Scalar | SignedSrc,
+            ScalarZxZx = Scalar,
+
+            VectorSxSx = SignedSrc | SignedDst,
+            VectorSxZx = SignedSrc,
+            VectorZxZx = 0
+        }
+
+        public static void EmitSaturatingNarrowOp(ILEmitterCtx context, SaturatingNarrowFlags flags)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            bool scalar    = (flags & SaturatingNarrowFlags.Scalar)    != 0;
+            bool signedSrc = (flags & SaturatingNarrowFlags.SignedSrc) != 0;
+            bool signedDst = (flags & SaturatingNarrowFlags.SignedDst) != 0;
+
+            int elems = !scalar ? 8 >> op.Size : 1;
+
+            int part = !scalar && (op.RegisterSize == RegisterSize.Simd128) ? elems : 0;
+
+            if (scalar)
+            {
+                EmitVectorZeroLowerTmp(context);
+            }
+
+            if (part != 0)
+            {
+                context.EmitLdvec(op.Rd);
+                context.EmitStvectmp();
+            }
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtract(context, op.Rn, index, op.Size + 1, signedSrc);
+
+                EmitSatQ(context, op.Size, signedSrc, signedDst);
+
+                EmitVectorInsertTmp(context, part + index, op.Size);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+
+            if (part == 0)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        // TSrc (16bit, 32bit, 64bit; signed, unsigned) > TDst (8bit, 16bit, 32bit; signed, unsigned).
+        public static void EmitSatQ(
+            ILEmitterCtx context,
+            int  sizeDst,
+            bool signedSrc,
+            bool signedDst)
+        {
+            if (sizeDst > 2)
+            {
+                throw new ArgumentOutOfRangeException(nameof(sizeDst));
+            }
+
+            context.EmitLdc_I4(sizeDst);
+            context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+            if (signedSrc)
+            {
+                SoftFallback.EmitCall(context, signedDst
+                    ? nameof(SoftFallback.SignedSrcSignedDstSatQ)
+                    : nameof(SoftFallback.SignedSrcUnsignedDstSatQ));
+            }
+            else
+            {
+                SoftFallback.EmitCall(context, signedDst
+                    ? nameof(SoftFallback.UnsignedSrcSignedDstSatQ)
+                    : nameof(SoftFallback.UnsignedSrcUnsignedDstSatQ));
+            }
+        }
+
+        // TSrc (64bit) == TDst (64bit); signed.
+        public static void EmitUnarySignedSatQAbsOrNeg(ILEmitterCtx context)
+        {
+            if (((OpCodeSimd64)context.CurrOp).Size < 3)
+            {
+                throw new InvalidOperationException();
+            }
+
+            context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+            SoftFallback.EmitCall(context, nameof(SoftFallback.UnarySignedSatQAbsOrNeg));
+        }
+
+        // TSrcs (64bit) == TDst (64bit); signed, unsigned.
+        public static void EmitBinarySatQAdd(ILEmitterCtx context, bool signed)
+        {
+            if (((OpCodeSimdReg64)context.CurrOp).Size < 3)
+            {
+                throw new InvalidOperationException();
+            }
+
+            context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+            SoftFallback.EmitCall(context, signed
+                ? nameof(SoftFallback.BinarySignedSatQAdd)
+                : nameof(SoftFallback.BinaryUnsignedSatQAdd));
+        }
+
+        // TSrcs (64bit) == TDst (64bit); signed, unsigned.
+        public static void EmitBinarySatQSub(ILEmitterCtx context, bool signed)
+        {
+            if (((OpCodeSimdReg64)context.CurrOp).Size < 3)
+            {
+                throw new InvalidOperationException();
+            }
+
+            context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+            SoftFallback.EmitCall(context, signed
+                ? nameof(SoftFallback.BinarySignedSatQSub)
+                : nameof(SoftFallback.BinaryUnsignedSatQSub));
+        }
+
+        // TSrcs (64bit) == TDst (64bit); signed, unsigned.
+        public static void EmitBinarySatQAccumulate(ILEmitterCtx context, bool signed)
+        {
+            if (((OpCodeSimd64)context.CurrOp).Size < 3)
+            {
+                throw new InvalidOperationException();
+            }
+
+            context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+            SoftFallback.EmitCall(context, signed
+                ? nameof(SoftFallback.BinarySignedSatQAcc)
+                : nameof(SoftFallback.BinaryUnsignedSatQAcc));
+        }
+
+        public static void EmitScalarSet(ILEmitterCtx context, int reg, int size)
+        {
+            EmitVectorZeroAll(context, reg);
+            EmitVectorInsert(context, reg, 0, size);
+        }
+
+        public static void EmitScalarSetF(ILEmitterCtx context, int reg, int size)
+        {
+            if (Optimizations.UseSse41 && size == 0)
+            {
+                //If the type is float, we can perform insertion and
+                //zero the upper bits with a single instruction (INSERTPS);
+                context.EmitLdvec(reg);
+
+                VectorHelper.EmitCall(context, nameof(VectorHelper.Sse41VectorInsertScalarSingle));
+
+                context.EmitStvec(reg);
+            }
+            else
+            {
+                EmitVectorZeroAll(context, reg);
+                EmitVectorInsertF(context, reg, 0, size);
+            }
+        }
+
+        public static void EmitVectorExtractSx(ILEmitterCtx context, int reg, int index, int size)
+        {
+            EmitVectorExtract(context, reg, index, size, true);
+        }
+
+        public static void EmitVectorExtractZx(ILEmitterCtx context, int reg, int index, int size)
+        {
+            EmitVectorExtract(context, reg, index, size, false);
+        }
+
+        public static void EmitVectorExtract(ILEmitterCtx context, int reg, int index, int size, bool signed)
+        {
+            ThrowIfInvalid(index, size);
+
+            context.EmitLdvec(reg);
+            context.EmitLdc_I4(index);
+            context.EmitLdc_I4(size);
+
+            VectorHelper.EmitCall(context, signed
+                ? nameof(VectorHelper.VectorExtractIntSx)
+                : nameof(VectorHelper.VectorExtractIntZx));
+        }
+
+        public static void EmitVectorExtractF(ILEmitterCtx context, int reg, int index, int size)
+        {
+            ThrowIfInvalidF(index, size);
+
+            context.EmitLdvec(reg);
+            context.EmitLdc_I4(index);
+
+            if (size == 0)
+            {
+                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorExtractSingle));
+            }
+            else if (size == 1)
+            {
+                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorExtractDouble));
+            }
+            else
+            {
+                throw new ArgumentOutOfRangeException(nameof(size));
+            }
+        }
+
+        public static void EmitVectorZeroAll(ILEmitterCtx context, int rd)
+        {
+            if (Optimizations.UseSse2)
+            {
+                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
+
+                context.EmitStvec(rd);
+            }
+            else
+            {
+                EmitVectorZeroLower(context, rd);
+                EmitVectorZeroUpper(context, rd);
+            }
+        }
+
+        public static void EmitVectorZeroLower(ILEmitterCtx context, int rd)
+        {
+            EmitVectorInsert(context, rd, 0, 3, 0);
+        }
+
+        public static void EmitVectorZeroLowerTmp(ILEmitterCtx context)
+        {
+            EmitVectorInsertTmp(context, 0, 3, 0);
+        }
+
+        public static void EmitVectorZeroUpper(ILEmitterCtx context, int reg)
+        {
+            if (Optimizations.UseSse2)
+            {
+                //TODO: Use MoveScalar once it is fixed, as of the
+                //time of writing it just crashes the JIT.
+                EmitLdvecWithUnsignedCast(context, reg, 3);
+
+                Type[] types = new Type[] { typeof(Vector128<ulong>), typeof(byte) };
+
+                //Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.MoveScalar), Types));
+
+                context.EmitLdc_I4(8);
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical128BitLane), types));
+
+                context.EmitLdc_I4(8);
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), types));
+
+                EmitStvecWithUnsignedCast(context, reg, 3);
+            }
+            else
+            {
+                EmitVectorInsert(context, reg, 1, 3, 0);
+            }
+        }
+
+        public static void EmitVectorZero32_128(ILEmitterCtx context, int reg)
+        {
+            context.EmitLdvec(reg);
+
+            VectorHelper.EmitCall(context, nameof(VectorHelper.VectorZero32_128));
+
+            context.EmitStvec(reg);
+        }
+
+        public static void EmitVectorInsert(ILEmitterCtx context, int reg, int index, int size)
+        {
+            ThrowIfInvalid(index, size);
+
+            context.EmitLdvec(reg);
+            context.EmitLdc_I4(index);
+            context.EmitLdc_I4(size);
+
+            VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInsertInt));
+
+            context.EmitStvec(reg);
+        }
+
+        public static void EmitVectorInsertTmp(ILEmitterCtx context, int index, int size)
+        {
+            ThrowIfInvalid(index, size);
+
+            context.EmitLdvectmp();
+            context.EmitLdc_I4(index);
+            context.EmitLdc_I4(size);
+
+            VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInsertInt));
+
+            context.EmitStvectmp();
+        }
+
+        public static void EmitVectorInsert(ILEmitterCtx context, int reg, int index, int size, long value)
+        {
+            ThrowIfInvalid(index, size);
+
+            context.EmitLdc_I8(value);
+            context.EmitLdvec(reg);
+            context.EmitLdc_I4(index);
+            context.EmitLdc_I4(size);
+
+            VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInsertInt));
+
+            context.EmitStvec(reg);
+        }
+
+        public static void EmitVectorInsertTmp(ILEmitterCtx context, int index, int size, long value)
+        {
+            ThrowIfInvalid(index, size);
+
+            context.EmitLdc_I8(value);
+            context.EmitLdvectmp();
+            context.EmitLdc_I4(index);
+            context.EmitLdc_I4(size);
+
+            VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInsertInt));
+
+            context.EmitStvectmp();
+        }
+
+        public static void EmitVectorInsertF(ILEmitterCtx context, int reg, int index, int size)
+        {
+            ThrowIfInvalidF(index, size);
+
+            context.EmitLdvec(reg);
+            context.EmitLdc_I4(index);
+
+            if (size == 0)
+            {
+                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInsertSingle));
+            }
+            else if (size == 1)
+            {
+                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInsertDouble));
+            }
+            else
+            {
+                throw new ArgumentOutOfRangeException(nameof(size));
+            }
+
+            context.EmitStvec(reg);
+        }
+
+        public static void EmitVectorInsertTmpF(ILEmitterCtx context, int index, int size)
+        {
+            ThrowIfInvalidF(index, size);
+
+            context.EmitLdvectmp();
+            context.EmitLdc_I4(index);
+
+            if (size == 0)
+            {
+                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInsertSingle));
+            }
+            else if (size == 1)
+            {
+                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInsertDouble));
+            }
+            else
+            {
+                throw new ArgumentOutOfRangeException(nameof(size));
+            }
+
+            context.EmitStvectmp();
+        }
+
+        private static void ThrowIfInvalid(int index, int size)
+        {
+            if ((uint)size > 3u)
+            {
+                throw new ArgumentOutOfRangeException(nameof(size));
+            }
+
+            if ((uint)index >= 16u >> size)
+            {
+                throw new ArgumentOutOfRangeException(nameof(index));
+            }
+        }
+
+        private static void ThrowIfInvalidF(int index, int size)
+        {
+            if ((uint)size > 1u)
+            {
+                throw new ArgumentOutOfRangeException(nameof(size));
+            }
+
+            if ((uint)index >= 4u >> size)
+            {
+                throw new ArgumentOutOfRangeException(nameof(index));
+            }
+        }
+    }
+}
diff --git a/ChocolArm64/Instructions/InstEmitSimdLogical.cs b/ChocolArm64/Instructions/InstEmitSimdLogical.cs
new file mode 100644
index 00000000..f51568eb
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitSimdLogical.cs
@@ -0,0 +1,311 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.State;
+using ChocolArm64.Translation;
+using System;
+using System.Reflection.Emit;
+using System.Runtime.Intrinsics.X86;
+
+using static ChocolArm64.Instructions.InstEmitSimdHelper;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        public static void And_V(ILEmitterCtx context)
+        {
+            if (Optimizations.UseSse2)
+            {
+                EmitSse2Op(context, nameof(Sse2.And));
+            }
+            else
+            {
+                EmitVectorBinaryOpZx(context, () => context.Emit(OpCodes.And));
+            }
+        }
+
+        public static void Bic_V(ILEmitterCtx context)
+        {
+            if (Optimizations.UseSse2)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+
+                Type[] types = new Type[]
+                {
+                    VectorUIntTypesPerSizeLog2[op.Size],
+                    VectorUIntTypesPerSizeLog2[op.Size]
+                };
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), types));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorBinaryOpZx(context, () =>
+                {
+                    context.Emit(OpCodes.Not);
+                    context.Emit(OpCodes.And);
+                });
+            }
+        }
+
+        public static void Bic_Vi(ILEmitterCtx context)
+        {
+            EmitVectorImmBinaryOp(context, () =>
+            {
+                context.Emit(OpCodes.Not);
+                context.Emit(OpCodes.And);
+            });
+        }
+
+        public static void Bif_V(ILEmitterCtx context)
+        {
+            EmitBitBif(context, true);
+        }
+
+        public static void Bit_V(ILEmitterCtx context)
+        {
+            EmitBitBif(context, false);
+        }
+
+        private static void EmitBitBif(ILEmitterCtx context, bool notRm)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            if (Optimizations.UseSse2)
+            {
+                Type[] types = new Type[]
+                {
+                    VectorUIntTypesPerSizeLog2[op.Size],
+                    VectorUIntTypesPerSizeLog2[op.Size]
+                };
+
+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+                EmitLdvecWithUnsignedCast(context, op.Rd, op.Size);
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), types));
+
+                string name = notRm ? nameof(Sse2.AndNot) : nameof(Sse2.And);
+
+                context.EmitCall(typeof(Sse2).GetMethod(name, types));
+
+                EmitLdvecWithUnsignedCast(context, op.Rd, op.Size);
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), types));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                int bytes = op.GetBitsCount() >> 3;
+                int elems = bytes >> op.Size;
+
+                for (int index = 0; index < elems; index++)
+                {
+                    EmitVectorExtractZx(context, op.Rd, index, op.Size);
+                    EmitVectorExtractZx(context, op.Rn, index, op.Size);
+
+                    context.Emit(OpCodes.Xor);
+
+                    EmitVectorExtractZx(context, op.Rm, index, op.Size);
+
+                    if (notRm)
+                    {
+                        context.Emit(OpCodes.Not);
+                    }
+
+                    context.Emit(OpCodes.And);
+
+                    EmitVectorExtractZx(context, op.Rd, index, op.Size);
+
+                    context.Emit(OpCodes.Xor);
+
+                    EmitVectorInsert(context, op.Rd, index, op.Size);
+                }
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+        }
+
+        public static void Bsl_V(ILEmitterCtx context)
+        {
+            if (Optimizations.UseSse2)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                Type[] types = new Type[]
+                {
+                    VectorUIntTypesPerSizeLog2[op.Size],
+                    VectorUIntTypesPerSizeLog2[op.Size]
+                };
+
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), types));
+
+                EmitLdvecWithUnsignedCast(context, op.Rd, op.Size);
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), types));
+
+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), types));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorTernaryOpZx(context, () =>
+                {
+                    context.EmitSttmp();
+                    context.EmitLdtmp();
+
+                    context.Emit(OpCodes.Xor);
+                    context.Emit(OpCodes.And);
+
+                    context.EmitLdtmp();
+
+                    context.Emit(OpCodes.Xor);
+                });
+            }
+        }
+
+        public static void Eor_V(ILEmitterCtx context)
+        {
+            if (Optimizations.UseSse2)
+            {
+                EmitSse2Op(context, nameof(Sse2.Xor));
+            }
+            else
+            {
+                EmitVectorBinaryOpZx(context, () => context.Emit(OpCodes.Xor));
+            }
+        }
+
+        public static void Not_V(ILEmitterCtx context)
+        {
+            EmitVectorUnaryOpZx(context, () => context.Emit(OpCodes.Not));
+        }
+
+        public static void Orn_V(ILEmitterCtx context)
+        {
+            EmitVectorBinaryOpZx(context, () =>
+            {
+                context.Emit(OpCodes.Not);
+                context.Emit(OpCodes.Or);
+            });
+        }
+
+        public static void Orr_V(ILEmitterCtx context)
+        {
+            if (Optimizations.UseSse2)
+            {
+                EmitSse2Op(context, nameof(Sse2.Or));
+            }
+            else
+            {
+                EmitVectorBinaryOpZx(context, () => context.Emit(OpCodes.Or));
+            }
+        }
+
+        public static void Orr_Vi(ILEmitterCtx context)
+        {
+            EmitVectorImmBinaryOp(context, () => context.Emit(OpCodes.Or));
+        }
+
+        public static void Rbit_V(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int elems = op.RegisterSize == RegisterSize.Simd128 ? 16 : 8;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractZx(context, op.Rn, index, 0);
+
+                context.Emit(OpCodes.Conv_U4);
+
+                SoftFallback.EmitCall(context, nameof(SoftFallback.ReverseBits8));
+
+                context.Emit(OpCodes.Conv_U8);
+
+                EmitVectorInsert(context, op.Rd, index, 0);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        public static void Rev16_V(ILEmitterCtx context)
+        {
+            EmitRev_V(context, containerSize: 1);
+        }
+
+        public static void Rev32_V(ILEmitterCtx context)
+        {
+            EmitRev_V(context, containerSize: 2);
+        }
+
+        public static void Rev64_V(ILEmitterCtx context)
+        {
+            EmitRev_V(context, containerSize: 3);
+        }
+
+        private static void EmitRev_V(ILEmitterCtx context, int containerSize)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            if (op.Size >= containerSize)
+            {
+                throw new InvalidOperationException();
+            }
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> op.Size;
+
+            int containerMask = (1 << (containerSize - op.Size)) - 1;
+
+            for (int index = 0; index < elems; index++)
+            {
+                int revIndex = index ^ containerMask;
+
+                EmitVectorExtractZx(context, op.Rn, revIndex, op.Size);
+
+                EmitVectorInsertTmp(context, index, op.Size);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+    }
+}
diff --git a/ChocolArm64/Instructions/InstEmitSimdMemory.cs b/ChocolArm64/Instructions/InstEmitSimdMemory.cs
new file mode 100644
index 00000000..eb053257
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitSimdMemory.cs
@@ -0,0 +1,185 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.State;
+using ChocolArm64.Translation;
+using System;
+using System.Reflection.Emit;
+
+using static ChocolArm64.Instructions.InstEmitMemoryHelper;
+using static ChocolArm64.Instructions.InstEmitSimdHelper;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        public static void Ld__Vms(ILEmitterCtx context)
+        {
+            EmitSimdMemMs(context, isLoad: true);
+        }
+
+        public static void Ld__Vss(ILEmitterCtx context)
+        {
+            EmitSimdMemSs(context, isLoad: true);
+        }
+
+        public static void St__Vms(ILEmitterCtx context)
+        {
+            EmitSimdMemMs(context, isLoad: false);
+        }
+
+        public static void St__Vss(ILEmitterCtx context)
+        {
+            EmitSimdMemSs(context, isLoad: false);
+        }
+
+        private static void EmitSimdMemMs(ILEmitterCtx context, bool isLoad)
+        {
+            OpCodeSimdMemMs64 op = (OpCodeSimdMemMs64)context.CurrOp;
+
+            int offset = 0;
+
+            for (int rep   = 0; rep   < op.Reps;   rep++)
+            for (int elem  = 0; elem  < op.Elems;  elem++)
+            for (int sElem = 0; sElem < op.SElems; sElem++)
+            {
+                int rtt = (op.Rt + rep + sElem) & 0x1f;
+
+                if (isLoad)
+                {
+                    context.EmitLdarg(TranslatedSub.MemoryArgIdx);
+                    context.EmitLdint(op.Rn);
+                    context.EmitLdc_I8(offset);
+
+                    context.Emit(OpCodes.Add);
+
+                    EmitReadZxCall(context, op.Size);
+
+                    EmitVectorInsert(context, rtt, elem, op.Size);
+
+                    if (op.RegisterSize == RegisterSize.Simd64 && elem == op.Elems - 1)
+                    {
+                        EmitVectorZeroUpper(context, rtt);
+                    }
+                }
+                else
+                {
+                    context.EmitLdarg(TranslatedSub.MemoryArgIdx);
+                    context.EmitLdint(op.Rn);
+                    context.EmitLdc_I8(offset);
+
+                    context.Emit(OpCodes.Add);
+
+                    EmitVectorExtractZx(context, rtt, elem, op.Size);
+
+                    EmitWriteCall(context, op.Size);
+                }
+
+                offset += 1 << op.Size;
+            }
+
+            if (op.WBack)
+            {
+                EmitSimdMemWBack(context, offset);
+            }
+        }
+
+        private static void EmitSimdMemSs(ILEmitterCtx context, bool isLoad)
+        {
+            OpCodeSimdMemSs64 op = (OpCodeSimdMemSs64)context.CurrOp;
+
+            int offset = 0;
+
+            void EmitMemAddress()
+            {
+                context.EmitLdarg(TranslatedSub.MemoryArgIdx);
+                context.EmitLdint(op.Rn);
+                context.EmitLdc_I8(offset);
+
+                context.Emit(OpCodes.Add);
+            }
+
+            if (op.Replicate)
+            {
+                //Only loads uses the replicate mode.
+                if (!isLoad)
+                {
+                    throw new InvalidOperationException();
+                }
+
+                int bytes = op.GetBitsCount() >> 3;
+                int elems = bytes >> op.Size;
+
+                for (int sElem = 0; sElem < op.SElems; sElem++)
+                {
+                    int rt = (op.Rt + sElem) & 0x1f;
+
+                    for (int index = 0; index < elems; index++)
+                    {
+                        EmitMemAddress();
+
+                        EmitReadZxCall(context, op.Size);
+
+                        EmitVectorInsert(context, rt, index, op.Size);
+                    }
+
+                    if (op.RegisterSize == RegisterSize.Simd64)
+                    {
+                        EmitVectorZeroUpper(context, rt);
+                    }
+
+                    offset += 1 << op.Size;
+                }
+            }
+            else
+            {
+                for (int sElem = 0; sElem < op.SElems; sElem++)
+                {
+                    int rt = (op.Rt + sElem) & 0x1f;
+
+                    if (isLoad)
+                    {
+                        EmitMemAddress();
+
+                        EmitReadZxCall(context, op.Size);
+
+                        EmitVectorInsert(context, rt, op.Index, op.Size);
+                    }
+                    else
+                    {
+                        EmitMemAddress();
+
+                        EmitVectorExtractZx(context, rt, op.Index, op.Size);
+
+                        EmitWriteCall(context, op.Size);
+                    }
+
+                    offset += 1 << op.Size;
+                }
+            }
+
+            if (op.WBack)
+            {
+                EmitSimdMemWBack(context, offset);
+            }
+        }
+
+        private static void EmitSimdMemWBack(ILEmitterCtx context, int offset)
+        {
+            OpCodeMemReg64 op = (OpCodeMemReg64)context.CurrOp;
+
+            context.EmitLdint(op.Rn);
+
+            if (op.Rm != CpuThreadState.ZrIndex)
+            {
+                context.EmitLdint(op.Rm);
+            }
+            else
+            {
+                context.EmitLdc_I8(offset);
+            }
+
+            context.Emit(OpCodes.Add);
+
+            context.EmitStint(op.Rn);
+        }
+    }
+}
+\ No newline at end of file
diff --git a/ChocolArm64/Instructions/InstEmitSimdMove.cs b/ChocolArm64/Instructions/InstEmitSimdMove.cs
new file mode 100644
index 00000000..3f539b8a
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitSimdMove.cs
@@ -0,0 +1,562 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.State;
+using ChocolArm64.Translation;
+using System;
+using System.Reflection.Emit;
+using System.Runtime.Intrinsics.X86;
+
+using static ChocolArm64.Instructions.InstEmitSimdHelper;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        public static void Dup_Gp(ILEmitterCtx context)
+        {
+            OpCodeSimdIns64 op = (OpCodeSimdIns64)context.CurrOp;
+
+            if (Optimizations.UseSse2)
+            {
+                context.EmitLdintzr(op.Rn);
+
+                switch (op.Size)
+                {
+                    case 0: context.Emit(OpCodes.Conv_U1); break;
+                    case 1: context.Emit(OpCodes.Conv_U2); break;
+                    case 2: context.Emit(OpCodes.Conv_U4); break;
+                }
+
+                Type[] types = new Type[] { UIntTypesPerSizeLog2[op.Size] };
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), types));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                int bytes = op.GetBitsCount() >> 3;
+                int elems = bytes >> op.Size;
+
+                for (int index = 0; index < elems; index++)
+                {
+                    context.EmitLdintzr(op.Rn);
+
+                    EmitVectorInsert(context, op.Rd, index, op.Size);
+                }
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+        }
+
+        public static void Dup_S(ILEmitterCtx context)
+        {
+            OpCodeSimdIns64 op = (OpCodeSimdIns64)context.CurrOp;
+
+            EmitVectorExtractZx(context, op.Rn, op.DstIndex, op.Size);
+
+            EmitScalarSet(context, op.Rd, op.Size);
+        }
+
+        public static void Dup_V(ILEmitterCtx context)
+        {
+            OpCodeSimdIns64 op = (OpCodeSimdIns64)context.CurrOp;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> op.Size;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractZx(context, op.Rn, op.DstIndex, op.Size);
+
+                EmitVectorInsert(context, op.Rd, index, op.Size);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        public static void Ext_V(ILEmitterCtx context)
+        {
+            OpCodeSimdExt64 op = (OpCodeSimdExt64)context.CurrOp;
+
+            context.EmitLdvec(op.Rd);
+            context.EmitStvectmp();
+
+            int bytes = op.GetBitsCount() >> 3;
+
+            int position = op.Imm4;
+
+            for (int index = 0; index < bytes; index++)
+            {
+                int reg = op.Imm4 + index < bytes ? op.Rn : op.Rm;
+
+                if (position == bytes)
+                {
+                    position = 0;
+                }
+
+                EmitVectorExtractZx(context, reg, position++, 0);
+                EmitVectorInsertTmp(context, index, 0);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        public static void Fcsel_S(ILEmitterCtx context)
+        {
+            OpCodeSimdFcond64 op = (OpCodeSimdFcond64)context.CurrOp;
+
+            ILLabel lblTrue = new ILLabel();
+            ILLabel lblEnd  = new ILLabel();
+
+            context.EmitCondBranch(lblTrue, op.Cond);
+
+            EmitVectorExtractF(context, op.Rm, 0, op.Size);
+
+            context.Emit(OpCodes.Br_S, lblEnd);
+
+            context.MarkLabel(lblTrue);
+
+            EmitVectorExtractF(context, op.Rn, 0, op.Size);
+
+            context.MarkLabel(lblEnd);
+
+            EmitScalarSetF(context, op.Rd, op.Size);
+        }
+
+        public static void Fmov_Ftoi(ILEmitterCtx context)
+        {
+            OpCodeSimdCvt64 op = (OpCodeSimdCvt64)context.CurrOp;
+
+            EmitVectorExtractZx(context, op.Rn, 0, 3);
+
+            EmitIntZeroUpperIfNeeded(context);
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        public static void Fmov_Ftoi1(ILEmitterCtx context)
+        {
+            OpCodeSimdCvt64 op = (OpCodeSimdCvt64)context.CurrOp;
+
+            EmitVectorExtractZx(context, op.Rn, 1, 3);
+
+            EmitIntZeroUpperIfNeeded(context);
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        public static void Fmov_Itof(ILEmitterCtx context)
+        {
+            OpCodeSimdCvt64 op = (OpCodeSimdCvt64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rn);
+
+            EmitIntZeroUpperIfNeeded(context);
+
+            EmitScalarSet(context, op.Rd, 3);
+        }
+
+        public static void Fmov_Itof1(ILEmitterCtx context)
+        {
+            OpCodeSimdCvt64 op = (OpCodeSimdCvt64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rn);
+
+            EmitIntZeroUpperIfNeeded(context);
+
+            EmitVectorInsert(context, op.Rd, 1, 3);
+        }
+
+        public static void Fmov_S(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            EmitVectorExtractF(context, op.Rn, 0, op.Size);
+
+            EmitScalarSetF(context, op.Rd, op.Size);
+        }
+
+        public static void Fmov_Si(ILEmitterCtx context)
+        {
+            OpCodeSimdFmov64 op = (OpCodeSimdFmov64)context.CurrOp;
+
+            context.EmitLdc_I8(op.Imm);
+
+            EmitScalarSet(context, op.Rd, op.Size + 2);
+        }
+
+        public static void Fmov_V(ILEmitterCtx context)
+        {
+            OpCodeSimdImm64 op = (OpCodeSimdImm64)context.CurrOp;
+
+            int elems = op.RegisterSize == RegisterSize.Simd128 ? 4 : 2;
+
+            for (int index = 0; index < (elems >> op.Size); index++)
+            {
+                context.EmitLdc_I8(op.Imm);
+
+                EmitVectorInsert(context, op.Rd, index, op.Size + 2);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        public static void Ins_Gp(ILEmitterCtx context)
+        {
+            OpCodeSimdIns64 op = (OpCodeSimdIns64)context.CurrOp;
+
+            context.EmitLdintzr(op.Rn);
+
+            EmitVectorInsert(context, op.Rd, op.DstIndex, op.Size);
+        }
+
+        public static void Ins_V(ILEmitterCtx context)
+        {
+            OpCodeSimdIns64 op = (OpCodeSimdIns64)context.CurrOp;
+
+            EmitVectorExtractZx(context, op.Rn, op.SrcIndex, op.Size);
+
+            EmitVectorInsert(context, op.Rd, op.DstIndex, op.Size);
+        }
+
+        public static void Movi_V(ILEmitterCtx context)
+        {
+            EmitVectorImmUnaryOp(context, () => { });
+        }
+
+        public static void Mvni_V(ILEmitterCtx context)
+        {
+            EmitVectorImmUnaryOp(context, () => context.Emit(OpCodes.Not));
+        }
+
+        public static void Smov_S(ILEmitterCtx context)
+        {
+            OpCodeSimdIns64 op = (OpCodeSimdIns64)context.CurrOp;
+
+            EmitVectorExtractSx(context, op.Rn, op.DstIndex, op.Size);
+
+            EmitIntZeroUpperIfNeeded(context);
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        public static void Tbl_V(ILEmitterCtx context)
+        {
+            OpCodeSimdTbl64 op = (OpCodeSimdTbl64)context.CurrOp;
+
+            context.EmitLdvec(op.Rm);
+
+            for (int index = 0; index < op.Size; index++)
+            {
+                context.EmitLdvec((op.Rn + index) & 0x1f);
+            }
+
+            switch (op.Size)
+            {
+                case 1: VectorHelper.EmitCall(context,
+                    nameof(VectorHelper.Tbl1_V64),
+                    nameof(VectorHelper.Tbl1_V128)); break;
+
+                case 2: VectorHelper.EmitCall(context,
+                    nameof(VectorHelper.Tbl2_V64),
+                    nameof(VectorHelper.Tbl2_V128)); break;
+
+                case 3: VectorHelper.EmitCall(context,
+                    nameof(VectorHelper.Tbl3_V64),
+                    nameof(VectorHelper.Tbl3_V128)); break;
+
+                case 4: VectorHelper.EmitCall(context,
+                    nameof(VectorHelper.Tbl4_V64),
+                    nameof(VectorHelper.Tbl4_V128)); break;
+
+                default: throw new InvalidOperationException();
+            }
+
+            context.EmitStvec(op.Rd);
+        }
+
+        public static void Trn1_V(ILEmitterCtx context)
+        {
+            EmitVectorTranspose(context, part: 0);
+        }
+
+        public static void Trn2_V(ILEmitterCtx context)
+        {
+            EmitVectorTranspose(context, part: 1);
+        }
+
+        public static void Umov_S(ILEmitterCtx context)
+        {
+            OpCodeSimdIns64 op = (OpCodeSimdIns64)context.CurrOp;
+
+            EmitVectorExtractZx(context, op.Rn, op.DstIndex, op.Size);
+
+            context.EmitStintzr(op.Rd);
+        }
+
+        public static void Uzp1_V(ILEmitterCtx context)
+        {
+            EmitVectorUnzip(context, part: 0);
+        }
+
+        public static void Uzp2_V(ILEmitterCtx context)
+        {
+            EmitVectorUnzip(context, part: 1);
+        }
+
+        public static void Xtn_V(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int elems = 8 >> op.Size;
+
+            int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
+
+            if (Optimizations.UseSse41 && op.Size < 2)
+            {
+                void EmitZeroVector()
+                {
+                    switch (op.Size)
+                    {
+                        case 0: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInt16Zero)); break;
+                        case 1: VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInt32Zero)); break;
+                    }
+                }
+
+                //For XTN, first operand is source, second operand is 0.
+                //For XTN2, first operand is 0, second operand is source.
+                if (part != 0)
+                {
+                    EmitZeroVector();
+                }
+
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size + 1);
+
+                //Set mask to discard the upper half of the wide elements.
+                switch (op.Size)
+                {
+                    case 0: context.EmitLdc_I4(0x00ff);     break;
+                    case 1: context.EmitLdc_I4(0x0000ffff); break;
+                }
+
+                Type wideType = IntTypesPerSizeLog2[op.Size + 1];
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), new Type[] { wideType }));
+
+                wideType = VectorIntTypesPerSizeLog2[op.Size + 1];
+
+                Type[] wideTypes = new Type[] { wideType, wideType };
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), wideTypes));
+
+                if (part == 0)
+                {
+                    EmitZeroVector();
+                }
+
+                //Pack values with signed saturation, the signed saturation shouldn't
+                //saturate anything since the upper bits were masked off.
+                Type sseType = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
+
+                context.EmitCall(sseType.GetMethod(nameof(Sse2.PackUnsignedSaturate), wideTypes));
+
+                if (part != 0)
+                {
+                    //For XTN2, we additionally need to discard the upper bits
+                    //of the target register and OR the result with it.
+                    EmitVectorZeroUpper(context, op.Rd);
+
+                    EmitLdvecWithUnsignedCast(context, op.Rd, op.Size);
+
+                    Type narrowType = VectorUIntTypesPerSizeLog2[op.Size];
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or), new Type[] { narrowType, narrowType }));
+                }
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+            }
+            else
+            {
+                if (part != 0)
+                {
+                    context.EmitLdvec(op.Rd);
+                    context.EmitStvectmp();
+                }
+
+                for (int index = 0; index < elems; index++)
+                {
+                    EmitVectorExtractZx(context, op.Rn, index, op.Size + 1);
+
+                    EmitVectorInsertTmp(context, part + index, op.Size);
+                }
+
+                context.EmitLdvectmp();
+                context.EmitStvec(op.Rd);
+
+                if (part == 0)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+        }
+
+        public static void Zip1_V(ILEmitterCtx context)
+        {
+            EmitVectorZip(context, part: 0);
+        }
+
+        public static void Zip2_V(ILEmitterCtx context)
+        {
+            EmitVectorZip(context, part: 1);
+        }
+
+        private static void EmitIntZeroUpperIfNeeded(ILEmitterCtx context)
+        {
+            if (context.CurrOp.RegisterSize == RegisterSize.Int32 ||
+                context.CurrOp.RegisterSize == RegisterSize.Simd64)
+            {
+                context.Emit(OpCodes.Conv_U4);
+                context.Emit(OpCodes.Conv_U8);
+            }
+        }
+
+        private static void EmitVectorTranspose(ILEmitterCtx context, int part)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int words = op.GetBitsCount() >> 4;
+            int pairs = words >> op.Size;
+
+            for (int index = 0; index < pairs; index++)
+            {
+                int idx = index << 1;
+
+                EmitVectorExtractZx(context, op.Rn, idx + part, op.Size);
+                EmitVectorExtractZx(context, op.Rm, idx + part, op.Size);
+
+                EmitVectorInsertTmp(context, idx + 1, op.Size);
+                EmitVectorInsertTmp(context, idx,     op.Size);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        private static void EmitVectorUnzip(ILEmitterCtx context, int part)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int words = op.GetBitsCount() >> 4;
+            int pairs = words >> op.Size;
+
+            for (int index = 0; index < pairs; index++)
+            {
+                int idx = index << 1;
+
+                EmitVectorExtractZx(context, op.Rn, idx + part, op.Size);
+                EmitVectorExtractZx(context, op.Rm, idx + part, op.Size);
+
+                EmitVectorInsertTmp(context, pairs + index, op.Size);
+                EmitVectorInsertTmp(context,         index, op.Size);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        private static void EmitVectorZip(ILEmitterCtx context, int part)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            if (Optimizations.UseSse2)
+            {
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+
+                Type[] types = new Type[]
+                {
+                    VectorUIntTypesPerSizeLog2[op.Size],
+                    VectorUIntTypesPerSizeLog2[op.Size]
+                };
+
+                string name = part == 0 || (part != 0 && op.RegisterSize == RegisterSize.Simd64)
+                    ? nameof(Sse2.UnpackLow)
+                    : nameof(Sse2.UnpackHigh);
+
+                context.EmitCall(typeof(Sse2).GetMethod(name, types));
+
+                if (op.RegisterSize == RegisterSize.Simd64 && part != 0)
+                {
+                    context.EmitLdc_I4(8);
+
+                    Type[] shTypes = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) };
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), shTypes));
+                }
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64 && part == 0)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                int words = op.GetBitsCount() >> 4;
+                int pairs = words >> op.Size;
+
+                int Base = part != 0 ? pairs : 0;
+
+                for (int index = 0; index < pairs; index++)
+                {
+                    int idx = index << 1;
+
+                    EmitVectorExtractZx(context, op.Rn, Base + index, op.Size);
+                    EmitVectorExtractZx(context, op.Rm, Base + index, op.Size);
+
+                    EmitVectorInsertTmp(context, idx + 1, op.Size);
+                    EmitVectorInsertTmp(context, idx,     op.Size);
+                }
+
+                context.EmitLdvectmp();
+                context.EmitStvec(op.Rd);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+        }
+    }
+}
diff --git a/ChocolArm64/Instructions/InstEmitSimdShift.cs b/ChocolArm64/Instructions/InstEmitSimdShift.cs
new file mode 100644
index 00000000..3c24ff23
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitSimdShift.cs
@@ -0,0 +1,865 @@
+// https://github.com/intel/ARM_NEON_2_x86_SSE/blob/master/NEON_2_SSE.h
+
+using ChocolArm64.Decoders;
+using ChocolArm64.State;
+using ChocolArm64.Translation;
+using System;
+using System.Reflection.Emit;
+using System.Runtime.Intrinsics.X86;
+
+using static ChocolArm64.Instructions.InstEmitSimdHelper;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        public static void Rshrn_V(ILEmitterCtx context)
+        {
+            EmitVectorShrImmNarrowOpZx(context, round: true);
+        }
+
+        public static void Shl_S(ILEmitterCtx context)
+        {
+            OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
+
+            EmitScalarUnaryOpZx(context, () =>
+            {
+                context.EmitLdc_I4(GetImmShl(op));
+
+                context.Emit(OpCodes.Shl);
+            });
+        }
+
+        public static void Shl_V(ILEmitterCtx context)
+        {
+            OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
+
+            if (Optimizations.UseSse2 && op.Size > 0)
+            {
+                Type[] typesSll = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) };
+
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+
+                context.EmitLdc_I4(GetImmShl(op));
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesSll));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorUnaryOpZx(context, () =>
+                {
+                    context.EmitLdc_I4(GetImmShl(op));
+
+                    context.Emit(OpCodes.Shl);
+                });
+            }
+        }
+
+        public static void Shll_V(ILEmitterCtx context)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int shift = 8 << op.Size;
+
+            EmitVectorShImmWidenBinaryZx(context, () => context.Emit(OpCodes.Shl), shift);
+        }
+
+        public static void Shrn_V(ILEmitterCtx context)
+        {
+            EmitVectorShrImmNarrowOpZx(context, round: false);
+        }
+
+        public static void Sli_V(ILEmitterCtx context)
+        {
+            OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> op.Size;
+
+            int shift = GetImmShl(op);
+
+            ulong mask = shift != 0 ? ulong.MaxValue >> (64 - shift) : 0;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractZx(context, op.Rn, index, op.Size);
+
+                context.EmitLdc_I4(shift);
+
+                context.Emit(OpCodes.Shl);
+
+                EmitVectorExtractZx(context, op.Rd, index, op.Size);
+
+                context.EmitLdc_I8((long)mask);
+
+                context.Emit(OpCodes.And);
+                context.Emit(OpCodes.Or);
+
+                EmitVectorInsert(context, op.Rd, index, op.Size);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        public static void Sqrshrn_S(ILEmitterCtx context)
+        {
+            EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarSxSx);
+        }
+
+        public static void Sqrshrn_V(ILEmitterCtx context)
+        {
+            EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxSx);
+        }
+
+        public static void Sqrshrun_S(ILEmitterCtx context)
+        {
+            EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarSxZx);
+        }
+
+        public static void Sqrshrun_V(ILEmitterCtx context)
+        {
+            EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxZx);
+        }
+
+        public static void Sqshrn_S(ILEmitterCtx context)
+        {
+            EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarSxSx);
+        }
+
+        public static void Sqshrn_V(ILEmitterCtx context)
+        {
+            EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxSx);
+        }
+
+        public static void Sqshrun_S(ILEmitterCtx context)
+        {
+            EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarSxZx);
+        }
+
+        public static void Sqshrun_V(ILEmitterCtx context)
+        {
+            EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxZx);
+        }
+
+        public static void Srshr_S(ILEmitterCtx context)
+        {
+            EmitScalarShrImmOpSx(context, ShrImmFlags.Round);
+        }
+
+        public static void Srshr_V(ILEmitterCtx context)
+        {
+            OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
+
+            if (Optimizations.UseSse2 && op.Size > 0
+                                       && op.Size < 3)
+            {
+                Type[] typesShs = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesAdd = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };
+
+                int shift = GetImmShr(op);
+                int eSize = 8 << op.Size;
+
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size);
+
+                context.Emit(OpCodes.Dup);
+                context.EmitStvectmp();
+
+                context.EmitLdc_I4(eSize - shift);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesShs));
+
+                context.EmitLdc_I4(eSize - 1);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs));
+
+                context.EmitLdvectmp();
+
+                context.EmitLdc_I4(shift);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightArithmetic), typesShs));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAdd));
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorShrImmOpSx(context, ShrImmFlags.Round);
+            }
+        }
+
+        public static void Srsra_S(ILEmitterCtx context)
+        {
+            EmitScalarShrImmOpSx(context, ShrImmFlags.Round | ShrImmFlags.Accumulate);
+        }
+
+        public static void Srsra_V(ILEmitterCtx context)
+        {
+            OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
+
+            if (Optimizations.UseSse2 && op.Size > 0
+                                       && op.Size < 3)
+            {
+                Type[] typesShs = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesAdd = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };
+
+                int shift = GetImmShr(op);
+                int eSize = 8 << op.Size;
+
+                EmitLdvecWithSignedCast(context, op.Rd, op.Size);
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size);
+
+                context.Emit(OpCodes.Dup);
+                context.EmitStvectmp();
+
+                context.EmitLdc_I4(eSize - shift);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesShs));
+
+                context.EmitLdc_I4(eSize - 1);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs));
+
+                context.EmitLdvectmp();
+
+                context.EmitLdc_I4(shift);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightArithmetic), typesShs));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAdd));
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAdd));
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorShrImmOpSx(context, ShrImmFlags.Round | ShrImmFlags.Accumulate);
+            }
+        }
+
+        public static void Sshl_V(ILEmitterCtx context)
+        {
+            EmitVectorShl(context, signed: true);
+        }
+
+        public static void Sshll_V(ILEmitterCtx context)
+        {
+            OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
+
+            EmitVectorShImmWidenBinarySx(context, () => context.Emit(OpCodes.Shl), GetImmShl(op));
+        }
+
+        public static void Sshr_S(ILEmitterCtx context)
+        {
+            EmitShrImmOp(context, ShrImmFlags.ScalarSx);
+        }
+
+        public static void Sshr_V(ILEmitterCtx context)
+        {
+            OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
+
+            if (Optimizations.UseSse2 && op.Size > 0
+                                       && op.Size < 3)
+            {
+                Type[] typesSra = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) };
+
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size);
+
+                context.EmitLdc_I4(GetImmShr(op));
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightArithmetic), typesSra));
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitShrImmOp(context, ShrImmFlags.VectorSx);
+            }
+        }
+
+        public static void Ssra_S(ILEmitterCtx context)
+        {
+            EmitScalarShrImmOpSx(context, ShrImmFlags.Accumulate);
+        }
+
+        public static void Ssra_V(ILEmitterCtx context)
+        {
+            OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
+
+            if (Optimizations.UseSse2 && op.Size > 0
+                                       && op.Size < 3)
+            {
+                Type[] typesSra = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesAdd = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };
+
+                EmitLdvecWithSignedCast(context, op.Rd, op.Size);
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size);
+
+                context.EmitLdc_I4(GetImmShr(op));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightArithmetic), typesSra));
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAdd));
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorShrImmOpSx(context, ShrImmFlags.Accumulate);
+            }
+        }
+
+        public static void Uqrshrn_S(ILEmitterCtx context)
+        {
+            EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarZxZx);
+        }
+
+        public static void Uqrshrn_V(ILEmitterCtx context)
+        {
+            EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorZxZx);
+        }
+
+        public static void Uqshrn_S(ILEmitterCtx context)
+        {
+            EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarZxZx);
+        }
+
+        public static void Uqshrn_V(ILEmitterCtx context)
+        {
+            EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorZxZx);
+        }
+
+        public static void Urshr_S(ILEmitterCtx context)
+        {
+            EmitScalarShrImmOpZx(context, ShrImmFlags.Round);
+        }
+
+        public static void Urshr_V(ILEmitterCtx context)
+        {
+            OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
+
+            if (Optimizations.UseSse2 && op.Size > 0)
+            {
+                Type[] typesShs = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesAdd = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], VectorUIntTypesPerSizeLog2[op.Size] };
+
+                int shift = GetImmShr(op);
+                int eSize = 8 << op.Size;
+
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+
+                context.Emit(OpCodes.Dup);
+                context.EmitStvectmp();
+
+                context.EmitLdc_I4(eSize - shift);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesShs));
+
+                context.EmitLdc_I4(eSize - 1);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs));
+
+                context.EmitLdvectmp();
+
+                context.EmitLdc_I4(shift);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAdd));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorShrImmOpZx(context, ShrImmFlags.Round);
+            }
+        }
+
+        public static void Ursra_S(ILEmitterCtx context)
+        {
+            EmitScalarShrImmOpZx(context, ShrImmFlags.Round | ShrImmFlags.Accumulate);
+        }
+
+        public static void Ursra_V(ILEmitterCtx context)
+        {
+            OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
+
+            if (Optimizations.UseSse2 && op.Size > 0)
+            {
+                Type[] typesShs = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesAdd = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], VectorUIntTypesPerSizeLog2[op.Size] };
+
+                int shift = GetImmShr(op);
+                int eSize = 8 << op.Size;
+
+                EmitLdvecWithUnsignedCast(context, op.Rd, op.Size);
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+
+                context.Emit(OpCodes.Dup);
+                context.EmitStvectmp();
+
+                context.EmitLdc_I4(eSize - shift);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesShs));
+
+                context.EmitLdc_I4(eSize - 1);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs));
+
+                context.EmitLdvectmp();
+
+                context.EmitLdc_I4(shift);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAdd));
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAdd));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorShrImmOpZx(context, ShrImmFlags.Round | ShrImmFlags.Accumulate);
+            }
+        }
+
+        public static void Ushl_V(ILEmitterCtx context)
+        {
+            EmitVectorShl(context, signed: false);
+        }
+
+        public static void Ushll_V(ILEmitterCtx context)
+        {
+            OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
+
+            EmitVectorShImmWidenBinaryZx(context, () => context.Emit(OpCodes.Shl), GetImmShl(op));
+        }
+
+        public static void Ushr_S(ILEmitterCtx context)
+        {
+            EmitShrImmOp(context, ShrImmFlags.ScalarZx);
+        }
+
+        public static void Ushr_V(ILEmitterCtx context)
+        {
+            OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
+
+            if (Optimizations.UseSse2 && op.Size > 0)
+            {
+                Type[] typesSrl = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) };
+
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+
+                context.EmitLdc_I4(GetImmShr(op));
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesSrl));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitShrImmOp(context, ShrImmFlags.VectorZx);
+            }
+        }
+
+        public static void Usra_S(ILEmitterCtx context)
+        {
+            EmitScalarShrImmOpZx(context, ShrImmFlags.Accumulate);
+        }
+
+        public static void Usra_V(ILEmitterCtx context)
+        {
+            OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
+
+            if (Optimizations.UseSse2 && op.Size > 0)
+            {
+                Type[] typesSrl = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesAdd = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], VectorUIntTypesPerSizeLog2[op.Size] };
+
+                EmitLdvecWithUnsignedCast(context, op.Rd, op.Size);
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+
+                context.EmitLdc_I4(GetImmShr(op));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesSrl));
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAdd));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorShrImmOpZx(context, ShrImmFlags.Accumulate);
+            }
+        }
+
+        private static void EmitVectorShl(ILEmitterCtx context, bool signed)
+        {
+            //This instruction shifts the value on vector A by the number of bits
+            //specified on the signed, lower 8 bits of vector B. If the shift value
+            //is greater or equal to the data size of each lane, then the result is zero.
+            //Additionally, negative shifts produces right shifts by the negated shift value.
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int maxShift = 8 << op.Size;
+
+            Action emit = () =>
+            {
+                ILLabel lblShl  = new ILLabel();
+                ILLabel lblZero = new ILLabel();
+                ILLabel lblEnd  = new ILLabel();
+
+                void EmitShift(OpCode ilOp)
+                {
+                    context.Emit(OpCodes.Dup);
+
+                    context.EmitLdc_I4(maxShift);
+
+                    context.Emit(OpCodes.Bge_S, lblZero);
+                    context.Emit(ilOp);
+                    context.Emit(OpCodes.Br_S, lblEnd);
+                }
+
+                context.Emit(OpCodes.Conv_I1);
+                context.Emit(OpCodes.Dup);
+
+                context.EmitLdc_I4(0);
+
+                context.Emit(OpCodes.Bge_S, lblShl);
+                context.Emit(OpCodes.Neg);
+
+                EmitShift(signed
+                    ? OpCodes.Shr
+                    : OpCodes.Shr_Un);
+
+                context.MarkLabel(lblShl);
+
+                EmitShift(OpCodes.Shl);
+
+                context.MarkLabel(lblZero);
+
+                context.Emit(OpCodes.Pop);
+                context.Emit(OpCodes.Pop);
+
+                context.EmitLdc_I8(0);
+
+                context.MarkLabel(lblEnd);
+            };
+
+            if (signed)
+            {
+                EmitVectorBinaryOpSx(context, emit);
+            }
+            else
+            {
+                EmitVectorBinaryOpZx(context, emit);
+            }
+        }
+
+        [Flags]
+        private enum ShrImmFlags
+        {
+            Scalar = 1 << 0,
+            Signed = 1 << 1,
+
+            Round      = 1 << 2,
+            Accumulate = 1 << 3,
+
+            ScalarSx = Scalar | Signed,
+            ScalarZx = Scalar,
+
+            VectorSx = Signed,
+            VectorZx = 0
+        }
+
+        private static void EmitScalarShrImmOpSx(ILEmitterCtx context, ShrImmFlags flags)
+        {
+            EmitShrImmOp(context, ShrImmFlags.ScalarSx | flags);
+        }
+
+        private static void EmitScalarShrImmOpZx(ILEmitterCtx context, ShrImmFlags flags)
+        {
+            EmitShrImmOp(context, ShrImmFlags.ScalarZx | flags);
+        }
+
+        private static void EmitVectorShrImmOpSx(ILEmitterCtx context, ShrImmFlags flags)
+        {
+            EmitShrImmOp(context, ShrImmFlags.VectorSx | flags);
+        }
+
+        private static void EmitVectorShrImmOpZx(ILEmitterCtx context, ShrImmFlags flags)
+        {
+            EmitShrImmOp(context, ShrImmFlags.VectorZx | flags);
+        }
+
+        private static void EmitShrImmOp(ILEmitterCtx context, ShrImmFlags flags)
+        {
+            OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
+
+            bool scalar     = (flags & ShrImmFlags.Scalar)     != 0;
+            bool signed     = (flags & ShrImmFlags.Signed)     != 0;
+            bool round      = (flags & ShrImmFlags.Round)      != 0;
+            bool accumulate = (flags & ShrImmFlags.Accumulate) != 0;
+
+            int shift = GetImmShr(op);
+
+            long roundConst = 1L << (shift - 1);
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = !scalar ? bytes >> op.Size : 1;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtract(context, op.Rn, index, op.Size, signed);
+
+                if (op.Size <= 2)
+                {
+                    if (round)
+                    {
+                        context.EmitLdc_I8(roundConst);
+
+                        context.Emit(OpCodes.Add);
+                    }
+
+                    context.EmitLdc_I4(shift);
+
+                    context.Emit(signed ? OpCodes.Shr : OpCodes.Shr_Un);
+                }
+                else /* if (Op.Size == 3) */
+                {
+                    EmitShrImm_64(context, signed, round ? roundConst : 0L, shift);
+                }
+
+                if (accumulate)
+                {
+                    EmitVectorExtract(context, op.Rd, index, op.Size, signed);
+
+                    context.Emit(OpCodes.Add);
+                }
+
+                EmitVectorInsertTmp(context, index, op.Size);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+
+            if ((op.RegisterSize == RegisterSize.Simd64) || scalar)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        private static void EmitVectorShrImmNarrowOpZx(ILEmitterCtx context, bool round)
+        {
+            OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
+
+            int shift = GetImmShr(op);
+
+            long roundConst = 1L << (shift - 1);
+
+            int elems = 8 >> op.Size;
+
+            int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
+
+            if (part != 0)
+            {
+                context.EmitLdvec(op.Rd);
+                context.EmitStvectmp();
+            }
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractZx(context, op.Rn, index, op.Size + 1);
+
+                if (round)
+                {
+                    context.EmitLdc_I8(roundConst);
+
+                    context.Emit(OpCodes.Add);
+                }
+
+                context.EmitLdc_I4(shift);
+
+                context.Emit(OpCodes.Shr_Un);
+
+                EmitVectorInsertTmp(context, part + index, op.Size);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+
+            if (part == 0)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        [Flags]
+        private enum ShrImmSaturatingNarrowFlags
+        {
+            Scalar    = 1 << 0,
+            SignedSrc = 1 << 1,
+            SignedDst = 1 << 2,
+
+            Round = 1 << 3,
+
+            ScalarSxSx = Scalar | SignedSrc | SignedDst,
+            ScalarSxZx = Scalar | SignedSrc,
+            ScalarZxZx = Scalar,
+
+            VectorSxSx = SignedSrc | SignedDst,
+            VectorSxZx = SignedSrc,
+            VectorZxZx = 0
+        }
+
+        private static void EmitRoundShrImmSaturatingNarrowOp(ILEmitterCtx context, ShrImmSaturatingNarrowFlags flags)
+        {
+            EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.Round | flags);
+        }
+
+        private static void EmitShrImmSaturatingNarrowOp(ILEmitterCtx context, ShrImmSaturatingNarrowFlags flags)
+        {
+            OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp;
+
+            bool scalar    = (flags & ShrImmSaturatingNarrowFlags.Scalar)    != 0;
+            bool signedSrc = (flags & ShrImmSaturatingNarrowFlags.SignedSrc) != 0;
+            bool signedDst = (flags & ShrImmSaturatingNarrowFlags.SignedDst) != 0;
+            bool round     = (flags & ShrImmSaturatingNarrowFlags.Round)     != 0;
+
+            int shift = GetImmShr(op);
+
+            long roundConst = 1L << (shift - 1);
+
+            int elems = !scalar ? 8 >> op.Size : 1;
+
+            int part = !scalar && (op.RegisterSize == RegisterSize.Simd128) ? elems : 0;
+
+            if (scalar)
+            {
+                EmitVectorZeroLowerTmp(context);
+            }
+
+            if (part != 0)
+            {
+                context.EmitLdvec(op.Rd);
+                context.EmitStvectmp();
+            }
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtract(context, op.Rn, index, op.Size + 1, signedSrc);
+
+                if (op.Size <= 1 || !round)
+                {
+                    if (round)
+                    {
+                        context.EmitLdc_I8(roundConst);
+
+                        context.Emit(OpCodes.Add);
+                    }
+
+                    context.EmitLdc_I4(shift);
+
+                    context.Emit(signedSrc ? OpCodes.Shr : OpCodes.Shr_Un);
+                }
+                else /* if (Op.Size == 2 && Round) */
+                {
+                    EmitShrImm_64(context, signedSrc, roundConst, shift); // Shift <= 32
+                }
+
+                EmitSatQ(context, op.Size, signedSrc, signedDst);
+
+                EmitVectorInsertTmp(context, part + index, op.Size);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+
+            if (part == 0)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
+        // Dst_64 = (Int(Src_64, Signed) + RoundConst) >> Shift;
+        private static void EmitShrImm_64(
+            ILEmitterCtx context,
+            bool signed,
+            long roundConst,
+            int  shift)
+        {
+            context.EmitLdc_I8(roundConst);
+            context.EmitLdc_I4(shift);
+
+            SoftFallback.EmitCall(context, signed
+                ? nameof(SoftFallback.SignedShrImm_64)
+                : nameof(SoftFallback.UnsignedShrImm_64));
+        }
+
+        private static void EmitVectorShImmWidenBinarySx(ILEmitterCtx context, Action emit, int imm)
+        {
+            EmitVectorShImmWidenBinaryOp(context, emit, imm, true);
+        }
+
+        private static void EmitVectorShImmWidenBinaryZx(ILEmitterCtx context, Action emit, int imm)
+        {
+            EmitVectorShImmWidenBinaryOp(context, emit, imm, false);
+        }
+
+        private static void EmitVectorShImmWidenBinaryOp(ILEmitterCtx context, Action emit, int imm, bool signed)
+        {
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int elems = 8 >> op.Size;
+
+            int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtract(context, op.Rn, part + index, op.Size, signed);
+
+                context.EmitLdc_I4(imm);
+
+                emit();
+
+                EmitVectorInsertTmp(context, index, op.Size + 1);
+            }
+
+            context.EmitLdvectmp();
+            context.EmitStvec(op.Rd);
+        }
+    }
+}
diff --git a/ChocolArm64/Instructions/InstEmitSystem.cs b/ChocolArm64/Instructions/InstEmitSystem.cs
new file mode 100644
index 00000000..0e61d5bd
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitSystem.cs
@@ -0,0 +1,138 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.State;
+using ChocolArm64.Translation;
+using System;
+using System.Reflection;
+using System.Reflection.Emit;
+
+namespace ChocolArm64.Instructions
+{
+    static partial class InstEmit
+    {
+        public static void Hint(ILEmitterCtx context)
+        {
+            //Execute as no-op.
+        }
+
+        public static void Isb(ILEmitterCtx context)
+        {
+            //Execute as no-op.
+        }
+
+        public static void Mrs(ILEmitterCtx context)
+        {
+            OpCodeSystem64 op = (OpCodeSystem64)context.CurrOp;
+
+            context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+            string propName;
+
+            switch (GetPackedId(op))
+            {
+                case 0b11_011_0000_0000_001: propName = nameof(CpuThreadState.CtrEl0);    break;
+                case 0b11_011_0000_0000_111: propName = nameof(CpuThreadState.DczidEl0);  break;
+                case 0b11_011_0100_0100_000: propName = nameof(CpuThreadState.Fpcr);      break;
+                case 0b11_011_0100_0100_001: propName = nameof(CpuThreadState.Fpsr);      break;
+                case 0b11_011_1101_0000_010: propName = nameof(CpuThreadState.TpidrEl0);  break;
+                case 0b11_011_1101_0000_011: propName = nameof(CpuThreadState.Tpidr);     break;
+                case 0b11_011_1110_0000_000: propName = nameof(CpuThreadState.CntfrqEl0); break;
+                case 0b11_011_1110_0000_001: propName = nameof(CpuThreadState.CntpctEl0); break;
+
+                default: throw new NotImplementedException($"Unknown MRS at {op.Position:x16}");
+            }
+
+            context.EmitCallPropGet(typeof(CpuThreadState), propName);
+
+            PropertyInfo propInfo = typeof(CpuThreadState).GetProperty(propName);
+
+            if (propInfo.PropertyType != typeof(long) &&
+                propInfo.PropertyType != typeof(ulong))
+            {
+                context.Emit(OpCodes.Conv_U8);
+            }
+
+            context.EmitStintzr(op.Rt);
+        }
+
+        public static void Msr(ILEmitterCtx context)
+        {
+            OpCodeSystem64 op = (OpCodeSystem64)context.CurrOp;
+
+            context.EmitLdarg(TranslatedSub.StateArgIdx);
+            context.EmitLdintzr(op.Rt);
+
+            string propName;
+
+            switch (GetPackedId(op))
+            {
+                case 0b11_011_0100_0100_000: propName = nameof(CpuThreadState.Fpcr);     break;
+                case 0b11_011_0100_0100_001: propName = nameof(CpuThreadState.Fpsr);     break;
+                case 0b11_011_1101_0000_010: propName = nameof(CpuThreadState.TpidrEl0); break;
+
+                default: throw new NotImplementedException($"Unknown MSR at {op.Position:x16}");
+            }
+
+            PropertyInfo propInfo = typeof(CpuThreadState).GetProperty(propName);
+
+            if (propInfo.PropertyType != typeof(long) &&
+                propInfo.PropertyType != typeof(ulong))
+            {
+                context.Emit(OpCodes.Conv_U4);
+            }
+
+            context.EmitCallPropSet(typeof(CpuThreadState), propName);
+        }
+
+        public static void Nop(ILEmitterCtx context)
+        {
+            //Do nothing.
+        }
+
+        public static void Sys(ILEmitterCtx context)
+        {
+            //This instruction is used to do some operations on the CPU like cache invalidation,
+            //address translation and the like.
+            //We treat it as no-op here since we don't have any cache being emulated anyway.
+            OpCodeSystem64 op = (OpCodeSystem64)context.CurrOp;
+
+            switch (GetPackedId(op))
+            {
+                case 0b11_011_0111_0100_001:
+                {
+                    //DC ZVA
+                    for (int offs = 0; offs < (4 << CpuThreadState.DczSizeLog2); offs += 8)
+                    {
+                        context.EmitLdarg(TranslatedSub.MemoryArgIdx);
+                        context.EmitLdintzr(op.Rt);
+                        context.EmitLdc_I(offs);
+
+                        context.Emit(OpCodes.Add);
+
+                        context.EmitLdc_I8(0);
+
+                        InstEmitMemoryHelper.EmitWriteCall(context, 3);
+                    }
+
+                    break;
+                }
+
+                //No-op
+                case 0b11_011_0111_1110_001: //DC CIVAC
+                    break;
+            }
+        }
+
+        private static int GetPackedId(OpCodeSystem64 op)
+        {
+            int id;
+
+            id  = op.Op2 << 0;
+            id |= op.CRm << 3;
+            id |= op.CRn << 7;
+            id |= op.Op1 << 11;
+            id |= op.Op0 << 14;
+
+            return id;
+        }
+    }
+}
diff --git a/ChocolArm64/Instructions/InstEmitter.cs b/ChocolArm64/Instructions/InstEmitter.cs
new file mode 100644
index 00000000..db6e8604
--- /dev/null
+++ b/ChocolArm64/Instructions/InstEmitter.cs
@@ -0,0 +1,6 @@
+using ChocolArm64.Translation;
+
+namespace ChocolArm64.Instructions
+{
+    delegate void InstEmitter(ILEmitterCtx context);
+}
+\ No newline at end of file
diff --git a/ChocolArm64/Instructions/InstInterpreter.cs b/ChocolArm64/Instructions/InstInterpreter.cs
new file mode 100644
index 00000000..e6354fd5
--- /dev/null
+++ b/ChocolArm64/Instructions/InstInterpreter.cs
@@ -0,0 +1,8 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.Memory;
+using ChocolArm64.State;
+
+namespace ChocolArm64.Instructions
+{
+    delegate void InstInterpreter(CpuThreadState state, MemoryManager memory, OpCode64 opCode);
+}
+\ No newline at end of file
diff --git a/ChocolArm64/Instructions/SoftFallback.cs b/ChocolArm64/Instructions/SoftFallback.cs
new file mode 100644
index 00000000..a31aa34c
--- /dev/null
+++ b/ChocolArm64/Instructions/SoftFallback.cs
@@ -0,0 +1,922 @@
+using ChocolArm64.State;
+using ChocolArm64.Translation;
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace ChocolArm64.Instructions
+{
+    using static VectorHelper;
+
+    static class SoftFallback
+    {
+        public static void EmitCall(ILEmitterCtx context, string mthdName)
+        {
+            context.EmitCall(typeof(SoftFallback), mthdName);
+        }
+
+#region "ShrImm_64"
+        public static long SignedShrImm_64(long value, long roundConst, int shift)
+        {
+            if (roundConst == 0L)
+            {
+                if (shift <= 63)
+                {
+                    return value >> shift;
+                }
+                else /* if (Shift == 64) */
+                {
+                    if (value < 0L)
+                    {
+                        return -1L;
+                    }
+                    else
+                    {
+                        return 0L;
+                    }
+                }
+            }
+            else /* if (RoundConst == 1L << (Shift - 1)) */
+            {
+                if (shift <= 63)
+                {
+                    long add = value + roundConst;
+
+                    if ((~value & (value ^ add)) < 0L)
+                    {
+                        return (long)((ulong)add >> shift);
+                    }
+                    else
+                    {
+                        return add >> shift;
+                    }
+                }
+                else /* if (Shift == 64) */
+                {
+                    return 0L;
+                }
+            }
+        }
+
+        public static ulong UnsignedShrImm_64(ulong value, long roundConst, int shift)
+        {
+            if (roundConst == 0L)
+            {
+                if (shift <= 63)
+                {
+                    return value >> shift;
+                }
+                else /* if (Shift == 64) */
+                {
+                    return 0UL;
+                }
+            }
+            else /* if (RoundConst == 1L << (Shift - 1)) */
+            {
+                ulong add = value + (ulong)roundConst;
+
+                if ((add < value) && (add < (ulong)roundConst))
+                {
+                    if (shift <= 63)
+                    {
+                        return (add >> shift) | (0x8000000000000000UL >> (shift - 1));
+                    }
+                    else /* if (Shift == 64) */
+                    {
+                        return 1UL;
+                    }
+                }
+                else
+                {
+                    if (shift <= 63)
+                    {
+                        return add >> shift;
+                    }
+                    else /* if (Shift == 64) */
+                    {
+                        return 0UL;
+                    }
+                }
+            }
+        }
+#endregion
+
+#region "Saturating"
+        public static long SignedSrcSignedDstSatQ(long op, int size, CpuThreadState state)
+        {
+            int eSize = 8 << size;
+
+            long tMaxValue =  (1L << (eSize - 1)) - 1L;
+            long tMinValue = -(1L << (eSize - 1));
+
+            if (op > tMaxValue)
+            {
+                state.SetFpsrFlag(Fpsr.Qc);
+
+                return tMaxValue;
+            }
+            else if (op < tMinValue)
+            {
+                state.SetFpsrFlag(Fpsr.Qc);
+
+                return tMinValue;
+            }
+            else
+            {
+                return op;
+            }
+        }
+
+        public static ulong SignedSrcUnsignedDstSatQ(long op, int size, CpuThreadState state)
+        {
+            int eSize = 8 << size;
+
+            ulong tMaxValue = (1UL << eSize) - 1UL;
+            ulong tMinValue =  0UL;
+
+            if (op > (long)tMaxValue)
+            {
+                state.SetFpsrFlag(Fpsr.Qc);
+
+                return tMaxValue;
+            }
+            else if (op < (long)tMinValue)
+            {
+                state.SetFpsrFlag(Fpsr.Qc);
+
+                return tMinValue;
+            }
+            else
+            {
+                return (ulong)op;
+            }
+        }
+
+        public static long UnsignedSrcSignedDstSatQ(ulong op, int size, CpuThreadState state)
+        {
+            int eSize = 8 << size;
+
+            long tMaxValue = (1L << (eSize - 1)) - 1L;
+
+            if (op > (ulong)tMaxValue)
+            {
+                state.SetFpsrFlag(Fpsr.Qc);
+
+                return tMaxValue;
+            }
+            else
+            {
+                return (long)op;
+            }
+        }
+
+        public static ulong UnsignedSrcUnsignedDstSatQ(ulong op, int size, CpuThreadState state)
+        {
+            int eSize = 8 << size;
+
+            ulong tMaxValue = (1UL << eSize) - 1UL;
+
+            if (op > tMaxValue)
+            {
+                state.SetFpsrFlag(Fpsr.Qc);
+
+                return tMaxValue;
+            }
+            else
+            {
+                return op;
+            }
+        }
+
+        public static long UnarySignedSatQAbsOrNeg(long op, CpuThreadState state)
+        {
+            if (op == long.MinValue)
+            {
+                state.SetFpsrFlag(Fpsr.Qc);
+
+                return long.MaxValue;
+            }
+            else
+            {
+                return op;
+            }
+        }
+
+        public static long BinarySignedSatQAdd(long op1, long op2, CpuThreadState state)
+        {
+            long add = op1 + op2;
+
+            if ((~(op1 ^ op2) & (op1 ^ add)) < 0L)
+            {
+                state.SetFpsrFlag(Fpsr.Qc);
+
+                if (op1 < 0L)
+                {
+                    return long.MinValue;
+                }
+                else
+                {
+                    return long.MaxValue;
+                }
+            }
+            else
+            {
+                return add;
+            }
+        }
+
+        public static ulong BinaryUnsignedSatQAdd(ulong op1, ulong op2, CpuThreadState state)
+        {
+            ulong add = op1 + op2;
+
+            if ((add < op1) && (add < op2))
+            {
+                state.SetFpsrFlag(Fpsr.Qc);
+
+                return ulong.MaxValue;
+            }
+            else
+            {
+                return add;
+            }
+        }
+
+        public static long BinarySignedSatQSub(long op1, long op2, CpuThreadState state)
+        {
+            long sub = op1 - op2;
+
+            if (((op1 ^ op2) & (op1 ^ sub)) < 0L)
+            {
+                state.SetFpsrFlag(Fpsr.Qc);
+
+                if (op1 < 0L)
+                {
+                    return long.MinValue;
+                }
+                else
+                {
+                    return long.MaxValue;
+                }
+            }
+            else
+            {
+                return sub;
+            }
+        }
+
+        public static ulong BinaryUnsignedSatQSub(ulong op1, ulong op2, CpuThreadState state)
+        {
+            ulong sub = op1 - op2;
+
+            if (op1 < op2)
+            {
+                state.SetFpsrFlag(Fpsr.Qc);
+
+                return ulong.MinValue;
+            }
+            else
+            {
+                return sub;
+            }
+        }
+
+        public static long BinarySignedSatQAcc(ulong op1, long op2, CpuThreadState state)
+        {
+            if (op1 <= (ulong)long.MaxValue)
+            {
+                // Op1 from ulong.MinValue to (ulong)long.MaxValue
+                // Op2 from long.MinValue to long.MaxValue
+
+                long add = (long)op1 + op2;
+
+                if ((~op2 & add) < 0L)
+                {
+                    state.SetFpsrFlag(Fpsr.Qc);
+
+                    return long.MaxValue;
+                }
+                else
+                {
+                    return add;
+                }
+            }
+            else if (op2 >= 0L)
+            {
+                // Op1 from (ulong)long.MaxValue + 1UL to ulong.MaxValue
+                // Op2 from (long)ulong.MinValue to long.MaxValue
+
+                state.SetFpsrFlag(Fpsr.Qc);
+
+                return long.MaxValue;
+            }
+            else
+            {
+                // Op1 from (ulong)long.MaxValue + 1UL to ulong.MaxValue
+                // Op2 from long.MinValue to (long)ulong.MinValue - 1L
+
+                ulong add = op1 + (ulong)op2;
+
+                if (add > (ulong)long.MaxValue)
+                {
+                    state.SetFpsrFlag(Fpsr.Qc);
+
+                    return long.MaxValue;
+                }
+                else
+                {
+                    return (long)add;
+                }
+            }
+        }
+
+        public static ulong BinaryUnsignedSatQAcc(long op1, ulong op2, CpuThreadState state)
+        {
+            if (op1 >= 0L)
+            {
+                // Op1 from (long)ulong.MinValue to long.MaxValue
+                // Op2 from ulong.MinValue to ulong.MaxValue
+
+                ulong add = (ulong)op1 + op2;
+
+                if ((add < (ulong)op1) && (add < op2))
+                {
+                    state.SetFpsrFlag(Fpsr.Qc);
+
+                    return ulong.MaxValue;
+                }
+                else
+                {
+                    return add;
+                }
+            }
+            else if (op2 > (ulong)long.MaxValue)
+            {
+                // Op1 from long.MinValue to (long)ulong.MinValue - 1L
+                // Op2 from (ulong)long.MaxValue + 1UL to ulong.MaxValue
+
+                return (ulong)op1 + op2;
+            }
+            else
+            {
+                // Op1 from long.MinValue to (long)ulong.MinValue - 1L
+                // Op2 from ulong.MinValue to (ulong)long.MaxValue
+
+                long add = op1 + (long)op2;
+
+                if (add < (long)ulong.MinValue)
+                {
+                    state.SetFpsrFlag(Fpsr.Qc);
+
+                    return ulong.MinValue;
+                }
+                else
+                {
+                    return (ulong)add;
+                }
+            }
+        }
+#endregion
+
+#region "Count"
+        public static ulong CountLeadingSigns(ulong value, int size) // Size is 8, 16, 32 or 64 (SIMD&FP or Base Inst.).
+        {
+            value ^= value >> 1;
+
+            int highBit = size - 2;
+
+            for (int bit = highBit; bit >= 0; bit--)
+            {
+                if (((value >> bit) & 0b1) != 0)
+                {
+                    return (ulong)(highBit - bit);
+                }
+            }
+
+            return (ulong)(size - 1);
+        }
+
+        private static readonly byte[] ClzNibbleTbl = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+        public static ulong CountLeadingZeros(ulong value, int size) // Size is 8, 16, 32 or 64 (SIMD&FP or Base Inst.).
+        {
+            if (value == 0ul)
+            {
+                return (ulong)size;
+            }
+
+            int nibbleIdx = size;
+            int preCount, count = 0;
+
+            do
+            {
+                nibbleIdx -= 4;
+                preCount = ClzNibbleTbl[(value >> nibbleIdx) & 0b1111];
+                count += preCount;
+            }
+            while (preCount == 4);
+
+            return (ulong)count;
+        }
+
+        public static ulong CountSetBits8(ulong value) // "Size" is 8 (SIMD&FP Inst.).
+        {
+            if (value == 0xfful)
+            {
+                return 8ul;
+            }
+
+            value = ((value >> 1) & 0x55ul) + (value & 0x55ul);
+            value = ((value >> 2) & 0x33ul) + (value & 0x33ul);
+
+            return (value >> 4) + (value & 0x0ful);
+        }
+#endregion
+
+#region "Crc32"
+        private const uint Crc32RevPoly  = 0xedb88320;
+        private const uint Crc32CRevPoly = 0x82f63b78;
+
+        public static uint Crc32B(uint crc, byte   val) => Crc32 (crc, Crc32RevPoly, val);
+        public static uint Crc32H(uint crc, ushort val) => Crc32H(crc, Crc32RevPoly, val);
+        public static uint Crc32W(uint crc, uint   val) => Crc32W(crc, Crc32RevPoly, val);
+        public static uint Crc32X(uint crc, ulong  val) => Crc32X(crc, Crc32RevPoly, val);
+
+        public static uint Crc32Cb(uint crc, byte   val) => Crc32 (crc, Crc32CRevPoly, val);
+        public static uint Crc32Ch(uint crc, ushort val) => Crc32H(crc, Crc32CRevPoly, val);
+        public static uint Crc32Cw(uint crc, uint   val) => Crc32W(crc, Crc32CRevPoly, val);
+        public static uint Crc32Cx(uint crc, ulong  val) => Crc32X(crc, Crc32CRevPoly, val);
+
+        private static uint Crc32H(uint crc, uint poly, ushort val)
+        {
+            crc = Crc32(crc, poly, (byte)(val >> 0));
+            crc = Crc32(crc, poly, (byte)(val >> 8));
+
+            return crc;
+        }
+
+        private static uint Crc32W(uint crc, uint poly, uint val)
+        {
+            crc = Crc32(crc, poly, (byte)(val >> 0 ));
+            crc = Crc32(crc, poly, (byte)(val >> 8 ));
+            crc = Crc32(crc, poly, (byte)(val >> 16));
+            crc = Crc32(crc, poly, (byte)(val >> 24));
+
+            return crc;
+        }
+
+        private static uint Crc32X(uint crc, uint poly, ulong val)
+        {
+            crc = Crc32(crc, poly, (byte)(val >> 0 ));
+            crc = Crc32(crc, poly, (byte)(val >> 8 ));
+            crc = Crc32(crc, poly, (byte)(val >> 16));
+            crc = Crc32(crc, poly, (byte)(val >> 24));
+            crc = Crc32(crc, poly, (byte)(val >> 32));
+            crc = Crc32(crc, poly, (byte)(val >> 40));
+            crc = Crc32(crc, poly, (byte)(val >> 48));
+            crc = Crc32(crc, poly, (byte)(val >> 56));
+
+            return crc;
+        }
+
+        private static uint Crc32(uint crc, uint poly, byte val)
+        {
+            crc ^= val;
+
+            for (int bit = 7; bit >= 0; bit--)
+            {
+                uint mask = (uint)(-(int)(crc & 1));
+
+                crc = (crc >> 1) ^ (poly & mask);
+            }
+
+            return crc;
+        }
+#endregion
+
+#region "Aes"
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> Decrypt(Vector128<float> value, Vector128<float> roundKey)
+        {
+            if (!Sse.IsSupported)
+            {
+                throw new PlatformNotSupportedException();
+            }
+
+            return CryptoHelper.AesInvSubBytes(CryptoHelper.AesInvShiftRows(Sse.Xor(value, roundKey)));
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> Encrypt(Vector128<float> value, Vector128<float> roundKey)
+        {
+            if (!Sse.IsSupported)
+            {
+                throw new PlatformNotSupportedException();
+            }
+
+            return CryptoHelper.AesSubBytes(CryptoHelper.AesShiftRows(Sse.Xor(value, roundKey)));
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> InverseMixColumns(Vector128<float> value)
+        {
+            return CryptoHelper.AesInvMixColumns(value);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> MixColumns(Vector128<float> value)
+        {
+            return CryptoHelper.AesMixColumns(value);
+        }
+#endregion
+
+#region "Sha1"
+        public static Vector128<float> HashChoose(Vector128<float> hashAbcd, uint hashE, Vector128<float> wk)
+        {
+            for (int e = 0; e <= 3; e++)
+            {
+                uint t = ShaChoose((uint)VectorExtractIntZx(hashAbcd, (byte)1, 2),
+                                   (uint)VectorExtractIntZx(hashAbcd, (byte)2, 2),
+                                   (uint)VectorExtractIntZx(hashAbcd, (byte)3, 2));
+
+                hashE += Rol((uint)VectorExtractIntZx(hashAbcd, (byte)0, 2), 5) + t;
+                hashE += (uint)VectorExtractIntZx(wk, (byte)e, 2);
+
+                t = Rol((uint)VectorExtractIntZx(hashAbcd, (byte)1, 2), 30);
+                hashAbcd = VectorInsertInt((ulong)t, hashAbcd, (byte)1, 2);
+
+                Rol32_160(ref hashE, ref hashAbcd);
+            }
+
+            return hashAbcd;
+        }
+
+        public static uint FixedRotate(uint hashE)
+        {
+            return hashE.Rol(30);
+        }
+
+        public static Vector128<float> HashMajority(Vector128<float> hashAbcd, uint hashE, Vector128<float> wk)
+        {
+            for (int e = 0; e <= 3; e++)
+            {
+                uint t = ShaMajority((uint)VectorExtractIntZx(hashAbcd, (byte)1, 2),
+                                     (uint)VectorExtractIntZx(hashAbcd, (byte)2, 2),
+                                     (uint)VectorExtractIntZx(hashAbcd, (byte)3, 2));
+
+                hashE += Rol((uint)VectorExtractIntZx(hashAbcd, (byte)0, 2), 5) + t;
+                hashE += (uint)VectorExtractIntZx(wk, (byte)e, 2);
+
+                t = Rol((uint)VectorExtractIntZx(hashAbcd, (byte)1, 2), 30);
+                hashAbcd = VectorInsertInt((ulong)t, hashAbcd, (byte)1, 2);
+
+                Rol32_160(ref hashE, ref hashAbcd);
+            }
+
+            return hashAbcd;
+        }
+
+        public static Vector128<float> HashParity(Vector128<float> hashAbcd, uint hashE, Vector128<float> wk)
+        {
+            for (int e = 0; e <= 3; e++)
+            {
+                uint t = ShaParity((uint)VectorExtractIntZx(hashAbcd, (byte)1, 2),
+                                   (uint)VectorExtractIntZx(hashAbcd, (byte)2, 2),
+                                   (uint)VectorExtractIntZx(hashAbcd, (byte)3, 2));
+
+                hashE += Rol((uint)VectorExtractIntZx(hashAbcd, (byte)0, 2), 5) + t;
+                hashE += (uint)VectorExtractIntZx(wk, (byte)e, 2);
+
+                t = Rol((uint)VectorExtractIntZx(hashAbcd, (byte)1, 2), 30);
+                hashAbcd = VectorInsertInt((ulong)t, hashAbcd, (byte)1, 2);
+
+                Rol32_160(ref hashE, ref hashAbcd);
+            }
+
+            return hashAbcd;
+        }
+
+        public static Vector128<float> Sha1SchedulePart1(Vector128<float> w03, Vector128<float> w47, Vector128<float> w811)
+        {
+            if (!Sse.IsSupported)
+            {
+                throw new PlatformNotSupportedException();
+            }
+
+            Vector128<float> result = new Vector128<float>();
+
+            ulong t2 = VectorExtractIntZx(w47, (byte)0, 3);
+            ulong t1 = VectorExtractIntZx(w03, (byte)1, 3);
+
+            result = VectorInsertInt((ulong)t1, result, (byte)0, 3);
+            result = VectorInsertInt((ulong)t2, result, (byte)1, 3);
+
+            return Sse.Xor(result, Sse.Xor(w03, w811));
+        }
+
+        public static Vector128<float> Sha1SchedulePart2(Vector128<float> tw03, Vector128<float> w1215)
+        {
+            if (!Sse2.IsSupported)
+            {
+                throw new PlatformNotSupportedException();
+            }
+
+            Vector128<float> result = new Vector128<float>();
+
+            Vector128<float> t = Sse.Xor(tw03, Sse.StaticCast<uint, float>(
+                Sse2.ShiftRightLogical128BitLane(Sse.StaticCast<float, uint>(w1215), (byte)4)));
+
+            uint tE0 = (uint)VectorExtractIntZx(t, (byte)0, 2);
+            uint tE1 = (uint)VectorExtractIntZx(t, (byte)1, 2);
+            uint tE2 = (uint)VectorExtractIntZx(t, (byte)2, 2);
+            uint tE3 = (uint)VectorExtractIntZx(t, (byte)3, 2);
+
+            result = VectorInsertInt((ulong)tE0.Rol(1), result, (byte)0, 2);
+            result = VectorInsertInt((ulong)tE1.Rol(1), result, (byte)1, 2);
+            result = VectorInsertInt((ulong)tE2.Rol(1), result, (byte)2, 2);
+
+            return VectorInsertInt((ulong)(tE3.Rol(1) ^ tE0.Rol(2)), result, (byte)3, 2);
+        }
+
+        private static void Rol32_160(ref uint y, ref Vector128<float> x)
+        {
+            if (!Sse2.IsSupported)
+            {
+                throw new PlatformNotSupportedException();
+            }
+
+            uint xE3 = (uint)VectorExtractIntZx(x, (byte)3, 2);
+
+            x = Sse.StaticCast<uint, float>(Sse2.ShiftLeftLogical128BitLane(Sse.StaticCast<float, uint>(x), (byte)4));
+            x = VectorInsertInt((ulong)y, x, (byte)0, 2);
+
+            y = xE3;
+        }
+
+        private static uint ShaChoose(uint x, uint y, uint z)
+        {
+            return ((y ^ z) & x) ^ z;
+        }
+
+        private static uint ShaMajority(uint x, uint y, uint z)
+        {
+            return (x & y) | ((x | y) & z);
+        }
+
+        private static uint ShaParity(uint x, uint y, uint z)
+        {
+            return x ^ y ^ z;
+        }
+
+        private static uint Rol(this uint value, int count)
+        {
+            return (value << count) | (value >> (32 - count));
+        }
+#endregion
+
+#region "Sha256"
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> HashLower(Vector128<float> hashAbcd, Vector128<float> hashEfgh, Vector128<float> wk)
+        {
+            return Sha256Hash(hashAbcd, hashEfgh, wk, true);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> HashUpper(Vector128<float> hashEfgh, Vector128<float> hashAbcd, Vector128<float> wk)
+        {
+            return Sha256Hash(hashAbcd, hashEfgh, wk, false);
+        }
+
+        public static Vector128<float> Sha256SchedulePart1(Vector128<float> w03, Vector128<float> w47)
+        {
+            Vector128<float> result = new Vector128<float>();
+
+            for (int e = 0; e <= 3; e++)
+            {
+                uint elt = (uint)VectorExtractIntZx(e <= 2 ? w03 : w47, (byte)(e <= 2 ? e + 1 : 0), 2);
+
+                elt = elt.Ror(7) ^ elt.Ror(18) ^ elt.Lsr(3);
+
+                elt += (uint)VectorExtractIntZx(w03, (byte)e, 2);
+
+                result = VectorInsertInt((ulong)elt, result, (byte)e, 2);
+            }
+
+            return result;
+        }
+
+        public static Vector128<float> Sha256SchedulePart2(Vector128<float> w03, Vector128<float> w811, Vector128<float> w1215)
+        {
+            Vector128<float> result = new Vector128<float>();
+
+            ulong t1 = VectorExtractIntZx(w1215, (byte)1, 3);
+
+            for (int e = 0; e <= 1; e++)
+            {
+                uint elt = t1.ULongPart(e);
+
+                elt = elt.Ror(17) ^ elt.Ror(19) ^ elt.Lsr(10);
+
+                elt += (uint)VectorExtractIntZx(w03, (byte)e, 2);
+                elt += (uint)VectorExtractIntZx(w811, (byte)(e + 1), 2);
+
+                result = VectorInsertInt((ulong)elt, result, (byte)e, 2);
+            }
+
+            t1 = VectorExtractIntZx(result, (byte)0, 3);
+
+            for (int e = 2; e <= 3; e++)
+            {
+                uint elt = t1.ULongPart(e - 2);
+
+                elt = elt.Ror(17) ^ elt.Ror(19) ^ elt.Lsr(10);
+
+                elt += (uint)VectorExtractIntZx(w03, (byte)e, 2);
+                elt += (uint)VectorExtractIntZx(e == 2 ? w811 : w1215, (byte)(e == 2 ? 3 : 0), 2);
+
+                result = VectorInsertInt((ulong)elt, result, (byte)e, 2);
+            }
+
+            return result;
+        }
+
+        private static Vector128<float> Sha256Hash(Vector128<float> x, Vector128<float> y, Vector128<float> w, bool part1)
+        {
+            for (int e = 0; e <= 3; e++)
+            {
+                uint chs = ShaChoose((uint)VectorExtractIntZx(y, (byte)0, 2),
+                                     (uint)VectorExtractIntZx(y, (byte)1, 2),
+                                     (uint)VectorExtractIntZx(y, (byte)2, 2));
+
+                uint maj = ShaMajority((uint)VectorExtractIntZx(x, (byte)0, 2),
+                                       (uint)VectorExtractIntZx(x, (byte)1, 2),
+                                       (uint)VectorExtractIntZx(x, (byte)2, 2));
+
+                uint t1 = (uint)VectorExtractIntZx(y, (byte)3, 2);
+                t1 += ShaHashSigma1((uint)VectorExtractIntZx(y, (byte)0, 2)) + chs;
+                t1 += (uint)VectorExtractIntZx(w, (byte)e, 2);
+
+                uint t2 = t1 + (uint)VectorExtractIntZx(x, (byte)3, 2);
+                x = VectorInsertInt((ulong)t2, x, (byte)3, 2);
+                t2 = t1 + ShaHashSigma0((uint)VectorExtractIntZx(x, (byte)0, 2)) + maj;
+                y = VectorInsertInt((ulong)t2, y, (byte)3, 2);
+
+                Rol32_256(ref y, ref x);
+            }
+
+            return part1 ? x : y;
+        }
+
+        private static void Rol32_256(ref Vector128<float> y, ref Vector128<float> x)
+        {
+            if (!Sse2.IsSupported)
+            {
+                throw new PlatformNotSupportedException();
+            }
+
+            uint yE3 = (uint)VectorExtractIntZx(y, (byte)3, 2);
+            uint xE3 = (uint)VectorExtractIntZx(x, (byte)3, 2);
+
+            y = Sse.StaticCast<uint, float>(Sse2.ShiftLeftLogical128BitLane(Sse.StaticCast<float, uint>(y), (byte)4));
+            x = Sse.StaticCast<uint, float>(Sse2.ShiftLeftLogical128BitLane(Sse.StaticCast<float, uint>(x), (byte)4));
+
+            y = VectorInsertInt((ulong)xE3, y, (byte)0, 2);
+            x = VectorInsertInt((ulong)yE3, x, (byte)0, 2);
+        }
+
+        private static uint ShaHashSigma0(uint x)
+        {
+            return x.Ror(2) ^ x.Ror(13) ^ x.Ror(22);
+        }
+
+        private static uint ShaHashSigma1(uint x)
+        {
+            return x.Ror(6) ^ x.Ror(11) ^ x.Ror(25);
+        }
+
+        private static uint Ror(this uint value, int count)
+        {
+            return (value >> count) | (value << (32 - count));
+        }
+
+        private static uint Lsr(this uint value, int count)
+        {
+            return value >> count;
+        }
+
+        private static uint ULongPart(this ulong value, int part)
+        {
+            return part == 0
+                ? (uint)(value & 0xFFFFFFFFUL)
+                : (uint)(value >> 32);
+        }
+#endregion
+
+#region "Reverse"
+        public static uint ReverseBits8(uint value)
+        {
+            value = ((value & 0xaa) >> 1) | ((value & 0x55) << 1);
+            value = ((value & 0xcc) >> 2) | ((value & 0x33) << 2);
+
+            return (value >> 4) | ((value & 0x0f) << 4);
+        }
+
+        public static uint ReverseBits32(uint value)
+        {
+            value = ((value & 0xaaaaaaaa) >> 1) | ((value & 0x55555555) << 1);
+            value = ((value & 0xcccccccc) >> 2) | ((value & 0x33333333) << 2);
+            value = ((value & 0xf0f0f0f0) >> 4) | ((value & 0x0f0f0f0f) << 4);
+            value = ((value & 0xff00ff00) >> 8) | ((value & 0x00ff00ff) << 8);
+
+            return (value >> 16) | (value << 16);
+        }
+
+        public static ulong ReverseBits64(ulong value)
+        {
+            value = ((value & 0xaaaaaaaaaaaaaaaa) >> 1 ) | ((value & 0x5555555555555555) << 1 );
+            value = ((value & 0xcccccccccccccccc) >> 2 ) | ((value & 0x3333333333333333) << 2 );
+            value = ((value & 0xf0f0f0f0f0f0f0f0) >> 4 ) | ((value & 0x0f0f0f0f0f0f0f0f) << 4 );
+            value = ((value & 0xff00ff00ff00ff00) >> 8 ) | ((value & 0x00ff00ff00ff00ff) << 8 );
+            value = ((value & 0xffff0000ffff0000) >> 16) | ((value & 0x0000ffff0000ffff) << 16);
+
+            return (value >> 32) | (value << 32);
+        }
+
+        public static uint ReverseBytes16_32(uint value) => (uint)ReverseBytes16_64(value);
+        public static uint ReverseBytes32_32(uint value) => (uint)ReverseBytes32_64(value);
+
+        public static ulong ReverseBytes16_64(ulong value) => ReverseBytes(value, RevSize.Rev16);
+        public static ulong ReverseBytes32_64(ulong value) => ReverseBytes(value, RevSize.Rev32);
+        public static ulong ReverseBytes64(ulong value)    => ReverseBytes(value, RevSize.Rev64);
+
+        private enum RevSize
+        {
+            Rev16,
+            Rev32,
+            Rev64
+        }
+
+        private static ulong ReverseBytes(ulong value, RevSize size)
+        {
+            value = ((value & 0xff00ff00ff00ff00) >> 8) | ((value & 0x00ff00ff00ff00ff) << 8);
+
+            if (size == RevSize.Rev16)
+            {
+                return value;
+            }
+
+            value = ((value & 0xffff0000ffff0000) >> 16) | ((value & 0x0000ffff0000ffff) << 16);
+
+            if (size == RevSize.Rev32)
+            {
+                return value;
+            }
+
+            value = ((value & 0xffffffff00000000) >> 32) | ((value & 0x00000000ffffffff) << 32);
+
+            if (size == RevSize.Rev64)
+            {
+                return value;
+            }
+
+            throw new ArgumentException(nameof(size));
+        }
+#endregion
+
+#region "MultiplyHigh"
+        public static long SMulHi128(long left, long right)
+        {
+            long result = (long)UMulHi128((ulong)left, (ulong)right);
+
+            if (left < 0)
+            {
+                result -= right;
+            }
+
+            if (right < 0)
+            {
+                result -= left;
+            }
+
+            return result;
+        }
+
+        public static ulong UMulHi128(ulong left, ulong right)
+        {
+            ulong lHigh = left  >> 32;
+            ulong lLow  = left  & 0xFFFFFFFF;
+            ulong rHigh = right >> 32;
+            ulong rLow  = right & 0xFFFFFFFF;
+
+            ulong z2 = lLow  * rLow;
+            ulong t  = lHigh * rLow + (z2 >> 32);
+            ulong z1 = t & 0xFFFFFFFF;
+            ulong z0 = t >> 32;
+
+            z1 += lLow * rHigh;
+
+            return lHigh * rHigh + z0 + (z1 >> 32);
+        }
+#endregion
+    }
+}
diff --git a/ChocolArm64/Instructions/SoftFloat.cs b/ChocolArm64/Instructions/SoftFloat.cs
new file mode 100644
index 00000000..79dbe954
--- /dev/null
+++ b/ChocolArm64/Instructions/SoftFloat.cs
@@ -0,0 +1,2127 @@
+using ChocolArm64.State;
+using System;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+
+namespace ChocolArm64.Instructions
+{
+    static class SoftFloat
+    {
+        static SoftFloat()
+        {
+            RecipEstimateTable   = BuildRecipEstimateTable();
+            InvSqrtEstimateTable = BuildInvSqrtEstimateTable();
+        }
+
+        private static readonly byte[] RecipEstimateTable;
+        private static readonly byte[] InvSqrtEstimateTable;
+
+        private static byte[] BuildRecipEstimateTable()
+        {
+            byte[] table = new byte[256];
+            for (ulong index = 0; index < 256; index++)
+            {
+                ulong a = index | 0x100;
+
+                a = (a << 1) + 1;
+                ulong b = 0x80000 / a;
+                b = (b + 1) >> 1;
+
+                table[index] = (byte)(b & 0xFF);
+            }
+            return table;
+        }
+
+        private static byte[] BuildInvSqrtEstimateTable()
+        {
+            byte[] table = new byte[512];
+            for (ulong index = 128; index < 512; index++)
+            {
+                ulong a = index;
+                if (a < 256)
+                {
+                    a = (a << 1) + 1;
+                }
+                else
+                {
+                    a = (a | 1) << 1;
+                }
+
+                ulong b = 256;
+                while (a * (b + 1) * (b + 1) < (1ul << 28))
+                {
+                    b++;
+                }
+                b = (b + 1) >> 1;
+
+                table[index] = (byte)(b & 0xFF);
+            }
+            return table;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static float RecipEstimate(float x)
+        {
+            return (float)RecipEstimate((double)x);
+        }
+
+        public static double RecipEstimate(double x)
+        {
+            ulong xBits = (ulong)BitConverter.DoubleToInt64Bits(x);
+            ulong xSign = xBits & 0x8000000000000000;
+            ulong xExp = (xBits >> 52) & 0x7FF;
+            ulong scaled = xBits & ((1ul << 52) - 1);
+
+            if (xExp >= 2045)
+            {
+                if (xExp == 0x7ff && scaled != 0)
+                {
+                    // NaN
+                    return BitConverter.Int64BitsToDouble((long)(xBits | 0x0008000000000000));
+                }
+
+                // Infinity, or Out of range -> Zero
+                return BitConverter.Int64BitsToDouble((long)xSign);
+            }
+
+            if (xExp == 0)
+            {
+                if (scaled == 0)
+                {
+                    // Zero -> Infinity
+                    return BitConverter.Int64BitsToDouble((long)(xSign | 0x7FF0000000000000));
+                }
+
+                // Denormal
+                if ((scaled & (1ul << 51)) == 0)
+                {
+                    xExp = ~0ul;
+                    scaled <<= 2;
+                }
+                else
+                {
+                    scaled <<= 1;
+                }
+            }
+
+            scaled >>= 44;
+            scaled &= 0xFF;
+
+            ulong resultExp = (2045 - xExp) & 0x7FF;
+            ulong estimate = (ulong)RecipEstimateTable[scaled];
+            ulong fraction = estimate << 44;
+
+            if (resultExp == 0)
+            {
+                fraction >>= 1;
+                fraction |= 1ul << 51;
+            }
+            else if (resultExp == 0x7FF)
+            {
+                resultExp = 0;
+                fraction >>= 2;
+                fraction |= 1ul << 50;
+            }
+
+            ulong result = xSign | (resultExp << 52) | fraction;
+            return BitConverter.Int64BitsToDouble((long)result);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static float InvSqrtEstimate(float x)
+        {
+            return (float)InvSqrtEstimate((double)x);
+        }
+
+        public static double InvSqrtEstimate(double x)
+        {
+            ulong xBits = (ulong)BitConverter.DoubleToInt64Bits(x);
+            ulong xSign = xBits & 0x8000000000000000;
+            long xExp = (long)((xBits >> 52) & 0x7FF);
+            ulong scaled = xBits & ((1ul << 52) - 1);
+
+            if (xExp == 0x7FF && scaled != 0)
+            {
+                // NaN
+                return BitConverter.Int64BitsToDouble((long)(xBits | 0x0008000000000000));
+            }
+
+            if (xExp == 0)
+            {
+                if (scaled == 0)
+                {
+                    // Zero -> Infinity
+                    return BitConverter.Int64BitsToDouble((long)(xSign | 0x7FF0000000000000));
+                }
+
+                // Denormal
+                while ((scaled & (1 << 51)) == 0)
+                {
+                    scaled <<= 1;
+                    xExp--;
+                }
+                scaled <<= 1;
+            }
+
+            if (xSign != 0)
+            {
+                // Negative -> NaN
+                return BitConverter.Int64BitsToDouble((long)0x7FF8000000000000);
+            }
+
+            if (xExp == 0x7ff && scaled == 0)
+            {
+                // Infinity -> Zero
+                return BitConverter.Int64BitsToDouble((long)xSign);
+            }
+
+            if (((ulong)xExp & 1) == 1)
+            {
+                scaled >>= 45;
+                scaled &= 0xFF;
+                scaled |= 0x80;
+            }
+            else
+            {
+                scaled >>= 44;
+                scaled &= 0xFF;
+                scaled |= 0x100;
+            }
+
+            ulong resultExp = ((ulong)(3068 - xExp) / 2) & 0x7FF;
+            ulong estimate = (ulong)InvSqrtEstimateTable[scaled];
+            ulong fraction = estimate << 44;
+
+            ulong result = xSign | (resultExp << 52) | fraction;
+            return BitConverter.Int64BitsToDouble((long)result);
+        }
+    }
+
+    static class SoftFloat1632
+    {
+        public static float FPConvert(ushort valueBits, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat16_32.FPConvert: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            double real = valueBits.FPUnpackCv(out FpType type, out bool sign, state);
+
+            float result;
+
+            if (type == FpType.SNaN || type == FpType.QNaN)
+            {
+                if (state.GetFpcrFlag(Fpcr.Dn))
+                {
+                    result = FPDefaultNaN();
+                }
+                else
+                {
+                    result = FPConvertNaN(valueBits);
+                }
+
+                if (type == FpType.SNaN)
+                {
+                    FPProcessException(FpExc.InvalidOp, state);
+                }
+            }
+            else if (type == FpType.Infinity)
+            {
+                result = FPInfinity(sign);
+            }
+            else if (type == FpType.Zero)
+            {
+                result = FPZero(sign);
+            }
+            else
+            {
+                result = FPRoundCv(real, state);
+            }
+
+            return result;
+        }
+
+        private static float FPDefaultNaN()
+        {
+            return -float.NaN;
+        }
+
+        private static float FPInfinity(bool sign)
+        {
+            return sign ? float.NegativeInfinity : float.PositiveInfinity;
+        }
+
+        private static float FPZero(bool sign)
+        {
+            return sign ? -0f : +0f;
+        }
+
+        private static float FPMaxNormal(bool sign)
+        {
+            return sign ? float.MinValue : float.MaxValue;
+        }
+
+        private static double FPUnpackCv(this ushort valueBits, out FpType type, out bool sign, CpuThreadState state)
+        {
+            sign = (~(uint)valueBits & 0x8000u) == 0u;
+
+            uint exp16  = ((uint)valueBits & 0x7C00u) >> 10;
+            uint frac16 =  (uint)valueBits & 0x03FFu;
+
+            double real;
+
+            if (exp16 == 0u)
+            {
+                if (frac16 == 0u)
+                {
+                    type = FpType.Zero;
+                    real = 0d;
+                }
+                else
+                {
+                    type = FpType.Nonzero; // Subnormal.
+                    real = Math.Pow(2d, -14) * ((double)frac16 * Math.Pow(2d, -10));
+                }
+            }
+            else if (exp16 == 0x1Fu && !state.GetFpcrFlag(Fpcr.Ahp))
+            {
+                if (frac16 == 0u)
+                {
+                    type = FpType.Infinity;
+                    real = Math.Pow(2d, 1000);
+                }
+                else
+                {
+                    type = (~frac16 & 0x0200u) == 0u ? FpType.QNaN : FpType.SNaN;
+                    real = 0d;
+                }
+            }
+            else
+            {
+                type = FpType.Nonzero; // Normal.
+                real = Math.Pow(2d, (int)exp16 - 15) * (1d + (double)frac16 * Math.Pow(2d, -10));
+            }
+
+            return sign ? -real : real;
+        }
+
+        private static float FPRoundCv(double real, CpuThreadState state)
+        {
+            const int minimumExp = -126;
+
+            const int e = 8;
+            const int f = 23;
+
+            bool   sign;
+            double mantissa;
+
+            if (real < 0d)
+            {
+                sign     = true;
+                mantissa = -real;
+            }
+            else
+            {
+                sign     = false;
+                mantissa = real;
+            }
+
+            int exponent = 0;
+
+            while (mantissa < 1d)
+            {
+                mantissa *= 2d;
+                exponent--;
+            }
+
+            while (mantissa >= 2d)
+            {
+                mantissa /= 2d;
+                exponent++;
+            }
+
+            if (state.GetFpcrFlag(Fpcr.Fz) && exponent < minimumExp)
+            {
+                state.SetFpsrFlag(Fpsr.Ufc);
+
+                return FPZero(sign);
+            }
+
+            uint biasedExp = (uint)Math.Max(exponent - minimumExp + 1, 0);
+
+            if (biasedExp == 0u)
+            {
+                mantissa /= Math.Pow(2d, minimumExp - exponent);
+            }
+
+            uint intMant = (uint)Math.Floor(mantissa * Math.Pow(2d, f));
+            double error = mantissa * Math.Pow(2d, f) - (double)intMant;
+
+            if (biasedExp == 0u && (error != 0d || state.GetFpcrFlag(Fpcr.Ufe)))
+            {
+                FPProcessException(FpExc.Underflow, state);
+            }
+
+            bool overflowToInf;
+            bool roundUp;
+
+            switch (state.FPRoundingMode())
+            {
+                default:
+                case RoundMode.ToNearest:
+                    roundUp       = (error > 0.5d || (error == 0.5d && (intMant & 1u) == 1u));
+                    overflowToInf = true;
+                    break;
+
+                case RoundMode.TowardsPlusInfinity:
+                    roundUp       = (error != 0d && !sign);
+                    overflowToInf = !sign;
+                    break;
+
+                case RoundMode.TowardsMinusInfinity:
+                    roundUp       = (error != 0d && sign);
+                    overflowToInf = sign;
+                    break;
+
+                case RoundMode.TowardsZero:
+                    roundUp       = false;
+                    overflowToInf = false;
+                    break;
+            }
+
+            if (roundUp)
+            {
+                intMant++;
+
+                if (intMant == (uint)Math.Pow(2d, f))
+                {
+                    biasedExp = 1u;
+                }
+
+                if (intMant == (uint)Math.Pow(2d, f + 1))
+                {
+                    biasedExp++;
+                    intMant >>= 1;
+                }
+            }
+
+            float result;
+
+            if (biasedExp >= (uint)Math.Pow(2d, e) - 1u)
+            {
+                result = overflowToInf ? FPInfinity(sign) : FPMaxNormal(sign);
+
+                FPProcessException(FpExc.Overflow, state);
+
+                error = 1d;
+            }
+            else
+            {
+                result = BitConverter.Int32BitsToSingle(
+                    (int)((sign ? 1u : 0u) << 31 | (biasedExp & 0xFFu) << 23 | (intMant & 0x007FFFFFu)));
+            }
+
+            if (error != 0d)
+            {
+                FPProcessException(FpExc.Inexact, state);
+            }
+
+            return result;
+        }
+
+        private static float FPConvertNaN(ushort valueBits)
+        {
+            return BitConverter.Int32BitsToSingle(
+                (int)(((uint)valueBits & 0x8000u) << 16 | 0x7FC00000u | ((uint)valueBits & 0x01FFu) << 13));
+        }
+
+        private static void FPProcessException(FpExc exc, CpuThreadState state)
+        {
+            int enable = (int)exc + 8;
+
+            if ((state.Fpcr & (1 << enable)) != 0)
+            {
+                throw new NotImplementedException("floating-point trap handling");
+            }
+            else
+            {
+                state.Fpsr |= 1 << (int)exc;
+            }
+        }
+    }
+
+    static class SoftFloat3216
+    {
+        public static ushort FPConvert(float value, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat32_16.FPConvert: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            double real = value.FPUnpackCv(out FpType type, out bool sign, state, out uint valueBits);
+
+            bool altHp = state.GetFpcrFlag(Fpcr.Ahp);
+
+            ushort resultBits;
+
+            if (type == FpType.SNaN || type == FpType.QNaN)
+            {
+                if (altHp)
+                {
+                    resultBits = FPZero(sign);
+                }
+                else if (state.GetFpcrFlag(Fpcr.Dn))
+                {
+                    resultBits = FPDefaultNaN();
+                }
+                else
+                {
+                    resultBits = FPConvertNaN(valueBits);
+                }
+
+                if (type == FpType.SNaN || altHp)
+                {
+                    FPProcessException(FpExc.InvalidOp, state);
+                }
+            }
+            else if (type == FpType.Infinity)
+            {
+                if (altHp)
+                {
+                    resultBits = (ushort)((sign ? 1u : 0u) << 15 | 0x7FFFu);
+
+                    FPProcessException(FpExc.InvalidOp, state);
+                }
+                else
+                {
+                    resultBits = FPInfinity(sign);
+                }
+            }
+            else if (type == FpType.Zero)
+            {
+                resultBits = FPZero(sign);
+            }
+            else
+            {
+                resultBits = FPRoundCv(real, state);
+            }
+
+            return resultBits;
+        }
+
+        private static ushort FPDefaultNaN()
+        {
+            return (ushort)0x7E00u;
+        }
+
+        private static ushort FPInfinity(bool sign)
+        {
+            return sign ? (ushort)0xFC00u : (ushort)0x7C00u;
+        }
+
+        private static ushort FPZero(bool sign)
+        {
+            return sign ? (ushort)0x8000u : (ushort)0x0000u;
+        }
+
+        private static ushort FPMaxNormal(bool sign)
+        {
+            return sign ? (ushort)0xFBFFu : (ushort)0x7BFFu;
+        }
+
+        private static double FPUnpackCv(this float value, out FpType type, out bool sign, CpuThreadState state, out uint valueBits)
+        {
+            valueBits = (uint)BitConverter.SingleToInt32Bits(value);
+
+            sign = (~valueBits & 0x80000000u) == 0u;
+
+            uint exp32  = (valueBits & 0x7F800000u) >> 23;
+            uint frac32 =  valueBits & 0x007FFFFFu;
+
+            double real;
+
+            if (exp32 == 0u)
+            {
+                if (frac32 == 0u || state.GetFpcrFlag(Fpcr.Fz))
+                {
+                    type = FpType.Zero;
+                    real = 0d;
+
+                    if (frac32 != 0u) FPProcessException(FpExc.InputDenorm, state);
+                }
+                else
+                {
+                    type = FpType.Nonzero; // Subnormal.
+                    real = Math.Pow(2d, -126) * ((double)frac32 * Math.Pow(2d, -23));
+                }
+            }
+            else if (exp32 == 0xFFu)
+            {
+                if (frac32 == 0u)
+                {
+                    type = FpType.Infinity;
+                    real = Math.Pow(2d, 1000);
+                }
+                else
+                {
+                    type = (~frac32 & 0x00400000u) == 0u ? FpType.QNaN : FpType.SNaN;
+                    real = 0d;
+                }
+            }
+            else
+            {
+                type = FpType.Nonzero; // Normal.
+                real = Math.Pow(2d, (int)exp32 - 127) * (1d + (double)frac32 * Math.Pow(2d, -23));
+            }
+
+            return sign ? -real : real;
+        }
+
+        private static ushort FPRoundCv(double real, CpuThreadState state)
+        {
+            const int minimumExp = -14;
+
+            const int e = 5;
+            const int f = 10;
+
+            bool   sign;
+            double mantissa;
+
+            if (real < 0d)
+            {
+                sign     = true;
+                mantissa = -real;
+            }
+            else
+            {
+                sign     = false;
+                mantissa = real;
+            }
+
+            int exponent = 0;
+
+            while (mantissa < 1d)
+            {
+                mantissa *= 2d;
+                exponent--;
+            }
+
+            while (mantissa >= 2d)
+            {
+                mantissa /= 2d;
+                exponent++;
+            }
+
+            uint biasedExp = (uint)Math.Max(exponent - minimumExp + 1, 0);
+
+            if (biasedExp == 0u)
+            {
+                mantissa /= Math.Pow(2d, minimumExp - exponent);
+            }
+
+            uint intMant = (uint)Math.Floor(mantissa * Math.Pow(2d, f));
+            double error = mantissa * Math.Pow(2d, f) - (double)intMant;
+
+            if (biasedExp == 0u && (error != 0d || state.GetFpcrFlag(Fpcr.Ufe)))
+            {
+                FPProcessException(FpExc.Underflow, state);
+            }
+
+            bool overflowToInf;
+            bool roundUp;
+
+            switch (state.FPRoundingMode())
+            {
+                default:
+                case RoundMode.ToNearest:
+                    roundUp       = (error > 0.5d || (error == 0.5d && (intMant & 1u) == 1u));
+                    overflowToInf = true;
+                    break;
+
+                case RoundMode.TowardsPlusInfinity:
+                    roundUp       = (error != 0d && !sign);
+                    overflowToInf = !sign;
+                    break;
+
+                case RoundMode.TowardsMinusInfinity:
+                    roundUp       = (error != 0d && sign);
+                    overflowToInf = sign;
+                    break;
+
+                case RoundMode.TowardsZero:
+                    roundUp       = false;
+                    overflowToInf = false;
+                    break;
+            }
+
+            if (roundUp)
+            {
+                intMant++;
+
+                if (intMant == (uint)Math.Pow(2d, f))
+                {
+                    biasedExp = 1u;
+                }
+
+                if (intMant == (uint)Math.Pow(2d, f + 1))
+                {
+                    biasedExp++;
+                    intMant >>= 1;
+                }
+            }
+
+            ushort resultBits;
+
+            if (!state.GetFpcrFlag(Fpcr.Ahp))
+            {
+                if (biasedExp >= (uint)Math.Pow(2d, e) - 1u)
+                {
+                    resultBits = overflowToInf ? FPInfinity(sign) : FPMaxNormal(sign);
+
+                    FPProcessException(FpExc.Overflow, state);
+
+                    error = 1d;
+                }
+                else
+                {
+                    resultBits = (ushort)((sign ? 1u : 0u) << 15 | (biasedExp & 0x1Fu) << 10 | (intMant & 0x03FFu));
+                }
+            }
+            else
+            {
+                if (biasedExp >= (uint)Math.Pow(2d, e))
+                {
+                    resultBits = (ushort)((sign ? 1u : 0u) << 15 | 0x7FFFu);
+
+                    FPProcessException(FpExc.InvalidOp, state);
+
+                    error = 0d;
+                }
+                else
+                {
+                    resultBits = (ushort)((sign ? 1u : 0u) << 15 | (biasedExp & 0x1Fu) << 10 | (intMant & 0x03FFu));
+                }
+            }
+
+            if (error != 0d)
+            {
+                FPProcessException(FpExc.Inexact, state);
+            }
+
+            return resultBits;
+        }
+
+        private static ushort FPConvertNaN(uint valueBits)
+        {
+            return (ushort)((valueBits & 0x80000000u) >> 16 | 0x7E00u | (valueBits & 0x003FE000u) >> 13);
+        }
+
+        private static void FPProcessException(FpExc exc, CpuThreadState state)
+        {
+            int enable = (int)exc + 8;
+
+            if ((state.Fpcr & (1 << enable)) != 0)
+            {
+                throw new NotImplementedException("floating-point trap handling");
+            }
+            else
+            {
+                state.Fpsr |= 1 << (int)exc;
+            }
+        }
+    }
+
+    static class SoftFloat32
+    {
+        public static float FPAdd(float value1, float value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_32.FPAdd: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out uint op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out uint op2);
+
+            float result = FPProcessNaNs(type1, type2, op1, op2, state, out bool done);
+
+            if (!done)
+            {
+                bool inf1 = type1 == FpType.Infinity; bool zero1 = type1 == FpType.Zero;
+                bool inf2 = type2 == FpType.Infinity; bool zero2 = type2 == FpType.Zero;
+
+                if (inf1 && inf2 && sign1 == !sign2)
+                {
+                    result = FPDefaultNaN();
+
+                    FPProcessException(FpExc.InvalidOp, state);
+                }
+                else if ((inf1 && !sign1) || (inf2 && !sign2))
+                {
+                    result = FPInfinity(false);
+                }
+                else if ((inf1 && sign1) || (inf2 && sign2))
+                {
+                    result = FPInfinity(true);
+                }
+                else if (zero1 && zero2 && sign1 == sign2)
+                {
+                    result = FPZero(sign1);
+                }
+                else
+                {
+                    result = value1 + value2;
+                }
+            }
+
+            return result;
+        }
+
+        public static float FPDiv(float value1, float value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_32.FPDiv: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out uint op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out uint op2);
+
+            float result = FPProcessNaNs(type1, type2, op1, op2, state, out bool done);
+
+            if (!done)
+            {
+                bool inf1 = type1 == FpType.Infinity; bool zero1 = type1 == FpType.Zero;
+                bool inf2 = type2 == FpType.Infinity; bool zero2 = type2 == FpType.Zero;
+
+                if ((inf1 && inf2) || (zero1 && zero2))
+                {
+                    result = FPDefaultNaN();
+
+                    FPProcessException(FpExc.InvalidOp, state);
+                }
+                else if (inf1 || zero2)
+                {
+                    result = FPInfinity(sign1 ^ sign2);
+
+                    if (!inf1) FPProcessException(FpExc.DivideByZero, state);
+                }
+                else if (zero1 || inf2)
+                {
+                    result = FPZero(sign1 ^ sign2);
+                }
+                else
+                {
+                    result = value1 / value2;
+                }
+            }
+
+            return result;
+        }
+
+        public static float FPMax(float value1, float value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_32.FPMax: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out uint op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out uint op2);
+
+            float result = FPProcessNaNs(type1, type2, op1, op2, state, out bool done);
+
+            if (!done)
+            {
+                if (value1 > value2)
+                {
+                    if (type1 == FpType.Infinity)
+                    {
+                        result = FPInfinity(sign1);
+                    }
+                    else if (type1 == FpType.Zero)
+                    {
+                        result = FPZero(sign1 && sign2);
+                    }
+                    else
+                    {
+                        result = value1;
+                    }
+                }
+                else
+                {
+                    if (type2 == FpType.Infinity)
+                    {
+                        result = FPInfinity(sign2);
+                    }
+                    else if (type2 == FpType.Zero)
+                    {
+                        result = FPZero(sign1 && sign2);
+                    }
+                    else
+                    {
+                        result = value2;
+                    }
+                }
+            }
+
+            return result;
+        }
+
+        public static float FPMaxNum(float value1, float value2, CpuThreadState state)
+        {
+            Debug.WriteIf(state.Fpcr != 0, "ASoftFloat_32.FPMaxNum: ");
+
+            value1.FPUnpack(out FpType type1, out _, out _);
+            value2.FPUnpack(out FpType type2, out _, out _);
+
+            if (type1 == FpType.QNaN && type2 != FpType.QNaN)
+            {
+                value1 = FPInfinity(true);
+            }
+            else if (type1 != FpType.QNaN && type2 == FpType.QNaN)
+            {
+                value2 = FPInfinity(true);
+            }
+
+            return FPMax(value1, value2, state);
+        }
+
+        public static float FPMin(float value1, float value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_32.FPMin: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out uint op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out uint op2);
+
+            float result = FPProcessNaNs(type1, type2, op1, op2, state, out bool done);
+
+            if (!done)
+            {
+                if (value1 < value2)
+                {
+                    if (type1 == FpType.Infinity)
+                    {
+                        result = FPInfinity(sign1);
+                    }
+                    else if (type1 == FpType.Zero)
+                    {
+                        result = FPZero(sign1 || sign2);
+                    }
+                    else
+                    {
+                        result = value1;
+                    }
+                }
+                else
+                {
+                    if (type2 == FpType.Infinity)
+                    {
+                        result = FPInfinity(sign2);
+                    }
+                    else if (type2 == FpType.Zero)
+                    {
+                        result = FPZero(sign1 || sign2);
+                    }
+                    else
+                    {
+                        result = value2;
+                    }
+                }
+            }
+
+            return result;
+        }
+
+        public static float FPMinNum(float value1, float value2, CpuThreadState state)
+        {
+            Debug.WriteIf(state.Fpcr != 0, "ASoftFloat_32.FPMinNum: ");
+
+            value1.FPUnpack(out FpType type1, out _, out _);
+            value2.FPUnpack(out FpType type2, out _, out _);
+
+            if (type1 == FpType.QNaN && type2 != FpType.QNaN)
+            {
+                value1 = FPInfinity(false);
+            }
+            else if (type1 != FpType.QNaN && type2 == FpType.QNaN)
+            {
+                value2 = FPInfinity(false);
+            }
+
+            return FPMin(value1, value2, state);
+        }
+
+        public static float FPMul(float value1, float value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_32.FPMul: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out uint op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out uint op2);
+
+            float result = FPProcessNaNs(type1, type2, op1, op2, state, out bool done);
+
+            if (!done)
+            {
+                bool inf1 = type1 == FpType.Infinity; bool zero1 = type1 == FpType.Zero;
+                bool inf2 = type2 == FpType.Infinity; bool zero2 = type2 == FpType.Zero;
+
+                if ((inf1 && zero2) || (zero1 && inf2))
+                {
+                    result = FPDefaultNaN();
+
+                    FPProcessException(FpExc.InvalidOp, state);
+                }
+                else if (inf1 || inf2)
+                {
+                    result = FPInfinity(sign1 ^ sign2);
+                }
+                else if (zero1 || zero2)
+                {
+                    result = FPZero(sign1 ^ sign2);
+                }
+                else
+                {
+                    result = value1 * value2;
+                }
+            }
+
+            return result;
+        }
+
+        public static float FPMulAdd(float valueA, float value1, float value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_32.FPMulAdd: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            valueA = valueA.FPUnpack(out FpType typeA, out bool signA, out uint addend);
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out uint op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out uint op2);
+
+            bool inf1 = type1 == FpType.Infinity; bool zero1 = type1 == FpType.Zero;
+            bool inf2 = type2 == FpType.Infinity; bool zero2 = type2 == FpType.Zero;
+
+            float result = FPProcessNaNs3(typeA, type1, type2, addend, op1, op2, state, out bool done);
+
+            if (typeA == FpType.QNaN && ((inf1 && zero2) || (zero1 && inf2)))
+            {
+                result = FPDefaultNaN();
+
+                FPProcessException(FpExc.InvalidOp, state);
+            }
+
+            if (!done)
+            {
+                bool infA = typeA == FpType.Infinity; bool zeroA = typeA == FpType.Zero;
+
+                bool signP = sign1 ^  sign2;
+                bool infP  = inf1  || inf2;
+                bool zeroP = zero1 || zero2;
+
+                if ((inf1 && zero2) || (zero1 && inf2) || (infA && infP && signA != signP))
+                {
+                    result = FPDefaultNaN();
+
+                    FPProcessException(FpExc.InvalidOp, state);
+                }
+                else if ((infA && !signA) || (infP && !signP))
+                {
+                    result = FPInfinity(false);
+                }
+                else if ((infA && signA) || (infP && signP))
+                {
+                    result = FPInfinity(true);
+                }
+                else if (zeroA && zeroP && signA == signP)
+                {
+                    result = FPZero(signA);
+                }
+                else
+                {
+                    // TODO: When available, use: T MathF.FusedMultiplyAdd(T, T, T);
+                    // https://github.com/dotnet/corefx/issues/31903
+
+                    result = valueA + (value1 * value2);
+                }
+            }
+
+            return result;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static float FPMulSub(float valueA, float value1, float value2, CpuThreadState state)
+        {
+            Debug.WriteIf(state.Fpcr != 0, "ASoftFloat_32.FPMulSub: ");
+
+            value1 = value1.FPNeg();
+
+            return FPMulAdd(valueA, value1, value2, state);
+        }
+
+        public static float FPMulX(float value1, float value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_32.FPMulX: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out uint op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out uint op2);
+
+            float result = FPProcessNaNs(type1, type2, op1, op2, state, out bool done);
+
+            if (!done)
+            {
+                bool inf1 = type1 == FpType.Infinity; bool zero1 = type1 == FpType.Zero;
+                bool inf2 = type2 == FpType.Infinity; bool zero2 = type2 == FpType.Zero;
+
+                if ((inf1 && zero2) || (zero1 && inf2))
+                {
+                    result = FPTwo(sign1 ^ sign2);
+                }
+                else if (inf1 || inf2)
+                {
+                    result = FPInfinity(sign1 ^ sign2);
+                }
+                else if (zero1 || zero2)
+                {
+                    result = FPZero(sign1 ^ sign2);
+                }
+                else
+                {
+                    result = value1 * value2;
+                }
+            }
+
+            return result;
+        }
+
+        public static float FPRecipStepFused(float value1, float value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_32.FPRecipStepFused: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPNeg();
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out uint op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out uint op2);
+
+            float result = FPProcessNaNs(type1, type2, op1, op2, state, out bool done);
+
+            if (!done)
+            {
+                bool inf1 = type1 == FpType.Infinity; bool zero1 = type1 == FpType.Zero;
+                bool inf2 = type2 == FpType.Infinity; bool zero2 = type2 == FpType.Zero;
+
+                if ((inf1 && zero2) || (zero1 && inf2))
+                {
+                    result = FPTwo(false);
+                }
+                else if (inf1 || inf2)
+                {
+                    result = FPInfinity(sign1 ^ sign2);
+                }
+                else
+                {
+                    // TODO: When available, use: T MathF.FusedMultiplyAdd(T, T, T);
+                    // https://github.com/dotnet/corefx/issues/31903
+
+                    result = 2f + (value1 * value2);
+                }
+            }
+
+            return result;
+        }
+
+        public static float FPRecpX(float value, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_32.FPRecpX: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value.FPUnpack(out FpType type, out bool sign, out uint op);
+
+            float result;
+
+            if (type == FpType.SNaN || type == FpType.QNaN)
+            {
+                result = FPProcessNaN(type, op, state);
+            }
+            else
+            {
+                uint notExp = (~op >> 23) & 0xFFu;
+                uint maxExp = 0xFEu;
+
+                result = BitConverter.Int32BitsToSingle(
+                    (int)((sign ? 1u : 0u) << 31 | (notExp == 0xFFu ? maxExp : notExp) << 23));
+            }
+
+            return result;
+        }
+
+        public static float FprSqrtStepFused(float value1, float value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_32.FPRSqrtStepFused: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPNeg();
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out uint op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out uint op2);
+
+            float result = FPProcessNaNs(type1, type2, op1, op2, state, out bool done);
+
+            if (!done)
+            {
+                bool inf1 = type1 == FpType.Infinity; bool zero1 = type1 == FpType.Zero;
+                bool inf2 = type2 == FpType.Infinity; bool zero2 = type2 == FpType.Zero;
+
+                if ((inf1 && zero2) || (zero1 && inf2))
+                {
+                    result = FPOnePointFive(false);
+                }
+                else if (inf1 || inf2)
+                {
+                    result = FPInfinity(sign1 ^ sign2);
+                }
+                else
+                {
+                    // TODO: When available, use: T MathF.FusedMultiplyAdd(T, T, T);
+                    // https://github.com/dotnet/corefx/issues/31903
+
+                    result = (3f + (value1 * value2)) / 2f;
+                }
+            }
+
+            return result;
+        }
+
+        public static float FPSqrt(float value, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_32.FPSqrt: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value = value.FPUnpack(out FpType type, out bool sign, out uint op);
+
+            float result;
+
+            if (type == FpType.SNaN || type == FpType.QNaN)
+            {
+                result = FPProcessNaN(type, op, state);
+            }
+            else if (type == FpType.Zero)
+            {
+                result = FPZero(sign);
+            }
+            else if (type == FpType.Infinity && !sign)
+            {
+                result = FPInfinity(sign);
+            }
+            else if (sign)
+            {
+                result = FPDefaultNaN();
+
+                FPProcessException(FpExc.InvalidOp, state);
+            }
+            else
+            {
+                result = MathF.Sqrt(value);
+            }
+
+            return result;
+        }
+
+        public static float FPSub(float value1, float value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_32.FPSub: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out uint op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out uint op2);
+
+            float result = FPProcessNaNs(type1, type2, op1, op2, state, out bool done);
+
+            if (!done)
+            {
+                bool inf1 = type1 == FpType.Infinity; bool zero1 = type1 == FpType.Zero;
+                bool inf2 = type2 == FpType.Infinity; bool zero2 = type2 == FpType.Zero;
+
+                if (inf1 && inf2 && sign1 == sign2)
+                {
+                    result = FPDefaultNaN();
+
+                    FPProcessException(FpExc.InvalidOp, state);
+                }
+                else if ((inf1 && !sign1) || (inf2 && sign2))
+                {
+                    result = FPInfinity(false);
+                }
+                else if ((inf1 && sign1) || (inf2 && !sign2))
+                {
+                    result = FPInfinity(true);
+                }
+                else if (zero1 && zero2 && sign1 == !sign2)
+                {
+                    result = FPZero(sign1);
+                }
+                else
+                {
+                    result = value1 - value2;
+                }
+            }
+
+            return result;
+        }
+
+        private static float FPDefaultNaN()
+        {
+            return -float.NaN;
+        }
+
+        private static float FPInfinity(bool sign)
+        {
+            return sign ? float.NegativeInfinity : float.PositiveInfinity;
+        }
+
+        private static float FPZero(bool sign)
+        {
+            return sign ? -0f : +0f;
+        }
+
+        private static float FPTwo(bool sign)
+        {
+            return sign ? -2f : +2f;
+        }
+
+        private static float FPOnePointFive(bool sign)
+        {
+            return sign ? -1.5f : +1.5f;
+        }
+
+        private static float FPNeg(this float value)
+        {
+            return -value;
+        }
+
+        private static float FPUnpack(this float value, out FpType type, out bool sign, out uint valueBits)
+        {
+            valueBits = (uint)BitConverter.SingleToInt32Bits(value);
+
+            sign = (~valueBits & 0x80000000u) == 0u;
+
+            if ((valueBits & 0x7F800000u) == 0u)
+            {
+                if ((valueBits & 0x007FFFFFu) == 0u)
+                {
+                    type = FpType.Zero;
+                }
+                else
+                {
+                    type = FpType.Nonzero;
+                }
+            }
+            else if ((~valueBits & 0x7F800000u) == 0u)
+            {
+                if ((valueBits & 0x007FFFFFu) == 0u)
+                {
+                    type = FpType.Infinity;
+                }
+                else
+                {
+                    type = (~valueBits & 0x00400000u) == 0u
+                        ? FpType.QNaN
+                        : FpType.SNaN;
+
+                    return FPZero(sign);
+                }
+            }
+            else
+            {
+                type = FpType.Nonzero;
+            }
+
+            return value;
+        }
+
+        private static float FPProcessNaNs(
+            FpType type1,
+            FpType type2,
+            uint op1,
+            uint op2,
+            CpuThreadState state,
+            out bool done)
+        {
+            done = true;
+
+            if (type1 == FpType.SNaN)
+            {
+                return FPProcessNaN(type1, op1, state);
+            }
+            else if (type2 == FpType.SNaN)
+            {
+                return FPProcessNaN(type2, op2, state);
+            }
+            else if (type1 == FpType.QNaN)
+            {
+                return FPProcessNaN(type1, op1, state);
+            }
+            else if (type2 == FpType.QNaN)
+            {
+                return FPProcessNaN(type2, op2, state);
+            }
+
+            done = false;
+
+            return FPZero(false);
+        }
+
+        private static float FPProcessNaNs3(
+            FpType type1,
+            FpType type2,
+            FpType type3,
+            uint op1,
+            uint op2,
+            uint op3,
+            CpuThreadState state,
+            out bool done)
+        {
+            done = true;
+
+            if (type1 == FpType.SNaN)
+            {
+                return FPProcessNaN(type1, op1, state);
+            }
+            else if (type2 == FpType.SNaN)
+            {
+                return FPProcessNaN(type2, op2, state);
+            }
+            else if (type3 == FpType.SNaN)
+            {
+                return FPProcessNaN(type3, op3, state);
+            }
+            else if (type1 == FpType.QNaN)
+            {
+                return FPProcessNaN(type1, op1, state);
+            }
+            else if (type2 == FpType.QNaN)
+            {
+                return FPProcessNaN(type2, op2, state);
+            }
+            else if (type3 == FpType.QNaN)
+            {
+                return FPProcessNaN(type3, op3, state);
+            }
+
+            done = false;
+
+            return FPZero(false);
+        }
+
+        private static float FPProcessNaN(FpType type, uint op, CpuThreadState state)
+        {
+            if (type == FpType.SNaN)
+            {
+                op |= 1u << 22;
+
+                FPProcessException(FpExc.InvalidOp, state);
+            }
+
+            if (state.GetFpcrFlag(Fpcr.Dn))
+            {
+                return FPDefaultNaN();
+            }
+
+            return BitConverter.Int32BitsToSingle((int)op);
+        }
+
+        private static void FPProcessException(FpExc exc, CpuThreadState state)
+        {
+            int enable = (int)exc + 8;
+
+            if ((state.Fpcr & (1 << enable)) != 0)
+            {
+                throw new NotImplementedException("floating-point trap handling");
+            }
+            else
+            {
+                state.Fpsr |= 1 << (int)exc;
+            }
+        }
+    }
+
+    static class SoftFloat64
+    {
+        public static double FPAdd(double value1, double value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_64.FPAdd: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out ulong op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out ulong op2);
+
+            double result = FPProcessNaNs(type1, type2, op1, op2, state, out bool done);
+
+            if (!done)
+            {
+                bool inf1 = type1 == FpType.Infinity; bool zero1 = type1 == FpType.Zero;
+                bool inf2 = type2 == FpType.Infinity; bool zero2 = type2 == FpType.Zero;
+
+                if (inf1 && inf2 && sign1 == !sign2)
+                {
+                    result = FPDefaultNaN();
+
+                    FPProcessException(FpExc.InvalidOp, state);
+                }
+                else if ((inf1 && !sign1) || (inf2 && !sign2))
+                {
+                    result = FPInfinity(false);
+                }
+                else if ((inf1 && sign1) || (inf2 && sign2))
+                {
+                    result = FPInfinity(true);
+                }
+                else if (zero1 && zero2 && sign1 == sign2)
+                {
+                    result = FPZero(sign1);
+                }
+                else
+                {
+                    result = value1 + value2;
+                }
+            }
+
+            return result;
+        }
+
+        public static double FPDiv(double value1, double value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_64.FPDiv: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out ulong op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out ulong op2);
+
+            double result = FPProcessNaNs(type1, type2, op1, op2, state, out bool done);
+
+            if (!done)
+            {
+                bool inf1 = type1 == FpType.Infinity; bool zero1 = type1 == FpType.Zero;
+                bool inf2 = type2 == FpType.Infinity; bool zero2 = type2 == FpType.Zero;
+
+                if ((inf1 && inf2) || (zero1 && zero2))
+                {
+                    result = FPDefaultNaN();
+
+                    FPProcessException(FpExc.InvalidOp, state);
+                }
+                else if (inf1 || zero2)
+                {
+                    result = FPInfinity(sign1 ^ sign2);
+
+                    if (!inf1) FPProcessException(FpExc.DivideByZero, state);
+                }
+                else if (zero1 || inf2)
+                {
+                    result = FPZero(sign1 ^ sign2);
+                }
+                else
+                {
+                    result = value1 / value2;
+                }
+            }
+
+            return result;
+        }
+
+        public static double FPMax(double value1, double value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_64.FPMax: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out ulong op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out ulong op2);
+
+            double result = FPProcessNaNs(type1, type2, op1, op2, state, out bool done);
+
+            if (!done)
+            {
+                if (value1 > value2)
+                {
+                    if (type1 == FpType.Infinity)
+                    {
+                        result = FPInfinity(sign1);
+                    }
+                    else if (type1 == FpType.Zero)
+                    {
+                        result = FPZero(sign1 && sign2);
+                    }
+                    else
+                    {
+                        result = value1;
+                    }
+                }
+                else
+                {
+                    if (type2 == FpType.Infinity)
+                    {
+                        result = FPInfinity(sign2);
+                    }
+                    else if (type2 == FpType.Zero)
+                    {
+                        result = FPZero(sign1 && sign2);
+                    }
+                    else
+                    {
+                        result = value2;
+                    }
+                }
+            }
+
+            return result;
+        }
+
+        public static double FPMaxNum(double value1, double value2, CpuThreadState state)
+        {
+            Debug.WriteIf(state.Fpcr != 0, "ASoftFloat_64.FPMaxNum: ");
+
+            value1.FPUnpack(out FpType type1, out _, out _);
+            value2.FPUnpack(out FpType type2, out _, out _);
+
+            if (type1 == FpType.QNaN && type2 != FpType.QNaN)
+            {
+                value1 = FPInfinity(true);
+            }
+            else if (type1 != FpType.QNaN && type2 == FpType.QNaN)
+            {
+                value2 = FPInfinity(true);
+            }
+
+            return FPMax(value1, value2, state);
+        }
+
+        public static double FPMin(double value1, double value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_64.FPMin: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out ulong op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out ulong op2);
+
+            double result = FPProcessNaNs(type1, type2, op1, op2, state, out bool done);
+
+            if (!done)
+            {
+                if (value1 < value2)
+                {
+                    if (type1 == FpType.Infinity)
+                    {
+                        result = FPInfinity(sign1);
+                    }
+                    else if (type1 == FpType.Zero)
+                    {
+                        result = FPZero(sign1 || sign2);
+                    }
+                    else
+                    {
+                        result = value1;
+                    }
+                }
+                else
+                {
+                    if (type2 == FpType.Infinity)
+                    {
+                        result = FPInfinity(sign2);
+                    }
+                    else if (type2 == FpType.Zero)
+                    {
+                        result = FPZero(sign1 || sign2);
+                    }
+                    else
+                    {
+                        result = value2;
+                    }
+                }
+            }
+
+            return result;
+        }
+
+        public static double FPMinNum(double value1, double value2, CpuThreadState state)
+        {
+            Debug.WriteIf(state.Fpcr != 0, "ASoftFloat_64.FPMinNum: ");
+
+            value1.FPUnpack(out FpType type1, out _, out _);
+            value2.FPUnpack(out FpType type2, out _, out _);
+
+            if (type1 == FpType.QNaN && type2 != FpType.QNaN)
+            {
+                value1 = FPInfinity(false);
+            }
+            else if (type1 != FpType.QNaN && type2 == FpType.QNaN)
+            {
+                value2 = FPInfinity(false);
+            }
+
+            return FPMin(value1, value2, state);
+        }
+
+        public static double FPMul(double value1, double value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_64.FPMul: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out ulong op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out ulong op2);
+
+            double result = FPProcessNaNs(type1, type2, op1, op2, state, out bool done);
+
+            if (!done)
+            {
+                bool inf1 = type1 == FpType.Infinity; bool zero1 = type1 == FpType.Zero;
+                bool inf2 = type2 == FpType.Infinity; bool zero2 = type2 == FpType.Zero;
+
+                if ((inf1 && zero2) || (zero1 && inf2))
+                {
+                    result = FPDefaultNaN();
+
+                    FPProcessException(FpExc.InvalidOp, state);
+                }
+                else if (inf1 || inf2)
+                {
+                    result = FPInfinity(sign1 ^ sign2);
+                }
+                else if (zero1 || zero2)
+                {
+                    result = FPZero(sign1 ^ sign2);
+                }
+                else
+                {
+                    result = value1 * value2;
+                }
+            }
+
+            return result;
+        }
+
+        public static double FPMulAdd(double valueA, double value1, double value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_64.FPMulAdd: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            valueA = valueA.FPUnpack(out FpType typeA, out bool signA, out ulong addend);
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out ulong op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out ulong op2);
+
+            bool inf1 = type1 == FpType.Infinity; bool zero1 = type1 == FpType.Zero;
+            bool inf2 = type2 == FpType.Infinity; bool zero2 = type2 == FpType.Zero;
+
+            double result = FPProcessNaNs3(typeA, type1, type2, addend, op1, op2, state, out bool done);
+
+            if (typeA == FpType.QNaN && ((inf1 && zero2) || (zero1 && inf2)))
+            {
+                result = FPDefaultNaN();
+
+                FPProcessException(FpExc.InvalidOp, state);
+            }
+
+            if (!done)
+            {
+                bool infA = typeA == FpType.Infinity; bool zeroA = typeA == FpType.Zero;
+
+                bool signP = sign1 ^  sign2;
+                bool infP  = inf1  || inf2;
+                bool zeroP = zero1 || zero2;
+
+                if ((inf1 && zero2) || (zero1 && inf2) || (infA && infP && signA != signP))
+                {
+                    result = FPDefaultNaN();
+
+                    FPProcessException(FpExc.InvalidOp, state);
+                }
+                else if ((infA && !signA) || (infP && !signP))
+                {
+                    result = FPInfinity(false);
+                }
+                else if ((infA && signA) || (infP && signP))
+                {
+                    result = FPInfinity(true);
+                }
+                else if (zeroA && zeroP && signA == signP)
+                {
+                    result = FPZero(signA);
+                }
+                else
+                {
+                    // TODO: When available, use: T Math.FusedMultiplyAdd(T, T, T);
+                    // https://github.com/dotnet/corefx/issues/31903
+
+                    result = valueA + (value1 * value2);
+                }
+            }
+
+            return result;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static double FPMulSub(double valueA, double value1, double value2, CpuThreadState state)
+        {
+            Debug.WriteIf(state.Fpcr != 0, "ASoftFloat_64.FPMulSub: ");
+
+            value1 = value1.FPNeg();
+
+            return FPMulAdd(valueA, value1, value2, state);
+        }
+
+        public static double FPMulX(double value1, double value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_64.FPMulX: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out ulong op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out ulong op2);
+
+            double result = FPProcessNaNs(type1, type2, op1, op2, state, out bool done);
+
+            if (!done)
+            {
+                bool inf1 = type1 == FpType.Infinity; bool zero1 = type1 == FpType.Zero;
+                bool inf2 = type2 == FpType.Infinity; bool zero2 = type2 == FpType.Zero;
+
+                if ((inf1 && zero2) || (zero1 && inf2))
+                {
+                    result = FPTwo(sign1 ^ sign2);
+                }
+                else if (inf1 || inf2)
+                {
+                    result = FPInfinity(sign1 ^ sign2);
+                }
+                else if (zero1 || zero2)
+                {
+                    result = FPZero(sign1 ^ sign2);
+                }
+                else
+                {
+                    result = value1 * value2;
+                }
+            }
+
+            return result;
+        }
+
+        public static double FPRecipStepFused(double value1, double value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_64.FPRecipStepFused: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPNeg();
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out ulong op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out ulong op2);
+
+            double result = FPProcessNaNs(type1, type2, op1, op2, state, out bool done);
+
+            if (!done)
+            {
+                bool inf1 = type1 == FpType.Infinity; bool zero1 = type1 == FpType.Zero;
+                bool inf2 = type2 == FpType.Infinity; bool zero2 = type2 == FpType.Zero;
+
+                if ((inf1 && zero2) || (zero1 && inf2))
+                {
+                    result = FPTwo(false);
+                }
+                else if (inf1 || inf2)
+                {
+                    result = FPInfinity(sign1 ^ sign2);
+                }
+                else
+                {
+                    // TODO: When available, use: T Math.FusedMultiplyAdd(T, T, T);
+                    // https://github.com/dotnet/corefx/issues/31903
+
+                    result = 2d + (value1 * value2);
+                }
+            }
+
+            return result;
+        }
+
+        public static double FPRecpX(double value, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_64.FPRecpX: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value.FPUnpack(out FpType type, out bool sign, out ulong op);
+
+            double result;
+
+            if (type == FpType.SNaN || type == FpType.QNaN)
+            {
+                result = FPProcessNaN(type, op, state);
+            }
+            else
+            {
+                ulong notExp = (~op >> 52) & 0x7FFul;
+                ulong maxExp = 0x7FEul;
+
+                result = BitConverter.Int64BitsToDouble(
+                    (long)((sign ? 1ul : 0ul) << 63 | (notExp == 0x7FFul ? maxExp : notExp) << 52));
+            }
+
+            return result;
+        }
+
+        public static double FprSqrtStepFused(double value1, double value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_64.FPRSqrtStepFused: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPNeg();
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out ulong op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out ulong op2);
+
+            double result = FPProcessNaNs(type1, type2, op1, op2, state, out bool done);
+
+            if (!done)
+            {
+                bool inf1 = type1 == FpType.Infinity; bool zero1 = type1 == FpType.Zero;
+                bool inf2 = type2 == FpType.Infinity; bool zero2 = type2 == FpType.Zero;
+
+                if ((inf1 && zero2) || (zero1 && inf2))
+                {
+                    result = FPOnePointFive(false);
+                }
+                else if (inf1 || inf2)
+                {
+                    result = FPInfinity(sign1 ^ sign2);
+                }
+                else
+                {
+                    // TODO: When available, use: T Math.FusedMultiplyAdd(T, T, T);
+                    // https://github.com/dotnet/corefx/issues/31903
+
+                    result = (3d + (value1 * value2)) / 2d;
+                }
+            }
+
+            return result;
+        }
+
+        public static double FPSqrt(double value, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_64.FPSqrt: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value = value.FPUnpack(out FpType type, out bool sign, out ulong op);
+
+            double result;
+
+            if (type == FpType.SNaN || type == FpType.QNaN)
+            {
+                result = FPProcessNaN(type, op, state);
+            }
+            else if (type == FpType.Zero)
+            {
+                result = FPZero(sign);
+            }
+            else if (type == FpType.Infinity && !sign)
+            {
+                result = FPInfinity(sign);
+            }
+            else if (sign)
+            {
+                result = FPDefaultNaN();
+
+                FPProcessException(FpExc.InvalidOp, state);
+            }
+            else
+            {
+                result = Math.Sqrt(value);
+            }
+
+            return result;
+        }
+
+        public static double FPSub(double value1, double value2, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"ASoftFloat_64.FPSub: State.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out ulong op1);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out ulong op2);
+
+            double result = FPProcessNaNs(type1, type2, op1, op2, state, out bool done);
+
+            if (!done)
+            {
+                bool inf1 = type1 == FpType.Infinity; bool zero1 = type1 == FpType.Zero;
+                bool inf2 = type2 == FpType.Infinity; bool zero2 = type2 == FpType.Zero;
+
+                if (inf1 && inf2 && sign1 == sign2)
+                {
+                    result = FPDefaultNaN();
+
+                    FPProcessException(FpExc.InvalidOp, state);
+                }
+                else if ((inf1 && !sign1) || (inf2 && sign2))
+                {
+                    result = FPInfinity(false);
+                }
+                else if ((inf1 && sign1) || (inf2 && !sign2))
+                {
+                    result = FPInfinity(true);
+                }
+                else if (zero1 && zero2 && sign1 == !sign2)
+                {
+                    result = FPZero(sign1);
+                }
+                else
+                {
+                    result = value1 - value2;
+                }
+            }
+
+            return result;
+        }
+
+        private static double FPDefaultNaN()
+        {
+            return -double.NaN;
+        }
+
+        private static double FPInfinity(bool sign)
+        {
+            return sign ? double.NegativeInfinity : double.PositiveInfinity;
+        }
+
+        private static double FPZero(bool sign)
+        {
+            return sign ? -0d : +0d;
+        }
+
+        private static double FPTwo(bool sign)
+        {
+            return sign ? -2d : +2d;
+        }
+
+        private static double FPOnePointFive(bool sign)
+        {
+            return sign ? -1.5d : +1.5d;
+        }
+
+        private static double FPNeg(this double value)
+        {
+            return -value;
+        }
+
+        private static double FPUnpack(this double value, out FpType type, out bool sign, out ulong valueBits)
+        {
+            valueBits = (ulong)BitConverter.DoubleToInt64Bits(value);
+
+            sign = (~valueBits & 0x8000000000000000ul) == 0ul;
+
+            if ((valueBits & 0x7FF0000000000000ul) == 0ul)
+            {
+                if ((valueBits & 0x000FFFFFFFFFFFFFul) == 0ul)
+                {
+                    type = FpType.Zero;
+                }
+                else
+                {
+                    type = FpType.Nonzero;
+                }
+            }
+            else if ((~valueBits & 0x7FF0000000000000ul) == 0ul)
+            {
+                if ((valueBits & 0x000FFFFFFFFFFFFFul) == 0ul)
+                {
+                    type = FpType.Infinity;
+                }
+                else
+                {
+                    type = (~valueBits & 0x0008000000000000ul) == 0ul
+                        ? FpType.QNaN
+                        : FpType.SNaN;
+
+                    return FPZero(sign);
+                }
+            }
+            else
+            {
+                type = FpType.Nonzero;
+            }
+
+            return value;
+        }
+
+        private static double FPProcessNaNs(
+            FpType type1,
+            FpType type2,
+            ulong op1,
+            ulong op2,
+            CpuThreadState state,
+            out bool done)
+        {
+            done = true;
+
+            if (type1 == FpType.SNaN)
+            {
+                return FPProcessNaN(type1, op1, state);
+            }
+            else if (type2 == FpType.SNaN)
+            {
+                return FPProcessNaN(type2, op2, state);
+            }
+            else if (type1 == FpType.QNaN)
+            {
+                return FPProcessNaN(type1, op1, state);
+            }
+            else if (type2 == FpType.QNaN)
+            {
+                return FPProcessNaN(type2, op2, state);
+            }
+
+            done = false;
+
+            return FPZero(false);
+        }
+
+        private static double FPProcessNaNs3(
+            FpType type1,
+            FpType type2,
+            FpType type3,
+            ulong op1,
+            ulong op2,
+            ulong op3,
+            CpuThreadState state,
+            out bool done)
+        {
+            done = true;
+
+            if (type1 == FpType.SNaN)
+            {
+                return FPProcessNaN(type1, op1, state);
+            }
+            else if (type2 == FpType.SNaN)
+            {
+                return FPProcessNaN(type2, op2, state);
+            }
+            else if (type3 == FpType.SNaN)
+            {
+                return FPProcessNaN(type3, op3, state);
+            }
+            else if (type1 == FpType.QNaN)
+            {
+                return FPProcessNaN(type1, op1, state);
+            }
+            else if (type2 == FpType.QNaN)
+            {
+                return FPProcessNaN(type2, op2, state);
+            }
+            else if (type3 == FpType.QNaN)
+            {
+                return FPProcessNaN(type3, op3, state);
+            }
+
+            done = false;
+
+            return FPZero(false);
+        }
+
+        private static double FPProcessNaN(FpType type, ulong op, CpuThreadState state)
+        {
+            if (type == FpType.SNaN)
+            {
+                op |= 1ul << 51;
+
+                FPProcessException(FpExc.InvalidOp, state);
+            }
+
+            if (state.GetFpcrFlag(Fpcr.Dn))
+            {
+                return FPDefaultNaN();
+            }
+
+            return BitConverter.Int64BitsToDouble((long)op);
+        }
+
+        private static void FPProcessException(FpExc exc, CpuThreadState state)
+        {
+            int enable = (int)exc + 8;
+
+            if ((state.Fpcr & (1 << enable)) != 0)
+            {
+                throw new NotImplementedException("floating-point trap handling");
+            }
+            else
+            {
+                state.Fpsr |= 1 << (int)exc;
+            }
+        }
+    }
+}
diff --git a/ChocolArm64/Instructions/VectorHelper.cs b/ChocolArm64/Instructions/VectorHelper.cs
new file mode 100644
index 00000000..8ef15818
--- /dev/null
+++ b/ChocolArm64/Instructions/VectorHelper.cs
@@ -0,0 +1,790 @@
+using ChocolArm64.State;
+using ChocolArm64.Translation;
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace ChocolArm64.Instructions
+{
+    static class VectorHelper
+    {
+        private static readonly Vector128<float> Zero32128Mask;
+
+        static VectorHelper()
+        {
+            if (!Sse2.IsSupported)
+            {
+                throw new PlatformNotSupportedException();
+            }
+
+            Zero32128Mask = Sse.StaticCast<uint, float>(Sse2.SetVector128(0, 0, 0, 0xffffffff));
+        }
+
+        public static void EmitCall(ILEmitterCtx context, string name64, string name128)
+        {
+            bool isSimd64 = context.CurrOp.RegisterSize == RegisterSize.Simd64;
+
+            context.EmitCall(typeof(VectorHelper), isSimd64 ? name64 : name128);
+        }
+
+        public static void EmitCall(ILEmitterCtx context, string mthdName)
+        {
+            context.EmitCall(typeof(VectorHelper), mthdName);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static int SatF32ToS32(float value)
+        {
+            if (float.IsNaN(value)) return 0;
+
+            return value > int.MaxValue ? int.MaxValue :
+                   value < int.MinValue ? int.MinValue : (int)value;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static long SatF32ToS64(float value)
+        {
+            if (float.IsNaN(value)) return 0;
+
+            return value > long.MaxValue ? long.MaxValue :
+                   value < long.MinValue ? long.MinValue : (long)value;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static uint SatF32ToU32(float value)
+        {
+            if (float.IsNaN(value)) return 0;
+
+            return value > uint.MaxValue ? uint.MaxValue :
+                   value < uint.MinValue ? uint.MinValue : (uint)value;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static ulong SatF32ToU64(float value)
+        {
+            if (float.IsNaN(value)) return 0;
+
+            return value > ulong.MaxValue ? ulong.MaxValue :
+                   value < ulong.MinValue ? ulong.MinValue : (ulong)value;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static int SatF64ToS32(double value)
+        {
+            if (double.IsNaN(value)) return 0;
+
+            return value > int.MaxValue ? int.MaxValue :
+                   value < int.MinValue ? int.MinValue : (int)value;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static long SatF64ToS64(double value)
+        {
+            if (double.IsNaN(value)) return 0;
+
+            return value > long.MaxValue ? long.MaxValue :
+                   value < long.MinValue ? long.MinValue : (long)value;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static uint SatF64ToU32(double value)
+        {
+            if (double.IsNaN(value)) return 0;
+
+            return value > uint.MaxValue ? uint.MaxValue :
+                   value < uint.MinValue ? uint.MinValue : (uint)value;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static ulong SatF64ToU64(double value)
+        {
+            if (double.IsNaN(value)) return 0;
+
+            return value > ulong.MaxValue ? ulong.MaxValue :
+                   value < ulong.MinValue ? ulong.MinValue : (ulong)value;
+        }
+
+        public static double Round(double value, CpuThreadState state)
+        {
+            switch (state.FPRoundingMode())
+            {
+                case RoundMode.ToNearest:            return Math.Round   (value);
+                case RoundMode.TowardsPlusInfinity:  return Math.Ceiling (value);
+                case RoundMode.TowardsMinusInfinity: return Math.Floor   (value);
+                case RoundMode.TowardsZero:          return Math.Truncate(value);
+            }
+
+            throw new InvalidOperationException();
+        }
+
+        public static float RoundF(float value, CpuThreadState state)
+        {
+            switch (state.FPRoundingMode())
+            {
+                case RoundMode.ToNearest:            return MathF.Round   (value);
+                case RoundMode.TowardsPlusInfinity:  return MathF.Ceiling (value);
+                case RoundMode.TowardsMinusInfinity: return MathF.Floor   (value);
+                case RoundMode.TowardsZero:          return MathF.Truncate(value);
+            }
+
+            throw new InvalidOperationException();
+        }
+
+        public static Vector128<float> Tbl1_V64(
+            Vector128<float> vector,
+            Vector128<float> tb0)
+        {
+            return Tbl(vector, 8, tb0);
+        }
+
+        public static Vector128<float> Tbl1_V128(
+            Vector128<float> vector,
+            Vector128<float> tb0)
+        {
+            return Tbl(vector, 16, tb0);
+        }
+
+        public static Vector128<float> Tbl2_V64(
+            Vector128<float> vector,
+            Vector128<float> tb0,
+            Vector128<float> tb1)
+        {
+            return Tbl(vector, 8, tb0, tb1);
+        }
+
+        public static Vector128<float> Tbl2_V128(
+            Vector128<float> vector,
+            Vector128<float> tb0,
+            Vector128<float> tb1)
+        {
+            return Tbl(vector, 16, tb0, tb1);
+        }
+
+        public static Vector128<float> Tbl3_V64(
+            Vector128<float> vector,
+            Vector128<float> tb0,
+            Vector128<float> tb1,
+            Vector128<float> tb2)
+        {
+            return Tbl(vector, 8, tb0, tb1, tb2);
+        }
+
+        public static Vector128<float> Tbl3_V128(
+            Vector128<float> vector,
+            Vector128<float> tb0,
+            Vector128<float> tb1,
+            Vector128<float> tb2)
+        {
+            return Tbl(vector, 16, tb0, tb1, tb2);
+        }
+
+        public static Vector128<float> Tbl4_V64(
+            Vector128<float> vector,
+            Vector128<float> tb0,
+            Vector128<float> tb1,
+            Vector128<float> tb2,
+            Vector128<float> tb3)
+        {
+            return Tbl(vector, 8, tb0, tb1, tb2, tb3);
+        }
+
+        public static Vector128<float> Tbl4_V128(
+            Vector128<float> vector,
+            Vector128<float> tb0,
+            Vector128<float> tb1,
+            Vector128<float> tb2,
+            Vector128<float> tb3)
+        {
+            return Tbl(vector, 16, tb0, tb1, tb2, tb3);
+        }
+
+        private static Vector128<float> Tbl(Vector128<float> vector, int bytes, params Vector128<float>[] tb)
+        {
+            Vector128<float> res = new Vector128<float>();
+
+            byte[] table = new byte[tb.Length * 16];
+
+            for (byte index  = 0; index  < tb.Length; index++)
+            for (byte index2 = 0; index2 < 16;        index2++)
+            {
+                table[index * 16 + index2] = (byte)VectorExtractIntZx(tb[index], index2, 0);
+            }
+
+            for (byte index = 0; index < bytes; index++)
+            {
+                byte tblIdx = (byte)VectorExtractIntZx(vector, index, 0);
+
+                if (tblIdx < table.Length)
+                {
+                    res = VectorInsertInt(table[tblIdx], res, index, 0);
+                }
+            }
+
+            return res;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static double VectorExtractDouble(Vector128<float> vector, byte index)
+        {
+            if (Sse41.IsSupported)
+            {
+                return BitConverter.Int64BitsToDouble(Sse41.Extract(Sse.StaticCast<float, long>(vector), index));
+            }
+            else if (Sse2.IsSupported)
+            {
+                return BitConverter.Int64BitsToDouble((long)VectorExtractIntZx(vector, index, 3));
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static long VectorExtractIntSx(Vector128<float> vector, byte index, int size)
+        {
+            if (Sse41.IsSupported)
+            {
+                if (size == 0)
+                {
+                    return (sbyte)Sse41.Extract(Sse.StaticCast<float, byte>(vector), index);
+                }
+                else if (size == 1)
+                {
+                    return (short)Sse2.Extract(Sse.StaticCast<float, ushort>(vector), index);
+                }
+                else if (size == 2)
+                {
+                    return Sse41.Extract(Sse.StaticCast<float, int>(vector), index);
+                }
+                else if (size == 3)
+                {
+                    return Sse41.Extract(Sse.StaticCast<float, long>(vector), index);
+                }
+                else
+                {
+                    throw new ArgumentOutOfRangeException(nameof(size));
+                }
+            }
+            else if (Sse2.IsSupported)
+            {
+                if (size == 0)
+                {
+                    return (sbyte)VectorExtractIntZx(vector, index, size);
+                }
+                else if (size == 1)
+                {
+                    return (short)VectorExtractIntZx(vector, index, size);
+                }
+                else if (size == 2)
+                {
+                    return (int)VectorExtractIntZx(vector, index, size);
+                }
+                else if (size == 3)
+                {
+                    return (long)VectorExtractIntZx(vector, index, size);
+                }
+                else
+                {
+                    throw new ArgumentOutOfRangeException(nameof(size));
+                }
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static ulong VectorExtractIntZx(Vector128<float> vector, byte index, int size)
+        {
+            if (Sse41.IsSupported)
+            {
+                if (size == 0)
+                {
+                    return Sse41.Extract(Sse.StaticCast<float, byte>(vector), index);
+                }
+                else if (size == 1)
+                {
+                    return Sse2.Extract(Sse.StaticCast<float, ushort>(vector), index);
+                }
+                else if (size == 2)
+                {
+                    return Sse41.Extract(Sse.StaticCast<float, uint>(vector), index);
+                }
+                else if (size == 3)
+                {
+                    return Sse41.Extract(Sse.StaticCast<float, ulong>(vector), index);
+                }
+                else
+                {
+                    throw new ArgumentOutOfRangeException(nameof(size));
+                }
+            }
+            else if (Sse2.IsSupported)
+            {
+                int shortIdx = size == 0
+                    ? index >> 1
+                    : index << (size - 1);
+
+                ushort value = Sse2.Extract(Sse.StaticCast<float, ushort>(vector), (byte)shortIdx);
+
+                if (size == 0)
+                {
+                    return (byte)(value >> (index & 1) * 8);
+                }
+                else if (size == 1)
+                {
+                    return value;
+                }
+                else if (size == 2 || size == 3)
+                {
+                    ushort value1 = Sse2.Extract(Sse.StaticCast<float, ushort>(vector), (byte)(shortIdx + 1));
+
+                    if (size == 2)
+                    {
+                        return (uint)(value | (value1 << 16));
+                    }
+
+                    ushort value2 = Sse2.Extract(Sse.StaticCast<float, ushort>(vector), (byte)(shortIdx + 2));
+                    ushort value3 = Sse2.Extract(Sse.StaticCast<float, ushort>(vector), (byte)(shortIdx + 3));
+
+                    return ((ulong)value  <<  0) |
+                           ((ulong)value1 << 16) |
+                           ((ulong)value2 << 32) |
+                           ((ulong)value3 << 48);
+                }
+                else
+                {
+                    throw new ArgumentOutOfRangeException(nameof(size));
+                }
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static float VectorExtractSingle(Vector128<float> vector, byte index)
+        {
+            if (Sse41.IsSupported)
+            {
+                return Sse41.Extract(vector, index);
+            }
+            else if (Sse2.IsSupported)
+            {
+                Vector128<ushort> shortVector = Sse.StaticCast<float, ushort>(vector);
+
+                int low  = Sse2.Extract(shortVector, (byte)(index * 2 + 0));
+                int high = Sse2.Extract(shortVector, (byte)(index * 2 + 1));
+
+                return BitConverter.Int32BitsToSingle(low | (high << 16));
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> VectorInsertDouble(double value, Vector128<float> vector, byte index)
+        {
+            return VectorInsertInt((ulong)BitConverter.DoubleToInt64Bits(value), vector, index, 3);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> VectorInsertInt(ulong value, Vector128<float> vector, byte index, int size)
+        {
+            if (Sse41.IsSupported)
+            {
+                if (size == 0)
+                {
+                    return Sse.StaticCast<byte, float>(Sse41.Insert(Sse.StaticCast<float, byte>(vector), (byte)value, index));
+                }
+                else if (size == 1)
+                {
+                    return Sse.StaticCast<ushort, float>(Sse2.Insert(Sse.StaticCast<float, ushort>(vector), (ushort)value, index));
+                }
+                else if (size == 2)
+                {
+                    return Sse.StaticCast<uint, float>(Sse41.Insert(Sse.StaticCast<float, uint>(vector), (uint)value, index));
+                }
+                else if (size == 3)
+                {
+                    return Sse.StaticCast<ulong, float>(Sse41.Insert(Sse.StaticCast<float, ulong>(vector), value, index));
+                }
+                else
+                {
+                    throw new ArgumentOutOfRangeException(nameof(size));
+                }
+            }
+            else if (Sse2.IsSupported)
+            {
+                Vector128<ushort> shortVector = Sse.StaticCast<float, ushort>(vector);
+
+                int shortIdx = size == 0
+                    ? index >> 1
+                    : index << (size - 1);
+
+                if (size == 0)
+                {
+                    ushort shortVal = Sse2.Extract(Sse.StaticCast<float, ushort>(vector), (byte)shortIdx);
+
+                    int shift = (index & 1) * 8;
+
+                    shortVal &= (ushort)(0xff00 >> shift);
+
+                    shortVal |= (ushort)((byte)value << shift);
+
+                    return Sse.StaticCast<ushort, float>(Sse2.Insert(shortVector, shortVal, (byte)shortIdx));
+                }
+                else if (size == 1)
+                {
+                    return Sse.StaticCast<ushort, float>(Sse2.Insert(Sse.StaticCast<float, ushort>(vector), (ushort)value, index));
+                }
+                else if (size == 2 || size == 3)
+                {
+                    shortVector = Sse2.Insert(shortVector, (ushort)(value >>  0), (byte)(shortIdx + 0));
+                    shortVector = Sse2.Insert(shortVector, (ushort)(value >> 16), (byte)(shortIdx + 1));
+
+                    if (size == 3)
+                    {
+                        shortVector = Sse2.Insert(shortVector, (ushort)(value >> 32), (byte)(shortIdx + 2));
+                        shortVector = Sse2.Insert(shortVector, (ushort)(value >> 48), (byte)(shortIdx + 3));
+                    }
+
+                    return Sse.StaticCast<ushort, float>(shortVector);
+                }
+                else
+                {
+                    throw new ArgumentOutOfRangeException(nameof(size));
+                }
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> VectorInsertSingle(float value, Vector128<float> vector, byte index)
+        {
+            if (Sse41.IsSupported)
+            {
+                //Note: The if/else if is necessary to enable the JIT to
+                //produce a single INSERTPS instruction instead of the
+                //jump table fallback.
+                if (index == 0)
+                {
+                    return Sse41.Insert(vector, value, 0x00);
+                }
+                else if (index == 1)
+                {
+                    return Sse41.Insert(vector, value, 0x10);
+                }
+                else if (index == 2)
+                {
+                    return Sse41.Insert(vector, value, 0x20);
+                }
+                else if (index == 3)
+                {
+                    return Sse41.Insert(vector, value, 0x30);
+                }
+                else
+                {
+                    throw new ArgumentOutOfRangeException(nameof(index));
+                }
+            }
+            else if (Sse2.IsSupported)
+            {
+                int intValue = BitConverter.SingleToInt32Bits(value);
+
+                ushort low  = (ushort)(intValue >> 0);
+                ushort high = (ushort)(intValue >> 16);
+
+                Vector128<ushort> shortVector = Sse.StaticCast<float, ushort>(vector);
+
+                shortVector = Sse2.Insert(shortVector, low,  (byte)(index * 2 + 0));
+                shortVector = Sse2.Insert(shortVector, high, (byte)(index * 2 + 1));
+
+                return Sse.StaticCast<ushort, float>(shortVector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> Sse41VectorInsertScalarSingle(float value, Vector128<float> vector)
+        {
+            //Note: 0b1110 is the mask to zero the upper bits.
+            return Sse41.Insert(vector, value, 0b1110);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<sbyte> VectorSByteZero()
+        {
+            if (Sse2.IsSupported)
+            {
+                return Sse2.SetZeroVector128<sbyte>();
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<short> VectorInt16Zero()
+        {
+            if (Sse2.IsSupported)
+            {
+                return Sse2.SetZeroVector128<short>();
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<int> VectorInt32Zero()
+        {
+            if (Sse2.IsSupported)
+            {
+                return Sse2.SetZeroVector128<int>();
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<long> VectorInt64Zero()
+        {
+            if (Sse2.IsSupported)
+            {
+                return Sse2.SetZeroVector128<long>();
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> VectorSingleZero()
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.SetZeroVector128();
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<double> VectorDoubleZero()
+        {
+            if (Sse2.IsSupported)
+            {
+                return Sse2.SetZeroVector128<double>();
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> VectorZero32_128(Vector128<float> vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.And(vector, Zero32128Mask);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<sbyte> VectorSingleToSByte(Vector128<float> vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<float, sbyte>(vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<short> VectorSingleToInt16(Vector128<float> vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<float, short>(vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<int> VectorSingleToInt32(Vector128<float> vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<float, int>(vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<long> VectorSingleToInt64(Vector128<float> vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<float, long>(vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<byte> VectorSingleToByte(Vector128<float> vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<float, byte>(vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<ushort> VectorSingleToUInt16(Vector128<float> vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<float, ushort>(vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<uint> VectorSingleToUInt32(Vector128<float> vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<float, uint>(vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<ulong> VectorSingleToUInt64(Vector128<float> vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<float, ulong>(vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<double> VectorSingleToDouble(Vector128<float> vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<float, double>(vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> VectorSByteToSingle(Vector128<sbyte> vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<sbyte, float>(vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> VectorInt16ToSingle(Vector128<short> vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<short, float>(vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> VectorInt32ToSingle(Vector128<int> vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<int, float>(vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> VectorInt64ToSingle(Vector128<long> vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<long, float>(vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> VectorByteToSingle(Vector128<byte> vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<byte, float>(vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> VectorUInt16ToSingle(Vector128<ushort> vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<ushort, float>(vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> VectorUInt32ToSingle(Vector128<uint> vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<uint, float>(vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> VectorUInt64ToSingle(Vector128<ulong> vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<ulong, float>(vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static Vector128<float> VectorDoubleToSingle(Vector128<double> vector)
+        {
+            if (Sse.IsSupported)
+            {
+                return Sse.StaticCast<double, float>(vector);
+            }
+
+            throw new PlatformNotSupportedException();
+        }
+    }
+}
author	Alex Barney <thealexbarney@gmail.com>	2018-10-30 19:43:02 -0600
committer	gdkchan <gab.dark.100@gmail.com>	2018-10-30 22:43:02 -0300
commit	9cb57fb4bb3bbae0ae052a5af4a96a49fc5d864d (patch)
tree	0c97425aeb311c142bc92a6fcc503cb2c07d4376 /ChocolArm64/Instructions
parent	5a87e58183578f5b84ca8d01cbb76aed11820f78 (diff)