Update arm vcvt intrinsics to use llvm.fpto(su)i.sat

JamieCunliffe · JamieCunliffe · commit d688b9bbe38f · 2021-08-05T14:29:10.000+01:00
Those intrinsics have the correct semantics for the desired fcvtz instruction,
without any undefined behaviour. The previous simd_cast was undefined for
infinite and NaN which could cause issues.
diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs
@@ -2427,31 +2427,51 @@ pub unsafe fn vcvtd_u64_f64(a: f64) -> u64 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fcvtzs))]
 pub unsafe fn vcvt_s64_f64(a: float64x1_t) -> int64x1_t {
-    simd_cast(a)
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fptosi.sat.v1i64.v1f64")]
+        fn vcvt_s64_f64_(a: float64x1_t) -> int64x1_t;
+    }
+    vcvt_s64_f64_(a)
 }
 
 /// Floating-point convert to signed fixed-point, rounding toward zero
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fcvtzs))]
 pub unsafe fn vcvtq_s64_f64(a: float64x2_t) -> int64x2_t {
-    simd_cast(a)
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fptosi.sat.v2i64.v2f64")]
+        fn vcvtq_s64_f64_(a: float64x2_t) -> int64x2_t;
+    }
+    vcvtq_s64_f64_(a)
 }
 
 /// Floating-point convert to unsigned fixed-point, rounding toward zero
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fcvtzu))]
 pub unsafe fn vcvt_u64_f64(a: float64x1_t) -> uint64x1_t {
-    simd_cast(a)
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fptoui.sat.v1i64.v1f64")]
+        fn vcvt_u64_f64_(a: float64x1_t) -> uint64x1_t;
+    }
+    vcvt_u64_f64_(a)
 }
 
 /// Floating-point convert to unsigned fixed-point, rounding toward zero
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fcvtzu))]
 pub unsafe fn vcvtq_u64_f64(a: float64x2_t) -> uint64x2_t {
-    simd_cast(a)
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fptoui.sat.v2i64.v2f64")]
+        fn vcvtq_u64_f64_(a: float64x2_t) -> uint64x2_t;
+    }
+    vcvtq_u64_f64_(a)
 }
 
 /// Floating-point convert to signed integer, rounding to nearest with ties to away
diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs
@@ -2407,7 +2407,13 @@ vcvtq_n_u32_f32_(a, N)
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fcvtzs))]
 pub unsafe fn vcvt_s32_f32(a: float32x2_t) -> int32x2_t {
-    simd_cast(a)
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.fptosi.sat.v2i32.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fptosi.sat.v2i32.v2f32")]
+        fn vcvt_s32_f32_(a: float32x2_t) -> int32x2_t;
+    }
+vcvt_s32_f32_(a)
 }
 
 /// Floating-point convert to signed fixed-point, rounding toward zero
@@ -2417,7 +2423,13 @@ pub unsafe fn vcvt_s32_f32(a: float32x2_t) -> int32x2_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fcvtzs))]
 pub unsafe fn vcvtq_s32_f32(a: float32x4_t) -> int32x4_t {
-    simd_cast(a)
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.fptosi.sat.v4i32.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fptosi.sat.v4i32.v4f32")]
+        fn vcvtq_s32_f32_(a: float32x4_t) -> int32x4_t;
+    }
+vcvtq_s32_f32_(a)
 }
 
 /// Floating-point convert to unsigned fixed-point, rounding toward zero
@@ -2427,7 +2439,13 @@ pub unsafe fn vcvtq_s32_f32(a: float32x4_t) -> int32x4_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fcvtzu))]
 pub unsafe fn vcvt_u32_f32(a: float32x2_t) -> uint32x2_t {
-    simd_cast(a)
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.fptoui.sat.v2i32.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fptoui.sat.v2i32.v2f32")]
+        fn vcvt_u32_f32_(a: float32x2_t) -> uint32x2_t;
+    }
+vcvt_u32_f32_(a)
 }
 
 /// Floating-point convert to unsigned fixed-point, rounding toward zero
@@ -2437,7 +2455,13 @@ pub unsafe fn vcvt_u32_f32(a: float32x2_t) -> uint32x2_t {
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vcvt))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fcvtzu))]
 pub unsafe fn vcvtq_u32_f32(a: float32x4_t) -> uint32x4_t {
-    simd_cast(a)
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.fptoui.sat.v4i32.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fptoui.sat.v4i32.v4f32")]
+        fn vcvtq_u32_f32_(a: float32x4_t) -> uint32x4_t;
+    }
+vcvtq_u32_f32_(a)
 }
 
 /// Set all vector lanes to the same value
diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec
@@ -1040,26 +1040,28 @@ generate f32:u32, f64:u64
 /// Floating-point convert to signed fixed-point, rounding toward zero
 name = vcvt
 double-suffixes
-fn = simd_cast
+link-aarch64 = llvm.fptosi.sat._EXT2_._EXT_
 a = -1.1, 2.1, -2.9, 3.9
 validate -1, 2, -2, 3
 
 aarch64 = fcvtzs
 generate float64x1_t:int64x1_t, float64x2_t:int64x2_t
 
+link-arm = llvm.fptosi.sat._EXT2_._EXT_
 arm = vcvt
 generate float32x2_t:int32x2_t, float32x4_t:int32x4_t
 
 /// Floating-point convert to unsigned fixed-point, rounding toward zero
 name = vcvt
 double-suffixes
-fn = simd_cast
+link-aarch64 = llvm.fptoui.sat._EXT2_._EXT_
 a = 1.1, 2.1, 2.9, 3.9
 validate 1, 2, 2, 3
 
 aarch64 = fcvtzu
 generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
 
+link-arm = llvm.fptoui.sat._EXT2_._EXT_
 arm = vcvt
 generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
 
diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs
@@ -125,6 +125,11 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
                 "usad8" | "vfma" | "vfms" => 27,
                 "qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" | "usub8" | "ssub8" => 29,
 
+                // Temporary, currently the fptosi.sat and fptoui.sat LLVM
+                // intrinsics emit unnecessary code on arm. This can be
+                // removed once it has been addressed in LLVM.
+                "fcvtzu" | "fcvtzs" | "vcvt" => 64,
+
                 // Original limit was 20 instructions, but ARM DSP Intrinsics
                 // are exactly 20 instructions long. So, bump the limit to 22
                 // instead of adding here a long list of exceptions.