From f4633aa844942afbb735294e391a9be7bd80e89a Mon Sep 17 00:00:00 2001
From: Thomas Schilling <nominolo@googlemail.com>
Date: Sun, 22 Oct 2017 17:51:32 +0200
Subject: [PATCH 1/3] Add single output _mm_cvt[t]ss_* variants

The *_pi variants are currently blocked by
https://github.com/rust-lang-nursery/stdsimd/issues/74
---
 src/x86/sse.rs | 205 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 205 insertions(+)

diff --git a/src/x86/sse.rs b/src/x86/sse.rs
index c402780bfc..dfdeed43b7 100644
--- a/src/x86/sse.rs
+++ b/src/x86/sse.rs
@@ -598,6 +598,102 @@ pub unsafe fn _mm_ucomineq_ss(a: f32x4, b: f32x4) -> i32 {
     ucomineq_ss(a, b)
 }
 
+/// Convert the lowest 32 bit float in the input vector to a 32 bit integer.
+///
+/// The result is rounded according to the current rounding mode. If the result
+/// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
+/// (`std::i32::MIN`) or an invalid operation floating point exception if
+/// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
+///
+/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvtss2si))]
+pub unsafe fn _mm_cvtss_si32(a: f32x4) -> i32 {
+    cvtss2si(a)
+}
+
+/// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvtss2si))]
+pub unsafe fn _mm_cvt_ss2si(a: f32x4) -> i32 {
+    _mm_cvtss_si32(a)
+}
+
+/// Convert the lowest 32 bit float in the input vector to a 64 bit integer.
+///
+/// The result is rounded according to the current rounding mode. If the result
+/// cannot be represented as a 64 bit integer the result will be
+/// `0x8000_0000_0000_0000` (`std::i64::MIN`) or trigger an invalid operation
+/// floating point exception if unmasked (see
+/// [`_mm_setcsr`](fn._mm_setcsr.html)).
+///
+/// This corresponds to the `CVTSS2SI` instruction (with 64 bit output).
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvtss2si))]
+pub unsafe fn _mm_cvtss_si64(a: f32x4) -> i64 {
+    cvtss2si64(a)
+}
+
+// Blocked by https://github.com/rust-lang-nursery/stdsimd/issues/74
+// pub unsafe fn _mm_cvtps_pi32(a: f32x4) -> i32x2
+// pub unsafe fn _mm_cvt_ps2pi(a: f32x4) -> i32x2 { _mm_cvtps_pi32(a) }
+
+/// Convert the lowest 32 bit float in the input vector to a 32 bit integer with
+/// truncation.
+///
+/// The result is rounded always using truncation (round towards zero). If the
+/// result cannot be represented as a 32 bit integer the result will be
+/// `0x8000_0000` (`std::i32::MIN`) or an invalid operation floating point
+/// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
+///
+/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvttss2si))]
+pub unsafe fn _mm_cvttss_si32(a: f32x4) -> i32 {
+    cvttss2si(a)
+}
+
+/// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvttss2si))]
+pub unsafe fn _mm_cvtt_ss2si(a: f32x4) -> i32 {
+    _mm_cvttss_si32(a)
+}
+
+/// Convert the lowest 32 bit float in the input vector to a 64 bit integer with
+/// truncation.
+///
+/// The result is rounded always using truncation (round towards zero). If the
+/// result cannot be represented as a 64 bit integer the result will be
+/// `0x8000_0000_0000_0000` (`std::i64::MIN`) or an invalid operation floating
+/// point exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
+///
+/// This corresponds to the `CVTTSS2SI` instruction (with 64 bit output).
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvttss2si))]
+pub unsafe fn _mm_cvttss_si64(a: f32x4) -> i64 {
+    cvttss2si64(a)
+}
+
+// Blocked by https://github.com/rust-lang-nursery/stdsimd/issues/74
+// pub unsafe fn _mm_cvttps_pi32(a: f32x4) -> i32x2;
+// pub unsafe fn _mm_cvtt_ps2pi(a: f32x4) -> i32x2 { _mm_cvttps_pi32(a) }
+
+/// Extract the lowest 32 bit float from the input vector.
+#[inline(always)]
+#[target_feature = "+sse"]
+// No point in using assert_instrs. In Unix x86_64 calling convention this is a
+// no-op, and on Windows it's just a `mov`.
+pub unsafe fn _mm_cvtss_f32(a: f32x4) -> f32 {
+    a.extract(0)
+}
+
 /// Construct a `f32x4` with the lowest element set to `a` and the rest set to
 /// zero.
 #[inline(always)]
@@ -1542,6 +1638,14 @@ extern {
     fn ucomige_ss(a: f32x4, b: f32x4) -> i32;
     #[link_name = "llvm.x86.sse.ucomineq.ss"]
     fn ucomineq_ss(a: f32x4, b: f32x4) -> i32;
+    #[link_name = "llvm.x86.sse.cvtss2si"]
+    fn cvtss2si(a: f32x4) -> i32;
+    #[link_name = "llvm.x86.sse.cvtss2si64"]
+    fn cvtss2si64(a: f32x4) -> i64;
+    #[link_name = "llvm.x86.sse.cvttss2si"]
+    fn cvttss2si(a: f32x4) -> i32;
+    #[link_name = "llvm.x86.sse.cvttss2si64"]
+    fn cvttss2si64(a: f32x4) -> i64;
     #[link_name = "llvm.x86.sse.sfence"]
     fn sfence();
     #[link_name = "llvm.x86.sse.stmxcsr"]
@@ -2532,6 +2636,107 @@ mod tests {
         }
     }
 
+    #[simd_test = "sse"]
+    unsafe fn _mm_cvtss_si32() {
+        use std::f32::NAN;
+        use std::i32::MIN;
+        let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
+        let result = &[42i32,   -3,   MIN,    0,       MIN, 2147483520];
+        for i in 0..inputs.len() {
+            let x = f32x4::new(inputs[i], 1.0, 3.0, 4.0);
+            let e = result[i];
+            let r = sse::_mm_cvtss_si32(x);
+            assert_eq!(e, r,
+                "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
+                i, x, r, e);
+        }
+    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_cvtss_si64() {
+        use std::f32::NAN;
+        use std::i64::MIN;
+        let inputs = &[
+            (42.0f32,  42i64),
+            (-31.4,   -31),
+            (-33.5,   -34),
+            (-34.5,   -34),
+            (4.0e10,  40_000_000_000),
+            (4.0e-10, 0),
+            (NAN, MIN),
+            (2147483500.1, 2147483520),
+            (9.223371e18, 9223370937343148032)
+        ];
+        for i in 0..inputs.len() {
+            let (xi, e) = inputs[i];
+            let x = f32x4::new(xi, 1.0, 3.0, 4.0);
+            let r = sse::_mm_cvtss_si64(x);
+            assert_eq!(e, r,
+                "TestCase #{} _mm_cvtss_si64({:?}) = {}, expected: {}",
+                i, x, r, e);
+        }
+    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_cvttss_si32() {
+        use std::f32::NAN;
+        use std::i32::MIN;
+        let inputs = &[
+            (42.0f32,  42i32),
+            (-31.4,   -31),
+            (-33.5,   -33),
+            (-34.5,   -34),
+            (10.999,   10),
+            (-5.99,    -5),
+            (4.0e10,  MIN),
+            (4.0e-10, 0),
+            (NAN, MIN),
+            (2147483500.1, 2147483520),
+        ];
+        for i in 0..inputs.len() {
+            let (xi, e) = inputs[i];
+            let x = f32x4::new(xi, 1.0, 3.0, 4.0);
+            let r = sse::_mm_cvttss_si32(x);
+            assert_eq!(e, r,
+                "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
+                i, x, r, e);
+        }
+    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_cvttss_si64() {
+        use std::f32::NAN;
+        use std::i64::MIN;
+        let inputs = &[
+            (42.0f32,  42i64),
+            (-31.4,   -31),
+            (-33.5,   -33),
+            (-34.5,   -34),
+            (10.999,   10),
+            (-5.99,    -5),
+            (4.0e10,  40_000_000_000),
+            (4.0e-10, 0),
+            (NAN, MIN),
+            (2147483500.1, 2147483520),
+            (9.223371e18, 9223370937343148032),
+            (9.223372e18, MIN),
+        ];
+        for i in 0..inputs.len() {
+            let (xi, e) = inputs[i];
+            let x = f32x4::new(xi, 1.0, 3.0, 4.0);
+            let r = sse::_mm_cvttss_si64(x);
+            assert_eq!(e, r,
+                "TestCase #{} _mm_cvttss_si64({:?}) = {}, expected: {}",
+                i, x, r, e);
+        }
+    }
+
+    #[simd_test = "sse"]
+    pub unsafe fn _mm_cvtss_f32() {
+        let a = f32x4::new(312.0134, 5.0, 6.0, 7.0);
+        assert_eq!(sse::_mm_cvtss_f32(a), 312.0134);
+    }
+
     #[simd_test = "sse"]
     unsafe fn _mm_set_ss() {
         let r = sse::_mm_set_ss(black_box(4.25));

From cd253475acc7ce049ee2bdcbdf815ccbb5683935 Mon Sep 17 00:00:00 2001
From: Thomas Schilling <nominolo@googlemail.com>
Date: Sun, 22 Oct 2017 18:27:02 +0200
Subject: [PATCH 2/3] Add _mm_cvtsi*_ss

The _mm_cvtpi*_ps intrinsics are blocked by
https://github.com/rust-lang-nursery/stdsimd/issues/74
---
 src/x86/sse.rs | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/src/x86/sse.rs b/src/x86/sse.rs
index dfdeed43b7..717246d04f 100644
--- a/src/x86/sse.rs
+++ b/src/x86/sse.rs
@@ -694,6 +694,44 @@ pub unsafe fn _mm_cvtss_f32(a: f32x4) -> f32 {
     a.extract(0)
 }
 
+/// Convert a 32 bit integer to a 32 bit float. The result vector is the input
+/// vector `a` with the lowest 32 bit float replaced by the converted integer.
+///
+/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
+/// input).
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvtsi2ssl))]
+pub unsafe fn _mm_cvtsi32_ss(a: f32x4, b: i32) -> f32x4 {
+    a.replace(0, b as f32)
+}
+
+/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvtsi2ssl))]
+pub unsafe fn _mm_cvt_si2ss(a: f32x4, b: i32) -> f32x4 {
+    _mm_cvtsi32_ss(a, b)
+}
+
+/// Convert a 64 bit integer to a 32 bit float. The result vector is the input
+/// vector `a` with the lowest 32 bit float replaced by the converted integer.
+///
+/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 64 bit
+/// input).
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvtsi2ssq))]
+pub unsafe fn _mm_cvtsi64_ss(a: f32x4, b: i64) -> f32x4 {
+    a.replace(0, b as f32)
+}
+
+// Blocked by https://github.com/rust-lang-nursery/stdsimd/issues/74
+// pub unsafe fn _mm_cvtpi32_ps(a: f32x4, b: i32x2) -> f32x4
+// pub unsafe fn _mm_cvt_pi2ps(a: f32x4, b: i32x2) -> f32x4 {
+//     _mm_cvtpi32_ps(a, b)
+// }
+
 /// Construct a `f32x4` with the lowest element set to `a` and the rest set to
 /// zero.
 #[inline(always)]
@@ -2731,6 +2769,48 @@ mod tests {
         }
     }
 
+    #[simd_test = "sse"]
+    pub unsafe fn _mm_cvtsi32_ss() {
+        let inputs = &[
+            (4555i32,   4555.0f32),
+            (322223333, 322223330.0),
+            (-432,      -432.0),
+            (-322223333, -322223330.0)
+        ];
+
+        for i in 0..inputs.len() {
+            let (x, f) = inputs[i];
+            let a = f32x4::new(5.0, 6.0, 7.0, 8.0);
+            let r = sse::_mm_cvtsi32_ss(a, x);
+            let e = a.replace(0, f);
+            assert_eq!(e, r,
+                "TestCase #{} _mm_cvtsi32_ss({:?}, {}) = {:?}, expected: {:?}",
+                i, a, x, r, e);
+        }
+    }
+
+    #[simd_test = "sse"]
+    pub unsafe fn _mm_cvtsi64_ss() {
+        let inputs = &[
+            (4555i64,   4555.0f32),
+            (322223333, 322223330.0),
+            (-432,      -432.0),
+            (-322223333, -322223330.0),
+            (9223372036854775807, 9.223372e18),
+            (-9223372036854775808, -9.223372e18)
+        ];
+
+        for i in 0..inputs.len() {
+            let (x, f) = inputs[i];
+            let a = f32x4::new(5.0, 6.0, 7.0, 8.0);
+            let r = sse::_mm_cvtsi64_ss(a, x);
+            let e = a.replace(0, f);
+            assert_eq!(e, r,
+                "TestCase #{} _mm_cvtsi64_ss({:?}, {}) = {:?}, expected: {:?}",
+                i, a, x, r, e);
+        }
+    }
+
     #[simd_test = "sse"]
     pub unsafe fn _mm_cvtss_f32() {
         let a = f32x4::new(312.0134, 5.0, 6.0, 7.0);

From b37e1e2e6c6ebe1f0a588852dbf22fbde2220986 Mon Sep 17 00:00:00 2001
From: Thomas Schilling <nominolo@googlemail.com>
Date: Sun, 22 Oct 2017 21:15:20 +0200
Subject: [PATCH 3/3] Fix Linux builds

Also the si64 variants are only available on x86_64
---
 src/x86/sse.rs | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/x86/sse.rs b/src/x86/sse.rs
index 717246d04f..70471134db 100644
--- a/src/x86/sse.rs
+++ b/src/x86/sse.rs
@@ -633,6 +633,7 @@ pub unsafe fn _mm_cvt_ss2si(a: f32x4) -> i32 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(cvtss2si))]
+#[cfg(target_arch = "x86_64")]
 pub unsafe fn _mm_cvtss_si64(a: f32x4) -> i64 {
     cvtss2si64(a)
 }
@@ -677,6 +678,7 @@ pub unsafe fn _mm_cvtt_ss2si(a: f32x4) -> i32 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(cvttss2si))]
+#[cfg(target_arch = "x86_64")]
 pub unsafe fn _mm_cvttss_si64(a: f32x4) -> i64 {
     cvttss2si64(a)
 }
@@ -701,7 +703,8 @@ pub unsafe fn _mm_cvtss_f32(a: f32x4) -> f32 {
 /// input).
 #[inline(always)]
 #[target_feature = "+sse"]
-#[cfg_attr(test, assert_instr(cvtsi2ssl))]
+#[cfg_attr(all(test, target_os = "macos"), assert_instr(cvtsi2ssl))]
+#[cfg_attr(all(test, not(target_os = "macos")), assert_instr(cvtsi2ss))]
 pub unsafe fn _mm_cvtsi32_ss(a: f32x4, b: i32) -> f32x4 {
     a.replace(0, b as f32)
 }
@@ -709,7 +712,8 @@ pub unsafe fn _mm_cvtsi32_ss(a: f32x4, b: i32) -> f32x4 {
 /// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
 #[inline(always)]
 #[target_feature = "+sse"]
-#[cfg_attr(test, assert_instr(cvtsi2ssl))]
+#[cfg_attr(all(test, target_os = "macos"), assert_instr(cvtsi2ssl))]
+#[cfg_attr(all(test, not(target_os = "macos")), assert_instr(cvtsi2ss))]
 pub unsafe fn _mm_cvt_si2ss(a: f32x4, b: i32) -> f32x4 {
     _mm_cvtsi32_ss(a, b)
 }
@@ -721,7 +725,9 @@ pub unsafe fn _mm_cvt_si2ss(a: f32x4, b: i32) -> f32x4 {
 /// input).
 #[inline(always)]
 #[target_feature = "+sse"]
-#[cfg_attr(test, assert_instr(cvtsi2ssq))]
+#[cfg_attr(all(test, target_os = "macos"), assert_instr(cvtsi2ssq))]
+#[cfg_attr(all(test, not(target_os = "macos")), assert_instr(cvtsi2ss))]
+#[cfg(target_arch = "x86_64")]
 pub unsafe fn _mm_cvtsi64_ss(a: f32x4, b: i64) -> f32x4 {
     a.replace(0, b as f32)
 }
@@ -1679,10 +1685,12 @@ extern {
     #[link_name = "llvm.x86.sse.cvtss2si"]
     fn cvtss2si(a: f32x4) -> i32;
     #[link_name = "llvm.x86.sse.cvtss2si64"]
+    #[cfg(target_arch = "x86_64")]
     fn cvtss2si64(a: f32x4) -> i64;
     #[link_name = "llvm.x86.sse.cvttss2si"]
     fn cvttss2si(a: f32x4) -> i32;
     #[link_name = "llvm.x86.sse.cvttss2si64"]
+    #[cfg(target_arch = "x86_64")]
     fn cvttss2si64(a: f32x4) -> i64;
     #[link_name = "llvm.x86.sse.sfence"]
     fn sfence();
@@ -2691,6 +2699,7 @@ mod tests {
     }
 
     #[simd_test = "sse"]
+    #[cfg(target_arch = "x86_64")]
     unsafe fn _mm_cvtss_si64() {
         use std::f32::NAN;
         use std::i64::MIN;
@@ -2742,6 +2751,7 @@ mod tests {
     }
 
     #[simd_test = "sse"]
+    #[cfg(target_arch = "x86_64")]
     unsafe fn _mm_cvttss_si64() {
         use std::f32::NAN;
         use std::i64::MIN;
@@ -2790,6 +2800,7 @@ mod tests {
     }
 
     #[simd_test = "sse"]
+    #[cfg(target_arch = "x86_64")]
     pub unsafe fn _mm_cvtsi64_ss() {
         let inputs = &[
             (4555i64,   4555.0f32),