From f4633aa844942afbb735294e391a9be7bd80e89a Mon Sep 17 00:00:00 2001 From: Thomas Schilling Date: Sun, 22 Oct 2017 17:51:32 +0200 Subject: [PATCH 1/3] Add single output _mm_cvt[t]ss_* variants The *_pi variants are currently blocked by https://github.com/rust-lang-nursery/stdsimd/issues/74 --- src/x86/sse.rs | 205 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 205 insertions(+) diff --git a/src/x86/sse.rs b/src/x86/sse.rs index c402780bfc..dfdeed43b7 100644 --- a/src/x86/sse.rs +++ b/src/x86/sse.rs @@ -598,6 +598,102 @@ pub unsafe fn _mm_ucomineq_ss(a: f32x4, b: f32x4) -> i32 { ucomineq_ss(a, b) } +/// Convert the lowest 32 bit float in the input vector to a 32 bit integer. +/// +/// The result is rounded according to the current rounding mode. If the result +/// cannot be represented as a 32 bit integer the result will be `0x8000_0000` +/// (`std::i32::MIN`) or an invalid operation floating point exception if +/// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)). +/// +/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output). +#[inline(always)] +#[target_feature = "+sse"] +#[cfg_attr(test, assert_instr(cvtss2si))] +pub unsafe fn _mm_cvtss_si32(a: f32x4) -> i32 { + cvtss2si(a) +} + +/// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html). +#[inline(always)] +#[target_feature = "+sse"] +#[cfg_attr(test, assert_instr(cvtss2si))] +pub unsafe fn _mm_cvt_ss2si(a: f32x4) -> i32 { + _mm_cvtss_si32(a) +} + +/// Convert the lowest 32 bit float in the input vector to a 64 bit integer. +/// +/// The result is rounded according to the current rounding mode. If the result +/// cannot be represented as a 64 bit integer the result will be +/// `0x8000_0000_0000_0000` (`std::i64::MIN`) or trigger an invalid operation +/// floating point exception if unmasked (see +/// [`_mm_setcsr`](fn._mm_setcsr.html)). +/// +/// This corresponds to the `CVTSS2SI` instruction (with 64 bit output). +#[inline(always)] +#[target_feature = "+sse"] +#[cfg_attr(test, assert_instr(cvtss2si))] +pub unsafe fn _mm_cvtss_si64(a: f32x4) -> i64 { + cvtss2si64(a) +} + +// Blocked by https://github.com/rust-lang-nursery/stdsimd/issues/74 +// pub unsafe fn _mm_cvtps_pi32(a: f32x4) -> i32x2 +// pub unsafe fn _mm_cvt_ps2pi(a: f32x4) -> i32x2 { _mm_cvtps_pi32(a) } + +/// Convert the lowest 32 bit float in the input vector to a 32 bit integer with +/// truncation. +/// +/// The result is rounded always using truncation (round towards zero). If the +/// result cannot be represented as a 32 bit integer the result will be +/// `0x8000_0000` (`std::i32::MIN`) or an invalid operation floating point +/// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)). +/// +/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output). +#[inline(always)] +#[target_feature = "+sse"] +#[cfg_attr(test, assert_instr(cvttss2si))] +pub unsafe fn _mm_cvttss_si32(a: f32x4) -> i32 { + cvttss2si(a) +} + +/// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html). +#[inline(always)] +#[target_feature = "+sse"] +#[cfg_attr(test, assert_instr(cvttss2si))] +pub unsafe fn _mm_cvtt_ss2si(a: f32x4) -> i32 { + _mm_cvttss_si32(a) +} + +/// Convert the lowest 32 bit float in the input vector to a 64 bit integer with +/// truncation. +/// +/// The result is rounded always using truncation (round towards zero). If the +/// result cannot be represented as a 64 bit integer the result will be +/// `0x8000_0000_0000_0000` (`std::i64::MIN`) or an invalid operation floating +/// point exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)). +/// +/// This corresponds to the `CVTTSS2SI` instruction (with 64 bit output). +#[inline(always)] +#[target_feature = "+sse"] +#[cfg_attr(test, assert_instr(cvttss2si))] +pub unsafe fn _mm_cvttss_si64(a: f32x4) -> i64 { + cvttss2si64(a) +} + +// Blocked by https://github.com/rust-lang-nursery/stdsimd/issues/74 +// pub unsafe fn _mm_cvttps_pi32(a: f32x4) -> i32x2; +// pub unsafe fn _mm_cvtt_ps2pi(a: f32x4) -> i32x2 { _mm_cvttps_pi32(a) } + +/// Extract the lowest 32 bit float from the input vector. +#[inline(always)] +#[target_feature = "+sse"] +// No point in using assert_instrs. In Unix x86_64 calling convention this is a +// no-op, and on Windows it's just a `mov`. +pub unsafe fn _mm_cvtss_f32(a: f32x4) -> f32 { + a.extract(0) +} + /// Construct a `f32x4` with the lowest element set to `a` and the rest set to /// zero. #[inline(always)] @@ -1542,6 +1638,14 @@ extern { fn ucomige_ss(a: f32x4, b: f32x4) -> i32; #[link_name = "llvm.x86.sse.ucomineq.ss"] fn ucomineq_ss(a: f32x4, b: f32x4) -> i32; + #[link_name = "llvm.x86.sse.cvtss2si"] + fn cvtss2si(a: f32x4) -> i32; + #[link_name = "llvm.x86.sse.cvtss2si64"] + fn cvtss2si64(a: f32x4) -> i64; + #[link_name = "llvm.x86.sse.cvttss2si"] + fn cvttss2si(a: f32x4) -> i32; + #[link_name = "llvm.x86.sse.cvttss2si64"] + fn cvttss2si64(a: f32x4) -> i64; #[link_name = "llvm.x86.sse.sfence"] fn sfence(); #[link_name = "llvm.x86.sse.stmxcsr"] @@ -2532,6 +2636,107 @@ mod tests { } } + #[simd_test = "sse"] + unsafe fn _mm_cvtss_si32() { + use std::f32::NAN; + use std::i32::MIN; + let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1]; + let result = &[42i32, -3, MIN, 0, MIN, 2147483520]; + for i in 0..inputs.len() { + let x = f32x4::new(inputs[i], 1.0, 3.0, 4.0); + let e = result[i]; + let r = sse::_mm_cvtss_si32(x); + assert_eq!(e, r, + "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}", + i, x, r, e); + } + } + + #[simd_test = "sse"] + unsafe fn _mm_cvtss_si64() { + use std::f32::NAN; + use std::i64::MIN; + let inputs = &[ + (42.0f32, 42i64), + (-31.4, -31), + (-33.5, -34), + (-34.5, -34), + (4.0e10, 40_000_000_000), + (4.0e-10, 0), + (NAN, MIN), + (2147483500.1, 2147483520), + (9.223371e18, 9223370937343148032) + ]; + for i in 0..inputs.len() { + let (xi, e) = inputs[i]; + let x = f32x4::new(xi, 1.0, 3.0, 4.0); + let r = sse::_mm_cvtss_si64(x); + assert_eq!(e, r, + "TestCase #{} _mm_cvtss_si64({:?}) = {}, expected: {}", + i, x, r, e); + } + } + + #[simd_test = "sse"] + unsafe fn _mm_cvttss_si32() { + use std::f32::NAN; + use std::i32::MIN; + let inputs = &[ + (42.0f32, 42i32), + (-31.4, -31), + (-33.5, -33), + (-34.5, -34), + (10.999, 10), + (-5.99, -5), + (4.0e10, MIN), + (4.0e-10, 0), + (NAN, MIN), + (2147483500.1, 2147483520), + ]; + for i in 0..inputs.len() { + let (xi, e) = inputs[i]; + let x = f32x4::new(xi, 1.0, 3.0, 4.0); + let r = sse::_mm_cvttss_si32(x); + assert_eq!(e, r, + "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}", + i, x, r, e); + } + } + + #[simd_test = "sse"] + unsafe fn _mm_cvttss_si64() { + use std::f32::NAN; + use std::i64::MIN; + let inputs = &[ + (42.0f32, 42i64), + (-31.4, -31), + (-33.5, -33), + (-34.5, -34), + (10.999, 10), + (-5.99, -5), + (4.0e10, 40_000_000_000), + (4.0e-10, 0), + (NAN, MIN), + (2147483500.1, 2147483520), + (9.223371e18, 9223370937343148032), + (9.223372e18, MIN), + ]; + for i in 0..inputs.len() { + let (xi, e) = inputs[i]; + let x = f32x4::new(xi, 1.0, 3.0, 4.0); + let r = sse::_mm_cvttss_si64(x); + assert_eq!(e, r, + "TestCase #{} _mm_cvttss_si64({:?}) = {}, expected: {}", + i, x, r, e); + } + } + + #[simd_test = "sse"] + pub unsafe fn _mm_cvtss_f32() { + let a = f32x4::new(312.0134, 5.0, 6.0, 7.0); + assert_eq!(sse::_mm_cvtss_f32(a), 312.0134); + } + #[simd_test = "sse"] unsafe fn _mm_set_ss() { let r = sse::_mm_set_ss(black_box(4.25)); From cd253475acc7ce049ee2bdcbdf815ccbb5683935 Mon Sep 17 00:00:00 2001 From: Thomas Schilling Date: Sun, 22 Oct 2017 18:27:02 +0200 Subject: [PATCH 2/3] Add _mm_cvtsi*_ss The _mm_cvtpi*_ps intrinsics are blocked by https://github.com/rust-lang-nursery/stdsimd/issues/74 --- src/x86/sse.rs | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/src/x86/sse.rs b/src/x86/sse.rs index dfdeed43b7..717246d04f 100644 --- a/src/x86/sse.rs +++ b/src/x86/sse.rs @@ -694,6 +694,44 @@ pub unsafe fn _mm_cvtss_f32(a: f32x4) -> f32 { a.extract(0) } +/// Convert a 32 bit integer to a 32 bit float. The result vector is the input +/// vector `a` with the lowest 32 bit float replaced by the converted integer. +/// +/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit +/// input). +#[inline(always)] +#[target_feature = "+sse"] +#[cfg_attr(test, assert_instr(cvtsi2ssl))] +pub unsafe fn _mm_cvtsi32_ss(a: f32x4, b: i32) -> f32x4 { + a.replace(0, b as f32) +} + +/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html). +#[inline(always)] +#[target_feature = "+sse"] +#[cfg_attr(test, assert_instr(cvtsi2ssl))] +pub unsafe fn _mm_cvt_si2ss(a: f32x4, b: i32) -> f32x4 { + _mm_cvtsi32_ss(a, b) +} + +/// Convert a 64 bit integer to a 32 bit float. The result vector is the input +/// vector `a` with the lowest 32 bit float replaced by the converted integer. +/// +/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 64 bit +/// input). +#[inline(always)] +#[target_feature = "+sse"] +#[cfg_attr(test, assert_instr(cvtsi2ssq))] +pub unsafe fn _mm_cvtsi64_ss(a: f32x4, b: i64) -> f32x4 { + a.replace(0, b as f32) +} + +// Blocked by https://github.com/rust-lang-nursery/stdsimd/issues/74 +// pub unsafe fn _mm_cvtpi32_ps(a: f32x4, b: i32x2) -> f32x4 +// pub unsafe fn _mm_cvt_pi2ps(a: f32x4, b: i32x2) -> f32x4 { +// _mm_cvtpi32_ps(a, b) +// } + /// Construct a `f32x4` with the lowest element set to `a` and the rest set to /// zero. #[inline(always)] @@ -2731,6 +2769,48 @@ mod tests { } } + #[simd_test = "sse"] + pub unsafe fn _mm_cvtsi32_ss() { + let inputs = &[ + (4555i32, 4555.0f32), + (322223333, 322223330.0), + (-432, -432.0), + (-322223333, -322223330.0) + ]; + + for i in 0..inputs.len() { + let (x, f) = inputs[i]; + let a = f32x4::new(5.0, 6.0, 7.0, 8.0); + let r = sse::_mm_cvtsi32_ss(a, x); + let e = a.replace(0, f); + assert_eq!(e, r, + "TestCase #{} _mm_cvtsi32_ss({:?}, {}) = {:?}, expected: {:?}", + i, a, x, r, e); + } + } + + #[simd_test = "sse"] + pub unsafe fn _mm_cvtsi64_ss() { + let inputs = &[ + (4555i64, 4555.0f32), + (322223333, 322223330.0), + (-432, -432.0), + (-322223333, -322223330.0), + (9223372036854775807, 9.223372e18), + (-9223372036854775808, -9.223372e18) + ]; + + for i in 0..inputs.len() { + let (x, f) = inputs[i]; + let a = f32x4::new(5.0, 6.0, 7.0, 8.0); + let r = sse::_mm_cvtsi64_ss(a, x); + let e = a.replace(0, f); + assert_eq!(e, r, + "TestCase #{} _mm_cvtsi64_ss({:?}, {}) = {:?}, expected: {:?}", + i, a, x, r, e); + } + } + #[simd_test = "sse"] pub unsafe fn _mm_cvtss_f32() { let a = f32x4::new(312.0134, 5.0, 6.0, 7.0); From b37e1e2e6c6ebe1f0a588852dbf22fbde2220986 Mon Sep 17 00:00:00 2001 From: Thomas Schilling Date: Sun, 22 Oct 2017 21:15:20 +0200 Subject: [PATCH 3/3] Fix Linux builds Also the si64 variants are only available on x86_64 --- src/x86/sse.rs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/x86/sse.rs b/src/x86/sse.rs index 717246d04f..70471134db 100644 --- a/src/x86/sse.rs +++ b/src/x86/sse.rs @@ -633,6 +633,7 @@ pub unsafe fn _mm_cvt_ss2si(a: f32x4) -> i32 { #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(cvtss2si))] +#[cfg(target_arch = "x86_64")] pub unsafe fn _mm_cvtss_si64(a: f32x4) -> i64 { cvtss2si64(a) } @@ -677,6 +678,7 @@ pub unsafe fn _mm_cvtt_ss2si(a: f32x4) -> i32 { #[inline(always)] #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(cvttss2si))] +#[cfg(target_arch = "x86_64")] pub unsafe fn _mm_cvttss_si64(a: f32x4) -> i64 { cvttss2si64(a) } @@ -701,7 +703,8 @@ pub unsafe fn _mm_cvtss_f32(a: f32x4) -> f32 { /// input). #[inline(always)] #[target_feature = "+sse"] -#[cfg_attr(test, assert_instr(cvtsi2ssl))] +#[cfg_attr(all(test, target_os = "macos"), assert_instr(cvtsi2ssl))] +#[cfg_attr(all(test, not(target_os = "macos")), assert_instr(cvtsi2ss))] pub unsafe fn _mm_cvtsi32_ss(a: f32x4, b: i32) -> f32x4 { a.replace(0, b as f32) } @@ -709,7 +712,8 @@ pub unsafe fn _mm_cvtsi32_ss(a: f32x4, b: i32) -> f32x4 { /// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html). #[inline(always)] #[target_feature = "+sse"] -#[cfg_attr(test, assert_instr(cvtsi2ssl))] +#[cfg_attr(all(test, target_os = "macos"), assert_instr(cvtsi2ssl))] +#[cfg_attr(all(test, not(target_os = "macos")), assert_instr(cvtsi2ss))] pub unsafe fn _mm_cvt_si2ss(a: f32x4, b: i32) -> f32x4 { _mm_cvtsi32_ss(a, b) } @@ -721,7 +725,9 @@ pub unsafe fn _mm_cvt_si2ss(a: f32x4, b: i32) -> f32x4 { /// input). #[inline(always)] #[target_feature = "+sse"] -#[cfg_attr(test, assert_instr(cvtsi2ssq))] +#[cfg_attr(all(test, target_os = "macos"), assert_instr(cvtsi2ssq))] +#[cfg_attr(all(test, not(target_os = "macos")), assert_instr(cvtsi2ss))] +#[cfg(target_arch = "x86_64")] pub unsafe fn _mm_cvtsi64_ss(a: f32x4, b: i64) -> f32x4 { a.replace(0, b as f32) } @@ -1679,10 +1685,12 @@ extern { #[link_name = "llvm.x86.sse.cvtss2si"] fn cvtss2si(a: f32x4) -> i32; #[link_name = "llvm.x86.sse.cvtss2si64"] + #[cfg(target_arch = "x86_64")] fn cvtss2si64(a: f32x4) -> i64; #[link_name = "llvm.x86.sse.cvttss2si"] fn cvttss2si(a: f32x4) -> i32; #[link_name = "llvm.x86.sse.cvttss2si64"] + #[cfg(target_arch = "x86_64")] fn cvttss2si64(a: f32x4) -> i64; #[link_name = "llvm.x86.sse.sfence"] fn sfence(); @@ -2691,6 +2699,7 @@ mod tests { } #[simd_test = "sse"] + #[cfg(target_arch = "x86_64")] unsafe fn _mm_cvtss_si64() { use std::f32::NAN; use std::i64::MIN; @@ -2742,6 +2751,7 @@ mod tests { } #[simd_test = "sse"] + #[cfg(target_arch = "x86_64")] unsafe fn _mm_cvttss_si64() { use std::f32::NAN; use std::i64::MIN; @@ -2790,6 +2800,7 @@ mod tests { } #[simd_test = "sse"] + #[cfg(target_arch = "x86_64")] pub unsafe fn _mm_cvtsi64_ss() { let inputs = &[ (4555i64, 4555.0f32),