Skip to content

Commit 2afb984

Browse files
nominoloalexcrichton
authored andcommitted
Add some SSE _mm_cvt* instructions (#136)
* Add single output _mm_cvt[t]ss_* variants The *_pi variants are currently blocked by #74 * Add _mm_cvtsi*_ss The _mm_cvtpi*_ps intrinsics are blocked by #74 * Fix Linux builds Also the si64 variants are only available on x86_64
1 parent 8d87901 commit 2afb984

File tree

1 file changed

+296
-0
lines changed

1 file changed

+296
-0
lines changed

src/x86/sse.rs

Lines changed: 296 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,146 @@ pub unsafe fn _mm_ucomineq_ss(a: f32x4, b: f32x4) -> i32 {
598598
ucomineq_ss(a, b)
599599
}
600600

601+
/// Convert the lowest 32 bit float in the input vector to a 32 bit integer.
602+
///
603+
/// The result is rounded according to the current rounding mode. If the result
604+
/// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
605+
/// (`std::i32::MIN`) or an invalid operation floating point exception if
606+
/// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
607+
///
608+
/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
609+
#[inline(always)]
610+
#[target_feature = "+sse"]
611+
#[cfg_attr(test, assert_instr(cvtss2si))]
612+
pub unsafe fn _mm_cvtss_si32(a: f32x4) -> i32 {
613+
cvtss2si(a)
614+
}
615+
616+
/// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
617+
#[inline(always)]
618+
#[target_feature = "+sse"]
619+
#[cfg_attr(test, assert_instr(cvtss2si))]
620+
pub unsafe fn _mm_cvt_ss2si(a: f32x4) -> i32 {
621+
_mm_cvtss_si32(a)
622+
}
623+
624+
/// Convert the lowest 32 bit float in the input vector to a 64 bit integer.
625+
///
626+
/// The result is rounded according to the current rounding mode. If the result
627+
/// cannot be represented as a 64 bit integer the result will be
628+
/// `0x8000_0000_0000_0000` (`std::i64::MIN`) or trigger an invalid operation
629+
/// floating point exception if unmasked (see
630+
/// [`_mm_setcsr`](fn._mm_setcsr.html)).
631+
///
632+
/// This corresponds to the `CVTSS2SI` instruction (with 64 bit output).
633+
#[inline(always)]
634+
#[target_feature = "+sse"]
635+
#[cfg_attr(test, assert_instr(cvtss2si))]
636+
#[cfg(target_arch = "x86_64")]
637+
pub unsafe fn _mm_cvtss_si64(a: f32x4) -> i64 {
638+
cvtss2si64(a)
639+
}
640+
641+
// Blocked by https://github.com/rust-lang-nursery/stdsimd/issues/74
642+
// pub unsafe fn _mm_cvtps_pi32(a: f32x4) -> i32x2
643+
// pub unsafe fn _mm_cvt_ps2pi(a: f32x4) -> i32x2 { _mm_cvtps_pi32(a) }
644+
645+
/// Convert the lowest 32 bit float in the input vector to a 32 bit integer with
646+
/// truncation.
647+
///
648+
/// The result is rounded always using truncation (round towards zero). If the
649+
/// result cannot be represented as a 32 bit integer the result will be
650+
/// `0x8000_0000` (`std::i32::MIN`) or an invalid operation floating point
651+
/// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
652+
///
653+
/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
654+
#[inline(always)]
655+
#[target_feature = "+sse"]
656+
#[cfg_attr(test, assert_instr(cvttss2si))]
657+
pub unsafe fn _mm_cvttss_si32(a: f32x4) -> i32 {
658+
cvttss2si(a)
659+
}
660+
661+
/// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
662+
#[inline(always)]
663+
#[target_feature = "+sse"]
664+
#[cfg_attr(test, assert_instr(cvttss2si))]
665+
pub unsafe fn _mm_cvtt_ss2si(a: f32x4) -> i32 {
666+
_mm_cvttss_si32(a)
667+
}
668+
669+
/// Convert the lowest 32 bit float in the input vector to a 64 bit integer with
670+
/// truncation.
671+
///
672+
/// The result is rounded always using truncation (round towards zero). If the
673+
/// result cannot be represented as a 64 bit integer the result will be
674+
/// `0x8000_0000_0000_0000` (`std::i64::MIN`) or an invalid operation floating
675+
/// point exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
676+
///
677+
/// This corresponds to the `CVTTSS2SI` instruction (with 64 bit output).
678+
#[inline(always)]
679+
#[target_feature = "+sse"]
680+
#[cfg_attr(test, assert_instr(cvttss2si))]
681+
#[cfg(target_arch = "x86_64")]
682+
pub unsafe fn _mm_cvttss_si64(a: f32x4) -> i64 {
683+
cvttss2si64(a)
684+
}
685+
686+
// Blocked by https://github.com/rust-lang-nursery/stdsimd/issues/74
687+
// pub unsafe fn _mm_cvttps_pi32(a: f32x4) -> i32x2;
688+
// pub unsafe fn _mm_cvtt_ps2pi(a: f32x4) -> i32x2 { _mm_cvttps_pi32(a) }
689+
690+
/// Extract the lowest 32 bit float from the input vector.
691+
#[inline(always)]
692+
#[target_feature = "+sse"]
693+
// No point in using assert_instrs. In Unix x86_64 calling convention this is a
694+
// no-op, and on Windows it's just a `mov`.
695+
pub unsafe fn _mm_cvtss_f32(a: f32x4) -> f32 {
696+
a.extract(0)
697+
}
698+
699+
/// Convert a 32 bit integer to a 32 bit float. The result vector is the input
700+
/// vector `a` with the lowest 32 bit float replaced by the converted integer.
701+
///
702+
/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
703+
/// input).
704+
#[inline(always)]
705+
#[target_feature = "+sse"]
706+
#[cfg_attr(all(test, target_os = "macos"), assert_instr(cvtsi2ssl))]
707+
#[cfg_attr(all(test, not(target_os = "macos")), assert_instr(cvtsi2ss))]
708+
pub unsafe fn _mm_cvtsi32_ss(a: f32x4, b: i32) -> f32x4 {
709+
a.replace(0, b as f32)
710+
}
711+
712+
/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
713+
#[inline(always)]
714+
#[target_feature = "+sse"]
715+
#[cfg_attr(all(test, target_os = "macos"), assert_instr(cvtsi2ssl))]
716+
#[cfg_attr(all(test, not(target_os = "macos")), assert_instr(cvtsi2ss))]
717+
pub unsafe fn _mm_cvt_si2ss(a: f32x4, b: i32) -> f32x4 {
718+
_mm_cvtsi32_ss(a, b)
719+
}
720+
721+
/// Convert a 64 bit integer to a 32 bit float. The result vector is the input
722+
/// vector `a` with the lowest 32 bit float replaced by the converted integer.
723+
///
724+
/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 64 bit
725+
/// input).
726+
#[inline(always)]
727+
#[target_feature = "+sse"]
728+
#[cfg_attr(all(test, target_os = "macos"), assert_instr(cvtsi2ssq))]
729+
#[cfg_attr(all(test, not(target_os = "macos")), assert_instr(cvtsi2ss))]
730+
#[cfg(target_arch = "x86_64")]
731+
pub unsafe fn _mm_cvtsi64_ss(a: f32x4, b: i64) -> f32x4 {
732+
a.replace(0, b as f32)
733+
}
734+
735+
// Blocked by https://github.com/rust-lang-nursery/stdsimd/issues/74
736+
// pub unsafe fn _mm_cvtpi32_ps(a: f32x4, b: i32x2) -> f32x4
737+
// pub unsafe fn _mm_cvt_pi2ps(a: f32x4, b: i32x2) -> f32x4 {
738+
// _mm_cvtpi32_ps(a, b)
739+
// }
740+
601741
/// Construct a `f32x4` with the lowest element set to `a` and the rest set to
602742
/// zero.
603743
#[inline(always)]
@@ -1542,6 +1682,16 @@ extern {
15421682
fn ucomige_ss(a: f32x4, b: f32x4) -> i32;
15431683
#[link_name = "llvm.x86.sse.ucomineq.ss"]
15441684
fn ucomineq_ss(a: f32x4, b: f32x4) -> i32;
1685+
#[link_name = "llvm.x86.sse.cvtss2si"]
1686+
fn cvtss2si(a: f32x4) -> i32;
1687+
#[link_name = "llvm.x86.sse.cvtss2si64"]
1688+
#[cfg(target_arch = "x86_64")]
1689+
fn cvtss2si64(a: f32x4) -> i64;
1690+
#[link_name = "llvm.x86.sse.cvttss2si"]
1691+
fn cvttss2si(a: f32x4) -> i32;
1692+
#[link_name = "llvm.x86.sse.cvttss2si64"]
1693+
#[cfg(target_arch = "x86_64")]
1694+
fn cvttss2si64(a: f32x4) -> i64;
15451695
#[link_name = "llvm.x86.sse.sfence"]
15461696
fn sfence();
15471697
#[link_name = "llvm.x86.sse.stmxcsr"]
@@ -2532,6 +2682,152 @@ mod tests {
25322682
}
25332683
}
25342684

2685+
#[simd_test = "sse"]
2686+
unsafe fn _mm_cvtss_si32() {
2687+
use std::f32::NAN;
2688+
use std::i32::MIN;
2689+
let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
2690+
let result = &[42i32, -3, MIN, 0, MIN, 2147483520];
2691+
for i in 0..inputs.len() {
2692+
let x = f32x4::new(inputs[i], 1.0, 3.0, 4.0);
2693+
let e = result[i];
2694+
let r = sse::_mm_cvtss_si32(x);
2695+
assert_eq!(e, r,
2696+
"TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
2697+
i, x, r, e);
2698+
}
2699+
}
2700+
2701+
#[simd_test = "sse"]
2702+
#[cfg(target_arch = "x86_64")]
2703+
unsafe fn _mm_cvtss_si64() {
2704+
use std::f32::NAN;
2705+
use std::i64::MIN;
2706+
let inputs = &[
2707+
(42.0f32, 42i64),
2708+
(-31.4, -31),
2709+
(-33.5, -34),
2710+
(-34.5, -34),
2711+
(4.0e10, 40_000_000_000),
2712+
(4.0e-10, 0),
2713+
(NAN, MIN),
2714+
(2147483500.1, 2147483520),
2715+
(9.223371e18, 9223370937343148032)
2716+
];
2717+
for i in 0..inputs.len() {
2718+
let (xi, e) = inputs[i];
2719+
let x = f32x4::new(xi, 1.0, 3.0, 4.0);
2720+
let r = sse::_mm_cvtss_si64(x);
2721+
assert_eq!(e, r,
2722+
"TestCase #{} _mm_cvtss_si64({:?}) = {}, expected: {}",
2723+
i, x, r, e);
2724+
}
2725+
}
2726+
2727+
#[simd_test = "sse"]
2728+
unsafe fn _mm_cvttss_si32() {
2729+
use std::f32::NAN;
2730+
use std::i32::MIN;
2731+
let inputs = &[
2732+
(42.0f32, 42i32),
2733+
(-31.4, -31),
2734+
(-33.5, -33),
2735+
(-34.5, -34),
2736+
(10.999, 10),
2737+
(-5.99, -5),
2738+
(4.0e10, MIN),
2739+
(4.0e-10, 0),
2740+
(NAN, MIN),
2741+
(2147483500.1, 2147483520),
2742+
];
2743+
for i in 0..inputs.len() {
2744+
let (xi, e) = inputs[i];
2745+
let x = f32x4::new(xi, 1.0, 3.0, 4.0);
2746+
let r = sse::_mm_cvttss_si32(x);
2747+
assert_eq!(e, r,
2748+
"TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
2749+
i, x, r, e);
2750+
}
2751+
}
2752+
2753+
#[simd_test = "sse"]
2754+
#[cfg(target_arch = "x86_64")]
2755+
unsafe fn _mm_cvttss_si64() {
2756+
use std::f32::NAN;
2757+
use std::i64::MIN;
2758+
let inputs = &[
2759+
(42.0f32, 42i64),
2760+
(-31.4, -31),
2761+
(-33.5, -33),
2762+
(-34.5, -34),
2763+
(10.999, 10),
2764+
(-5.99, -5),
2765+
(4.0e10, 40_000_000_000),
2766+
(4.0e-10, 0),
2767+
(NAN, MIN),
2768+
(2147483500.1, 2147483520),
2769+
(9.223371e18, 9223370937343148032),
2770+
(9.223372e18, MIN),
2771+
];
2772+
for i in 0..inputs.len() {
2773+
let (xi, e) = inputs[i];
2774+
let x = f32x4::new(xi, 1.0, 3.0, 4.0);
2775+
let r = sse::_mm_cvttss_si64(x);
2776+
assert_eq!(e, r,
2777+
"TestCase #{} _mm_cvttss_si64({:?}) = {}, expected: {}",
2778+
i, x, r, e);
2779+
}
2780+
}
2781+
2782+
#[simd_test = "sse"]
2783+
pub unsafe fn _mm_cvtsi32_ss() {
2784+
let inputs = &[
2785+
(4555i32, 4555.0f32),
2786+
(322223333, 322223330.0),
2787+
(-432, -432.0),
2788+
(-322223333, -322223330.0)
2789+
];
2790+
2791+
for i in 0..inputs.len() {
2792+
let (x, f) = inputs[i];
2793+
let a = f32x4::new(5.0, 6.0, 7.0, 8.0);
2794+
let r = sse::_mm_cvtsi32_ss(a, x);
2795+
let e = a.replace(0, f);
2796+
assert_eq!(e, r,
2797+
"TestCase #{} _mm_cvtsi32_ss({:?}, {}) = {:?}, expected: {:?}",
2798+
i, a, x, r, e);
2799+
}
2800+
}
2801+
2802+
#[simd_test = "sse"]
2803+
#[cfg(target_arch = "x86_64")]
2804+
pub unsafe fn _mm_cvtsi64_ss() {
2805+
let inputs = &[
2806+
(4555i64, 4555.0f32),
2807+
(322223333, 322223330.0),
2808+
(-432, -432.0),
2809+
(-322223333, -322223330.0),
2810+
(9223372036854775807, 9.223372e18),
2811+
(-9223372036854775808, -9.223372e18)
2812+
];
2813+
2814+
for i in 0..inputs.len() {
2815+
let (x, f) = inputs[i];
2816+
let a = f32x4::new(5.0, 6.0, 7.0, 8.0);
2817+
let r = sse::_mm_cvtsi64_ss(a, x);
2818+
let e = a.replace(0, f);
2819+
assert_eq!(e, r,
2820+
"TestCase #{} _mm_cvtsi64_ss({:?}, {}) = {:?}, expected: {:?}",
2821+
i, a, x, r, e);
2822+
}
2823+
}
2824+
2825+
#[simd_test = "sse"]
2826+
pub unsafe fn _mm_cvtss_f32() {
2827+
let a = f32x4::new(312.0134, 5.0, 6.0, 7.0);
2828+
assert_eq!(sse::_mm_cvtss_f32(a), 312.0134);
2829+
}
2830+
25352831
#[simd_test = "sse"]
25362832
unsafe fn _mm_set_ss() {
25372833
let r = sse::_mm_set_ss(black_box(4.25));

0 commit comments

Comments
 (0)