From f3c6108709e1df1937f9b0c5727edf937df2190b Mon Sep 17 00:00:00 2001 From: gwenn Date: Thu, 5 Oct 2017 21:43:51 +0200 Subject: [PATCH 01/29] avx: _mm_permute_ps and sse: _mm_undefined_ps --- src/x86/avx.rs | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/x86/sse.rs | 9 ++++++++ 2 files changed, 71 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index 60a4aeea2e..40268891a7 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -7,6 +7,8 @@ use simd_llvm::{simd_cast, simd_shuffle2, simd_shuffle4, simd_shuffle8}; use v128::{f32x4, f64x2, i32x4, i64x2}; use v256::*; +use x86::sse::_mm_undefined_ps; + /// Add packed double-precision (64-bit) floating-point elements /// in `a` and `b`. #[inline(always)] @@ -707,6 +709,58 @@ pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 { } } +/// Shuffle single-precision (32-bit) floating-point elements in `a` +/// using the control in `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))] +pub unsafe fn _mm_permute_ps(a: f32x4, imm8: i32) -> f32x4 { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle4 { + ($a:expr, $b:expr, $c:expr, $d:expr) => { + simd_shuffle4(a, _mm_undefined_ps(), [ + $a, $b, $c, $d + ]) + } + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr) => { + match (imm8 >> 6) & 0b11 { + 0b00 => shuffle4!($a, $b, $c, 0), + 0b01 => shuffle4!($a, $b, $c, 1), + 0b10 => shuffle4!($a, $b, $c, 2), + _ => shuffle4!($a, $b, $c, 3), + } + } + } + macro_rules! shuffle2 { + ($a:expr, $b:expr) => { + match (imm8 >> 4) & 0b11 { + 0b00 => shuffle3!($a, $b, 0), + 0b01 => shuffle3!($a, $b, 1), + 0b10 => shuffle3!($a, $b, 2), + _ => shuffle3!($a, $b, 3), + } + } + } + macro_rules! shuffle1 { + ($a:expr) => { + match (imm8 >> 2) & 0b11 { + 0b00 => shuffle2!($a, 0), + 0b01 => shuffle2!($a, 1), + 0b10 => shuffle2!($a, 2), + _ => shuffle2!($a, 3), + } + } + } + match (imm8 >> 0) & 0b11 { + 0b00 => shuffle1!(0), + 0b01 => shuffle1!(1), + 0b10 => shuffle1!(2), + _ => shuffle1!(3), + } +} + /// Return vector of type `f32x8` with undefined elements. #[inline(always)] #[target_feature = "+avx"] @@ -1333,4 +1387,12 @@ mod tests { let e = f32x8::new(5.0, 2.0, 3.0, 4.0, 50.0, 64.0, 9.0, 8.0); assert_eq!(r, e); } + + #[simd_test = "avx"] + unsafe fn _mm_permute_ps() { + let a = f32x4::new(4.0, 3.0, 2.0, 5.0); + let r = avx::_mm_permute_ps(a, 0x1b); + let e = f32x4::new(5.0, 2.0, 3.0, 4.0); + assert_eq!(r, e); + } } diff --git a/src/x86/sse.rs b/src/x86/sse.rs index b56e2f756c..9beb7293a0 100644 --- a/src/x86/sse.rs +++ b/src/x86/sse.rs @@ -1,3 +1,5 @@ +use std::mem; + use simd_llvm::simd_shuffle4; use v128::*; use std::os::raw::c_void; @@ -595,6 +597,13 @@ pub unsafe fn _mm_prefetch(p: *const c_void, strategy: i8) { pref!(strategy) } +/// Return vector of type __m128 with undefined elements. +#[inline(always)] +#[target_feature = "+sse"] +pub unsafe fn _mm_undefined_ps() -> f32x4 { + mem::uninitialized() +} + #[allow(improper_ctypes)] extern { #[link_name = "llvm.x86.sse.add.ss"] From da9e9a33d29f1a6a009323754f5a23d8655fcb87 Mon Sep 17 00:00:00 2001 From: gwenn Date: Thu, 5 Oct 2017 22:38:12 +0200 Subject: [PATCH 02/29] avx: _mm256_permutevar_pdi, _mm_permutevar_pd --- src/x86/avx.rs | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index 40268891a7..1dc9405e26 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -761,6 +761,22 @@ pub unsafe fn _mm_permute_ps(a: f32x4, imm8: i32) -> f32x4 { } } +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vpermilpd))] +pub unsafe fn _mm256_permutevar_pd(a: f64x4, b: i64x4) -> f64x4 { + vpermilpd256(a, b) +} + +/// Shuffle double-precision (64-bit) floating-point elements in `a` +/// using the control in `b`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vpermilpd))] +pub unsafe fn _mm_permutevar_pd(a: f64x2, b: i64x2) -> f64x2 { + vpermilpd(a, b) +} + /// Return vector of type `f32x8` with undefined elements. #[inline(always)] #[target_feature = "+avx"] @@ -839,6 +855,10 @@ extern "C" { fn vpermilps256(a: f32x8, b: i32x8) -> f32x8; #[link_name = "llvm.x86.avx.vpermilvar.ps"] fn vpermilps(a: f32x4, b: i32x4) -> f32x4; + #[link_name = "llvm.x86.avx.vpermilvar.pd.256"] + fn vpermilpd256(a: f64x4, b: i64x4) -> f64x4; + #[link_name = "llvm.x86.avx.vpermilvar.pd"] + fn vpermilpd(a: f64x2, b: i64x2) -> f64x2; } #[cfg(test)] @@ -1395,4 +1415,22 @@ mod tests { let e = f32x4::new(5.0, 2.0, 3.0, 4.0); assert_eq!(r, e); } + + #[simd_test = "avx"] + unsafe fn _mm256_permutevar_pd() { + let a = f64x4::new(4.0, 3.0, 2.0, 5.0); + let b = i64x4::new(1, 2, 3, 4); + let r = avx::_mm256_permutevar_pd(a, b); + let e = f64x4::new(4.0, 3.0, 5.0, 2.0); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm_permutevar_pd() { + let a = f64x2::new(4.0, 3.0); + let b = i64x2::new(3, 0); + let r = avx::_mm_permutevar_pd(a, b); + let e = f64x2::new(3.0, 4.0); + assert_eq!(r, e); + } } From 7e1703b43e1aa3a4c26e2423bbd70fd113546846 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sat, 7 Oct 2017 17:39:54 +0200 Subject: [PATCH 03/29] avx: _mm256_permute_pd --- src/x86/avx.rs | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index 84b9f51528..dd2940f619 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -777,6 +777,48 @@ pub unsafe fn _mm_permutevar_pd(a: f64x2, b: i64x2) -> f64x2 { vpermilpd(a, b) } +/// Shuffle double-precision (64-bit) floating-point elements in `a` +/// within 128-bit lanes using the control in `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))] +pub unsafe fn _mm256_permute_pd(a: f64x4, imm8: i32) -> f64x4 { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle4 { + ($a:expr, $b:expr, $c:expr, $d:expr) => { + simd_shuffle4(a, _mm256_undefined_pd(), [$a, $b, $c, $d]); + } + } + macro_rules! shuffle3 { + ($a:expr, $b: expr, $c: expr) => { + match (imm8 >> 3) & 0x1 { + 0 => shuffle4!($a, $b, $c, 2), + _ => shuffle4!($a, $b, $c, 3), + } + } + } + macro_rules! shuffle2 { + ($a:expr, $b:expr) => { + match (imm8 >> 2) & 0x1 { + 0 => shuffle3!($a, $b, 2), + _ => shuffle3!($a, $b, 3), + } + } + } + macro_rules! shuffle1 { + ($a:expr) => { + match (imm8 >> 1) & 0x1 { + 0 => shuffle2!($a, 0), + _ => shuffle2!($a, 1), + } + } + } + match (imm8 >> 0) & 0x1 { + 0 => shuffle1!(0), + _ => shuffle1!(1), + } +} + /// Return vector of type `f32x8` with undefined elements. #[inline(always)] #[target_feature = "+avx"] @@ -1433,4 +1475,12 @@ mod tests { let e = f64x2::new(3.0, 4.0); assert_eq!(r, e); } + + #[simd_test = "avx"] + unsafe fn _mm256_permute_pd() { + let a = f64x4::new(4.0, 3.0, 2.0, 5.0); + let r = avx::_mm256_permute_pd(a, 5); + let e = f64x4::new(3.0, 4.0, 5.0, 2.0); + assert_eq!(r, e); + } } From 36a1012547569279e37c940381348fb4f925c822 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sat, 7 Oct 2017 17:45:24 +0200 Subject: [PATCH 04/29] avx: _mm256_shuffle_pd fixed --- src/x86/avx.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index dd2940f619..2016b046aa 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -76,7 +76,7 @@ pub unsafe fn _mm256_or_ps(a: f32x8, b: f32x8) -> f32x8 { /// lanes using the control in `imm8`. #[inline(always)] #[target_feature = "+avx"] -//#[cfg_attr(test, assert_instr(vshufpd, imm8 = 0x0))] // FIXME +#[cfg_attr(test, assert_instr(vshufpd, imm8 = 0x1))] pub unsafe fn _mm256_shuffle_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 { let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle4 { From 1f122b259600c79f0143acdecf2ff050a9b99da2 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sat, 7 Oct 2017 17:59:28 +0200 Subject: [PATCH 05/29] avx: _mm_permute_pd, sse2: _mm_undefined_pd --- src/x86/avx.rs | 35 +++++++++++++++++++++++++++++++++++ src/x86/sse2.rs | 7 +++++++ 2 files changed, 42 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index 2016b046aa..f10888ad50 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -8,6 +8,7 @@ use v128::{f32x4, f64x2, i32x4, i64x2}; use v256::*; use x86::sse::_mm_undefined_ps; +use x86::sse2::_mm_undefined_pd; /// Add packed double-precision (64-bit) floating-point elements /// in `a` and `b`. @@ -819,6 +820,32 @@ pub unsafe fn _mm256_permute_pd(a: f64x4, imm8: i32) -> f64x4 { } } +/// Shuffle double-precision (64-bit) floating-point elements in `a` +/// using the control in `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))] +pub unsafe fn _mm_permute_pd(a: f64x2, imm8: i32) -> f64x2 { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle2 { + ($a:expr, $b:expr) => { + simd_shuffle2(a, _mm_undefined_pd(), [$a, $b]); + } + } + macro_rules! shuffle1 { + ($a:expr) => { + match (imm8 >> 1) & 0x1 { + 0 => shuffle2!($a, 0), + _ => shuffle2!($a, 1), + } + } + } + match (imm8 >> 0) & 0x1 { + 0 => shuffle1!(0), + _ => shuffle1!(1), + } +} + /// Return vector of type `f32x8` with undefined elements. #[inline(always)] #[target_feature = "+avx"] @@ -1483,4 +1510,12 @@ mod tests { let e = f64x4::new(3.0, 4.0, 5.0, 2.0); assert_eq!(r, e); } + + #[simd_test = "avx"] + unsafe fn _mm_permute_pd() { + let a = f64x2::new(4.0, 3.0); + let r = avx::_mm_permute_pd(a, 1); + let e = f64x2::new(3.0, 4.0); + assert_eq!(r, e); + } } diff --git a/src/x86/sse2.rs b/src/x86/sse2.rs index a88d514a7f..201fed4385 100644 --- a/src/x86/sse2.rs +++ b/src/x86/sse2.rs @@ -1827,6 +1827,13 @@ pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> f64x2 { f64x2::new(d, d) } +/// Return vector of type __m128d with undefined elements. +#[inline(always)] +#[target_feature = "+sse2"] +pub unsafe fn _mm_undefined_pd() -> f64x2 { + f64x2::splat(mem::uninitialized()) +} + #[allow(improper_ctypes)] extern { #[link_name = "llvm.x86.sse2.pause"] From ce7f2603283bc5a5bd4765b05ad39384c907b297 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sat, 7 Oct 2017 18:48:06 +0200 Subject: [PATCH 06/29] avx: _mm256_permute2f128_ps --- src/x86/avx.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index f10888ad50..193ca10a5a 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -846,6 +846,18 @@ pub unsafe fn _mm_permute_pd(a: f64x2, imm8: i32) -> f64x2 { } } +/// Shuffle 256-bits (composed of 8 packed single-precision (32-bit) +/// floating-point elements) selected by `imm8` from `a` and `b`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x5))] +pub unsafe fn _mm256_permute2f128_ps(a: f32x8, b: f32x8, imm8: i8) -> f32x8 { + macro_rules! call { + ($imm8:expr) => { vperm2f128ps256(a, b, $imm8) } + } + constify_imm8!(imm8, call) +} + /// Return vector of type `f32x8` with undefined elements. #[inline(always)] #[target_feature = "+avx"] @@ -928,6 +940,8 @@ extern "C" { fn vpermilpd256(a: f64x4, b: i64x4) -> f64x4; #[link_name = "llvm.x86.avx.vpermilvar.pd"] fn vpermilpd(a: f64x2, b: i64x2) -> f64x2; + #[link_name = "llvm.x86.avx.vperm2f128.ps.256"] + fn vperm2f128ps256(a: f32x8, b: f32x8, imm8: i8) -> f32x8; } #[cfg(test)] @@ -1518,4 +1532,13 @@ mod tests { let e = f64x2::new(3.0, 4.0); assert_eq!(r, e); } + + #[simd_test = "avx"] + unsafe fn _mm256_permute2f128_ps() { + let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0); + let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0); + let r = avx::_mm256_permute2f128_ps(a, b, 0x13); + let e = f32x8::new(5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0); + assert_eq!(r, e); + } } From 9a59843921cd8d79bfc0255b874ef6ec3602c9de Mon Sep 17 00:00:00 2001 From: gwenn Date: Sat, 7 Oct 2017 18:58:35 +0200 Subject: [PATCH 07/29] avx: _mm256_permute2f128_pd --- src/x86/avx.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index 193ca10a5a..a201e21142 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -858,6 +858,18 @@ pub unsafe fn _mm256_permute2f128_ps(a: f32x8, b: f32x8, imm8: i8) -> f32x8 { constify_imm8!(imm8, call) } +/// Shuffle 256-bits (composed of 4 packed double-precision (64-bit) +/// floating-point elements) selected by `imm8` from `a` and `b`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))] +pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 { + macro_rules! call { + ($imm8:expr) => { vperm2f128pd256(a, b, $imm8) } + } + constify_imm8!(imm8, call) +} + /// Return vector of type `f32x8` with undefined elements. #[inline(always)] #[target_feature = "+avx"] @@ -942,6 +954,8 @@ extern "C" { fn vpermilpd(a: f64x2, b: i64x2) -> f64x2; #[link_name = "llvm.x86.avx.vperm2f128.ps.256"] fn vperm2f128ps256(a: f32x8, b: f32x8, imm8: i8) -> f32x8; + #[link_name = "llvm.x86.avx.vperm2f128.pd.256"] + fn vperm2f128pd256(a: f64x4, b: f64x4, imm8: i8) -> f64x4; } #[cfg(test)] @@ -1541,4 +1555,13 @@ mod tests { let e = f32x8::new(5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0); assert_eq!(r, e); } + + #[simd_test = "avx"] + unsafe fn _mm256_permute2f128_pd() { + let a = f64x4::new(1.0, 2.0, 3.0, 4.0); + let b = f64x4::new(5.0, 6.0, 7.0, 8.0); + let r = avx::_mm256_permute2f128_pd(a, b, 0x31); + let e = f64x4::new(3.0, 4.0, 7.0, 8.0); + assert_eq!(r, e); + } } From 567dd6d15b9d322af0084b5ae1e397f58a3fd628 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sat, 7 Oct 2017 19:09:02 +0200 Subject: [PATCH 08/29] avx: _mm256_permute2f128_si256 --- src/x86/avx.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index a201e21142..e8bfece220 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -870,6 +870,18 @@ pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 { constify_imm8!(imm8, call) } +/// Shuffle 258-bits (composed of integer data) selected by `imm8` +/// from `a` and `b`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))] +pub unsafe fn _mm256_permute2f128_si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 { + macro_rules! call { + ($imm8:expr) => { vperm2f128si256(a, b, $imm8) } + } + constify_imm8!(imm8, call) +} + /// Return vector of type `f32x8` with undefined elements. #[inline(always)] #[target_feature = "+avx"] @@ -956,6 +968,8 @@ extern "C" { fn vperm2f128ps256(a: f32x8, b: f32x8, imm8: i8) -> f32x8; #[link_name = "llvm.x86.avx.vperm2f128.pd.256"] fn vperm2f128pd256(a: f64x4, b: f64x4, imm8: i8) -> f64x4; + #[link_name = "llvm.x86.avx.vperm2f128.si.256"] + fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8; } #[cfg(test)] @@ -1564,4 +1578,13 @@ mod tests { let e = f64x4::new(3.0, 4.0, 7.0, 8.0); assert_eq!(r, e); } + + #[simd_test = "avx"] + unsafe fn _mm256_permute2f128_si256() { + let a = i32x8::new(1, 2, 3, 4, 1, 2, 3, 4); + let b = i32x8::new(5, 6, 7, 8, 5, 6, 7, 8); + let r = avx::_mm256_permute2f128_si256(a, b, 0x20); + let e = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); + assert_eq!(r, e); + } } From 7702384047469d67214b7c6d2d8ad1e8bcb7f55c Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 08:41:17 +0200 Subject: [PATCH 09/29] avx: _mm256_broadcast_ss --- src/x86/avx.rs | 16 ++++++++++++++++ src/x86/sse.rs | 2 -- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index e8bfece220..78590b3175 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -882,6 +882,15 @@ pub unsafe fn _mm256_permute2f128_si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 { constify_imm8!(imm8, call) } +/// Broadcast a single-precision (32-bit) floating-point element from memory +/// to all elements of the returned vector. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vbroadcastss))] +pub unsafe fn _mm256_broadcast_ss(f: &f32) -> f32x8 { + f32x8::splat(*f) +} + /// Return vector of type `f32x8` with undefined elements. #[inline(always)] #[target_feature = "+avx"] @@ -1587,4 +1596,11 @@ mod tests { let e = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); assert_eq!(r, e); } + + #[simd_test = "avx"] + unsafe fn _mm256_broadcast_ss() { + let r = avx::_mm256_broadcast_ss(&3.0); + let e = f32x8::splat(3.0); + assert_eq!(r, e); + } } diff --git a/src/x86/sse.rs b/src/x86/sse.rs index b19b2150b2..22a6dd75bb 100644 --- a/src/x86/sse.rs +++ b/src/x86/sse.rs @@ -1,5 +1,3 @@ -use std::mem; - use simd_llvm::simd_shuffle4; use v128::*; use v64::f32x2; From 8f04804fea3fc70f0c3fb4fcba41d41fc0ad1dbd Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 08:46:02 +0200 Subject: [PATCH 10/29] avx: _mm_broadcast_ss --- src/x86/avx.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index 78590b3175..c331c32f8b 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -891,6 +891,15 @@ pub unsafe fn _mm256_broadcast_ss(f: &f32) -> f32x8 { f32x8::splat(*f) } +/// Broadcast a single-precision (32-bit) floating-point element from memory +/// to all elements of the returned vector. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vbroadcastss))] +pub unsafe fn _mm_broadcast_ss(f: &f32) -> f32x4 { + f32x4::splat(*f) +} + /// Return vector of type `f32x8` with undefined elements. #[inline(always)] #[target_feature = "+avx"] @@ -1603,4 +1612,11 @@ mod tests { let e = f32x8::splat(3.0); assert_eq!(r, e); } + + #[simd_test = "avx"] + unsafe fn _mm_broadcast_ss() { + let r = avx::_mm_broadcast_ss(&3.0); + let e = f32x4::splat(3.0); + assert_eq!(r, e); + } } From 6634846716c25335ac289365462d7b9195aefed7 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 08:50:05 +0200 Subject: [PATCH 11/29] avx: _mm256_broadcast_sd --- src/x86/avx.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index c331c32f8b..abf9b4a328 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -900,6 +900,15 @@ pub unsafe fn _mm_broadcast_ss(f: &f32) -> f32x4 { f32x4::splat(*f) } +/// Broadcast a double-precision (64-bit) floating-point element from memory +/// to all elements of the returned vector. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vbroadcastsd))] +pub unsafe fn _mm256_broadcast_sd(f: &f64) -> f64x4 { + f64x4::splat(*f) +} + /// Return vector of type `f32x8` with undefined elements. #[inline(always)] #[target_feature = "+avx"] @@ -1618,5 +1627,12 @@ mod tests { let r = avx::_mm_broadcast_ss(&3.0); let e = f32x4::splat(3.0); assert_eq!(r, e); + + #[simd_test = "avx"] + unsafe fn _mm256_broadcast_sd() { + let r = avx::_mm256_broadcast_sd(&3.0); + let e = f64x4::splat(3.0); + assert_eq!(r, e); + } } } From 0220b7e70819ab5f1d4338b6a10aa65c2ede09f0 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 09:17:55 +0200 Subject: [PATCH 12/29] avx: _mm256_broadcast_ps --- src/x86/avx.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index abf9b4a328..92544d18f9 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -909,6 +909,15 @@ pub unsafe fn _mm256_broadcast_sd(f: &f64) -> f64x4 { f64x4::splat(*f) } +/// Broadcast 128 bits from memory (composed of 4 packed single-precision +/// (32-bit) floating-point elements) to all elements of the returned vector. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vbroadcastf128))] +pub unsafe fn _mm256_broadcast_ps(a: &f32x4) -> f32x8 { + vbroadcastf128ps256(a) +} + /// Return vector of type `f32x8` with undefined elements. #[inline(always)] #[target_feature = "+avx"] @@ -997,6 +1006,8 @@ extern "C" { fn vperm2f128pd256(a: f64x4, b: f64x4, imm8: i8) -> f64x4; #[link_name = "llvm.x86.avx.vperm2f128.si.256"] fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8; + #[link_name = "llvm.x86.avx.vbroadcastf128.ps.256"] + fn vbroadcastf128ps256(a: &f32x4) -> f32x8; } #[cfg(test)] @@ -1627,6 +1638,7 @@ mod tests { let r = avx::_mm_broadcast_ss(&3.0); let e = f32x4::splat(3.0); assert_eq!(r, e); + } #[simd_test = "avx"] unsafe fn _mm256_broadcast_sd() { @@ -1634,5 +1646,12 @@ mod tests { let e = f64x4::splat(3.0); assert_eq!(r, e); } + + #[simd_test = "avx"] + unsafe fn _mm256_broadcast_ps() { + let a = f32x4::new(4.0, 3.0, 2.0, 5.0); + let r = avx::_mm256_broadcast_ps(&a); + let e = f32x8::new(4.0, 3.0, 2.0, 5.0, 4.0, 3.0, 2.0, 5.0); + assert_eq!(r, e); } } From ea4dcc9f115bb285f8b21b1ed944d31c515e8ac4 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 09:24:32 +0200 Subject: [PATCH 13/29] avx: _mm256_broadcast_pd --- src/x86/avx.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index 92544d18f9..b4d8ba3f82 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -918,6 +918,15 @@ pub unsafe fn _mm256_broadcast_ps(a: &f32x4) -> f32x8 { vbroadcastf128ps256(a) } +/// Broadcast 128 bits from memory (composed of 2 packed double-precision +/// (64-bit) floating-point elements) to all elements of the returned vector. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vbroadcastf128))] +pub unsafe fn _mm256_broadcast_pd(a: &f64x2) -> f64x4 { + vbroadcastf128pd256(a) +} + /// Return vector of type `f32x8` with undefined elements. #[inline(always)] #[target_feature = "+avx"] @@ -1008,6 +1017,8 @@ extern "C" { fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8; #[link_name = "llvm.x86.avx.vbroadcastf128.ps.256"] fn vbroadcastf128ps256(a: &f32x4) -> f32x8; + #[link_name = "llvm.x86.avx.vbroadcastf128.pd.256"] + fn vbroadcastf128pd256(a: &f64x2) -> f64x4; } #[cfg(test)] @@ -1654,4 +1665,12 @@ mod tests { let e = f32x8::new(4.0, 3.0, 2.0, 5.0, 4.0, 3.0, 2.0, 5.0); assert_eq!(r, e); } + + #[simd_test = "avx"] + unsafe fn _mm256_broadcast_pd() { + let a = f64x2::new(4.0, 3.0); + let r = avx::_mm256_broadcast_pd(&a); + let e = f64x4::new(4.0, 3.0, 4.0, 3.0); + assert_eq!(r, e); + } } From ab44d454e941f82b37c309e413cf9ddf549d3b40 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 10:53:33 +0200 Subject: [PATCH 14/29] avx: _mm_cmp_pd --- src/x86/avx.rs | 89 +++++++++++++++++++++++++++++++++++++++++++++++ src/x86/macros.rs | 40 +++++++++++++++++++++ 2 files changed, 129 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index b4d8ba3f82..e067ce5fda 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -487,6 +487,84 @@ pub unsafe fn _mm256_xor_ps(a: f32x8, b: f32x8) -> f32x8 { mem::transmute(a ^ b) } +// Equal (ordered, non-signaling) +pub const _CMP_EQ_OQ: i8 = 0x00; +// Less-than (ordered, signaling) +pub const _CMP_LT_OS: i8 = 0x01; +// Less-than-or-equal (ordered, signaling) +pub const _CMP_LE_OS: i8 = 0x02; +// Unordered (non-signaling) +pub const _CMP_UNORD_Q: i8 = 0x03; +// Not-equal (unordered, non-signaling) +pub const _CMP_NEQ_UQ: i8 = 0x04; +// Not-less-than (unordered, signaling) +pub const _CMP_NLT_US: i8 = 0x05; +// Not-less-than-or-equal (unordered, signaling) +pub const _CMP_NLE_US: i8 = 0x06; +// Ordered (non-signaling) +pub const _CMP_ORD_Q: i8 = 0x07; +// Equal (unordered, non-signaling) +pub const _CMP_EQ_UQ: i8 = 0x08; +// Not-greater-than-or-equal (unordered, signaling) +pub const _CMP_NGE_US: i8 = 0x09; +// Not-greater-than (unordered, signaling) +pub const _CMP_NGT_US: i8 = 0x0a; +// False (ordered, non-signaling) +pub const _CMP_FALSE_OQ: i8 = 0x0b; +// Not-equal (ordered, non-signaling) +pub const _CMP_NEQ_OQ: i8 = 0x0c; +// Greater-than-or-equal (ordered, signaling) +pub const _CMP_GE_OS: i8 = 0x0d; +// Greater-than (ordered, signaling) +pub const _CMP_GT_OS: i8 = 0x0e; +// True (unordered, non-signaling) +pub const _CMP_TRUE_UQ: i8 = 0x0f; +// Equal (ordered, signaling) +pub const _CMP_EQ_OS: i8 = 0x10; +// Less-than (ordered, non-signaling) +pub const _CMP_LT_OQ: i8 = 0x11; +// Less-than-or-equal (ordered, non-signaling) +pub const _CMP_LE_OQ: i8 = 0x12; +// Unordered (signaling) +pub const _CMP_UNORD_S: i8 = 0x13; +// Not-equal (unordered, signaling) +pub const _CMP_NEQ_US: i8 = 0x14; +// Not-less-than (unordered, non-signaling) +pub const _CMP_NLT_UQ: i8 = 0x15; +// Not-less-than-or-equal (unordered, non-signaling) +pub const _CMP_NLE_UQ: i8 = 0x16; +// Ordered (signaling) +pub const _CMP_ORD_S: i8 = 0x17; +// Equal (unordered, signaling) +pub const _CMP_EQ_US: i8 = 0x18; +// Not-greater-than-or-equal (unordered, non-signaling) +pub const _CMP_NGE_UQ: i8 = 0x19; +// Not-greater-than (unordered, non-signaling) +pub const _CMP_NGT_UQ: i8 = 0x1a; +// False (ordered, signaling) +pub const _CMP_FALSE_OS: i8 = 0x1b; +// Not-equal (ordered, signaling) +pub const _CMP_NEQ_OS: i8 = 0x1c; +// Greater-than-or-equal (ordered, non-signaling) +pub const _CMP_GE_OQ: i8 = 0x1d; +// Greater-than (ordered, non-signaling) +pub const _CMP_GT_OQ: i8 = 0x1e; +// True (unordered, signaling) +pub const _CMP_TRUE_US: i8 = 0x1f; + +/// Compare packed double-precision (64-bit) floating-point +/// elements in `a` and `b` based on the comparison operand +/// specified by `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd +pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: i8) -> f64x2 { + macro_rules! call { + ($imm8:expr) => { vcmppd(a, b, $imm8) } + } + constify_imm6!(imm8, call) +} + /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) /// floating-point elements. #[inline(always)] @@ -985,6 +1063,8 @@ extern "C" { fn vhsubpd(a: f64x4, b: f64x4) -> f64x4; #[link_name = "llvm.x86.avx.hsub.ps.256"] fn vhsubps(a: f32x8, b: f32x8) -> f32x8; + #[link_name = "llvm.x86.sse2.cmp.pd"] + fn vcmppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2; #[link_name = "llvm.x86.avx.cvtdq2.ps.256"] fn vcvtdq2ps(a: i32x8) -> f32x8; #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"] @@ -1410,6 +1490,15 @@ mod tests { assert_eq!(r, a); } + #[simd_test = "avx"] + unsafe fn _mm_cmp_pd() { + let a = f64x2::new(4.0, 9.0); + let b = f64x2::new(4.0, 3.0); + let r = avx::_mm_cmp_pd(a, b, avx::_CMP_GE_OS); + assert!(r.extract(0).is_nan()); + assert!(r.extract(1).is_nan()); + } + #[simd_test = "avx"] unsafe fn _mm256_cvtepi32_pd() { let a = i32x4::new(4, 9, 16, 25); diff --git a/src/x86/macros.rs b/src/x86/macros.rs index ebe1015181..8d65435db0 100644 --- a/src/x86/macros.rs +++ b/src/x86/macros.rs @@ -261,3 +261,43 @@ macro_rules! constify_imm8 { } } } + +macro_rules! constify_imm6 { + ($imm8:expr, $expand:ident) => { + #[allow(overflowing_literals)] + match $imm8 & 0b1_1111 { + 0 => $expand!(0), + 1 => $expand!(1), + 2 => $expand!(2), + 3 => $expand!(3), + 4 => $expand!(4), + 5 => $expand!(5), + 6 => $expand!(6), + 7 => $expand!(7), + 8 => $expand!(8), + 9 => $expand!(9), + 10 => $expand!(10), + 11 => $expand!(11), + 12 => $expand!(12), + 13 => $expand!(13), + 14 => $expand!(14), + 15 => $expand!(15), + 16 => $expand!(16), + 17 => $expand!(17), + 18 => $expand!(18), + 19 => $expand!(19), + 20 => $expand!(20), + 21 => $expand!(21), + 22 => $expand!(22), + 23 => $expand!(23), + 24 => $expand!(24), + 25 => $expand!(25), + 26 => $expand!(26), + 27 => $expand!(27), + 28 => $expand!(28), + 29 => $expand!(29), + 30 => $expand!(30), + _ => $expand!(31), + } + } +} From 13cb6db2412238bf6db1a82d6c262f80c85d2f61 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 11:03:27 +0200 Subject: [PATCH 15/29] avx: _mm256_cmp_pd --- src/x86/avx.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index e067ce5fda..ef5e4eb863 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -565,6 +565,19 @@ pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: i8) -> f64x2 { constify_imm6!(imm8, call) } +/// Compare packed double-precision (64-bit) floating-point +/// elements in `a` and `b` based on the comparison operand +/// specified by `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd +pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 { + macro_rules! call { + ($imm8:expr) => { vcmppd256(a, b, $imm8) } + } + constify_imm6!(imm8, call) +} + /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) /// floating-point elements. #[inline(always)] @@ -1065,6 +1078,8 @@ extern "C" { fn vhsubps(a: f32x8, b: f32x8) -> f32x8; #[link_name = "llvm.x86.sse2.cmp.pd"] fn vcmppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2; + #[link_name = "llvm.x86.avx.cmp.pd.256"] + fn vcmppd256(a: f64x4, b: f64x4, imm8: u8) -> f64x4; #[link_name = "llvm.x86.avx.cvtdq2.ps.256"] fn vcvtdq2ps(a: i32x8) -> f32x8; #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"] @@ -1499,6 +1514,15 @@ mod tests { assert!(r.extract(1).is_nan()); } + #[simd_test = "avx"] + unsafe fn _mm256_cmp_pd() { + let a = f64x4::new(1.0, 2.0, 3.0, 4.0); + let b = f64x4::new(5.0, 6.0, 7.0, 8.0); + let r = avx::_mm256_cmp_pd(a, b, avx::_CMP_GE_OS); + let e = f64x4::splat(0.0); + assert_eq!(r, e); + } + #[simd_test = "avx"] unsafe fn _mm256_cvtepi32_pd() { let a = i32x4::new(4, 9, 16, 25); From e130cca45fb6f9eb3647e73aad8b08df3fadbe54 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 11:14:39 +0200 Subject: [PATCH 16/29] avx: _mm_cmp_ps --- src/x86/avx.rs | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index ef5e4eb863..3251cede69 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -578,6 +578,19 @@ pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 { constify_imm6!(imm8, call) } +/// Compare packed single-precision (32-bit) floating-point +/// elements in `a` and `b` based on the comparison operand +/// specified by `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps +pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: i8) -> f32x4 { + macro_rules! call { + ($imm8:expr) => { vcmpps(a, b, $imm8) } + } + constify_imm6!(imm8, call) +} + /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) /// floating-point elements. #[inline(always)] @@ -1080,6 +1093,8 @@ extern "C" { fn vcmppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2; #[link_name = "llvm.x86.avx.cmp.pd.256"] fn vcmppd256(a: f64x4, b: f64x4, imm8: u8) -> f64x4; + #[link_name = "llvm.x86.sse.cmp.ps"] + fn vcmpps(a: f32x4, b: f32x4, imm8: u8) -> f32x4; #[link_name = "llvm.x86.avx.cvtdq2.ps.256"] fn vcvtdq2ps(a: i32x8) -> f32x8; #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"] @@ -1523,6 +1538,17 @@ mod tests { assert_eq!(r, e); } + #[simd_test = "avx"] + unsafe fn _mm_cmp_ps() { + let a = f32x4::new(4.0, 3.0, 2.0, 5.0); + let b = f32x4::new(4.0, 9.0, 16.0, 25.0); + let r = avx::_mm_cmp_ps(a, b, avx::_CMP_GE_OS); + assert!(r.extract(0).is_nan()); + assert_eq!(r.extract(1), 0.0); + assert_eq!(r.extract(2), 0.0); + assert_eq!(r.extract(3), 0.0); + } + #[simd_test = "avx"] unsafe fn _mm256_cvtepi32_pd() { let a = i32x4::new(4, 9, 16, 25); From 01a69d29beb3b1f7595875e7bb04785c60538cfe Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 11:22:03 +0200 Subject: [PATCH 17/29] avx: _mm256_cmp_ps --- src/x86/avx.rs | 94 +++++++++++++++++++++++++++++++------------------- 1 file changed, 59 insertions(+), 35 deletions(-) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index 3251cede69..f1ee2ec44d 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -488,69 +488,69 @@ pub unsafe fn _mm256_xor_ps(a: f32x8, b: f32x8) -> f32x8 { } // Equal (ordered, non-signaling) -pub const _CMP_EQ_OQ: i8 = 0x00; +pub const _CMP_EQ_OQ: u8 = 0x00; // Less-than (ordered, signaling) -pub const _CMP_LT_OS: i8 = 0x01; +pub const _CMP_LT_OS: u8 = 0x01; // Less-than-or-equal (ordered, signaling) -pub const _CMP_LE_OS: i8 = 0x02; +pub const _CMP_LE_OS: u8 = 0x02; // Unordered (non-signaling) -pub const _CMP_UNORD_Q: i8 = 0x03; +pub const _CMP_UNORD_Q: u8 = 0x03; // Not-equal (unordered, non-signaling) -pub const _CMP_NEQ_UQ: i8 = 0x04; +pub const _CMP_NEQ_UQ: u8 = 0x04; // Not-less-than (unordered, signaling) -pub const _CMP_NLT_US: i8 = 0x05; +pub const _CMP_NLT_US: u8 = 0x05; // Not-less-than-or-equal (unordered, signaling) -pub const _CMP_NLE_US: i8 = 0x06; +pub const _CMP_NLE_US: u8 = 0x06; // Ordered (non-signaling) -pub const _CMP_ORD_Q: i8 = 0x07; +pub const _CMP_ORD_Q: u8 = 0x07; // Equal (unordered, non-signaling) -pub const _CMP_EQ_UQ: i8 = 0x08; +pub const _CMP_EQ_UQ: u8 = 0x08; // Not-greater-than-or-equal (unordered, signaling) -pub const _CMP_NGE_US: i8 = 0x09; +pub const _CMP_NGE_US: u8 = 0x09; // Not-greater-than (unordered, signaling) -pub const _CMP_NGT_US: i8 = 0x0a; +pub const _CMP_NGT_US: u8 = 0x0a; // False (ordered, non-signaling) -pub const _CMP_FALSE_OQ: i8 = 0x0b; +pub const _CMP_FALSE_OQ: u8 = 0x0b; // Not-equal (ordered, non-signaling) -pub const _CMP_NEQ_OQ: i8 = 0x0c; +pub const _CMP_NEQ_OQ: u8 = 0x0c; // Greater-than-or-equal (ordered, signaling) -pub const _CMP_GE_OS: i8 = 0x0d; +pub const _CMP_GE_OS: u8 = 0x0d; // Greater-than (ordered, signaling) -pub const _CMP_GT_OS: i8 = 0x0e; +pub const _CMP_GT_OS: u8 = 0x0e; // True (unordered, non-signaling) -pub const _CMP_TRUE_UQ: i8 = 0x0f; +pub const _CMP_TRUE_UQ: u8 = 0x0f; // Equal (ordered, signaling) -pub const _CMP_EQ_OS: i8 = 0x10; +pub const _CMP_EQ_OS: u8 = 0x10; // Less-than (ordered, non-signaling) -pub const _CMP_LT_OQ: i8 = 0x11; +pub const _CMP_LT_OQ: u8 = 0x11; // Less-than-or-equal (ordered, non-signaling) -pub const _CMP_LE_OQ: i8 = 0x12; +pub const _CMP_LE_OQ: u8 = 0x12; // Unordered (signaling) -pub const _CMP_UNORD_S: i8 = 0x13; +pub const _CMP_UNORD_S: u8 = 0x13; // Not-equal (unordered, signaling) -pub const _CMP_NEQ_US: i8 = 0x14; +pub const _CMP_NEQ_US: u8 = 0x14; // Not-less-than (unordered, non-signaling) -pub const _CMP_NLT_UQ: i8 = 0x15; +pub const _CMP_NLT_UQ: u8 = 0x15; // Not-less-than-or-equal (unordered, non-signaling) -pub const _CMP_NLE_UQ: i8 = 0x16; +pub const _CMP_NLE_UQ: u8 = 0x16; // Ordered (signaling) -pub const _CMP_ORD_S: i8 = 0x17; +pub const _CMP_ORD_S: u8 = 0x17; // Equal (unordered, signaling) -pub const _CMP_EQ_US: i8 = 0x18; +pub const _CMP_EQ_US: u8 = 0x18; // Not-greater-than-or-equal (unordered, non-signaling) -pub const _CMP_NGE_UQ: i8 = 0x19; +pub const _CMP_NGE_UQ: u8 = 0x19; // Not-greater-than (unordered, non-signaling) -pub const _CMP_NGT_UQ: i8 = 0x1a; +pub const _CMP_NGT_UQ: u8 = 0x1a; // False (ordered, signaling) -pub const _CMP_FALSE_OS: i8 = 0x1b; +pub const _CMP_FALSE_OS: u8 = 0x1b; // Not-equal (ordered, signaling) -pub const _CMP_NEQ_OS: i8 = 0x1c; +pub const _CMP_NEQ_OS: u8 = 0x1c; // Greater-than-or-equal (ordered, non-signaling) -pub const _CMP_GE_OQ: i8 = 0x1d; +pub const _CMP_GE_OQ: u8 = 0x1d; // Greater-than (ordered, non-signaling) -pub const _CMP_GT_OQ: i8 = 0x1e; +pub const _CMP_GT_OQ: u8 = 0x1e; // True (unordered, signaling) -pub const _CMP_TRUE_US: i8 = 0x1f; +pub const _CMP_TRUE_US: u8 = 0x1f; /// Compare packed double-precision (64-bit) floating-point /// elements in `a` and `b` based on the comparison operand @@ -558,7 +558,7 @@ pub const _CMP_TRUE_US: i8 = 0x1f; #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd -pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: i8) -> f64x2 { +pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 { macro_rules! call { ($imm8:expr) => { vcmppd(a, b, $imm8) } } @@ -571,7 +571,7 @@ pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: i8) -> f64x2 { #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd -pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 { +pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: u8) -> f64x4 { macro_rules! call { ($imm8:expr) => { vcmppd256(a, b, $imm8) } } @@ -584,13 +584,26 @@ pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 { #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps -pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: i8) -> f32x4 { +pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 { macro_rules! call { ($imm8:expr) => { vcmpps(a, b, $imm8) } } constify_imm6!(imm8, call) } +/// Compare packed single-precision (32-bit) floating-point +/// elements in `a` and `b` based on the comparison operand +/// specified by `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps +pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 { + macro_rules! call { + ($imm8:expr) => { vcmpps256(a, b, $imm8) } + } + constify_imm6!(imm8, call) +} + /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) /// floating-point elements. #[inline(always)] @@ -1095,6 +1108,8 @@ extern "C" { fn vcmppd256(a: f64x4, b: f64x4, imm8: u8) -> f64x4; #[link_name = "llvm.x86.sse.cmp.ps"] fn vcmpps(a: f32x4, b: f32x4, imm8: u8) -> f32x4; + #[link_name = "llvm.x86.avx.cmp.ps.256"] + fn vcmpps256(a: f32x8, b: f32x8, imm8: u8) -> f32x8; #[link_name = "llvm.x86.avx.cvtdq2.ps.256"] fn vcvtdq2ps(a: i32x8) -> f32x8; #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"] @@ -1549,6 +1564,15 @@ mod tests { assert_eq!(r.extract(3), 0.0); } + #[simd_test = "avx"] + unsafe fn _mm256_cmp_ps() { + let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0); + let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0); + let r = avx::_mm256_cmp_ps(a, b, avx::_CMP_GE_OS); + let e = f32x8::splat(0.0); + assert_eq!(r, e); + } + #[simd_test = "avx"] unsafe fn _mm256_cvtepi32_pd() { let a = i32x4::new(4, 9, 16, 25); From ebb2cc27af5b368bbcd8457b0e19489b2028dde8 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 11:33:37 +0200 Subject: [PATCH 18/29] avx: _mm_cmp_sd --- src/x86/avx.rs | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index f1ee2ec44d..0a18bd03e9 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -604,6 +604,21 @@ pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 { constify_imm6!(imm8, call) } + +/// Compare the lower double-precision (64-bit) floating-point element in +/// `a` and `b` based on the comparison operand specified by `imm8`, +/// store the result in the returned vector, and copy the upper element +/// from `a` to the upper element of the returned vector. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd +pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 { + macro_rules! call { + ($imm8:expr) => { vcmpsd(a, b, $imm8) } + } + constify_imm6!(imm8, call) +} + /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) /// floating-point elements. #[inline(always)] @@ -1110,6 +1125,8 @@ extern "C" { fn vcmpps(a: f32x4, b: f32x4, imm8: u8) -> f32x4; #[link_name = "llvm.x86.avx.cmp.ps.256"] fn vcmpps256(a: f32x8, b: f32x8, imm8: u8) -> f32x8; + #[link_name = "llvm.x86.sse2.cmp.sd"] + fn vcmpsd(a: f64x2, b: f64x2, imm8: u8) -> f64x2; #[link_name = "llvm.x86.avx.cvtdq2.ps.256"] fn vcvtdq2ps(a: i32x8) -> f32x8; #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"] @@ -1573,6 +1590,15 @@ mod tests { assert_eq!(r, e); } + #[simd_test = "avx"] + unsafe fn _mm_cmp_sd() { + let a = f64x2::new(4.0, 9.0); + let b = f64x2::new(4.0, 3.0); + let r = avx::_mm_cmp_sd(a, b, avx::_CMP_GE_OS); + assert!(r.extract(0).is_nan()); + assert_eq!(r.extract(1), 9.0); + } + #[simd_test = "avx"] unsafe fn _mm256_cvtepi32_pd() { let a = i32x4::new(4, 9, 16, 25); From 4ae314bb699d6bf34e9f0dc3496311134d89a8c0 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 11:46:58 +0200 Subject: [PATCH 19/29] avx: _mm_cmp_ss --- src/x86/avx.rs | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index 0a18bd03e9..5cf99bbe2d 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -604,11 +604,10 @@ pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 { constify_imm6!(imm8, call) } - /// Compare the lower double-precision (64-bit) floating-point element in /// `a` and `b` based on the comparison operand specified by `imm8`, -/// store the result in the returned vector, and copy the upper element -/// from `a` to the upper element of the returned vector. +/// store the result in the lower element of returned vector, +/// and copy the upper element from `a` to the upper element of returned vector. #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd @@ -619,6 +618,21 @@ pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 { constify_imm6!(imm8, call) } +/// Compare the lower single-precision (32-bit) floating-point element in +/// `a` and `b` based on the comparison operand specified by `imm8`, +/// store the result in the lower element of returned vector, +/// and copy the upper 3 packed elements from `a` to the upper elements of +/// returned vector. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vcmpeqss, imm8 = 0))] // TODO Validate vcmpss +pub unsafe fn _mm_cmp_ss(a: f32x4, b: f32x4, imm8: u8) -> f32x4 { + macro_rules! call { + ($imm8:expr) => { vcmpss(a, b, $imm8) } + } + constify_imm6!(imm8, call) +} + /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit) /// floating-point elements. #[inline(always)] @@ -1127,6 +1141,8 @@ extern "C" { fn vcmpps256(a: f32x8, b: f32x8, imm8: u8) -> f32x8; #[link_name = "llvm.x86.sse2.cmp.sd"] fn vcmpsd(a: f64x2, b: f64x2, imm8: u8) -> f64x2; + #[link_name = "llvm.x86.sse.cmp.ss"] + fn vcmpss(a: f32x4, b: f32x4, imm8: u8) -> f32x4; #[link_name = "llvm.x86.avx.cvtdq2.ps.256"] fn vcvtdq2ps(a: i32x8) -> f32x8; #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"] @@ -1599,6 +1615,17 @@ mod tests { assert_eq!(r.extract(1), 9.0); } + #[simd_test = "avx"] + unsafe fn _mm_cmp_ss() { + let a = f32x4::new(4.0, 3.0, 2.0, 5.0); + let b = f32x4::new(4.0, 9.0, 16.0, 25.0); + let r = avx::_mm_cmp_ss(a, b, avx::_CMP_GE_OS); + assert!(r.extract(0).is_nan()); + assert_eq!(r.extract(1), 3.0); + assert_eq!(r.extract(2), 2.0); + assert_eq!(r.extract(3), 5.0); + } + #[simd_test = "avx"] unsafe fn _mm256_cvtepi32_pd() { let a = i32x4::new(4, 9, 16, 25); From b1b74ab84469943a1f2fc928a0744360c11d5874 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 14:30:21 +0200 Subject: [PATCH 20/29] avx: _mm256_insertf128_pd, _mm256_castpd128_pd256 --- src/x86/avx.rs | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index 5cf99bbe2d..28d04c7841 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -1073,6 +1073,57 @@ pub unsafe fn _mm256_broadcast_pd(a: &f64x2) -> f64x4 { vbroadcastf128pd256(a) } +/// Copy `a` to result, then insert 128 bits (composed of 2 packed +/// double-precision (64-bit) floating-point elements) from `b` into result +/// at the location specified by `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))] +pub unsafe fn _mm256_insertf128_pd(a: f64x4, b: f64x2, imm8: i32) -> f64x4 { + macro_rules! shuffle4 { + ($a:expr, $b:expr, $c:expr, $d:expr) => { + simd_shuffle4(a, _mm256_castpd128_pd256(b), [$a, $b, $c, $d]); + } + } + macro_rules! shuffle3 { + ($a:expr, $b: expr, $c: expr) => { + match imm8 & 0x1 { + 0 => shuffle4!($a, $b, $c, 3), + _ => shuffle4!($a, $b, $c, 5), + } + } + } + macro_rules! shuffle2 { + ($a:expr, $b:expr) => { + match imm8 & 0x1 { + 0 => shuffle3!($a, $b, 2), + _ => shuffle3!($a, $b, 4), + } + } + } + macro_rules! shuffle1 { + ($a:expr) => { + match imm8 & 0x1 { + 0 => shuffle2!($a, 5), + _ => shuffle2!($a, 1), + } + } + } + match imm8 & 0x1 { + 0 => shuffle1!(4), + _ => shuffle1!(0), + } +} + +/// Casts vector of type __m128d to type __m256d; +/// the upper 128 bits of the result are undefined. +#[inline(always)] +#[target_feature = "+avx"] +pub unsafe fn _mm256_castpd128_pd256(a: f64x2) -> f64x4 { + // FIXME simd_shuffle4(a, a, [0, 1, -1, -1]) + simd_shuffle4(a, a, [0, 1, 1, 1]) +} + /// Return vector of type `f32x8` with undefined elements. #[inline(always)] #[target_feature = "+avx"] @@ -1889,4 +1940,13 @@ mod tests { let e = f64x4::new(4.0, 3.0, 4.0, 3.0); assert_eq!(r, e); } + + #[simd_test = "avx"] + unsafe fn _mm256_insertf128_pd() { + let a = f64x4::new(1.0, 2.0, 3.0, 4.0); + let b = f64x2::new(5.0, 6.0); + let r = avx::_mm256_insertf128_pd(a, b, 0); + let e = f64x4::new(5.0, 6.0, 3.0, 4.0); + assert_eq!(r, e); + } } From c3d109c7e19b91f6f72f06d54ca33f1bb9db2aa1 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 15:58:09 +0200 Subject: [PATCH 21/29] avx: _mm256_insertf128_si256, _mm256_castsi128_si256 --- src/x86/avx.rs | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index 28d04c7841..db76e0f262 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -1115,6 +1115,47 @@ pub unsafe fn _mm256_insertf128_pd(a: f64x4, b: f64x2, imm8: i32) -> f64x4 { } } +/// Copy `a` to result, then insert 128 bits from `b` into result +/// at the location specified by `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))] +pub unsafe fn _mm256_insertf128_si256(a: i64x4, b: i64x2, imm8: i32) -> i64x4 { + macro_rules! shuffle4 { + ($a:expr, $b:expr, $c:expr, $d:expr) => { + simd_shuffle4(a, _mm256_castsi128_si256(b), [$a, $b, $c, $d]); + } + } + macro_rules! shuffle3 { + ($a:expr, $b: expr, $c: expr) => { + match imm8 & 0x1 { + 0 => shuffle4!($a, $b, $c, 3), + _ => shuffle4!($a, $b, $c, 5), + } + } + } + macro_rules! shuffle2 { + ($a:expr, $b:expr) => { + match imm8 & 0x1 { + 0 => shuffle3!($a, $b, 2), + _ => shuffle3!($a, $b, 4), + } + } + } + macro_rules! shuffle1 { + ($a:expr) => { + match imm8 & 0x1 { + 0 => shuffle2!($a, 5), + _ => shuffle2!($a, 1), + } + } + } + match imm8 & 0x1 { + 0 => shuffle1!(4), + _ => shuffle1!(0), + } +} + /// Casts vector of type __m128d to type __m256d; /// the upper 128 bits of the result are undefined. #[inline(always)] @@ -1124,6 +1165,15 @@ pub unsafe fn _mm256_castpd128_pd256(a: f64x2) -> f64x4 { simd_shuffle4(a, a, [0, 1, 1, 1]) } +/// Casts vector of type __m128i to type __m256i; +/// the upper 128 bits of the result are undefined. +#[inline(always)] +#[target_feature = "+avx"] +pub unsafe fn _mm256_castsi128_si256(a: i64x2) -> i64x4 { + // FIXME simd_shuffle4(a, a, [0, 1, -1, -1]) + simd_shuffle4(a, a, [0, 1, 1, 1]) +} + /// Return vector of type `f32x8` with undefined elements. #[inline(always)] #[target_feature = "+avx"] @@ -1949,4 +1999,13 @@ mod tests { let e = f64x4::new(5.0, 6.0, 3.0, 4.0); assert_eq!(r, e); } + + #[simd_test = "avx"] + unsafe fn _mm256_insertf128_si256() { + let a = i64x4::new(1, 2, 3, 4); + let b = i64x2::new(5, 6); + let r = avx::_mm256_insertf128_si256(a, b, 0); + let e = i64x4::new(5, 6, 3, 4); + assert_eq!(r, e); + } } From 728e9e7ae5b7a83187865263a3133fc379adb4c6 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 16:24:09 +0200 Subject: [PATCH 22/29] avx: _mm256_insertf128_ps, _mm256_castps128_ps256 --- src/x86/avx.rs | 101 +++++++++++++++++-------------------------------- 1 file changed, 35 insertions(+), 66 deletions(-) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index db76e0f262..721c1cc5e3 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -1073,6 +1073,19 @@ pub unsafe fn _mm256_broadcast_pd(a: &f64x2) -> f64x4 { vbroadcastf128pd256(a) } +/// Copy `a` to result, then insert 128 bits (composed of 4 packed +/// single-precision (32-bit) floating-point elements) from `b` into result +/// at the location specified by `imm8`. +#[inline(always)] +#[target_feature = "+avx"] +#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))] +pub unsafe fn _mm256_insertf128_ps(a: f32x8, b: f32x4, imm8: i32) -> f32x8 { + match imm8 & 1 { + 0 => simd_shuffle8(a, _mm256_castps128_ps256(b), [8, 9, 10, 11, 4, 5, 6, 7]), + _ => simd_shuffle8(a, _mm256_castps128_ps256(b), [0, 1, 2, 3, 8, 9, 10, 11]), + } +} + /// Copy `a` to result, then insert 128 bits (composed of 2 packed /// double-precision (64-bit) floating-point elements) from `b` into result /// at the location specified by `imm8`. @@ -1080,38 +1093,9 @@ pub unsafe fn _mm256_broadcast_pd(a: &f64x2) -> f64x4 { #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))] pub unsafe fn _mm256_insertf128_pd(a: f64x4, b: f64x2, imm8: i32) -> f64x4 { - macro_rules! shuffle4 { - ($a:expr, $b:expr, $c:expr, $d:expr) => { - simd_shuffle4(a, _mm256_castpd128_pd256(b), [$a, $b, $c, $d]); - } - } - macro_rules! shuffle3 { - ($a:expr, $b: expr, $c: expr) => { - match imm8 & 0x1 { - 0 => shuffle4!($a, $b, $c, 3), - _ => shuffle4!($a, $b, $c, 5), - } - } - } - macro_rules! shuffle2 { - ($a:expr, $b:expr) => { - match imm8 & 0x1 { - 0 => shuffle3!($a, $b, 2), - _ => shuffle3!($a, $b, 4), - } - } - } - macro_rules! shuffle1 { - ($a:expr) => { - match imm8 & 0x1 { - 0 => shuffle2!($a, 5), - _ => shuffle2!($a, 1), - } - } - } - match imm8 & 0x1 { - 0 => shuffle1!(4), - _ => shuffle1!(0), + match imm8 & 1 { + 0 => simd_shuffle4(a, _mm256_castpd128_pd256(b), [4, 5, 2, 3]), + _ => simd_shuffle4(a, _mm256_castpd128_pd256(b), [0, 1, 4, 5]), } } @@ -1121,48 +1105,24 @@ pub unsafe fn _mm256_insertf128_pd(a: f64x4, b: f64x2, imm8: i32) -> f64x4 { #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))] pub unsafe fn _mm256_insertf128_si256(a: i64x4, b: i64x2, imm8: i32) -> i64x4 { - macro_rules! shuffle4 { - ($a:expr, $b:expr, $c:expr, $d:expr) => { - simd_shuffle4(a, _mm256_castsi128_si256(b), [$a, $b, $c, $d]); - } - } - macro_rules! shuffle3 { - ($a:expr, $b: expr, $c: expr) => { - match imm8 & 0x1 { - 0 => shuffle4!($a, $b, $c, 3), - _ => shuffle4!($a, $b, $c, 5), - } - } - } - macro_rules! shuffle2 { - ($a:expr, $b:expr) => { - match imm8 & 0x1 { - 0 => shuffle3!($a, $b, 2), - _ => shuffle3!($a, $b, 4), - } - } - } - macro_rules! shuffle1 { - ($a:expr) => { - match imm8 & 0x1 { - 0 => shuffle2!($a, 5), - _ => shuffle2!($a, 1), - } - } - } - match imm8 & 0x1 { - 0 => shuffle1!(4), - _ => shuffle1!(0), + match imm8 & 1 { + 0 => simd_shuffle4(a, _mm256_castsi128_si256(b), [4, 5, 2, 3]), + _ => simd_shuffle4(a, _mm256_castsi128_si256(b), [0, 1, 4, 5]), } } +pub unsafe fn _mm256_castps128_ps256(a: f32x4) -> f32x8 { + // FIXME simd_shuffle8(a, a, [0, 1, 2, 3, -1, -1, -1, -1]) + simd_shuffle8(a, a, [0, 1, 2, 3, 0, 0, 0, 0]) +} + /// Casts vector of type __m128d to type __m256d; /// the upper 128 bits of the result are undefined. #[inline(always)] #[target_feature = "+avx"] pub unsafe fn _mm256_castpd128_pd256(a: f64x2) -> f64x4 { // FIXME simd_shuffle4(a, a, [0, 1, -1, -1]) - simd_shuffle4(a, a, [0, 1, 1, 1]) + simd_shuffle4(a, a, [0, 1, 0, 0]) } /// Casts vector of type __m128i to type __m256i; @@ -1171,7 +1131,7 @@ pub unsafe fn _mm256_castpd128_pd256(a: f64x2) -> f64x4 { #[target_feature = "+avx"] pub unsafe fn _mm256_castsi128_si256(a: i64x2) -> i64x4 { // FIXME simd_shuffle4(a, a, [0, 1, -1, -1]) - simd_shuffle4(a, a, [0, 1, 1, 1]) + simd_shuffle4(a, a, [0, 1, 0, 0]) } /// Return vector of type `f32x8` with undefined elements. @@ -1991,6 +1951,15 @@ mod tests { assert_eq!(r, e); } + #[simd_test = "avx"] + unsafe fn _mm256_insertf128_ps() { + let a = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0); + let b = f32x4::new(4.0, 9.0, 16.0, 25.0); + let r = avx::_mm256_insertf128_ps(a, b, 0); + let e = f32x8::new(4.0, 9.0, 16.0, 25.0, 8.0, 9.0, 64.0, 50.0); + assert_eq!(r, e); + } + #[simd_test = "avx"] unsafe fn _mm256_insertf128_pd() { let a = f64x4::new(1.0, 2.0, 3.0, 4.0); From 21fb6e10717aa5f94a7ca0d86362ece68bc1136e Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 16:41:28 +0200 Subject: [PATCH 23/29] avx: _mm256_insert_epi8 --- src/x86/avx.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index 721c1cc5e3..635c9ff057 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -1111,6 +1111,15 @@ pub unsafe fn _mm256_insertf128_si256(a: i64x4, b: i64x2, imm8: i32) -> i64x4 { } } +/// Copy `a` to result, and insert the 8-bit integer `i` into result +/// at the location specified by `index`. +#[inline(always)] +#[target_feature = "+avx"] +pub unsafe fn _mm256_insert_epi8(a: i8x32, i: i8, index: i32) -> i8x32 { + let c = a; + c.replace(index as u32, i) +} + pub unsafe fn _mm256_castps128_ps256(a: f32x4) -> f32x8 { // FIXME simd_shuffle8(a, a, [0, 1, 2, 3, -1, -1, -1, -1]) simd_shuffle8(a, a, [0, 1, 2, 3, 0, 0, 0, 0]) @@ -1977,4 +1986,20 @@ mod tests { let e = i64x4::new(5, 6, 3, 4); assert_eq!(r, e); } + + #[simd_test = "avx"] + unsafe fn _mm256_insert_epi8() { + let a = i8x32::new( + 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32); + let r = avx::_mm256_insert_epi8(a, 0, 31); + let e = i8x32::new( + 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 0); + assert_eq!(r, e); + } } From 197a38a1be61b0271380f80dc774ec0296980691 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 16:48:18 +0200 Subject: [PATCH 24/29] avx: _mm256_insert_epi16 --- src/x86/avx.rs | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index 635c9ff057..6b3759bc45 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -1117,7 +1117,16 @@ pub unsafe fn _mm256_insertf128_si256(a: i64x4, b: i64x2, imm8: i32) -> i64x4 { #[target_feature = "+avx"] pub unsafe fn _mm256_insert_epi8(a: i8x32, i: i8, index: i32) -> i8x32 { let c = a; - c.replace(index as u32, i) + c.replace(index as u32 & 31, i) +} + +/// Copy `a` to result, and insert the 16-bit integer `i` into result +/// at the location specified by `index`. +#[inline(always)] +#[target_feature = "+avx"] +pub unsafe fn _mm256_insert_epi16(a: i16x16, i: i16, index: i32) -> i16x16 { + let c = a; + c.replace(index as u32 & 15, i) } pub unsafe fn _mm256_castps128_ps256(a: f32x4) -> f32x8 { @@ -2002,4 +2011,16 @@ mod tests { 25, 26, 27, 28, 29, 30, 31, 0); assert_eq!(r, e); } + + #[simd_test = "avx"] + unsafe fn _mm256_insert_epi16() { + let a = i16x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + let r = avx::_mm256_insert_epi16(a, 0, 15); + let e = i16x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 0); + assert_eq!(r, e); + } } From d1a92688bd2051c75cf531f9a679e54e7c69c9f4 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 16:55:20 +0200 Subject: [PATCH 25/29] avx: _mm256_insert_epi32 --- src/x86/avx.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index 6b3759bc45..eb7394161e 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -1129,6 +1129,15 @@ pub unsafe fn _mm256_insert_epi16(a: i16x16, i: i16, index: i32) -> i16x16 { c.replace(index as u32 & 15, i) } +/// Copy `a` to result, and insert the 32-bit integer `i` into result +/// at the location specified by `index`. +#[inline(always)] +#[target_feature = "+avx"] +pub unsafe fn _mm256_insert_epi32(a: i32x8, i: i32, index: i32) -> i32x8 { + let c = a; + c.replace(index as u32 & 7, i) +} + pub unsafe fn _mm256_castps128_ps256(a: f32x4) -> f32x8 { // FIXME simd_shuffle8(a, a, [0, 1, 2, 3, -1, -1, -1, -1]) simd_shuffle8(a, a, [0, 1, 2, 3, 0, 0, 0, 0]) @@ -2023,4 +2032,12 @@ mod tests { 8, 9, 10, 11, 12, 13, 14, 0); assert_eq!(r, e); } + + #[simd_test = "avx"] + unsafe fn _mm256_insert_epi32() { + let a = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let r = avx::_mm256_insert_epi32(a, 0, 7); + let e = i32x8::new(1, 2, 3, 4, 5, 6, 7, 0); + assert_eq!(r, e); + } } From 56b4f9f37da3aada1deefd6fca029bce7f5e0b5a Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 16:59:33 +0200 Subject: [PATCH 26/29] avx: _mm256_insert_epi64 --- src/x86/avx.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index eb7394161e..019656f9c1 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -1138,6 +1138,15 @@ pub unsafe fn _mm256_insert_epi32(a: i32x8, i: i32, index: i32) -> i32x8 { c.replace(index as u32 & 7, i) } +/// Copy `a` to result, and insert the 64-bit integer `i` into result +/// at the location specified by `index`. +#[inline(always)] +#[target_feature = "+avx"] +pub unsafe fn _mm256_insert_epi64(a: i64x4, i: i64, index: i32) -> i64x4 { + let c = a; + c.replace(index as u32 & 3, i) +} + pub unsafe fn _mm256_castps128_ps256(a: f32x4) -> f32x8 { // FIXME simd_shuffle8(a, a, [0, 1, 2, 3, -1, -1, -1, -1]) simd_shuffle8(a, a, [0, 1, 2, 3, 0, 0, 0, 0]) @@ -2040,4 +2049,12 @@ mod tests { let e = i32x8::new(1, 2, 3, 4, 5, 6, 7, 0); assert_eq!(r, e); } + + #[simd_test = "avx"] + unsafe fn _mm256_insert_epi64() { + let a = i64x4::new(1, 2, 3, 4); + let r = avx::_mm256_insert_epi64(a, 0, 3); + let e = i64x4::new(1, 2, 3, 0); + assert_eq!(r, e); + } } From 4f998dcca42ac49e5089028302efb08e7637b272 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 19:42:28 +0200 Subject: [PATCH 27/29] Try to fix i586 build --- src/x86/avx.rs | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index 019656f9c1..2bec009517 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -7,9 +7,6 @@ use simd_llvm::{simd_cast, simd_shuffle2, simd_shuffle4, simd_shuffle8}; use v128::{f32x4, f64x2, i32x4, i64x2}; use v256::*; -use x86::sse::_mm_undefined_ps; -use x86::sse2::_mm_undefined_pd; - /// Add packed double-precision (64-bit) floating-point elements /// in `a` and `b`. #[inline(always)] @@ -556,7 +553,7 @@ pub const _CMP_TRUE_US: u8 = 0x1f; /// elements in `a` and `b` based on the comparison operand /// specified by `imm8`. #[inline(always)] -#[target_feature = "+avx"] +#[target_feature = "+avx,+sse2"] #[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 { macro_rules! call { @@ -582,7 +579,7 @@ pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: u8) -> f64x4 { /// elements in `a` and `b` based on the comparison operand /// specified by `imm8`. #[inline(always)] -#[target_feature = "+avx"] +#[target_feature = "+avx,+sse"] #[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 { macro_rules! call { @@ -609,7 +606,7 @@ pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 { /// store the result in the lower element of returned vector, /// and copy the upper element from `a` to the upper element of returned vector. #[inline(always)] -#[target_feature = "+avx"] +#[target_feature = "+avx,+sse2"] #[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 { macro_rules! call { @@ -624,7 +621,7 @@ pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 { /// and copy the upper 3 packed elements from `a` to the upper elements of /// returned vector. #[inline(always)] -#[target_feature = "+avx"] +#[target_feature = "+avx,+sse"] #[cfg_attr(test, assert_instr(vcmpeqss, imm8 = 0))] // TODO Validate vcmpss pub unsafe fn _mm_cmp_ss(a: f32x4, b: f32x4, imm8: u8) -> f32x4 { macro_rules! call { @@ -859,9 +856,11 @@ pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 { /// Shuffle single-precision (32-bit) floating-point elements in `a` /// using the control in `imm8`. #[inline(always)] -#[target_feature = "+avx"] +#[target_feature = "+avx,+sse"] #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))] pub unsafe fn _mm_permute_ps(a: f32x4, imm8: i32) -> f32x4 { + use x86::sse::_mm_undefined_ps; + let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle4 { ($a:expr, $b:expr, $c:expr, $d:expr) => { @@ -969,9 +968,11 @@ pub unsafe fn _mm256_permute_pd(a: f64x4, imm8: i32) -> f64x4 { /// Shuffle double-precision (64-bit) floating-point elements in `a` /// using the control in `imm8`. #[inline(always)] -#[target_feature = "+avx"] +#[target_feature = "+avx,+sse2"] #[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))] pub unsafe fn _mm_permute_pd(a: f64x2, imm8: i32) -> f64x2 { + use x86::sse2::_mm_undefined_pd; + let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle2 { ($a:expr, $b:expr) => { From c18e3d340a1aebcd6d1ce457ed240b42654a53c3 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sun, 8 Oct 2017 19:56:19 +0200 Subject: [PATCH 28/29] Fix missing inline and target_feature --- src/x86/avx.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index 2bec009517..163bc8a0cb 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -1148,6 +1148,10 @@ pub unsafe fn _mm256_insert_epi64(a: i64x4, i: i64, index: i32) -> i64x4 { c.replace(index as u32 & 3, i) } +/// Casts vector of type __m128 to type __m256; +/// the upper 128 bits of the result are undefined. +#[inline(always)] +#[target_feature = "+avx"] pub unsafe fn _mm256_castps128_ps256(a: f32x4) -> f32x8 { // FIXME simd_shuffle8(a, a, [0, 1, 2, 3, -1, -1, -1, -1]) simd_shuffle8(a, a, [0, 1, 2, 3, 0, 0, 0, 0]) From 5744c9590ebd8b9741efa274ddd933eeacc6bc6d Mon Sep 17 00:00:00 2001 From: gwenn Date: Mon, 9 Oct 2017 18:28:16 +0200 Subject: [PATCH 29/29] sse: fix _mm_undefined_ps --- src/x86/sse.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/x86/sse.rs b/src/x86/sse.rs index 22a6dd75bb..26127fe2c3 100644 --- a/src/x86/sse.rs +++ b/src/x86/sse.rs @@ -872,7 +872,7 @@ pub unsafe fn _mm_prefetch(p: *const c_void, strategy: i8) { #[inline(always)] #[target_feature = "+sse"] pub unsafe fn _mm_undefined_ps() -> f32x4 { - mem::uninitialized() + f32x4::splat(mem::uninitialized()) } #[allow(improper_ctypes)]