From 4350febabce15b893591d767194af624450c0962 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Wed, 4 Oct 2017 11:09:55 +0100 Subject: [PATCH 01/23] sse4.1: _mm_blendv_ps and _mm_blendv_pd --- src/x86/sse41.rs | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 2cd36f74c6..473c382ff4 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -15,6 +15,22 @@ pub unsafe fn _mm_blendv_epi8( pblendvb(a, b, mask) } +/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask` +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(blendvpd))] +pub unsafe fn _mm_blendv_pd(a: f64x2, b: f64x2, mask: f64x2) -> f64x2 { + blendvpd(a, b, mask) +} + +/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask` +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(blendvps))] +pub unsafe fn _mm_blendv_ps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4 { + blendvps(a, b, mask) +} + /// Returns the dot product of two f64x2 vectors. /// /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask. @@ -53,6 +69,10 @@ pub unsafe fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 { extern { #[link_name = "llvm.x86.sse41.pblendvb"] fn pblendvb(a: __m128i, b: __m128i, mask: __m128i) -> __m128i; + #[link_name = "llvm.x86.sse41.blendvpd"] + fn blendvpd(a: f64x2, b: f64x2, mask: f64x2) -> f64x2; + #[link_name = "llvm.x86.sse41.blendvps"] + fn blendvps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4; #[link_name = "llvm.x86.sse41.dppd"] fn dppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2; #[link_name = "llvm.x86.sse41.dpps"] @@ -79,6 +99,26 @@ mod tests { assert_eq!(sse41::_mm_blendv_epi8(a, b, mask), e); } + #[simd_test = "sse4.1"] + unsafe fn _mm_blendv_pd() { + let a = f64x2::splat(0.0); + let b = f64x2::splat(1.0); + let mask = ::std::mem::transmute(i64x2::new(0, -1)); + let r = sse41::_mm_blendv_pd(a, b, mask); + let e = f64x2::new(0.0, 1.0); + assert_eq!(r, e); + } + + #[simd_test = "sse4.1"] + unsafe fn _mm_blendv_ps() { + let a = f32x4::splat(0.0); + let b = f32x4::splat(1.0); + let mask = ::std::mem::transmute(i32x4::new(0,-1, 0, -1)); + let r = sse41::_mm_blendv_ps(a, b, mask); + let e = f32x4::new(0.0, 1.0, 0.0, 1.0); + assert_eq!(r, e); + } + #[simd_test = "sse4.1"] unsafe fn _mm_dp_pd() { let a = f64x2::new(2.0, 3.0); From 9e015df40c5c61432fc61476546017d8a205c818 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Wed, 4 Oct 2017 11:17:00 +0100 Subject: [PATCH 02/23] sse4.1: _mm_blend_ps and _mm_blend_pd - HACK warning: messing with the constify macros - Selecting only one buffer gets optimized away and tests need to take this into account --- src/x86/macros.rs | 37 +++++++++++++++++++++++++++++++++++++ src/x86/sse41.rs | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) diff --git a/src/x86/macros.rs b/src/x86/macros.rs index ebe1015181..9c0b9abf93 100644 --- a/src/x86/macros.rs +++ b/src/x86/macros.rs @@ -261,3 +261,40 @@ macro_rules! constify_imm8 { } } } + + +macro_rules! constify_imm4 { + ($imm4:expr, $expand:ident) => { + #[allow(overflowing_literals)] + match $imm4 & 0b1111 { + 0 => $expand!(0), + 1 => $expand!(1), + 2 => $expand!(2), + 3 => $expand!(3), + 4 => $expand!(4), + 5 => $expand!(5), + 6 => $expand!(6), + 7 => $expand!(7), + 8 => $expand!(8), + 9 => $expand!(9), + 10 => $expand!(10), + 11 => $expand!(11), + 12 => $expand!(12), + 13 => $expand!(13), + 14 => $expand!(14), + _ => $expand!(15), + } + } +} + +macro_rules! constify_imm2 { + ($imm2:expr, $expand:ident) => { + #[allow(overflowing_literals)] + match $imm2 & 0b11 { + 0 => $expand!(0), + 1 => $expand!(1), + 2 => $expand!(2), + _ => $expand!(3), + } + } +} diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 473c382ff4..1cca9a7a82 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -31,6 +31,28 @@ pub unsafe fn _mm_blendv_ps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4 { blendvps(a, b, mask) } +/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm2` +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(blendpd, imm2=0b10))] +pub unsafe fn _mm_blend_pd(a: f64x2, b: f64x2, imm2: u8) -> f64x2 { + macro_rules! call { + ($imm2:expr) => { blendpd(a, b, $imm2) } + } + constify_imm2!(imm2, call) +} + +/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using mask `imm4` +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(blendps, imm4=0b0101))] +pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 { + macro_rules! call { + ($imm4:expr) => { blendps(a, b, $imm4) } + } + constify_imm4!(imm4, call) +} + /// Returns the dot product of two f64x2 vectors. /// /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask. @@ -73,6 +95,10 @@ extern { fn blendvpd(a: f64x2, b: f64x2, mask: f64x2) -> f64x2; #[link_name = "llvm.x86.sse41.blendvps"] fn blendvps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4; + #[link_name = "llvm.x86.sse41.blendpd"] + fn blendpd(a: f64x2, b: f64x2, imm2: u8) -> f64x2; + #[link_name = "llvm.x86.sse41.blendps"] + fn blendps(a: f32x4, b: f32x4, imm4: u8) -> f32x4; #[link_name = "llvm.x86.sse41.dppd"] fn dppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2; #[link_name = "llvm.x86.sse41.dpps"] @@ -119,6 +145,24 @@ mod tests { assert_eq!(r, e); } + #[simd_test = "sse4.1"] + unsafe fn _mm_blend_pd() { + let a = f64x2::splat(0.0); + let b = f64x2::splat(1.0); + let r = sse41::_mm_blend_pd(a, b, 0b10); + let e = f64x2::new(0.0, 1.0); + assert_eq!(r, e); + } + + #[simd_test = "sse4.1"] + unsafe fn _mm_blend_ps() { + let a = f32x4::splat(0.0); + let b = f32x4::splat(1.0); + let r = sse41::_mm_blend_ps(a, b, 0b1010); + let e = f32x4::new(0.0, 1.0, 0.0, 1.0); + assert_eq!(r, e); + } + #[simd_test = "sse4.1"] unsafe fn _mm_dp_pd() { let a = f64x2::new(2.0, 3.0); From ec25f71b2d0d1b30a0e3d2a66da75447127cfaa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Wed, 4 Oct 2017 11:49:06 +0100 Subject: [PATCH 03/23] sse4.1: _mm_blend_epi16 --- src/x86/sse41.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 1cca9a7a82..98c4f5635b 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -15,6 +15,16 @@ pub unsafe fn _mm_blendv_epi8( pblendvb(a, b, mask) } +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pblendw, imm8=0xF0))] +pub unsafe fn _mm_blend_epi16(a: i16x8, b: i16x8, imm8: u8) -> i16x8 { + macro_rules! call { + ($imm8:expr) => { pblendw(a, b, $imm8) } + } + constify_imm8!(imm8, call) +} + /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask` #[inline(always)] #[target_feature = "+sse4.1"] @@ -99,6 +109,8 @@ extern { fn blendpd(a: f64x2, b: f64x2, imm2: u8) -> f64x2; #[link_name = "llvm.x86.sse41.blendps"] fn blendps(a: f32x4, b: f32x4, imm4: u8) -> f32x4; + #[link_name = "llvm.x86.sse41.pblendw"] + fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8; #[link_name = "llvm.x86.sse41.dppd"] fn dppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2; #[link_name = "llvm.x86.sse41.dpps"] @@ -163,6 +175,15 @@ mod tests { assert_eq!(r, e); } + #[simd_test = "sse4.1"] + unsafe fn _mm_blend_epi16() { + let a = i16x8::splat(0); + let b = i16x8::splat(1); + let r = sse41::_mm_blend_epi16(a, b, 0b1010_1100); + let e = i16x8::new(0, 0, 1, 1, 0, 1, 0, 1); + assert_eq!(r, e); + } + #[simd_test = "sse4.1"] unsafe fn _mm_dp_pd() { let a = f64x2::new(2.0, 3.0); From 3ea44a3b733451ed3b4355aa98b76961305e4002 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Wed, 4 Oct 2017 14:12:33 +0100 Subject: [PATCH 04/23] sse4.1: _mm_extract_ps --- src/x86/sse41.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 98c4f5635b..9effc28fc7 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -1,3 +1,6 @@ + +use std::mem; + #[cfg(test)] use stdsimd_test::assert_instr; @@ -63,6 +66,14 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 { constify_imm4!(imm4, call) } +/// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8` +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(extractps, imm8=0))] +pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 { + mem::transmute(a.extract((imm8 & 0b11) as u32)) +} + /// Returns the dot product of two f64x2 vectors. /// /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask. @@ -119,6 +130,8 @@ extern { #[cfg(test)] mod tests { + use std::mem; + use stdsimd_test::simd_test; use v128::*; @@ -184,6 +197,17 @@ mod tests { assert_eq!(r, e); } + #[simd_test = "sse4.1"] + unsafe fn _mm_extract_ps() { + let a = f32x4::new(0.0, 1.0, 2.0, 3.0); + + let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 1)); + assert_eq!(r, 1.0); + + let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 5)); + assert_eq!(r, 1.0); + } + #[simd_test = "sse4.1"] unsafe fn _mm_dp_pd() { let a = f64x2::new(2.0, 3.0); From 88f3992c692c08bb3031be02fa632cfefc05aee5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Wed, 4 Oct 2017 15:28:03 +0100 Subject: [PATCH 05/23] sse4.1: _mm_extract_epi8 --- src/x86/sse41.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 9effc28fc7..fb8c9786ad 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -74,6 +74,14 @@ pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 { mem::transmute(a.extract((imm8 & 0b11) as u32)) } +/// Extract an 8-bit integer from `a` selected with `imm8` +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pextrb, imm8=0))] +pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i32 { + a.extract((imm8 & 0b111) as u32) as i32 +} + /// Returns the dot product of two f64x2 vectors. /// /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask. @@ -208,6 +216,17 @@ mod tests { assert_eq!(r, 1.0); } + #[simd_test = "sse4.1"] + unsafe fn _mm_extract_epi8() { + let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + + let r = sse41::_mm_extract_epi8(a, 1); + assert_eq!(r, 1); + + let r = sse41::_mm_extract_epi8(a, 17); + assert_eq!(r, 1); + } + #[simd_test = "sse4.1"] unsafe fn _mm_dp_pd() { let a = f64x2::new(2.0, 3.0); From 38981a8f8919c7aeb5fb70ec0ca1c59049b4b759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Wed, 4 Oct 2017 15:46:44 +0100 Subject: [PATCH 06/23] see4.1: _mm_extract_epi32 --- src/x86/sse41.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index fb8c9786ad..4f0b91787a 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -82,6 +82,14 @@ pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i32 { a.extract((imm8 & 0b111) as u32) as i32 } +/// Extract an 32-bit integer from `a` selected with `imm8` +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pextrd, imm8=1))] +pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 { + a.extract((imm8 & 0b11) as u32) as i32 +} + /// Returns the dot product of two f64x2 vectors. /// /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask. @@ -227,6 +235,17 @@ mod tests { assert_eq!(r, 1); } + #[simd_test = "sse4.1"] + unsafe fn _mm_extract_epi32() { + let a = i32x4::new(0, 1, 2, 3); + + let r = sse41::_mm_extract_epi32(a, 1); + assert_eq!(r, 1); + + let r = sse41::_mm_extract_epi32(a, 5); + assert_eq!(r, 1); + } + #[simd_test = "sse4.1"] unsafe fn _mm_dp_pd() { let a = f64x2::new(2.0, 3.0); From aa1f042c109ea2963bebc45748371f6030c8a538 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Wed, 4 Oct 2017 15:57:14 +0100 Subject: [PATCH 07/23] sse4.1: _mm_extract_epi64 --- src/x86/sse41.rs | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 4f0b91787a..4f5eb02e28 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -87,7 +87,15 @@ pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i32 { #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pextrd, imm8=1))] pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 { - a.extract((imm8 & 0b11) as u32) as i32 + a.extract((imm8 & 0b11) as u32) +} + +/// Extract an 64-bit integer from `a` selected with `imm8` +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pextrq, imm8=1))] +pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 { + a.extract((imm8 & 0b1) as u32) } /// Returns the dot product of two f64x2 vectors. @@ -246,6 +254,17 @@ mod tests { assert_eq!(r, 1); } + #[simd_test = "sse4.1"] + unsafe fn _mm_extract_epi64() { + let a = i64x2::new(0, 1); + + let r = sse41::_mm_extract_epi64(a, 1); + assert_eq!(r, 1); + + let r = sse41::_mm_extract_epi64(a, 3); + assert_eq!(r, 1); + } + #[simd_test = "sse4.1"] unsafe fn _mm_dp_pd() { let a = f64x2::new(2.0, 3.0); From 6c4607564e033ebf72c462e5a0878bfc56fcefc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Wed, 4 Oct 2017 18:54:29 +0100 Subject: [PATCH 08/23] sse4.1: _mm_insert_ps --- src/x86/sse41.rs | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 4f5eb02e28..76e635d8ce 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -98,6 +98,38 @@ pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 { a.extract((imm8 & 0b1) as u32) } +/// Select a single value in `a` to store at some position in `b`, +/// Then zero elements according to `imm8`. +/// +/// `imm8` specifies which bits from operand `a` will be copied, which bits in the +/// result they will be copied to, and which bits in the result will be +/// cleared. The following assignments are made: +/// +/// * Bits `[7:6]` specify the bits to copy from operand `a`: +/// - `00`: Selects bits `[31:0]` from operand `a`. +/// - `01`: Selects bits `[63:32]` from operand `a`. +/// - `10`: Selects bits `[95:64]` from operand `a`. +/// - `11`: Selects bits `[127:96]` from operand `a`. +/// +/// * Bits `[5:4]` specify the bits in the result to which the selected bits +/// from operand `a` are copied: +/// - `00`: Copies the selected bits from `a` to result bits `[31:0]`. +/// - `01`: Copies the selected bits from `a` to result bits `[63:32]`. +/// - `10`: Copies the selected bits from `a` to result bits `[95:64]`. +/// - `11`: Copies the selected bits from `a` to result bits `[127:96]`. +/// +/// * Bits `[3:0]`: If any of these bits are set, the corresponding result +/// element is cleared. +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(insertps, imm8=0b1010))] +pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 { + macro_rules! call { + ($imm8:expr) => { insertps(a, b, $imm8) } + } + constify_imm8!(imm8, call) +} + /// Returns the dot product of two f64x2 vectors. /// /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask. @@ -146,6 +178,8 @@ extern { fn blendps(a: f32x4, b: f32x4, imm4: u8) -> f32x4; #[link_name = "llvm.x86.sse41.pblendw"] fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8; + #[link_name = "llvm.x86.sse41.insertps"] + fn insertps(a: f32x4, b: f32x4, imm8: u8) -> f32x4; #[link_name = "llvm.x86.sse41.dppd"] fn dppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2; #[link_name = "llvm.x86.sse41.dpps"] @@ -265,6 +299,15 @@ mod tests { assert_eq!(r, 1); } + #[simd_test = "sse4.1"] + unsafe fn _mm_insert_ps() { + let a = f32x4::splat(1.0); + let b = f32x4::new(1.0, 2.0, 3.0, 4.0); + let r = sse41::_mm_insert_ps(a, b, 0b11_00_1100); + let e = f32x4::new(4.0, 1.0, 0.0, 0.0); + assert_eq!(r, e); + } + #[simd_test = "sse4.1"] unsafe fn _mm_dp_pd() { let a = f64x2::new(2.0, 3.0); From fd1506e695f6be251889c0e6338ad76bd32dc93e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Wed, 4 Oct 2017 19:12:39 +0100 Subject: [PATCH 09/23] sse4.1: _mm_insert_epi8 --- src/x86/sse41.rs | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 76e635d8ce..8ad26a5467 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -124,12 +124,20 @@ pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 { #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(insertps, imm8=0b1010))] pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 { - macro_rules! call { + macro_rules! call { ($imm8:expr) => { insertps(a, b, $imm8) } } constify_imm8!(imm8, call) } +/// Return a copy of `a` with an 8-bit integer from `i` inserted at a location specified by `imm8`. +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pinsrb, imm8=0))] +pub unsafe fn _mm_insert_epi8(a: i8x16, i: i8, imm8: u8) -> i8x16 { + a.replace((imm8 & 0b111) as u32, i) +} + /// Returns the dot product of two f64x2 vectors. /// /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask. @@ -308,6 +316,19 @@ mod tests { assert_eq!(r, e); } + #[simd_test = "sse4.1"] + unsafe fn _mm_insert_epi8() { + let a = i8x16::splat(0); + + let r = sse41::_mm_insert_epi8(a, 32, 1); + let e = i8x16::splat(0).replace(1, 32); + assert_eq!(r, e); + + let r = sse41::_mm_insert_epi8(a, 32, 17); + let e = i8x16::splat(0).replace(1, 32); + assert_eq!(r, e); + } + #[simd_test = "sse4.1"] unsafe fn _mm_dp_pd() { let a = f64x2::new(2.0, 3.0); From 137586103f5c9d4f2bff5af6036701c9127f2b39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Wed, 4 Oct 2017 21:16:43 +0100 Subject: [PATCH 10/23] sse4.1: _mm_insert_epi32 and _mm_insert_epi64 --- src/x86/sse41.rs | 48 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 8ad26a5467..ec073d1563 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -79,7 +79,7 @@ pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 { #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pextrb, imm8=0))] pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i32 { - a.extract((imm8 & 0b111) as u32) as i32 + a.extract((imm8 & 0b1111) as u32) as i32 } /// Extract an 32-bit integer from `a` selected with `imm8` @@ -130,12 +130,28 @@ pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 { constify_imm8!(imm8, call) } -/// Return a copy of `a` with an 8-bit integer from `i` inserted at a location specified by `imm8`. +/// Return a copy of `a` with the 8-bit integer from `i` inserted at a location specified by `imm8`. #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pinsrb, imm8=0))] pub unsafe fn _mm_insert_epi8(a: i8x16, i: i8, imm8: u8) -> i8x16 { - a.replace((imm8 & 0b111) as u32, i) + a.replace((imm8 & 0b1111) as u32, i) +} + +/// Return a copy of `a` with the 32-bit integer from `i` inserted at a location specified by `imm8`. +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pinsrd, imm8=0))] +pub unsafe fn _mm_insert_epi32(a: i32x4, i: i32, imm8: u8) -> i32x4 { + a.replace((imm8 & 0b11) as u32, i) +} + +/// Return a copy of `a` with the 64-bit integer from `i` inserted at a location specified by `imm8`. +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pinsrq, imm8=0))] +pub unsafe fn _mm_insert_epi64(a: i64x2, i: i64, imm8: u8) -> i64x2 { + a.replace((imm8 & 0b1) as u32, i) } /// Returns the dot product of two f64x2 vectors. @@ -329,6 +345,32 @@ mod tests { assert_eq!(r, e); } + #[simd_test = "sse4.1"] + unsafe fn _mm_insert_epi32() { + let a = i32x4::splat(0); + + let r = sse41::_mm_insert_epi32(a, 32, 1); + let e = i32x4::splat(0).replace(1, 32); + assert_eq!(r, e); + + let r = sse41::_mm_insert_epi32(a, 32, 5); + let e = i32x4::splat(0).replace(1, 32); + assert_eq!(r, e); + } + + #[simd_test = "sse4.1"] + unsafe fn _mm_insert_epi64() { + let a = i64x2::splat(0); + + let r = sse41::_mm_insert_epi64(a, 32, 1); + let e = i64x2::splat(0).replace(1, 32); + assert_eq!(r, e); + + let r = sse41::_mm_insert_epi64(a, 32, 3); + let e = i64x2::splat(0).replace(1, 32); + assert_eq!(r, e); + } + #[simd_test = "sse4.1"] unsafe fn _mm_dp_pd() { let a = f64x2::new(2.0, 3.0); From e5dab3a11c0603713df744c6e10f6358904fa90c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Wed, 4 Oct 2017 21:32:12 +0100 Subject: [PATCH 11/23] Formmating --- src/x86/sse41.rs | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index ec073d1563..d976739136 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -236,7 +236,7 @@ mod tests { unsafe fn _mm_blendv_pd() { let a = f64x2::splat(0.0); let b = f64x2::splat(1.0); - let mask = ::std::mem::transmute(i64x2::new(0, -1)); + let mask = mem::transmute(i64x2::new(0, -1)); let r = sse41::_mm_blendv_pd(a, b, mask); let e = f64x2::new(0.0, 1.0); assert_eq!(r, e); @@ -246,7 +246,7 @@ mod tests { unsafe fn _mm_blendv_ps() { let a = f32x4::splat(0.0); let b = f32x4::splat(1.0); - let mask = ::std::mem::transmute(i32x4::new(0,-1, 0, -1)); + let mask = mem::transmute(i32x4::new(0,-1, 0, -1)); let r = sse41::_mm_blendv_ps(a, b, mask); let e = f32x4::new(0.0, 1.0, 0.0, 1.0); assert_eq!(r, e); @@ -282,10 +282,8 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn _mm_extract_ps() { let a = f32x4::new(0.0, 1.0, 2.0, 3.0); - let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 1)); assert_eq!(r, 1.0); - let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 5)); assert_eq!(r, 1.0); } @@ -293,10 +291,8 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn _mm_extract_epi8() { let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = sse41::_mm_extract_epi8(a, 1); assert_eq!(r, 1); - let r = sse41::_mm_extract_epi8(a, 17); assert_eq!(r, 1); } @@ -304,10 +300,8 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn _mm_extract_epi32() { let a = i32x4::new(0, 1, 2, 3); - let r = sse41::_mm_extract_epi32(a, 1); assert_eq!(r, 1); - let r = sse41::_mm_extract_epi32(a, 5); assert_eq!(r, 1); } @@ -315,10 +309,8 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn _mm_extract_epi64() { let a = i64x2::new(0, 1); - let r = sse41::_mm_extract_epi64(a, 1); assert_eq!(r, 1); - let r = sse41::_mm_extract_epi64(a, 3); assert_eq!(r, 1); } @@ -335,11 +327,9 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn _mm_insert_epi8() { let a = i8x16::splat(0); - let r = sse41::_mm_insert_epi8(a, 32, 1); let e = i8x16::splat(0).replace(1, 32); assert_eq!(r, e); - let r = sse41::_mm_insert_epi8(a, 32, 17); let e = i8x16::splat(0).replace(1, 32); assert_eq!(r, e); @@ -348,11 +338,9 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn _mm_insert_epi32() { let a = i32x4::splat(0); - let r = sse41::_mm_insert_epi32(a, 32, 1); let e = i32x4::splat(0).replace(1, 32); assert_eq!(r, e); - let r = sse41::_mm_insert_epi32(a, 32, 5); let e = i32x4::splat(0).replace(1, 32); assert_eq!(r, e); @@ -361,7 +349,6 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn _mm_insert_epi64() { let a = i64x2::splat(0); - let r = sse41::_mm_insert_epi64(a, 32, 1); let e = i64x2::splat(0).replace(1, 32); assert_eq!(r, e); From 08574d8f02395f417cab2a179cf8fae326fa1126 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Wed, 4 Oct 2017 22:18:35 +0100 Subject: [PATCH 12/23] sse4.1: _mm_max_epi8, _mm_max_epu16, _mm_max_epi32 and _mm_max_epu32 --- src/x86/sse41.rs | 76 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index d976739136..1d3eec6c2c 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -154,6 +154,38 @@ pub unsafe fn _mm_insert_epi64(a: i64x2, i: i64, imm8: u8) -> i64x2 { a.replace((imm8 & 0b1) as u32, i) } +/// Compare packed 8-bit integers in `a` and `b`,87 and return packed maximum values in dst. +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pmaxsb, imm8=0))] +pub unsafe fn _mm_max_epi8(a: i8x16, b: i8x16) -> i8x16 { + pmaxsb(a, b) +} + +/// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed maximum. +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pmaxuw, imm8=0))] +pub unsafe fn _mm_max_epu16(a: u16x8, b: u16x8) -> u16x8 { + pmaxuw(a, b) +} + +// Compare packed 32-bit integers in `a` and `b`, and return packed maximum values. +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pmaxsd, imm8=0))] +pub unsafe fn _mm_max_epi32(a: i32x4, b: i32x4) -> i32x4 { + pmaxsd(a, b) +} + +// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed maximum values. +#[inline(always)] +#[target_feature = "+sse4.1"] +#[cfg_attr(test, assert_instr(pmaxud, imm8=0))] +pub unsafe fn _mm_max_epu32(a: u32x4, b: u32x4) -> u32x4 { + pmaxud(a, b) +} + /// Returns the dot product of two f64x2 vectors. /// /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask. @@ -204,6 +236,14 @@ extern { fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8; #[link_name = "llvm.x86.sse41.insertps"] fn insertps(a: f32x4, b: f32x4, imm8: u8) -> f32x4; + #[link_name = "llvm.x86.sse41.pmaxsb"] + fn pmaxsb(a: i8x16, b: i8x16) -> i8x16; + #[link_name = "llvm.x86.sse41.pmaxuw"] + fn pmaxuw(a: u16x8, b: u16x8) -> u16x8; + #[link_name = "llvm.x86.sse41.pmaxsd"] + fn pmaxsd(a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.sse41.pmaxud"] + fn pmaxud(a: u32x4, b: u32x4) -> u32x4; #[link_name = "llvm.x86.sse41.dppd"] fn dppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2; #[link_name = "llvm.x86.sse41.dpps"] @@ -358,6 +398,42 @@ mod tests { assert_eq!(r, e); } + #[simd_test = "avx"] + unsafe fn _mm_max_epi8() { + let a = i8x16::new(1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32); + let b = i8x16::new(2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31); + let r = sse41::_mm_max_epi8(a, b); + let e = i8x16::new(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm_max_epu16() { + let a = u16x8::new(1, 4, 5, 8, 9, 12, 13, 16); + let b = u16x8::new(2, 3, 6, 7, 10, 11, 14, 15); + let r = sse41::_mm_max_epu16(a, b); + let e = u16x8::new(2, 4, 6, 8, 10, 12, 14, 16); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm_max_epi32() { + let a = i32x4::new(1, 4, 5, 8); + let b = i32x4::new(2, 3, 6, 7); + let r = sse41::_mm_max_epi32(a, b); + let e = i32x4::new(2, 4, 6, 8); + assert_eq!(r, e); + } + + #[simd_test = "avx"] + unsafe fn _mm_max_epu32() { + let a = u32x4::new(1, 4, 5, 8); + let b = u32x4::new(2, 3, 6, 7); + let r = sse41::_mm_max_epu32(a, b); + let e = u32x4::new(2, 4, 6, 8); + assert_eq!(r, e); + } + #[simd_test = "sse4.1"] unsafe fn _mm_dp_pd() { let a = f64x2::new(2.0, 3.0); From 7fe034545386dfbdff891402afe2c9e9dc0797cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Fri, 6 Oct 2017 11:45:59 +0100 Subject: [PATCH 13/23] Fix wrong compiler flag - avx -> sse4.1 --- src/x86/sse41.rs | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 1d3eec6c2c..90e96aa05e 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -367,38 +367,34 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn _mm_insert_epi8() { let a = i8x16::splat(0); - let r = sse41::_mm_insert_epi8(a, 32, 1); let e = i8x16::splat(0).replace(1, 32); + let r = sse41::_mm_insert_epi8(a, 32, 1); assert_eq!(r, e); let r = sse41::_mm_insert_epi8(a, 32, 17); - let e = i8x16::splat(0).replace(1, 32); assert_eq!(r, e); } #[simd_test = "sse4.1"] unsafe fn _mm_insert_epi32() { let a = i32x4::splat(0); - let r = sse41::_mm_insert_epi32(a, 32, 1); let e = i32x4::splat(0).replace(1, 32); + let r = sse41::_mm_insert_epi32(a, 32, 1); assert_eq!(r, e); let r = sse41::_mm_insert_epi32(a, 32, 5); - let e = i32x4::splat(0).replace(1, 32); assert_eq!(r, e); } #[simd_test = "sse4.1"] unsafe fn _mm_insert_epi64() { let a = i64x2::splat(0); - let r = sse41::_mm_insert_epi64(a, 32, 1); let e = i64x2::splat(0).replace(1, 32); + let r = sse41::_mm_insert_epi64(a, 32, 1); assert_eq!(r, e); - let r = sse41::_mm_insert_epi64(a, 32, 3); - let e = i64x2::splat(0).replace(1, 32); assert_eq!(r, e); } - #[simd_test = "avx"] + #[simd_test = "sse4.1"] unsafe fn _mm_max_epi8() { let a = i8x16::new(1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32); let b = i8x16::new(2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31); @@ -407,7 +403,7 @@ mod tests { assert_eq!(r, e); } - #[simd_test = "avx"] + #[simd_test = "sse4.1"] unsafe fn _mm_max_epu16() { let a = u16x8::new(1, 4, 5, 8, 9, 12, 13, 16); let b = u16x8::new(2, 3, 6, 7, 10, 11, 14, 15); @@ -416,7 +412,7 @@ mod tests { assert_eq!(r, e); } - #[simd_test = "avx"] + #[simd_test = "sse4.1"] unsafe fn _mm_max_epi32() { let a = i32x4::new(1, 4, 5, 8); let b = i32x4::new(2, 3, 6, 7); @@ -425,7 +421,7 @@ mod tests { assert_eq!(r, e); } - #[simd_test = "avx"] + #[simd_test = "sse4.1"] unsafe fn _mm_max_epu32() { let a = u32x4::new(1, 4, 5, 8); let b = u32x4::new(2, 3, 6, 7); From 60b115649c04d3ef5b08b57e120a667835719bff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Fri, 6 Oct 2017 14:56:34 +0100 Subject: [PATCH 14/23] Fix intrinsics that only work with x86-64 --- src/x86/sse41.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 90e96aa05e..927459819b 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -91,6 +91,7 @@ pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 { } /// Extract an 64-bit integer from `a` selected with `imm8` +#[cfg(target_arch = "x86_64")] #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pextrq, imm8=1))] @@ -147,6 +148,7 @@ pub unsafe fn _mm_insert_epi32(a: i32x4, i: i32, imm8: u8) -> i32x4 { } /// Return a copy of `a` with the 64-bit integer from `i` inserted at a location specified by `imm8`. +#[cfg(target_arch = "x86_64")] #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pinsrq, imm8=0))] @@ -346,6 +348,7 @@ mod tests { assert_eq!(r, 1); } + #[cfg(target_arch = "x86_64")] #[simd_test = "sse4.1"] unsafe fn _mm_extract_epi64() { let a = i64x2::new(0, 1); @@ -384,6 +387,7 @@ mod tests { assert_eq!(r, e); } + #[cfg(target_arch = "x86_64")] #[simd_test = "sse4.1"] unsafe fn _mm_insert_epi64() { let a = i64x2::splat(0); From 2373618d628a793df64b95a0f21ad5cb75093223 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Mon, 9 Oct 2017 13:41:43 +0100 Subject: [PATCH 15/23] sse4.1: use appropriate types --- src/x86/sse41.rs | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 927459819b..15df849569 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -1,6 +1,3 @@ - -use std::mem; - #[cfg(test)] use stdsimd_test::assert_instr; @@ -69,17 +66,17 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 { /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8` #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(extractps, imm8=0))] -pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 { - mem::transmute(a.extract((imm8 & 0b11) as u32)) +#[cfg_attr(test, assert_instr(extractps, imm8=2))] +pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> f32 { + a.extract(imm8 as u32 & 0b11) } /// Extract an 8-bit integer from `a` selected with `imm8` #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pextrb, imm8=0))] -pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i32 { - a.extract((imm8 & 0b1111) as u32) as i32 +pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i8 { + a.extract((imm8 & 0b1111) as u32) } /// Extract an 32-bit integer from `a` selected with `imm8` @@ -324,9 +321,9 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn _mm_extract_ps() { let a = f32x4::new(0.0, 1.0, 2.0, 3.0); - let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 1)); + let r = sse41::_mm_extract_ps(a, 1); assert_eq!(r, 1.0); - let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 5)); + let r = sse41::_mm_extract_ps(a, 5); assert_eq!(r, 1.0); } From b80b3a69acad221930f217855ed697eb9315b740 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Tue, 10 Oct 2017 09:33:18 +0100 Subject: [PATCH 16/23] Revert '_mm_extract_ps' to return i32 --- src/x86/sse41.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 15df849569..b14680e15c 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -1,3 +1,6 @@ + +use std::mem; + #[cfg(test)] use stdsimd_test::assert_instr; @@ -66,9 +69,9 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 { /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8` #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(extractps, imm8=2))] -pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> f32 { - a.extract(imm8 as u32 & 0b11) +#[cfg_attr(test, assert_instr(extractps, imm8=0))] +pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 { + mem::transmute(a.extract(imm8 as u32 & 0b11)) } /// Extract an 8-bit integer from `a` selected with `imm8` @@ -321,9 +324,9 @@ mod tests { #[simd_test = "sse4.1"] unsafe fn _mm_extract_ps() { let a = f32x4::new(0.0, 1.0, 2.0, 3.0); - let r = sse41::_mm_extract_ps(a, 1); + let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 1)); assert_eq!(r, 1.0); - let r = sse41::_mm_extract_ps(a, 5); + let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 5)); assert_eq!(r, 1.0); } From bab5cd32c5debd56aa570e726fe9b04965a7173a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Tue, 10 Oct 2017 10:04:31 +0100 Subject: [PATCH 17/23] sse4.1: Use the v128 types for consistency --- src/x86/sse41.rs | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index b14680e15c..d10ad987c6 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -5,16 +5,11 @@ use std::mem; use stdsimd_test::assert_instr; use v128::*; -use x86::__m128i; #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pblendvb))] -pub unsafe fn _mm_blendv_epi8( - a: __m128i, - b: __m128i, - mask: __m128i, -) -> __m128i { +pub unsafe fn _mm_blendv_epi8(a: i8x16, b: i8x16, mask: i8x16) -> i8x16 { pblendvb(a, b, mask) } @@ -225,7 +220,7 @@ pub unsafe fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 { #[allow(improper_ctypes)] extern { #[link_name = "llvm.x86.sse41.pblendvb"] - fn pblendvb(a: __m128i, b: __m128i, mask: __m128i) -> __m128i; + fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16; #[link_name = "llvm.x86.sse41.blendvpd"] fn blendvpd(a: f64x2, b: f64x2, mask: f64x2) -> f64x2; #[link_name = "llvm.x86.sse41.blendvps"] From 9c473808d334acedd46060b32ceea116662bf6a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Tue, 17 Oct 2017 09:44:21 +0100 Subject: [PATCH 18/23] Try fix for windows --- src/x86/macros.rs | 10 +++++++++- src/x86/sse41.rs | 36 ++++++++++++++++++++++++------------ 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/src/x86/macros.rs b/src/x86/macros.rs index 538dcc0d9d..5a195feca7 100644 --- a/src/x86/macros.rs +++ b/src/x86/macros.rs @@ -338,4 +338,12 @@ macro_rules! constify_imm2 { } } - +macro_rules! constify_imm1 { + ($imm8:expr, $expand:ident) => { + #[allow(overflowing_literals)] + match $imm8 & 0b1 { + 0 => $expand!(0), + _ => $expand!(1), + } + } +} diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index d10ad987c6..c69fdbd059 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -64,34 +64,46 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 { /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8` #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(extractps, imm8=0))] -pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 { - mem::transmute(a.extract(imm8 as u32 & 0b11)) +#[cfg_attr(test, assert_instr(extractps, imm2=0))] +pub unsafe fn _mm_extract_ps(a: f32x4, imm2: u8) -> i32 { + macro_rules! call { + ($imm2:expr) => { mem::transmute(a.extract($imm2)) } + } + constify_imm2!(imm2, call) } /// Extract an 8-bit integer from `a` selected with `imm8` #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(pextrb, imm8=0))] -pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i8 { - a.extract((imm8 & 0b1111) as u32) +#[cfg_attr(test, assert_instr(pextrb, imm4=0))] +pub unsafe fn _mm_extract_epi8(a: i8x16, imm4: u8) -> i8 { + macro_rules! call { + ($imm4:expr) => { a.extract($imm4) } + } + constify_imm4!(imm4, call) } /// Extract an 32-bit integer from `a` selected with `imm8` #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(pextrd, imm8=1))] -pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 { - a.extract((imm8 & 0b11) as u32) +#[cfg_attr(test, assert_instr(pextrd, imm2=1))] +pub unsafe fn _mm_extract_epi32(a: i32x4, imm2: u8) -> i32 { + macro_rules! call { + ($imm2:expr) => { a.extract($imm2) } + } + constify_imm2!(imm2, call) } /// Extract an 64-bit integer from `a` selected with `imm8` #[cfg(target_arch = "x86_64")] #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(pextrq, imm8=1))] -pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 { - a.extract((imm8 & 0b1) as u32) +#[cfg_attr(test, assert_instr(pextrq, imm1=1))] +pub unsafe fn _mm_extract_epi64(a: i64x2, imm1: u8) -> i64 { + macro_rules! call { + ($imm1:expr) => { a.extract($imm1) } + } + constify_imm1!(imm1, call) } /// Select a single value in `a` to store at some position in `b`, From 12936e9976bc6b0e4e538d82f55f0ee2d87a7f25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Tue, 17 Oct 2017 14:49:29 +0100 Subject: [PATCH 19/23] Try "vectorcall" calling convention --- src/lib.rs | 3 ++- src/x86/sse41.rs | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 9d87c0f1f3..b1e298167c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -111,9 +111,10 @@ #![allow(dead_code)] #![allow(unused_features)] + #![feature( const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi, - target_feature, cfg_target_feature, i128_type, asm, const_atomic_usize_new + target_feature, cfg_target_feature, i128_type, asm, const_atomic_usize_new, abi_vectorcall )] #![cfg_attr(test, feature(proc_macro, test))] diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index c69fdbd059..89b2ac5710 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -65,7 +65,7 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(extractps, imm2=0))] -pub unsafe fn _mm_extract_ps(a: f32x4, imm2: u8) -> i32 { +pub unsafe extern "vectorcall" fn _mm_extract_ps(a: f32x4, imm2: u8) -> i32 { macro_rules! call { ($imm2:expr) => { mem::transmute(a.extract($imm2)) } } @@ -76,7 +76,7 @@ pub unsafe fn _mm_extract_ps(a: f32x4, imm2: u8) -> i32 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pextrb, imm4=0))] -pub unsafe fn _mm_extract_epi8(a: i8x16, imm4: u8) -> i8 { +pub unsafe extern "vectorcall" fn _mm_extract_epi8(a: i8x16, imm4: u8) -> i8 { macro_rules! call { ($imm4:expr) => { a.extract($imm4) } } @@ -87,7 +87,7 @@ pub unsafe fn _mm_extract_epi8(a: i8x16, imm4: u8) -> i8 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pextrd, imm2=1))] -pub unsafe fn _mm_extract_epi32(a: i32x4, imm2: u8) -> i32 { +pub unsafe extern "vectorcall" fn _mm_extract_epi32(a: i32x4, imm2: u8) -> i32 { macro_rules! call { ($imm2:expr) => { a.extract($imm2) } } @@ -99,7 +99,7 @@ pub unsafe fn _mm_extract_epi32(a: i32x4, imm2: u8) -> i32 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pextrq, imm1=1))] -pub unsafe fn _mm_extract_epi64(a: i64x2, imm1: u8) -> i64 { +pub unsafe extern "vectorcall" fn _mm_extract_epi64(a: i64x2, imm1: u8) -> i64 { macro_rules! call { ($imm1:expr) => { a.extract($imm1) } } From 11d745cec8f68f617691eadf477d41e00995d972 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Tue, 17 Oct 2017 14:58:47 +0100 Subject: [PATCH 20/23] Revert "Try "vectorcall" calling convention" This reverts commit 12936e9976bc6b0e4e538d82f55f0ee2d87a7f25. --- src/lib.rs | 3 +-- src/x86/sse41.rs | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index b1e298167c..9d87c0f1f3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -111,10 +111,9 @@ #![allow(dead_code)] #![allow(unused_features)] - #![feature( const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi, - target_feature, cfg_target_feature, i128_type, asm, const_atomic_usize_new, abi_vectorcall + target_feature, cfg_target_feature, i128_type, asm, const_atomic_usize_new )] #![cfg_attr(test, feature(proc_macro, test))] diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 89b2ac5710..c69fdbd059 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -65,7 +65,7 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(extractps, imm2=0))] -pub unsafe extern "vectorcall" fn _mm_extract_ps(a: f32x4, imm2: u8) -> i32 { +pub unsafe fn _mm_extract_ps(a: f32x4, imm2: u8) -> i32 { macro_rules! call { ($imm2:expr) => { mem::transmute(a.extract($imm2)) } } @@ -76,7 +76,7 @@ pub unsafe extern "vectorcall" fn _mm_extract_ps(a: f32x4, imm2: u8) -> i32 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pextrb, imm4=0))] -pub unsafe extern "vectorcall" fn _mm_extract_epi8(a: i8x16, imm4: u8) -> i8 { +pub unsafe fn _mm_extract_epi8(a: i8x16, imm4: u8) -> i8 { macro_rules! call { ($imm4:expr) => { a.extract($imm4) } } @@ -87,7 +87,7 @@ pub unsafe extern "vectorcall" fn _mm_extract_epi8(a: i8x16, imm4: u8) -> i8 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pextrd, imm2=1))] -pub unsafe extern "vectorcall" fn _mm_extract_epi32(a: i32x4, imm2: u8) -> i32 { +pub unsafe fn _mm_extract_epi32(a: i32x4, imm2: u8) -> i32 { macro_rules! call { ($imm2:expr) => { a.extract($imm2) } } @@ -99,7 +99,7 @@ pub unsafe extern "vectorcall" fn _mm_extract_epi32(a: i32x4, imm2: u8) -> i32 { #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pextrq, imm1=1))] -pub unsafe extern "vectorcall" fn _mm_extract_epi64(a: i64x2, imm1: u8) -> i64 { +pub unsafe fn _mm_extract_epi64(a: i64x2, imm1: u8) -> i64 { macro_rules! call { ($imm1:expr) => { a.extract($imm1) } } From adc7abcc1e6538c11b8585bc6bf7bea4fc8746a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Tue, 17 Oct 2017 14:58:52 +0100 Subject: [PATCH 21/23] Revert "Try fix for windows" This reverts commit 9c473808d334acedd46060b32ceea116662bf6a3. --- src/x86/macros.rs | 10 +--------- src/x86/sse41.rs | 36 ++++++++++++------------------------ 2 files changed, 13 insertions(+), 33 deletions(-) diff --git a/src/x86/macros.rs b/src/x86/macros.rs index 5a195feca7..538dcc0d9d 100644 --- a/src/x86/macros.rs +++ b/src/x86/macros.rs @@ -338,12 +338,4 @@ macro_rules! constify_imm2 { } } -macro_rules! constify_imm1 { - ($imm8:expr, $expand:ident) => { - #[allow(overflowing_literals)] - match $imm8 & 0b1 { - 0 => $expand!(0), - _ => $expand!(1), - } - } -} + diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index c69fdbd059..d10ad987c6 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -64,46 +64,34 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 { /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8` #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(extractps, imm2=0))] -pub unsafe fn _mm_extract_ps(a: f32x4, imm2: u8) -> i32 { - macro_rules! call { - ($imm2:expr) => { mem::transmute(a.extract($imm2)) } - } - constify_imm2!(imm2, call) +#[cfg_attr(test, assert_instr(extractps, imm8=0))] +pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 { + mem::transmute(a.extract(imm8 as u32 & 0b11)) } /// Extract an 8-bit integer from `a` selected with `imm8` #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(pextrb, imm4=0))] -pub unsafe fn _mm_extract_epi8(a: i8x16, imm4: u8) -> i8 { - macro_rules! call { - ($imm4:expr) => { a.extract($imm4) } - } - constify_imm4!(imm4, call) +#[cfg_attr(test, assert_instr(pextrb, imm8=0))] +pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i8 { + a.extract((imm8 & 0b1111) as u32) } /// Extract an 32-bit integer from `a` selected with `imm8` #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(pextrd, imm2=1))] -pub unsafe fn _mm_extract_epi32(a: i32x4, imm2: u8) -> i32 { - macro_rules! call { - ($imm2:expr) => { a.extract($imm2) } - } - constify_imm2!(imm2, call) +#[cfg_attr(test, assert_instr(pextrd, imm8=1))] +pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 { + a.extract((imm8 & 0b11) as u32) } /// Extract an 64-bit integer from `a` selected with `imm8` #[cfg(target_arch = "x86_64")] #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(pextrq, imm1=1))] -pub unsafe fn _mm_extract_epi64(a: i64x2, imm1: u8) -> i64 { - macro_rules! call { - ($imm1:expr) => { a.extract($imm1) } - } - constify_imm1!(imm1, call) +#[cfg_attr(test, assert_instr(pextrq, imm8=1))] +pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 { + a.extract((imm8 & 0b1) as u32) } /// Select a single value in `a` to store at some position in `b`, From 5456fbfedb24358718fab53aad0fd0e71aaddc0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Tue, 17 Oct 2017 15:07:42 +0100 Subject: [PATCH 22/23] Change tests for windows --- src/x86/sse41.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index d10ad987c6..ab6abe34f8 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -64,7 +64,8 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 { /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8` #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(extractps, imm8=0))] +#[cfg_attr(all(test, windows), assert_instr(mov, imm8=0))] +#[cfg_attr(all(test, not(windows)), assert_instr(extractps, imm8=0))] pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 { mem::transmute(a.extract(imm8 as u32 & 0b11)) } @@ -80,7 +81,8 @@ pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i8 { /// Extract an 32-bit integer from `a` selected with `imm8` #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(pextrd, imm8=1))] +#[cfg_attr(all(test, windows), assert_instr(mov, imm8=1))] +#[cfg_attr(all(test, not(windows)), assert_instr(pextrd, imm8=1))] pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 { a.extract((imm8 & 0b11) as u32) } @@ -89,7 +91,8 @@ pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 { #[cfg(target_arch = "x86_64")] #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(test, assert_instr(pextrq, imm8=1))] +#[cfg_attr(all(test, windows), assert_instr(mov, imm8=1))] +#[cfg_attr(all(test, not(windows)), assert_instr(pextrq, imm8=1))] pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 { a.extract((imm8 & 0b1) as u32) } From 64f614602f418794c17472cb75694a6eaa603798 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= Date: Wed, 18 Oct 2017 09:30:34 +0100 Subject: [PATCH 23/23] Remove useless Windows test --- src/x86/sse41.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index ab6abe34f8..a804ed2e9e 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -64,7 +64,7 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 { /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8` #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(all(test, windows), assert_instr(mov, imm8=0))] +// TODO: Add test for Windows #[cfg_attr(all(test, not(windows)), assert_instr(extractps, imm8=0))] pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 { mem::transmute(a.extract(imm8 as u32 & 0b11)) @@ -81,7 +81,7 @@ pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i8 { /// Extract an 32-bit integer from `a` selected with `imm8` #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(all(test, windows), assert_instr(mov, imm8=1))] +// TODO: Add test for Windows #[cfg_attr(all(test, not(windows)), assert_instr(pextrd, imm8=1))] pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 { a.extract((imm8 & 0b11) as u32) @@ -91,7 +91,7 @@ pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 { #[cfg(target_arch = "x86_64")] #[inline(always)] #[target_feature = "+sse4.1"] -#[cfg_attr(all(test, windows), assert_instr(mov, imm8=1))] +// TODO: Add test for Windows #[cfg_attr(all(test, not(windows)), assert_instr(pextrq, imm8=1))] pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 { a.extract((imm8 & 0b1) as u32)