From 4350febabce15b893591d767194af624450c0962 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Wed, 4 Oct 2017 11:09:55 +0100
Subject: [PATCH 01/23] sse4.1: _mm_blendv_ps and _mm_blendv_pd

---
 src/x86/sse41.rs | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index 2cd36f74c6..473c382ff4 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -15,6 +15,22 @@ pub unsafe fn _mm_blendv_epi8(
     pblendvb(a, b, mask)
 }
 
+/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(blendvpd))]
+pub unsafe fn _mm_blendv_pd(a: f64x2, b: f64x2, mask: f64x2) -> f64x2 {
+    blendvpd(a, b, mask)
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(blendvps))]
+pub unsafe fn _mm_blendv_ps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4 {
+    blendvps(a, b, mask)
+}
+
 /// Returns the dot product of two f64x2 vectors.
 ///
 /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask.
@@ -53,6 +69,10 @@ pub unsafe fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
 extern {
     #[link_name = "llvm.x86.sse41.pblendvb"]
     fn pblendvb(a: __m128i, b: __m128i, mask: __m128i) -> __m128i;
+    #[link_name = "llvm.x86.sse41.blendvpd"]
+    fn blendvpd(a: f64x2, b: f64x2, mask: f64x2) -> f64x2;
+    #[link_name = "llvm.x86.sse41.blendvps"]
+    fn blendvps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4;
     #[link_name = "llvm.x86.sse41.dppd"]
     fn dppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2;
     #[link_name = "llvm.x86.sse41.dpps"]
@@ -79,6 +99,26 @@ mod tests {
         assert_eq!(sse41::_mm_blendv_epi8(a, b, mask), e);
     }
 
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_blendv_pd() {
+        let a = f64x2::splat(0.0);
+        let b = f64x2::splat(1.0);
+        let mask = ::std::mem::transmute(i64x2::new(0, -1));
+        let r = sse41::_mm_blendv_pd(a, b, mask);
+        let e = f64x2::new(0.0, 1.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_blendv_ps() {
+        let a = f32x4::splat(0.0);
+        let b = f32x4::splat(1.0);
+        let mask = ::std::mem::transmute(i32x4::new(0,-1, 0, -1));
+        let r = sse41::_mm_blendv_ps(a, b, mask);
+        let e = f32x4::new(0.0, 1.0, 0.0, 1.0);
+        assert_eq!(r, e);
+    }
+
     #[simd_test = "sse4.1"]
     unsafe fn _mm_dp_pd() {
         let a = f64x2::new(2.0, 3.0);

From 9e015df40c5c61432fc61476546017d8a205c818 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Wed, 4 Oct 2017 11:17:00 +0100
Subject: [PATCH 02/23] sse4.1: _mm_blend_ps and _mm_blend_pd

- HACK warning: messing with the constify macros
- Selecting only one buffer gets optimized away and tests need to take this into account
---
 src/x86/macros.rs | 37 +++++++++++++++++++++++++++++++++++++
 src/x86/sse41.rs  | 44 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+)

diff --git a/src/x86/macros.rs b/src/x86/macros.rs
index ebe1015181..9c0b9abf93 100644
--- a/src/x86/macros.rs
+++ b/src/x86/macros.rs
@@ -261,3 +261,40 @@ macro_rules! constify_imm8 {
         }
     }
 }
+
+
+macro_rules! constify_imm4 {
+    ($imm4:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match $imm4 & 0b1111 {
+            0 => $expand!(0),
+            1 => $expand!(1),
+            2 => $expand!(2),
+            3 => $expand!(3),
+            4 => $expand!(4),
+            5 => $expand!(5),
+            6 => $expand!(6),
+            7 => $expand!(7),
+            8 => $expand!(8),
+            9 => $expand!(9),
+            10 => $expand!(10),
+            11 => $expand!(11),
+            12 => $expand!(12),
+            13 => $expand!(13),
+            14 => $expand!(14),
+            _ => $expand!(15),
+        }
+    }
+}
+
+macro_rules! constify_imm2 {
+    ($imm2:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match $imm2 & 0b11 {
+            0 => $expand!(0),
+            1 => $expand!(1),
+            2 => $expand!(2),
+            _ => $expand!(3),
+        }
+    }
+}
diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index 473c382ff4..1cca9a7a82 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -31,6 +31,28 @@ pub unsafe fn _mm_blendv_ps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4 {
     blendvps(a, b, mask)
 }
 
+/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm2`
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(blendpd, imm2=0b10))]
+pub unsafe fn _mm_blend_pd(a: f64x2, b: f64x2, imm2: u8) -> f64x2 {
+    macro_rules! call {
+        ($imm2:expr) => { blendpd(a, b, $imm2) }
+    }
+    constify_imm2!(imm2, call)
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using mask `imm4`
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(blendps, imm4=0b0101))]
+pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 {
+    macro_rules! call {
+        ($imm4:expr) => { blendps(a, b, $imm4) }
+    }
+    constify_imm4!(imm4, call)
+}
+
 /// Returns the dot product of two f64x2 vectors.
 ///
 /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask.
@@ -73,6 +95,10 @@ extern {
     fn blendvpd(a: f64x2, b: f64x2, mask: f64x2) -> f64x2;
     #[link_name = "llvm.x86.sse41.blendvps"]
     fn blendvps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4;
+    #[link_name = "llvm.x86.sse41.blendpd"]
+    fn blendpd(a: f64x2, b: f64x2, imm2: u8) -> f64x2;
+    #[link_name = "llvm.x86.sse41.blendps"]
+    fn blendps(a: f32x4, b: f32x4, imm4: u8) -> f32x4;
     #[link_name = "llvm.x86.sse41.dppd"]
     fn dppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2;
     #[link_name = "llvm.x86.sse41.dpps"]
@@ -119,6 +145,24 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_blend_pd() {
+        let a = f64x2::splat(0.0);
+        let b = f64x2::splat(1.0);
+        let r = sse41::_mm_blend_pd(a, b, 0b10);
+        let e = f64x2::new(0.0, 1.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_blend_ps() {
+        let a = f32x4::splat(0.0);
+        let b = f32x4::splat(1.0);
+        let r = sse41::_mm_blend_ps(a, b, 0b1010);
+        let e = f32x4::new(0.0, 1.0, 0.0, 1.0);
+        assert_eq!(r, e);
+    }
+
     #[simd_test = "sse4.1"]
     unsafe fn _mm_dp_pd() {
         let a = f64x2::new(2.0, 3.0);

From ec25f71b2d0d1b30a0e3d2a66da75447127cfaa7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Wed, 4 Oct 2017 11:49:06 +0100
Subject: [PATCH 03/23] sse4.1: _mm_blend_epi16

---
 src/x86/sse41.rs | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index 1cca9a7a82..98c4f5635b 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -15,6 +15,16 @@ pub unsafe fn _mm_blendv_epi8(
     pblendvb(a, b, mask)
 }
 
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pblendw, imm8=0xF0))]
+pub unsafe fn _mm_blend_epi16(a: i16x8, b: i16x8, imm8: u8) -> i16x8 {
+    macro_rules! call {
+        ($imm8:expr) => { pblendw(a, b, $imm8) }
+    }
+    constify_imm8!(imm8, call)
+}
+
 /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`
 #[inline(always)]
 #[target_feature = "+sse4.1"]
@@ -99,6 +109,8 @@ extern {
     fn blendpd(a: f64x2, b: f64x2, imm2: u8) -> f64x2;
     #[link_name = "llvm.x86.sse41.blendps"]
     fn blendps(a: f32x4, b: f32x4, imm4: u8) -> f32x4;
+    #[link_name = "llvm.x86.sse41.pblendw"]
+    fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
     #[link_name = "llvm.x86.sse41.dppd"]
     fn dppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2;
     #[link_name = "llvm.x86.sse41.dpps"]
@@ -163,6 +175,15 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_blend_epi16() {
+        let a = i16x8::splat(0);
+        let b = i16x8::splat(1);
+        let r = sse41::_mm_blend_epi16(a, b, 0b1010_1100);
+        let e = i16x8::new(0, 0, 1, 1, 0, 1, 0, 1);
+        assert_eq!(r, e);
+    }
+
     #[simd_test = "sse4.1"]
     unsafe fn _mm_dp_pd() {
         let a = f64x2::new(2.0, 3.0);

From 3ea44a3b733451ed3b4355aa98b76961305e4002 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Wed, 4 Oct 2017 14:12:33 +0100
Subject: [PATCH 04/23] sse4.1: _mm_extract_ps

---
 src/x86/sse41.rs | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index 98c4f5635b..9effc28fc7 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -1,3 +1,6 @@
+
+use std::mem;
+
 #[cfg(test)]
 use stdsimd_test::assert_instr;
 
@@ -63,6 +66,14 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 {
     constify_imm4!(imm4, call)
 }
 
+/// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(extractps, imm8=0))]
+pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 {
+    mem::transmute(a.extract((imm8 & 0b11) as u32))
+}
+
 /// Returns the dot product of two f64x2 vectors.
 ///
 /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask.
@@ -119,6 +130,8 @@ extern {
 
 #[cfg(test)]
 mod tests {
+    use std::mem;
+
     use stdsimd_test::simd_test;
 
     use v128::*;
@@ -184,6 +197,17 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_extract_ps() {
+        let a = f32x4::new(0.0, 1.0, 2.0, 3.0);
+
+        let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 1));
+        assert_eq!(r, 1.0);
+
+        let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 5));
+        assert_eq!(r, 1.0);
+    }
+
     #[simd_test = "sse4.1"]
     unsafe fn _mm_dp_pd() {
         let a = f64x2::new(2.0, 3.0);

From 88f3992c692c08bb3031be02fa632cfefc05aee5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Wed, 4 Oct 2017 15:28:03 +0100
Subject: [PATCH 05/23] sse4.1: _mm_extract_epi8

---
 src/x86/sse41.rs | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index 9effc28fc7..fb8c9786ad 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -74,6 +74,14 @@ pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 {
     mem::transmute(a.extract((imm8 & 0b11) as u32))
 }
 
+/// Extract an 8-bit integer from `a` selected with `imm8`
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pextrb, imm8=0))]
+pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i32 {
+    a.extract((imm8 & 0b111) as u32) as i32
+}
+
 /// Returns the dot product of two f64x2 vectors.
 ///
 /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask.
@@ -208,6 +216,17 @@ mod tests {
         assert_eq!(r, 1.0);
     }
 
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_extract_epi8() {
+        let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+        let r = sse41::_mm_extract_epi8(a, 1);
+        assert_eq!(r, 1);
+
+        let r = sse41::_mm_extract_epi8(a, 17);
+        assert_eq!(r, 1);
+    }
+
     #[simd_test = "sse4.1"]
     unsafe fn _mm_dp_pd() {
         let a = f64x2::new(2.0, 3.0);

From 38981a8f8919c7aeb5fb70ec0ca1c59049b4b759 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Wed, 4 Oct 2017 15:46:44 +0100
Subject: [PATCH 06/23] see4.1: _mm_extract_epi32

---
 src/x86/sse41.rs | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index fb8c9786ad..4f0b91787a 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -82,6 +82,14 @@ pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i32 {
     a.extract((imm8 & 0b111) as u32) as i32
 }
 
+/// Extract an 32-bit integer from `a` selected with `imm8`
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pextrd, imm8=1))]
+pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 {
+    a.extract((imm8 & 0b11) as u32) as i32
+}
+
 /// Returns the dot product of two f64x2 vectors.
 ///
 /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask.
@@ -227,6 +235,17 @@ mod tests {
         assert_eq!(r, 1);
     }
 
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_extract_epi32() {
+        let a = i32x4::new(0, 1, 2, 3);
+
+        let r = sse41::_mm_extract_epi32(a, 1);
+        assert_eq!(r, 1);
+
+        let r = sse41::_mm_extract_epi32(a, 5);
+        assert_eq!(r, 1);
+    }
+
     #[simd_test = "sse4.1"]
     unsafe fn _mm_dp_pd() {
         let a = f64x2::new(2.0, 3.0);

From aa1f042c109ea2963bebc45748371f6030c8a538 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Wed, 4 Oct 2017 15:57:14 +0100
Subject: [PATCH 07/23] sse4.1: _mm_extract_epi64

---
 src/x86/sse41.rs | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index 4f0b91787a..4f5eb02e28 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -87,7 +87,15 @@ pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i32 {
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pextrd, imm8=1))]
 pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 {
-    a.extract((imm8 & 0b11) as u32) as i32
+    a.extract((imm8 & 0b11) as u32)
+}
+
+/// Extract an 64-bit integer from `a` selected with `imm8`
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pextrq, imm8=1))]
+pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 {
+    a.extract((imm8 & 0b1) as u32)
 }
 
 /// Returns the dot product of two f64x2 vectors.
@@ -246,6 +254,17 @@ mod tests {
         assert_eq!(r, 1);
     }
 
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_extract_epi64() {
+        let a = i64x2::new(0, 1);
+
+        let r = sse41::_mm_extract_epi64(a, 1);
+        assert_eq!(r, 1);
+
+        let r = sse41::_mm_extract_epi64(a, 3);
+        assert_eq!(r, 1);
+    }
+
     #[simd_test = "sse4.1"]
     unsafe fn _mm_dp_pd() {
         let a = f64x2::new(2.0, 3.0);

From 6c4607564e033ebf72c462e5a0878bfc56fcefc1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Wed, 4 Oct 2017 18:54:29 +0100
Subject: [PATCH 08/23] sse4.1: _mm_insert_ps

---
 src/x86/sse41.rs | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index 4f5eb02e28..76e635d8ce 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -98,6 +98,38 @@ pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 {
     a.extract((imm8 & 0b1) as u32)
 }
 
+/// Select a single value in `a` to store at some position in `b`, 
+/// Then zero elements according to `imm8`.
+/// 
+/// `imm8` specifies which bits from operand `a` will be copied, which bits in the 
+/// result they will be copied to, and which bits in the result will be
+/// cleared. The following assignments are made:
+///
+/// * Bits `[7:6]` specify the bits to copy from operand `a`:
+///     - `00`: Selects bits `[31:0]` from operand `a`.
+///     - `01`: Selects bits `[63:32]` from operand `a`.
+///     - `10`: Selects bits `[95:64]` from operand `a`.
+///     - `11`: Selects bits `[127:96]` from operand `a`.
+///
+/// * Bits `[5:4]` specify the bits in the result to which the selected bits
+/// from operand `a` are copied:
+///     - `00`: Copies the selected bits from `a` to result bits `[31:0]`.
+///     - `01`: Copies the selected bits from `a` to result bits `[63:32]`.
+///     - `10`: Copies the selected bits from `a` to result bits `[95:64]`.
+///     - `11`: Copies the selected bits from `a` to result bits `[127:96]`.
+///
+/// * Bits `[3:0]`: If any of these bits are set, the corresponding result
+/// element is cleared.
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(insertps, imm8=0b1010))]
+pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
+        macro_rules! call {
+        ($imm8:expr) => { insertps(a, b, $imm8) }
+    }
+    constify_imm8!(imm8, call)
+}
+
 /// Returns the dot product of two f64x2 vectors.
 ///
 /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask.
@@ -146,6 +178,8 @@ extern {
     fn blendps(a: f32x4, b: f32x4, imm4: u8) -> f32x4;
     #[link_name = "llvm.x86.sse41.pblendw"]
     fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
+    #[link_name = "llvm.x86.sse41.insertps"]
+    fn insertps(a: f32x4, b: f32x4, imm8: u8) -> f32x4;
     #[link_name = "llvm.x86.sse41.dppd"]
     fn dppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2;
     #[link_name = "llvm.x86.sse41.dpps"]
@@ -265,6 +299,15 @@ mod tests {
         assert_eq!(r, 1);
     }
 
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_insert_ps() {
+        let a = f32x4::splat(1.0);
+        let b = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        let r = sse41::_mm_insert_ps(a, b, 0b11_00_1100);
+        let e = f32x4::new(4.0, 1.0, 0.0, 0.0);
+        assert_eq!(r, e);
+    }
+
     #[simd_test = "sse4.1"]
     unsafe fn _mm_dp_pd() {
         let a = f64x2::new(2.0, 3.0);

From fd1506e695f6be251889c0e6338ad76bd32dc93e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Wed, 4 Oct 2017 19:12:39 +0100
Subject: [PATCH 09/23] sse4.1: _mm_insert_epi8

---
 src/x86/sse41.rs | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index 76e635d8ce..8ad26a5467 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -124,12 +124,20 @@ pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 {
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(insertps, imm8=0b1010))]
 pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
-        macro_rules! call {
+    macro_rules! call {
         ($imm8:expr) => { insertps(a, b, $imm8) }
     }
     constify_imm8!(imm8, call)
 }
 
+/// Return a copy of `a` with an 8-bit integer from `i` inserted at a location specified by `imm8`. 
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pinsrb, imm8=0))]
+pub unsafe fn _mm_insert_epi8(a: i8x16, i: i8, imm8: u8) -> i8x16 {
+    a.replace((imm8 & 0b111) as u32, i)
+}
+
 /// Returns the dot product of two f64x2 vectors.
 ///
 /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask.
@@ -308,6 +316,19 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_insert_epi8() {
+        let a = i8x16::splat(0);
+
+        let r = sse41::_mm_insert_epi8(a, 32, 1);
+        let e = i8x16::splat(0).replace(1, 32);
+        assert_eq!(r, e);
+
+        let r = sse41::_mm_insert_epi8(a, 32, 17);
+        let e = i8x16::splat(0).replace(1, 32);
+        assert_eq!(r, e);
+    }
+
     #[simd_test = "sse4.1"]
     unsafe fn _mm_dp_pd() {
         let a = f64x2::new(2.0, 3.0);

From 137586103f5c9d4f2bff5af6036701c9127f2b39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Wed, 4 Oct 2017 21:16:43 +0100
Subject: [PATCH 10/23] sse4.1: _mm_insert_epi32 and _mm_insert_epi64

---
 src/x86/sse41.rs | 48 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index 8ad26a5467..ec073d1563 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -79,7 +79,7 @@ pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 {
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pextrb, imm8=0))]
 pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i32 {
-    a.extract((imm8 & 0b111) as u32) as i32
+    a.extract((imm8 & 0b1111) as u32) as i32
 }
 
 /// Extract an 32-bit integer from `a` selected with `imm8`
@@ -130,12 +130,28 @@ pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
     constify_imm8!(imm8, call)
 }
 
-/// Return a copy of `a` with an 8-bit integer from `i` inserted at a location specified by `imm8`. 
+/// Return a copy of `a` with the 8-bit integer from `i` inserted at a location specified by `imm8`. 
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pinsrb, imm8=0))]
 pub unsafe fn _mm_insert_epi8(a: i8x16, i: i8, imm8: u8) -> i8x16 {
-    a.replace((imm8 & 0b111) as u32, i)
+    a.replace((imm8 & 0b1111) as u32, i)
+}
+
+/// Return a copy of `a` with the 32-bit integer from `i` inserted at a location specified by `imm8`. 
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pinsrd, imm8=0))]
+pub unsafe fn _mm_insert_epi32(a: i32x4, i: i32, imm8: u8) -> i32x4 {
+    a.replace((imm8 & 0b11) as u32, i)
+}
+
+/// Return a copy of `a` with the 64-bit integer from `i` inserted at a location specified by `imm8`. 
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pinsrq, imm8=0))]
+pub unsafe fn _mm_insert_epi64(a: i64x2, i: i64, imm8: u8) -> i64x2 {
+    a.replace((imm8 & 0b1) as u32, i)
 }
 
 /// Returns the dot product of two f64x2 vectors.
@@ -329,6 +345,32 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_insert_epi32() {
+        let a = i32x4::splat(0);
+
+        let r = sse41::_mm_insert_epi32(a, 32, 1);
+        let e = i32x4::splat(0).replace(1, 32);
+        assert_eq!(r, e);
+
+        let r = sse41::_mm_insert_epi32(a, 32, 5);
+        let e = i32x4::splat(0).replace(1, 32);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_insert_epi64() {
+        let a = i64x2::splat(0);
+
+        let r = sse41::_mm_insert_epi64(a, 32, 1);
+        let e = i64x2::splat(0).replace(1, 32);
+        assert_eq!(r, e);
+
+        let r = sse41::_mm_insert_epi64(a, 32, 3);
+        let e = i64x2::splat(0).replace(1, 32);
+        assert_eq!(r, e);
+    }
+
     #[simd_test = "sse4.1"]
     unsafe fn _mm_dp_pd() {
         let a = f64x2::new(2.0, 3.0);

From e5dab3a11c0603713df744c6e10f6358904fa90c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Wed, 4 Oct 2017 21:32:12 +0100
Subject: [PATCH 11/23] Formmating

---
 src/x86/sse41.rs | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index ec073d1563..d976739136 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -236,7 +236,7 @@ mod tests {
     unsafe fn _mm_blendv_pd() {
         let a = f64x2::splat(0.0);
         let b = f64x2::splat(1.0);
-        let mask = ::std::mem::transmute(i64x2::new(0, -1));
+        let mask = mem::transmute(i64x2::new(0, -1));
         let r = sse41::_mm_blendv_pd(a, b, mask);
         let e = f64x2::new(0.0, 1.0);
         assert_eq!(r, e);
@@ -246,7 +246,7 @@ mod tests {
     unsafe fn _mm_blendv_ps() {
         let a = f32x4::splat(0.0);
         let b = f32x4::splat(1.0);
-        let mask = ::std::mem::transmute(i32x4::new(0,-1, 0, -1));
+        let mask = mem::transmute(i32x4::new(0,-1, 0, -1));
         let r = sse41::_mm_blendv_ps(a, b, mask);
         let e = f32x4::new(0.0, 1.0, 0.0, 1.0);
         assert_eq!(r, e);
@@ -282,10 +282,8 @@ mod tests {
     #[simd_test = "sse4.1"]
     unsafe fn _mm_extract_ps() {
         let a = f32x4::new(0.0, 1.0, 2.0, 3.0);
-
         let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 1));
         assert_eq!(r, 1.0);
-
         let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 5));
         assert_eq!(r, 1.0);
     }
@@ -293,10 +291,8 @@ mod tests {
     #[simd_test = "sse4.1"]
     unsafe fn _mm_extract_epi8() {
         let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-
         let r = sse41::_mm_extract_epi8(a, 1);
         assert_eq!(r, 1);
-
         let r = sse41::_mm_extract_epi8(a, 17);
         assert_eq!(r, 1);
     }
@@ -304,10 +300,8 @@ mod tests {
     #[simd_test = "sse4.1"]
     unsafe fn _mm_extract_epi32() {
         let a = i32x4::new(0, 1, 2, 3);
-
         let r = sse41::_mm_extract_epi32(a, 1);
         assert_eq!(r, 1);
-
         let r = sse41::_mm_extract_epi32(a, 5);
         assert_eq!(r, 1);
     }
@@ -315,10 +309,8 @@ mod tests {
     #[simd_test = "sse4.1"]
     unsafe fn _mm_extract_epi64() {
         let a = i64x2::new(0, 1);
-
         let r = sse41::_mm_extract_epi64(a, 1);
         assert_eq!(r, 1);
-
         let r = sse41::_mm_extract_epi64(a, 3);
         assert_eq!(r, 1);
     }
@@ -335,11 +327,9 @@ mod tests {
     #[simd_test = "sse4.1"]
     unsafe fn _mm_insert_epi8() {
         let a = i8x16::splat(0);
-
         let r = sse41::_mm_insert_epi8(a, 32, 1);
         let e = i8x16::splat(0).replace(1, 32);
         assert_eq!(r, e);
-
         let r = sse41::_mm_insert_epi8(a, 32, 17);
         let e = i8x16::splat(0).replace(1, 32);
         assert_eq!(r, e);
@@ -348,11 +338,9 @@ mod tests {
     #[simd_test = "sse4.1"]
     unsafe fn _mm_insert_epi32() {
         let a = i32x4::splat(0);
-
         let r = sse41::_mm_insert_epi32(a, 32, 1);
         let e = i32x4::splat(0).replace(1, 32);
         assert_eq!(r, e);
-
         let r = sse41::_mm_insert_epi32(a, 32, 5);
         let e = i32x4::splat(0).replace(1, 32);
         assert_eq!(r, e);
@@ -361,7 +349,6 @@ mod tests {
     #[simd_test = "sse4.1"]
     unsafe fn _mm_insert_epi64() {
         let a = i64x2::splat(0);
-
         let r = sse41::_mm_insert_epi64(a, 32, 1);
         let e = i64x2::splat(0).replace(1, 32);
         assert_eq!(r, e);

From 08574d8f02395f417cab2a179cf8fae326fa1126 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Wed, 4 Oct 2017 22:18:35 +0100
Subject: [PATCH 12/23] sse4.1: _mm_max_epi8, _mm_max_epu16, _mm_max_epi32 and
 _mm_max_epu32

---
 src/x86/sse41.rs | 76 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index d976739136..1d3eec6c2c 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -154,6 +154,38 @@ pub unsafe fn _mm_insert_epi64(a: i64x2, i: i64, imm8: u8) -> i64x2 {
     a.replace((imm8 & 0b1) as u32, i)
 }
 
+/// Compare packed 8-bit integers in `a` and `b`,87 and return packed maximum values in dst. 
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pmaxsb, imm8=0))]
+pub unsafe fn _mm_max_epi8(a: i8x16, b: i8x16) -> i8x16 {
+    pmaxsb(a, b)
+}
+
+/// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed maximum.
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pmaxuw, imm8=0))]
+pub unsafe fn _mm_max_epu16(a: u16x8, b: u16x8) -> u16x8 {
+    pmaxuw(a, b)
+}
+
+// Compare packed 32-bit integers in `a` and `b`, and return packed maximum values.
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pmaxsd, imm8=0))]
+pub unsafe fn _mm_max_epi32(a: i32x4, b: i32x4) -> i32x4 {
+    pmaxsd(a, b)
+}
+
+// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed maximum values.
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(pmaxud, imm8=0))]
+pub unsafe fn _mm_max_epu32(a: u32x4, b: u32x4) -> u32x4 {
+    pmaxud(a, b)
+}
+
 /// Returns the dot product of two f64x2 vectors.
 ///
 /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask.
@@ -204,6 +236,14 @@ extern {
     fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
     #[link_name = "llvm.x86.sse41.insertps"]
     fn insertps(a: f32x4, b: f32x4, imm8: u8) -> f32x4;
+    #[link_name = "llvm.x86.sse41.pmaxsb"]
+    fn pmaxsb(a: i8x16, b: i8x16) -> i8x16;
+    #[link_name = "llvm.x86.sse41.pmaxuw"]
+    fn pmaxuw(a: u16x8, b: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse41.pmaxsd"]
+    fn pmaxsd(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.sse41.pmaxud"]
+    fn pmaxud(a: u32x4, b: u32x4) -> u32x4;
     #[link_name = "llvm.x86.sse41.dppd"]
     fn dppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2;
     #[link_name = "llvm.x86.sse41.dpps"]
@@ -358,6 +398,42 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test = "avx"]
+    unsafe fn _mm_max_epi8() {
+        let a = i8x16::new(1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32);
+        let b = i8x16::new(2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31);
+        let r = sse41::_mm_max_epi8(a, b);
+        let e = i8x16::new(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_max_epu16() {
+        let a = u16x8::new(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = u16x8::new(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = sse41::_mm_max_epu16(a, b);
+        let e = u16x8::new(2, 4, 6, 8, 10, 12, 14, 16);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_max_epi32() {
+        let a = i32x4::new(1, 4, 5, 8);
+        let b = i32x4::new(2, 3, 6, 7);
+        let r = sse41::_mm_max_epi32(a, b);
+        let e = i32x4::new(2, 4, 6, 8);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_max_epu32() {
+        let a = u32x4::new(1, 4, 5, 8);
+        let b = u32x4::new(2, 3, 6, 7);
+        let r = sse41::_mm_max_epu32(a, b);
+        let e = u32x4::new(2, 4, 6, 8);
+        assert_eq!(r, e);
+    }
+
     #[simd_test = "sse4.1"]
     unsafe fn _mm_dp_pd() {
         let a = f64x2::new(2.0, 3.0);

From 7fe034545386dfbdff891402afe2c9e9dc0797cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Fri, 6 Oct 2017 11:45:59 +0100
Subject: [PATCH 13/23] Fix wrong compiler flag

- avx -> sse4.1
---
 src/x86/sse41.rs | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index 1d3eec6c2c..90e96aa05e 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -367,38 +367,34 @@ mod tests {
     #[simd_test = "sse4.1"]
     unsafe fn _mm_insert_epi8() {
         let a = i8x16::splat(0);
-        let r = sse41::_mm_insert_epi8(a, 32, 1);
         let e = i8x16::splat(0).replace(1, 32);
+        let r = sse41::_mm_insert_epi8(a, 32, 1);
         assert_eq!(r, e);
         let r = sse41::_mm_insert_epi8(a, 32, 17);
-        let e = i8x16::splat(0).replace(1, 32);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
     unsafe fn _mm_insert_epi32() {
         let a = i32x4::splat(0);
-        let r = sse41::_mm_insert_epi32(a, 32, 1);
         let e = i32x4::splat(0).replace(1, 32);
+        let r = sse41::_mm_insert_epi32(a, 32, 1);
         assert_eq!(r, e);
         let r = sse41::_mm_insert_epi32(a, 32, 5);
-        let e = i32x4::splat(0).replace(1, 32);
         assert_eq!(r, e);
     }
 
     #[simd_test = "sse4.1"]
     unsafe fn _mm_insert_epi64() {
         let a = i64x2::splat(0);
-        let r = sse41::_mm_insert_epi64(a, 32, 1);
         let e = i64x2::splat(0).replace(1, 32);
+        let r = sse41::_mm_insert_epi64(a, 32, 1);
         assert_eq!(r, e);
-
         let r = sse41::_mm_insert_epi64(a, 32, 3);
-        let e = i64x2::splat(0).replace(1, 32);
         assert_eq!(r, e);
     }
 
-    #[simd_test = "avx"]
+    #[simd_test = "sse4.1"]
     unsafe fn _mm_max_epi8() {
         let a = i8x16::new(1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32);
         let b = i8x16::new(2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31);
@@ -407,7 +403,7 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test = "avx"]
+    #[simd_test = "sse4.1"]
     unsafe fn _mm_max_epu16() {
         let a = u16x8::new(1, 4, 5, 8, 9, 12, 13, 16);
         let b = u16x8::new(2, 3, 6, 7, 10, 11, 14, 15);
@@ -416,7 +412,7 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test = "avx"]
+    #[simd_test = "sse4.1"]
     unsafe fn _mm_max_epi32() {
         let a = i32x4::new(1, 4, 5, 8);
         let b = i32x4::new(2, 3, 6, 7);
@@ -425,7 +421,7 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test = "avx"]
+    #[simd_test = "sse4.1"]
     unsafe fn _mm_max_epu32() {
         let a = u32x4::new(1, 4, 5, 8);
         let b = u32x4::new(2, 3, 6, 7);

From 60b115649c04d3ef5b08b57e120a667835719bff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Fri, 6 Oct 2017 14:56:34 +0100
Subject: [PATCH 14/23] Fix intrinsics that only work with x86-64

---
 src/x86/sse41.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index 90e96aa05e..927459819b 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -91,6 +91,7 @@ pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 {
 }
 
 /// Extract an 64-bit integer from `a` selected with `imm8`
+#[cfg(target_arch = "x86_64")]
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pextrq, imm8=1))]
@@ -147,6 +148,7 @@ pub unsafe fn _mm_insert_epi32(a: i32x4, i: i32, imm8: u8) -> i32x4 {
 }
 
 /// Return a copy of `a` with the 64-bit integer from `i` inserted at a location specified by `imm8`. 
+#[cfg(target_arch = "x86_64")]
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pinsrq, imm8=0))]
@@ -346,6 +348,7 @@ mod tests {
         assert_eq!(r, 1);
     }
 
+    #[cfg(target_arch = "x86_64")]
     #[simd_test = "sse4.1"]
     unsafe fn _mm_extract_epi64() {
         let a = i64x2::new(0, 1);
@@ -384,6 +387,7 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[cfg(target_arch = "x86_64")]
     #[simd_test = "sse4.1"]
     unsafe fn _mm_insert_epi64() {
         let a = i64x2::splat(0);

From 2373618d628a793df64b95a0f21ad5cb75093223 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Mon, 9 Oct 2017 13:41:43 +0100
Subject: [PATCH 15/23] sse4.1: use appropriate types

---
 src/x86/sse41.rs | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index 927459819b..15df849569 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -1,6 +1,3 @@
-
-use std::mem;
-
 #[cfg(test)]
 use stdsimd_test::assert_instr;
 
@@ -69,17 +66,17 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 {
 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-#[cfg_attr(test, assert_instr(extractps, imm8=0))]
-pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 {
-    mem::transmute(a.extract((imm8 & 0b11) as u32))
+#[cfg_attr(test, assert_instr(extractps, imm8=2))]
+pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> f32 {
+    a.extract(imm8 as u32 & 0b11)
 }
 
 /// Extract an 8-bit integer from `a` selected with `imm8`
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pextrb, imm8=0))]
-pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i32 {
-    a.extract((imm8 & 0b1111) as u32) as i32
+pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i8 {
+    a.extract((imm8 & 0b1111) as u32)
 }
 
 /// Extract an 32-bit integer from `a` selected with `imm8`
@@ -324,9 +321,9 @@ mod tests {
     #[simd_test = "sse4.1"]
     unsafe fn _mm_extract_ps() {
         let a = f32x4::new(0.0, 1.0, 2.0, 3.0);
-        let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 1));
+        let r = sse41::_mm_extract_ps(a, 1);
         assert_eq!(r, 1.0);
-        let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 5));
+        let r = sse41::_mm_extract_ps(a, 5);
         assert_eq!(r, 1.0);
     }
 

From b80b3a69acad221930f217855ed697eb9315b740 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Tue, 10 Oct 2017 09:33:18 +0100
Subject: [PATCH 16/23] Revert '_mm_extract_ps' to return i32

---
 src/x86/sse41.rs | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index 15df849569..b14680e15c 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -1,3 +1,6 @@
+
+use std::mem;
+
 #[cfg(test)]
 use stdsimd_test::assert_instr;
 
@@ -66,9 +69,9 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 {
 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-#[cfg_attr(test, assert_instr(extractps, imm8=2))]
-pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> f32 {
-    a.extract(imm8 as u32 & 0b11)
+#[cfg_attr(test, assert_instr(extractps, imm8=0))]
+pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 {
+    mem::transmute(a.extract(imm8 as u32 & 0b11))
 }
 
 /// Extract an 8-bit integer from `a` selected with `imm8`
@@ -321,9 +324,9 @@ mod tests {
     #[simd_test = "sse4.1"]
     unsafe fn _mm_extract_ps() {
         let a = f32x4::new(0.0, 1.0, 2.0, 3.0);
-        let r = sse41::_mm_extract_ps(a, 1);
+        let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 1));
         assert_eq!(r, 1.0);
-        let r = sse41::_mm_extract_ps(a, 5);
+        let r: f32 = mem::transmute(sse41::_mm_extract_ps(a, 5));
         assert_eq!(r, 1.0);
     }
 

From bab5cd32c5debd56aa570e726fe9b04965a7173a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Tue, 10 Oct 2017 10:04:31 +0100
Subject: [PATCH 17/23] sse4.1: Use the v128 types for consistency

---
 src/x86/sse41.rs | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index b14680e15c..d10ad987c6 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -5,16 +5,11 @@ use std::mem;
 use stdsimd_test::assert_instr;
 
 use v128::*;
-use x86::__m128i;
 
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pblendvb))]
-pub unsafe fn _mm_blendv_epi8(
-    a: __m128i,
-    b: __m128i,
-    mask: __m128i,
-) -> __m128i {
+pub unsafe fn _mm_blendv_epi8(a: i8x16, b: i8x16, mask: i8x16) -> i8x16 {
     pblendvb(a, b, mask)
 }
 
@@ -225,7 +220,7 @@ pub unsafe fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
 #[allow(improper_ctypes)]
 extern {
     #[link_name = "llvm.x86.sse41.pblendvb"]
-    fn pblendvb(a: __m128i, b: __m128i, mask: __m128i) -> __m128i;
+    fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
     #[link_name = "llvm.x86.sse41.blendvpd"]
     fn blendvpd(a: f64x2, b: f64x2, mask: f64x2) -> f64x2;
     #[link_name = "llvm.x86.sse41.blendvps"]

From 9c473808d334acedd46060b32ceea116662bf6a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Tue, 17 Oct 2017 09:44:21 +0100
Subject: [PATCH 18/23] Try fix for windows

---
 src/x86/macros.rs | 10 +++++++++-
 src/x86/sse41.rs  | 36 ++++++++++++++++++++++++------------
 2 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/src/x86/macros.rs b/src/x86/macros.rs
index 538dcc0d9d..5a195feca7 100644
--- a/src/x86/macros.rs
+++ b/src/x86/macros.rs
@@ -338,4 +338,12 @@ macro_rules! constify_imm2 {
     }
 }
 
-
+macro_rules! constify_imm1 {
+    ($imm8:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match $imm8 & 0b1 {
+            0 => $expand!(0),
+            _ => $expand!(1),
+        }
+    }
+}
diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index d10ad987c6..c69fdbd059 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -64,34 +64,46 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 {
 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-#[cfg_attr(test, assert_instr(extractps, imm8=0))]
-pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 {
-    mem::transmute(a.extract(imm8 as u32 & 0b11))
+#[cfg_attr(test, assert_instr(extractps, imm2=0))]
+pub unsafe fn _mm_extract_ps(a: f32x4, imm2: u8) -> i32 {
+    macro_rules! call {
+        ($imm2:expr) => { mem::transmute(a.extract($imm2)) }
+    }
+    constify_imm2!(imm2, call)
 }
 
 /// Extract an 8-bit integer from `a` selected with `imm8`
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-#[cfg_attr(test, assert_instr(pextrb, imm8=0))]
-pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i8 {
-    a.extract((imm8 & 0b1111) as u32)
+#[cfg_attr(test, assert_instr(pextrb, imm4=0))]
+pub unsafe fn _mm_extract_epi8(a: i8x16, imm4: u8) -> i8 {
+    macro_rules! call {
+        ($imm4:expr) => { a.extract($imm4) }
+    }
+    constify_imm4!(imm4, call)
 }
 
 /// Extract an 32-bit integer from `a` selected with `imm8`
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-#[cfg_attr(test, assert_instr(pextrd, imm8=1))]
-pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 {
-    a.extract((imm8 & 0b11) as u32)
+#[cfg_attr(test, assert_instr(pextrd, imm2=1))]
+pub unsafe fn _mm_extract_epi32(a: i32x4, imm2: u8) -> i32 {
+    macro_rules! call {
+        ($imm2:expr) => { a.extract($imm2) }
+    }
+    constify_imm2!(imm2, call)
 }
 
 /// Extract an 64-bit integer from `a` selected with `imm8`
 #[cfg(target_arch = "x86_64")]
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-#[cfg_attr(test, assert_instr(pextrq, imm8=1))]
-pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 {
-    a.extract((imm8 & 0b1) as u32)
+#[cfg_attr(test, assert_instr(pextrq, imm1=1))]
+pub unsafe fn _mm_extract_epi64(a: i64x2, imm1: u8) -> i64 {
+    macro_rules! call {
+        ($imm1:expr) => { a.extract($imm1) }
+    }
+    constify_imm1!(imm1, call)
 }
 
 /// Select a single value in `a` to store at some position in `b`, 

From 12936e9976bc6b0e4e538d82f55f0ee2d87a7f25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Tue, 17 Oct 2017 14:49:29 +0100
Subject: [PATCH 19/23] Try "vectorcall" calling convention

---
 src/lib.rs       | 3 ++-
 src/x86/sse41.rs | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 9d87c0f1f3..b1e298167c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -111,9 +111,10 @@
 
 #![allow(dead_code)]
 #![allow(unused_features)]
+
 #![feature(
     const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi,
-    target_feature, cfg_target_feature, i128_type, asm, const_atomic_usize_new
+    target_feature, cfg_target_feature, i128_type, asm, const_atomic_usize_new, abi_vectorcall
 )]
 #![cfg_attr(test, feature(proc_macro, test))]
 
diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index c69fdbd059..89b2ac5710 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -65,7 +65,7 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(extractps, imm2=0))]
-pub unsafe fn _mm_extract_ps(a: f32x4, imm2: u8) -> i32 {
+pub unsafe extern "vectorcall" fn _mm_extract_ps(a: f32x4, imm2: u8) -> i32 {
     macro_rules! call {
         ($imm2:expr) => { mem::transmute(a.extract($imm2)) }
     }
@@ -76,7 +76,7 @@ pub unsafe fn _mm_extract_ps(a: f32x4, imm2: u8) -> i32 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pextrb, imm4=0))]
-pub unsafe fn _mm_extract_epi8(a: i8x16, imm4: u8) -> i8 {
+pub unsafe extern "vectorcall" fn _mm_extract_epi8(a: i8x16, imm4: u8) -> i8 {
     macro_rules! call {
         ($imm4:expr) => { a.extract($imm4) }
     }
@@ -87,7 +87,7 @@ pub unsafe fn _mm_extract_epi8(a: i8x16, imm4: u8) -> i8 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pextrd, imm2=1))]
-pub unsafe fn _mm_extract_epi32(a: i32x4, imm2: u8) -> i32 {
+pub unsafe extern "vectorcall" fn _mm_extract_epi32(a: i32x4, imm2: u8) -> i32 {
     macro_rules! call {
         ($imm2:expr) => { a.extract($imm2) }
     }
@@ -99,7 +99,7 @@ pub unsafe fn _mm_extract_epi32(a: i32x4, imm2: u8) -> i32 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pextrq, imm1=1))]
-pub unsafe fn _mm_extract_epi64(a: i64x2, imm1: u8) -> i64 {
+pub unsafe extern "vectorcall" fn _mm_extract_epi64(a: i64x2, imm1: u8) -> i64 {
     macro_rules! call {
         ($imm1:expr) => { a.extract($imm1) }
     }

From 11d745cec8f68f617691eadf477d41e00995d972 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Tue, 17 Oct 2017 14:58:47 +0100
Subject: [PATCH 20/23] Revert "Try "vectorcall" calling convention"

This reverts commit 12936e9976bc6b0e4e538d82f55f0ee2d87a7f25.
---
 src/lib.rs       | 3 +--
 src/x86/sse41.rs | 8 ++++----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index b1e298167c..9d87c0f1f3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -111,10 +111,9 @@
 
 #![allow(dead_code)]
 #![allow(unused_features)]
-
 #![feature(
     const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi,
-    target_feature, cfg_target_feature, i128_type, asm, const_atomic_usize_new, abi_vectorcall
+    target_feature, cfg_target_feature, i128_type, asm, const_atomic_usize_new
 )]
 #![cfg_attr(test, feature(proc_macro, test))]
 
diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index 89b2ac5710..c69fdbd059 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -65,7 +65,7 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(extractps, imm2=0))]
-pub unsafe extern "vectorcall" fn _mm_extract_ps(a: f32x4, imm2: u8) -> i32 {
+pub unsafe fn _mm_extract_ps(a: f32x4, imm2: u8) -> i32 {
     macro_rules! call {
         ($imm2:expr) => { mem::transmute(a.extract($imm2)) }
     }
@@ -76,7 +76,7 @@ pub unsafe extern "vectorcall" fn _mm_extract_ps(a: f32x4, imm2: u8) -> i32 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pextrb, imm4=0))]
-pub unsafe extern "vectorcall" fn _mm_extract_epi8(a: i8x16, imm4: u8) -> i8 {
+pub unsafe fn _mm_extract_epi8(a: i8x16, imm4: u8) -> i8 {
     macro_rules! call {
         ($imm4:expr) => { a.extract($imm4) }
     }
@@ -87,7 +87,7 @@ pub unsafe extern "vectorcall" fn _mm_extract_epi8(a: i8x16, imm4: u8) -> i8 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pextrd, imm2=1))]
-pub unsafe extern "vectorcall" fn _mm_extract_epi32(a: i32x4, imm2: u8) -> i32 {
+pub unsafe fn _mm_extract_epi32(a: i32x4, imm2: u8) -> i32 {
     macro_rules! call {
         ($imm2:expr) => { a.extract($imm2) }
     }
@@ -99,7 +99,7 @@ pub unsafe extern "vectorcall" fn _mm_extract_epi32(a: i32x4, imm2: u8) -> i32 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pextrq, imm1=1))]
-pub unsafe extern "vectorcall" fn _mm_extract_epi64(a: i64x2, imm1: u8) -> i64 {
+pub unsafe fn _mm_extract_epi64(a: i64x2, imm1: u8) -> i64 {
     macro_rules! call {
         ($imm1:expr) => { a.extract($imm1) }
     }

From adc7abcc1e6538c11b8585bc6bf7bea4fc8746a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Tue, 17 Oct 2017 14:58:52 +0100
Subject: [PATCH 21/23] Revert "Try fix for windows"

This reverts commit 9c473808d334acedd46060b32ceea116662bf6a3.
---
 src/x86/macros.rs | 10 +---------
 src/x86/sse41.rs  | 36 ++++++++++++------------------------
 2 files changed, 13 insertions(+), 33 deletions(-)

diff --git a/src/x86/macros.rs b/src/x86/macros.rs
index 5a195feca7..538dcc0d9d 100644
--- a/src/x86/macros.rs
+++ b/src/x86/macros.rs
@@ -338,12 +338,4 @@ macro_rules! constify_imm2 {
     }
 }
 
-macro_rules! constify_imm1 {
-    ($imm8:expr, $expand:ident) => {
-        #[allow(overflowing_literals)]
-        match $imm8 & 0b1 {
-            0 => $expand!(0),
-            _ => $expand!(1),
-        }
-    }
-}
+
diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index c69fdbd059..d10ad987c6 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -64,46 +64,34 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 {
 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-#[cfg_attr(test, assert_instr(extractps, imm2=0))]
-pub unsafe fn _mm_extract_ps(a: f32x4, imm2: u8) -> i32 {
-    macro_rules! call {
-        ($imm2:expr) => { mem::transmute(a.extract($imm2)) }
-    }
-    constify_imm2!(imm2, call)
+#[cfg_attr(test, assert_instr(extractps, imm8=0))]
+pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 {
+    mem::transmute(a.extract(imm8 as u32 & 0b11))
 }
 
 /// Extract an 8-bit integer from `a` selected with `imm8`
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-#[cfg_attr(test, assert_instr(pextrb, imm4=0))]
-pub unsafe fn _mm_extract_epi8(a: i8x16, imm4: u8) -> i8 {
-    macro_rules! call {
-        ($imm4:expr) => { a.extract($imm4) }
-    }
-    constify_imm4!(imm4, call)
+#[cfg_attr(test, assert_instr(pextrb, imm8=0))]
+pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i8 {
+    a.extract((imm8 & 0b1111) as u32)
 }
 
 /// Extract an 32-bit integer from `a` selected with `imm8`
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-#[cfg_attr(test, assert_instr(pextrd, imm2=1))]
-pub unsafe fn _mm_extract_epi32(a: i32x4, imm2: u8) -> i32 {
-    macro_rules! call {
-        ($imm2:expr) => { a.extract($imm2) }
-    }
-    constify_imm2!(imm2, call)
+#[cfg_attr(test, assert_instr(pextrd, imm8=1))]
+pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 {
+    a.extract((imm8 & 0b11) as u32)
 }
 
 /// Extract an 64-bit integer from `a` selected with `imm8`
 #[cfg(target_arch = "x86_64")]
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-#[cfg_attr(test, assert_instr(pextrq, imm1=1))]
-pub unsafe fn _mm_extract_epi64(a: i64x2, imm1: u8) -> i64 {
-    macro_rules! call {
-        ($imm1:expr) => { a.extract($imm1) }
-    }
-    constify_imm1!(imm1, call)
+#[cfg_attr(test, assert_instr(pextrq, imm8=1))]
+pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 {
+    a.extract((imm8 & 0b1) as u32)
 }
 
 /// Select a single value in `a` to store at some position in `b`, 

From 5456fbfedb24358718fab53aad0fd0e71aaddc0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Tue, 17 Oct 2017 15:07:42 +0100
Subject: [PATCH 22/23] Change tests for windows

---
 src/x86/sse41.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index d10ad987c6..ab6abe34f8 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -64,7 +64,8 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 {
 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-#[cfg_attr(test, assert_instr(extractps, imm8=0))]
+#[cfg_attr(all(test, windows), assert_instr(mov, imm8=0))]
+#[cfg_attr(all(test, not(windows)), assert_instr(extractps, imm8=0))]
 pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 {
     mem::transmute(a.extract(imm8 as u32 & 0b11))
 }
@@ -80,7 +81,8 @@ pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i8 {
 /// Extract an 32-bit integer from `a` selected with `imm8`
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-#[cfg_attr(test, assert_instr(pextrd, imm8=1))]
+#[cfg_attr(all(test, windows), assert_instr(mov, imm8=1))]
+#[cfg_attr(all(test, not(windows)), assert_instr(pextrd, imm8=1))]
 pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 {
     a.extract((imm8 & 0b11) as u32)
 }
@@ -89,7 +91,8 @@ pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 {
 #[cfg(target_arch = "x86_64")]
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-#[cfg_attr(test, assert_instr(pextrq, imm8=1))]
+#[cfg_attr(all(test, windows), assert_instr(mov, imm8=1))]
+#[cfg_attr(all(test, not(windows)), assert_instr(pextrq, imm8=1))]
 pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 {
     a.extract((imm8 & 0b1) as u32)
 }

From 64f614602f418794c17472cb75694a6eaa603798 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Oliveira?= <p32blo@gmail.com>
Date: Wed, 18 Oct 2017 09:30:34 +0100
Subject: [PATCH 23/23] Remove useless Windows test

---
 src/x86/sse41.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index ab6abe34f8..a804ed2e9e 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -64,7 +64,7 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 {
 /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-#[cfg_attr(all(test, windows), assert_instr(mov, imm8=0))]
+// TODO: Add test for Windows
 #[cfg_attr(all(test, not(windows)), assert_instr(extractps, imm8=0))]
 pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 {
     mem::transmute(a.extract(imm8 as u32 & 0b11))
@@ -81,7 +81,7 @@ pub unsafe fn _mm_extract_epi8(a: i8x16, imm8: u8) -> i8 {
 /// Extract an 32-bit integer from `a` selected with `imm8`
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-#[cfg_attr(all(test, windows), assert_instr(mov, imm8=1))]
+// TODO: Add test for Windows
 #[cfg_attr(all(test, not(windows)), assert_instr(pextrd, imm8=1))]
 pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 {
     a.extract((imm8 & 0b11) as u32)
@@ -91,7 +91,7 @@ pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: u8) -> i32 {
 #[cfg(target_arch = "x86_64")]
 #[inline(always)]
 #[target_feature = "+sse4.1"]
-#[cfg_attr(all(test, windows), assert_instr(mov, imm8=1))]
+// TODO: Add test for Windows
 #[cfg_attr(all(test, not(windows)), assert_instr(pextrq, imm8=1))]
 pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 {
     a.extract((imm8 & 0b1) as u32)