From f3c6108709e1df1937f9b0c5727edf937df2190b Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Thu, 5 Oct 2017 21:43:51 +0200
Subject: [PATCH 01/29] avx: _mm_permute_ps and sse: _mm_undefined_ps

---
 src/x86/avx.rs | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++
 src/x86/sse.rs |  9 ++++++++
 2 files changed, 71 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 60a4aeea2e..40268891a7 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -7,6 +7,8 @@ use simd_llvm::{simd_cast, simd_shuffle2, simd_shuffle4, simd_shuffle8};
 use v128::{f32x4, f64x2, i32x4, i64x2};
 use v256::*;
 
+use x86::sse::_mm_undefined_ps;
+
 /// Add packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`.
 #[inline(always)]
@@ -707,6 +709,58 @@ pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 {
     }
 }
 
+/// Shuffle single-precision (32-bit) floating-point elements in `a` 
+/// using the control in `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
+pub unsafe fn _mm_permute_ps(a: f32x4, imm8: i32) -> f32x4 {
+    let imm8 = (imm8 & 0xFF) as u8;
+    macro_rules! shuffle4 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle4(a, _mm_undefined_ps(), [
+                $a, $b, $c, $d
+            ])
+        }
+    }
+    macro_rules! shuffle3 {
+        ($a:expr, $b:expr, $c:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => shuffle4!($a, $b, $c, 0),
+                0b01 => shuffle4!($a, $b, $c, 1),
+                0b10 => shuffle4!($a, $b, $c, 2),
+                _ => shuffle4!($a, $b, $c, 3),
+            }
+        }
+    }
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => shuffle3!($a, $b, 0),
+                0b01 => shuffle3!($a, $b, 1),
+                0b10 => shuffle3!($a, $b, 2),
+                _ => shuffle3!($a, $b, 3),
+            }
+        }
+    }
+    macro_rules! shuffle1 {
+        ($a:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => shuffle2!($a, 0),
+                0b01 => shuffle2!($a, 1),
+                0b10 => shuffle2!($a, 2),
+                _ => shuffle2!($a, 3),
+            }
+        }
+    }
+    match (imm8 >> 0) & 0b11 {
+        0b00 => shuffle1!(0),
+        0b01 => shuffle1!(1),
+        0b10 => shuffle1!(2),
+        _ => shuffle1!(3),
+    }
+}
+
 /// Return vector of type `f32x8` with undefined elements.
 #[inline(always)]
 #[target_feature = "+avx"]
@@ -1333,4 +1387,12 @@ mod tests {
         let e = f32x8::new(5.0, 2.0, 3.0, 4.0, 50.0, 64.0, 9.0, 8.0);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_permute_ps() {
+        let a = f32x4::new(4.0, 3.0, 2.0, 5.0);
+        let r = avx::_mm_permute_ps(a, 0x1b);
+        let e = f32x4::new(5.0, 2.0, 3.0, 4.0);
+        assert_eq!(r, e);
+    }
 }
diff --git a/src/x86/sse.rs b/src/x86/sse.rs
index b56e2f756c..9beb7293a0 100644
--- a/src/x86/sse.rs
+++ b/src/x86/sse.rs
@@ -1,3 +1,5 @@
+use std::mem;
+
 use simd_llvm::simd_shuffle4;
 use v128::*;
 use std::os::raw::c_void;
@@ -595,6 +597,13 @@ pub unsafe fn _mm_prefetch(p: *const c_void, strategy: i8) {
     pref!(strategy)
 }
 
+/// Return vector of type __m128 with undefined elements.
+#[inline(always)]
+#[target_feature = "+sse"]
+pub unsafe fn _mm_undefined_ps() -> f32x4 {
+    mem::uninitialized()
+}
+
 #[allow(improper_ctypes)]
 extern {
     #[link_name = "llvm.x86.sse.add.ss"]

From da9e9a33d29f1a6a009323754f5a23d8655fcb87 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Thu, 5 Oct 2017 22:38:12 +0200
Subject: [PATCH 02/29] avx: _mm256_permutevar_pdi, _mm_permutevar_pd

---
 src/x86/avx.rs | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 40268891a7..1dc9405e26 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -761,6 +761,22 @@ pub unsafe fn _mm_permute_ps(a: f32x4, imm8: i32) -> f32x4 {
     }
 }
 
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub unsafe fn _mm256_permutevar_pd(a: f64x4, b: i64x4) -> f64x4 {
+    vpermilpd256(a, b)
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in `a`
+/// using the control in `b`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub unsafe fn _mm_permutevar_pd(a: f64x2, b: i64x2) -> f64x2 {
+    vpermilpd(a, b)
+}
+
 /// Return vector of type `f32x8` with undefined elements.
 #[inline(always)]
 #[target_feature = "+avx"]
@@ -839,6 +855,10 @@ extern "C" {
     fn vpermilps256(a: f32x8, b: i32x8) -> f32x8;
     #[link_name = "llvm.x86.avx.vpermilvar.ps"]
     fn vpermilps(a: f32x4, b: i32x4) -> f32x4;
+    #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
+    fn vpermilpd256(a: f64x4, b: i64x4) -> f64x4;
+    #[link_name = "llvm.x86.avx.vpermilvar.pd"]
+    fn vpermilpd(a: f64x2, b: i64x2) -> f64x2;
 }
 
 #[cfg(test)]
@@ -1395,4 +1415,22 @@ mod tests {
         let e = f32x4::new(5.0, 2.0, 3.0, 4.0);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_permutevar_pd() {
+        let a = f64x4::new(4.0, 3.0, 2.0, 5.0);
+        let b = i64x4::new(1, 2, 3, 4);
+        let r = avx::_mm256_permutevar_pd(a, b);
+        let e = f64x4::new(4.0, 3.0, 5.0, 2.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_permutevar_pd() {
+        let a = f64x2::new(4.0, 3.0);
+        let b = i64x2::new(3, 0);
+        let r = avx::_mm_permutevar_pd(a, b);
+        let e = f64x2::new(3.0, 4.0);
+        assert_eq!(r, e);
+    }
 }

From 7e1703b43e1aa3a4c26e2423bbd70fd113546846 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sat, 7 Oct 2017 17:39:54 +0200
Subject: [PATCH 03/29] avx: _mm256_permute_pd

---
 src/x86/avx.rs | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 84b9f51528..dd2940f619 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -777,6 +777,48 @@ pub unsafe fn _mm_permutevar_pd(a: f64x2, b: i64x2) -> f64x2 {
     vpermilpd(a, b)
 }
 
+/// Shuffle double-precision (64-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))]
+pub unsafe fn _mm256_permute_pd(a: f64x4, imm8: i32) -> f64x4 {
+    let imm8 = (imm8 & 0xFF) as u8;
+    macro_rules! shuffle4 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle4(a, _mm256_undefined_pd(), [$a, $b, $c, $d]);
+        }
+    }
+    macro_rules! shuffle3 {
+        ($a:expr, $b: expr, $c: expr) => {
+            match (imm8 >> 3) & 0x1 {
+                0 => shuffle4!($a, $b, $c, 2),
+                _ => shuffle4!($a, $b, $c, 3),
+            }
+        }
+    }
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 2) & 0x1 {
+                0 => shuffle3!($a, $b, 2),
+                _ => shuffle3!($a, $b, 3),
+            }
+        }
+    }
+    macro_rules! shuffle1 {
+        ($a:expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, 0),
+                _ => shuffle2!($a, 1),
+            }
+        }
+    }
+    match (imm8 >> 0) & 0x1 {
+        0 => shuffle1!(0),
+        _ => shuffle1!(1),
+    }
+}
+
 /// Return vector of type `f32x8` with undefined elements.
 #[inline(always)]
 #[target_feature = "+avx"]
@@ -1433,4 +1475,12 @@ mod tests {
         let e = f64x2::new(3.0, 4.0);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_permute_pd() {
+        let a = f64x4::new(4.0, 3.0, 2.0, 5.0);
+        let r = avx::_mm256_permute_pd(a, 5);
+        let e = f64x4::new(3.0, 4.0, 5.0, 2.0);
+        assert_eq!(r, e);
+    }
 }

From 36a1012547569279e37c940381348fb4f925c822 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sat, 7 Oct 2017 17:45:24 +0200
Subject: [PATCH 04/29] avx: _mm256_shuffle_pd fixed

---
 src/x86/avx.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index dd2940f619..2016b046aa 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -76,7 +76,7 @@ pub unsafe fn _mm256_or_ps(a: f32x8, b: f32x8) -> f32x8 {
 /// lanes using the control in `imm8`.
 #[inline(always)]
 #[target_feature = "+avx"]
-//#[cfg_attr(test, assert_instr(vshufpd, imm8 = 0x0))] // FIXME
+#[cfg_attr(test, assert_instr(vshufpd, imm8 = 0x1))]
 pub unsafe fn _mm256_shuffle_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle4 {

From 1f122b259600c79f0143acdecf2ff050a9b99da2 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sat, 7 Oct 2017 17:59:28 +0200
Subject: [PATCH 05/29] avx: _mm_permute_pd, sse2: _mm_undefined_pd

---
 src/x86/avx.rs  | 35 +++++++++++++++++++++++++++++++++++
 src/x86/sse2.rs |  7 +++++++
 2 files changed, 42 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 2016b046aa..f10888ad50 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -8,6 +8,7 @@ use v128::{f32x4, f64x2, i32x4, i64x2};
 use v256::*;
 
 use x86::sse::_mm_undefined_ps;
+use x86::sse2::_mm_undefined_pd;
 
 /// Add packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`.
@@ -819,6 +820,32 @@ pub unsafe fn _mm256_permute_pd(a: f64x4, imm8: i32) -> f64x4 {
     }
 }
 
+/// Shuffle double-precision (64-bit) floating-point elements in `a`
+/// using the control in `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))]
+pub unsafe fn _mm_permute_pd(a: f64x2, imm8: i32) -> f64x2 {
+    let imm8 = (imm8 & 0xFF) as u8;
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr) => {
+            simd_shuffle2(a, _mm_undefined_pd(), [$a, $b]);
+        }
+    }
+    macro_rules! shuffle1 {
+        ($a:expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, 0),
+                _ => shuffle2!($a, 1),
+            }
+        }
+    }
+    match (imm8 >> 0) & 0x1 {
+        0 => shuffle1!(0),
+        _ => shuffle1!(1),
+    }
+}
+
 /// Return vector of type `f32x8` with undefined elements.
 #[inline(always)]
 #[target_feature = "+avx"]
@@ -1483,4 +1510,12 @@ mod tests {
         let e = f64x4::new(3.0, 4.0, 5.0, 2.0);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_permute_pd() {
+        let a = f64x2::new(4.0, 3.0);
+        let r = avx::_mm_permute_pd(a, 1);
+        let e = f64x2::new(3.0, 4.0);
+        assert_eq!(r, e);
+    }
 }
diff --git a/src/x86/sse2.rs b/src/x86/sse2.rs
index a88d514a7f..201fed4385 100644
--- a/src/x86/sse2.rs
+++ b/src/x86/sse2.rs
@@ -1827,6 +1827,13 @@ pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> f64x2 {
     f64x2::new(d, d)
 }
 
+/// Return vector of type __m128d with undefined elements.
+#[inline(always)]
+#[target_feature = "+sse2"]
+pub unsafe fn _mm_undefined_pd() -> f64x2 {
+    f64x2::splat(mem::uninitialized())
+}
+
 #[allow(improper_ctypes)]
 extern {
     #[link_name = "llvm.x86.sse2.pause"]

From ce7f2603283bc5a5bd4765b05ad39384c907b297 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sat, 7 Oct 2017 18:48:06 +0200
Subject: [PATCH 06/29] avx: _mm256_permute2f128_ps

---
 src/x86/avx.rs | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index f10888ad50..193ca10a5a 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -846,6 +846,18 @@ pub unsafe fn _mm_permute_pd(a: f64x2, imm8: i32) -> f64x2 {
     }
 }
 
+/// Shuffle 256-bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) selected by `imm8` from `a` and `b`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x5))]
+pub unsafe fn _mm256_permute2f128_ps(a: f32x8, b: f32x8, imm8: i8) -> f32x8 {
+    macro_rules! call {
+        ($imm8:expr) => { vperm2f128ps256(a, b, $imm8) }
+    }
+    constify_imm8!(imm8, call)
+}
+
 /// Return vector of type `f32x8` with undefined elements.
 #[inline(always)]
 #[target_feature = "+avx"]
@@ -928,6 +940,8 @@ extern "C" {
     fn vpermilpd256(a: f64x4, b: i64x4) -> f64x4;
     #[link_name = "llvm.x86.avx.vpermilvar.pd"]
     fn vpermilpd(a: f64x2, b: i64x2) -> f64x2;
+    #[link_name = "llvm.x86.avx.vperm2f128.ps.256"]
+    fn vperm2f128ps256(a: f32x8, b: f32x8, imm8: i8) -> f32x8;
 }
 
 #[cfg(test)]
@@ -1518,4 +1532,13 @@ mod tests {
         let e = f64x2::new(3.0, 4.0);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_permute2f128_ps() {
+        let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0);
+        let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0);
+        let r = avx::_mm256_permute2f128_ps(a, b, 0x13);
+        let e = f32x8::new(5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0);
+        assert_eq!(r, e);
+    }
 }

From 9a59843921cd8d79bfc0255b874ef6ec3602c9de Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sat, 7 Oct 2017 18:58:35 +0200
Subject: [PATCH 07/29] avx: _mm256_permute2f128_pd

---
 src/x86/avx.rs | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 193ca10a5a..a201e21142 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -858,6 +858,18 @@ pub unsafe fn _mm256_permute2f128_ps(a: f32x8, b: f32x8, imm8: i8) -> f32x8 {
     constify_imm8!(imm8, call)
 }
 
+/// Shuffle 256-bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) selected by `imm8` from `a` and `b`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
+pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 {
+    macro_rules! call {
+        ($imm8:expr) => { vperm2f128pd256(a, b, $imm8) }
+    }
+    constify_imm8!(imm8, call)
+}
+
 /// Return vector of type `f32x8` with undefined elements.
 #[inline(always)]
 #[target_feature = "+avx"]
@@ -942,6 +954,8 @@ extern "C" {
     fn vpermilpd(a: f64x2, b: i64x2) -> f64x2;
     #[link_name = "llvm.x86.avx.vperm2f128.ps.256"]
     fn vperm2f128ps256(a: f32x8, b: f32x8, imm8: i8) -> f32x8;
+    #[link_name = "llvm.x86.avx.vperm2f128.pd.256"]
+    fn vperm2f128pd256(a: f64x4, b: f64x4, imm8: i8) -> f64x4;
 }
 
 #[cfg(test)]
@@ -1541,4 +1555,13 @@ mod tests {
         let e = f32x8::new(5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_permute2f128_pd() {
+        let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
+        let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
+        let r = avx::_mm256_permute2f128_pd(a, b, 0x31);
+        let e = f64x4::new(3.0, 4.0, 7.0, 8.0);
+        assert_eq!(r, e);
+    }
 }

From 567dd6d15b9d322af0084b5ae1e397f58a3fd628 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sat, 7 Oct 2017 19:09:02 +0200
Subject: [PATCH 08/29] avx: _mm256_permute2f128_si256

---
 src/x86/avx.rs | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index a201e21142..e8bfece220 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -870,6 +870,18 @@ pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 {
     constify_imm8!(imm8, call)
 }
 
+/// Shuffle 258-bits (composed of integer data) selected by `imm8`
+/// from `a` and `b`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
+pub unsafe fn _mm256_permute2f128_si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 {
+    macro_rules! call {
+        ($imm8:expr) => { vperm2f128si256(a, b, $imm8) }
+    }
+    constify_imm8!(imm8, call)
+}
+
 /// Return vector of type `f32x8` with undefined elements.
 #[inline(always)]
 #[target_feature = "+avx"]
@@ -956,6 +968,8 @@ extern "C" {
     fn vperm2f128ps256(a: f32x8, b: f32x8, imm8: i8) -> f32x8;
     #[link_name = "llvm.x86.avx.vperm2f128.pd.256"]
     fn vperm2f128pd256(a: f64x4, b: f64x4, imm8: i8) -> f64x4;
+    #[link_name = "llvm.x86.avx.vperm2f128.si.256"]
+    fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8;
 }
 
 #[cfg(test)]
@@ -1564,4 +1578,13 @@ mod tests {
         let e = f64x4::new(3.0, 4.0, 7.0, 8.0);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_permute2f128_si256() {
+        let a = i32x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        let b = i32x8::new(5, 6, 7, 8, 5, 6, 7, 8);
+        let r = avx::_mm256_permute2f128_si256(a, b, 0x20);
+        let e = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq!(r, e);
+    }
 }

From 7702384047469d67214b7c6d2d8ad1e8bcb7f55c Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 08:41:17 +0200
Subject: [PATCH 09/29] avx: _mm256_broadcast_ss

---
 src/x86/avx.rs | 16 ++++++++++++++++
 src/x86/sse.rs |  2 --
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index e8bfece220..78590b3175 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -882,6 +882,15 @@ pub unsafe fn _mm256_permute2f128_si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 {
     constify_imm8!(imm8, call)
 }
 
+/// Broadcast a single-precision (32-bit) floating-point element from memory
+/// to all elements of the returned vector.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub unsafe fn _mm256_broadcast_ss(f: &f32) -> f32x8 {
+    f32x8::splat(*f)
+}
+
 /// Return vector of type `f32x8` with undefined elements.
 #[inline(always)]
 #[target_feature = "+avx"]
@@ -1587,4 +1596,11 @@ mod tests {
         let e = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_broadcast_ss() {
+        let r = avx::_mm256_broadcast_ss(&3.0);
+        let e = f32x8::splat(3.0);
+        assert_eq!(r, e);
+    }
 }
diff --git a/src/x86/sse.rs b/src/x86/sse.rs
index b19b2150b2..22a6dd75bb 100644
--- a/src/x86/sse.rs
+++ b/src/x86/sse.rs
@@ -1,5 +1,3 @@
-use std::mem;
-
 use simd_llvm::simd_shuffle4;
 use v128::*;
 use v64::f32x2;

From 8f04804fea3fc70f0c3fb4fcba41d41fc0ad1dbd Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 08:46:02 +0200
Subject: [PATCH 10/29] avx: _mm_broadcast_ss

---
 src/x86/avx.rs | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 78590b3175..c331c32f8b 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -891,6 +891,15 @@ pub unsafe fn _mm256_broadcast_ss(f: &f32) -> f32x8 {
     f32x8::splat(*f)
 }
 
+/// Broadcast a single-precision (32-bit) floating-point element from memory
+/// to all elements of the returned vector.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub unsafe fn _mm_broadcast_ss(f: &f32) -> f32x4 {
+    f32x4::splat(*f)
+}
+
 /// Return vector of type `f32x8` with undefined elements.
 #[inline(always)]
 #[target_feature = "+avx"]
@@ -1603,4 +1612,11 @@ mod tests {
         let e = f32x8::splat(3.0);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_broadcast_ss() {
+        let r = avx::_mm_broadcast_ss(&3.0);
+        let e = f32x4::splat(3.0);
+        assert_eq!(r, e);
+    }
 }

From 6634846716c25335ac289365462d7b9195aefed7 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 08:50:05 +0200
Subject: [PATCH 11/29] avx: _mm256_broadcast_sd

---
 src/x86/avx.rs | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index c331c32f8b..abf9b4a328 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -900,6 +900,15 @@ pub unsafe fn _mm_broadcast_ss(f: &f32) -> f32x4 {
     f32x4::splat(*f)
 }
 
+/// Broadcast a double-precision (64-bit) floating-point element from memory
+/// to all elements of the returned vector.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub unsafe fn _mm256_broadcast_sd(f: &f64) -> f64x4 {
+    f64x4::splat(*f)
+}
+
 /// Return vector of type `f32x8` with undefined elements.
 #[inline(always)]
 #[target_feature = "+avx"]
@@ -1618,5 +1627,12 @@ mod tests {
         let r = avx::_mm_broadcast_ss(&3.0);
         let e = f32x4::splat(3.0);
         assert_eq!(r, e);
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_broadcast_sd() {
+        let r = avx::_mm256_broadcast_sd(&3.0);
+        let e = f64x4::splat(3.0);
+        assert_eq!(r, e);
+    }
     }
 }

From 0220b7e70819ab5f1d4338b6a10aa65c2ede09f0 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 09:17:55 +0200
Subject: [PATCH 12/29] avx: _mm256_broadcast_ps

---
 src/x86/avx.rs | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index abf9b4a328..92544d18f9 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -909,6 +909,15 @@ pub unsafe fn _mm256_broadcast_sd(f: &f64) -> f64x4 {
     f64x4::splat(*f)
 }
 
+/// Broadcast 128 bits from memory (composed of 4 packed single-precision
+/// (32-bit) floating-point elements) to all elements of the returned vector.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vbroadcastf128))]
+pub unsafe fn _mm256_broadcast_ps(a: &f32x4) -> f32x8 {
+    vbroadcastf128ps256(a)
+}
+
 /// Return vector of type `f32x8` with undefined elements.
 #[inline(always)]
 #[target_feature = "+avx"]
@@ -997,6 +1006,8 @@ extern "C" {
     fn vperm2f128pd256(a: f64x4, b: f64x4, imm8: i8) -> f64x4;
     #[link_name = "llvm.x86.avx.vperm2f128.si.256"]
     fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8;
+    #[link_name = "llvm.x86.avx.vbroadcastf128.ps.256"]
+    fn vbroadcastf128ps256(a: &f32x4) -> f32x8;
 }
 
 #[cfg(test)]
@@ -1627,6 +1638,7 @@ mod tests {
         let r = avx::_mm_broadcast_ss(&3.0);
         let e = f32x4::splat(3.0);
         assert_eq!(r, e);
+    }
 
     #[simd_test = "avx"]
     unsafe fn _mm256_broadcast_sd() {
@@ -1634,5 +1646,12 @@ mod tests {
         let e = f64x4::splat(3.0);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_broadcast_ps() {
+        let a = f32x4::new(4.0, 3.0, 2.0, 5.0);
+        let r = avx::_mm256_broadcast_ps(&a);
+        let e = f32x8::new(4.0, 3.0, 2.0, 5.0, 4.0, 3.0, 2.0, 5.0);
+        assert_eq!(r, e);
     }
 }

From ea4dcc9f115bb285f8b21b1ed944d31c515e8ac4 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 09:24:32 +0200
Subject: [PATCH 13/29] avx: _mm256_broadcast_pd

---
 src/x86/avx.rs | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 92544d18f9..b4d8ba3f82 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -918,6 +918,15 @@ pub unsafe fn _mm256_broadcast_ps(a: &f32x4) -> f32x8 {
     vbroadcastf128ps256(a)
 }
 
+/// Broadcast 128 bits from memory (composed of 2 packed double-precision
+/// (64-bit) floating-point elements) to all elements of the returned vector.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vbroadcastf128))]
+pub unsafe fn _mm256_broadcast_pd(a: &f64x2) -> f64x4 {
+    vbroadcastf128pd256(a)
+}
+
 /// Return vector of type `f32x8` with undefined elements.
 #[inline(always)]
 #[target_feature = "+avx"]
@@ -1008,6 +1017,8 @@ extern "C" {
     fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8;
     #[link_name = "llvm.x86.avx.vbroadcastf128.ps.256"]
     fn vbroadcastf128ps256(a: &f32x4) -> f32x8;
+    #[link_name = "llvm.x86.avx.vbroadcastf128.pd.256"]
+    fn vbroadcastf128pd256(a: &f64x2) -> f64x4;
 }
 
 #[cfg(test)]
@@ -1654,4 +1665,12 @@ mod tests {
         let e = f32x8::new(4.0, 3.0, 2.0, 5.0, 4.0, 3.0, 2.0, 5.0);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_broadcast_pd() {
+        let a = f64x2::new(4.0, 3.0);
+        let r = avx::_mm256_broadcast_pd(&a);
+        let e = f64x4::new(4.0, 3.0, 4.0, 3.0);
+        assert_eq!(r, e);
+    }
 }

From ab44d454e941f82b37c309e413cf9ddf549d3b40 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 10:53:33 +0200
Subject: [PATCH 14/29] avx: _mm_cmp_pd

---
 src/x86/avx.rs    | 89 +++++++++++++++++++++++++++++++++++++++++++++++
 src/x86/macros.rs | 40 +++++++++++++++++++++
 2 files changed, 129 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index b4d8ba3f82..e067ce5fda 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -487,6 +487,84 @@ pub unsafe fn _mm256_xor_ps(a: f32x8, b: f32x8) -> f32x8 {
     mem::transmute(a ^ b)
 }
 
+// Equal (ordered, non-signaling)
+pub const _CMP_EQ_OQ: i8 = 0x00;
+// Less-than (ordered, signaling)
+pub const _CMP_LT_OS: i8 = 0x01;
+// Less-than-or-equal (ordered, signaling)
+pub const _CMP_LE_OS: i8 = 0x02;
+// Unordered (non-signaling)
+pub const _CMP_UNORD_Q: i8 = 0x03;
+// Not-equal (unordered, non-signaling)
+pub const _CMP_NEQ_UQ: i8 = 0x04;
+// Not-less-than (unordered, signaling)
+pub const _CMP_NLT_US: i8 = 0x05;
+// Not-less-than-or-equal (unordered, signaling)
+pub const _CMP_NLE_US: i8 = 0x06;
+// Ordered (non-signaling)
+pub const _CMP_ORD_Q: i8 = 0x07;
+// Equal (unordered, non-signaling)
+pub const _CMP_EQ_UQ: i8 = 0x08;
+// Not-greater-than-or-equal (unordered, signaling)
+pub const _CMP_NGE_US: i8 = 0x09;
+// Not-greater-than (unordered, signaling)
+pub const _CMP_NGT_US: i8 = 0x0a;
+// False (ordered, non-signaling)
+pub const _CMP_FALSE_OQ: i8 = 0x0b;
+// Not-equal (ordered, non-signaling)
+pub const _CMP_NEQ_OQ: i8 = 0x0c;
+// Greater-than-or-equal (ordered, signaling)
+pub const _CMP_GE_OS: i8 = 0x0d;
+// Greater-than (ordered, signaling)
+pub const _CMP_GT_OS: i8 = 0x0e;
+// True (unordered, non-signaling)
+pub const _CMP_TRUE_UQ: i8 = 0x0f;
+// Equal (ordered, signaling)
+pub const _CMP_EQ_OS: i8 = 0x10;
+// Less-than (ordered, non-signaling)
+pub const _CMP_LT_OQ: i8 = 0x11;
+// Less-than-or-equal (ordered, non-signaling)
+pub const _CMP_LE_OQ: i8 = 0x12;
+// Unordered (signaling)
+pub const _CMP_UNORD_S: i8 = 0x13;
+// Not-equal (unordered, signaling)
+pub const _CMP_NEQ_US: i8 = 0x14;
+// Not-less-than (unordered, non-signaling)
+pub const _CMP_NLT_UQ: i8 = 0x15;
+// Not-less-than-or-equal (unordered, non-signaling)
+pub const _CMP_NLE_UQ: i8 = 0x16;
+// Ordered (signaling)
+pub const _CMP_ORD_S: i8 = 0x17;
+// Equal (unordered, signaling)
+pub const _CMP_EQ_US: i8 = 0x18;
+// Not-greater-than-or-equal (unordered, non-signaling)
+pub const _CMP_NGE_UQ: i8 = 0x19;
+// Not-greater-than (unordered, non-signaling)
+pub const _CMP_NGT_UQ: i8 = 0x1a;
+// False (ordered, signaling)
+pub const _CMP_FALSE_OS: i8 = 0x1b;
+// Not-equal (ordered, signaling)
+pub const _CMP_NEQ_OS: i8 = 0x1c;
+// Greater-than-or-equal (ordered, non-signaling)
+pub const _CMP_GE_OQ: i8 = 0x1d;
+// Greater-than (ordered, non-signaling)
+pub const _CMP_GT_OQ: i8 = 0x1e;
+// True (unordered, signaling)
+pub const _CMP_TRUE_US: i8 = 0x1f;
+
+/// Compare packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
+pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: i8) -> f64x2 {
+    macro_rules! call {
+        ($imm8:expr) => { vcmppd(a, b, $imm8) }
+    }
+    constify_imm6!(imm8, call)
+}
+
 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
 /// floating-point elements.
 #[inline(always)]
@@ -985,6 +1063,8 @@ extern "C" {
     fn vhsubpd(a: f64x4, b: f64x4) -> f64x4;
     #[link_name = "llvm.x86.avx.hsub.ps.256"]
     fn vhsubps(a: f32x8, b: f32x8) -> f32x8;
+    #[link_name = "llvm.x86.sse2.cmp.pd"]
+    fn vcmppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2;
     #[link_name = "llvm.x86.avx.cvtdq2.ps.256"]
     fn vcvtdq2ps(a: i32x8) -> f32x8;
     #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"]
@@ -1410,6 +1490,15 @@ mod tests {
         assert_eq!(r, a);
     }
 
+    #[simd_test = "avx"]
+    unsafe fn _mm_cmp_pd() {
+        let a = f64x2::new(4.0, 9.0);
+        let b = f64x2::new(4.0, 3.0);
+        let r = avx::_mm_cmp_pd(a, b, avx::_CMP_GE_OS);
+        assert!(r.extract(0).is_nan());
+        assert!(r.extract(1).is_nan());
+    }
+
     #[simd_test = "avx"]
     unsafe fn _mm256_cvtepi32_pd() {
         let a = i32x4::new(4, 9, 16, 25);
diff --git a/src/x86/macros.rs b/src/x86/macros.rs
index ebe1015181..8d65435db0 100644
--- a/src/x86/macros.rs
+++ b/src/x86/macros.rs
@@ -261,3 +261,43 @@ macro_rules! constify_imm8 {
         }
     }
 }
+
+macro_rules! constify_imm6 {
+    ($imm8:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match $imm8 & 0b1_1111 {
+            0 => $expand!(0),
+            1 => $expand!(1),
+            2 => $expand!(2),
+            3 => $expand!(3),
+            4 => $expand!(4),
+            5 => $expand!(5),
+            6 => $expand!(6),
+            7 => $expand!(7),
+            8 => $expand!(8),
+            9 => $expand!(9),
+            10 => $expand!(10),
+            11 => $expand!(11),
+            12 => $expand!(12),
+            13 => $expand!(13),
+            14 => $expand!(14),
+            15 => $expand!(15),
+            16 => $expand!(16),
+            17 => $expand!(17),
+            18 => $expand!(18),
+            19 => $expand!(19),
+            20 => $expand!(20),
+            21 => $expand!(21),
+            22 => $expand!(22),
+            23 => $expand!(23),
+            24 => $expand!(24),
+            25 => $expand!(25),
+            26 => $expand!(26),
+            27 => $expand!(27),
+            28 => $expand!(28),
+            29 => $expand!(29),
+            30 => $expand!(30),
+            _ => $expand!(31),
+        }
+    }
+}

From 13cb6db2412238bf6db1a82d6c262f80c85d2f61 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 11:03:27 +0200
Subject: [PATCH 15/29] avx: _mm256_cmp_pd

---
 src/x86/avx.rs | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index e067ce5fda..ef5e4eb863 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -565,6 +565,19 @@ pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: i8) -> f64x2 {
     constify_imm6!(imm8, call)
 }
 
+/// Compare packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
+pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 {
+    macro_rules! call {
+        ($imm8:expr) => { vcmppd256(a, b, $imm8) }
+    }
+    constify_imm6!(imm8, call)
+}
+
 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
 /// floating-point elements.
 #[inline(always)]
@@ -1065,6 +1078,8 @@ extern "C" {
     fn vhsubps(a: f32x8, b: f32x8) -> f32x8;
     #[link_name = "llvm.x86.sse2.cmp.pd"]
     fn vcmppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2;
+    #[link_name = "llvm.x86.avx.cmp.pd.256"]
+    fn vcmppd256(a: f64x4, b: f64x4, imm8: u8) -> f64x4;
     #[link_name = "llvm.x86.avx.cvtdq2.ps.256"]
     fn vcvtdq2ps(a: i32x8) -> f32x8;
     #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"]
@@ -1499,6 +1514,15 @@ mod tests {
         assert!(r.extract(1).is_nan());
     }
 
+    #[simd_test = "avx"]
+    unsafe fn _mm256_cmp_pd() {
+        let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
+        let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
+        let r = avx::_mm256_cmp_pd(a, b, avx::_CMP_GE_OS);
+        let e = f64x4::splat(0.0);
+        assert_eq!(r, e);
+    }
+
     #[simd_test = "avx"]
     unsafe fn _mm256_cvtepi32_pd() {
         let a = i32x4::new(4, 9, 16, 25);

From e130cca45fb6f9eb3647e73aad8b08df3fadbe54 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 11:14:39 +0200
Subject: [PATCH 16/29] avx: _mm_cmp_ps

---
 src/x86/avx.rs | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index ef5e4eb863..3251cede69 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -578,6 +578,19 @@ pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 {
     constify_imm6!(imm8, call)
 }
 
+/// Compare packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
+pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: i8) -> f32x4 {
+    macro_rules! call {
+        ($imm8:expr) => { vcmpps(a, b, $imm8) }
+    }
+    constify_imm6!(imm8, call)
+}
+
 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
 /// floating-point elements.
 #[inline(always)]
@@ -1080,6 +1093,8 @@ extern "C" {
     fn vcmppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2;
     #[link_name = "llvm.x86.avx.cmp.pd.256"]
     fn vcmppd256(a: f64x4, b: f64x4, imm8: u8) -> f64x4;
+    #[link_name = "llvm.x86.sse.cmp.ps"]
+    fn vcmpps(a: f32x4, b: f32x4, imm8: u8) -> f32x4;
     #[link_name = "llvm.x86.avx.cvtdq2.ps.256"]
     fn vcvtdq2ps(a: i32x8) -> f32x8;
     #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"]
@@ -1523,6 +1538,17 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test = "avx"]
+    unsafe fn _mm_cmp_ps() {
+        let a = f32x4::new(4.0, 3.0, 2.0, 5.0);
+        let b = f32x4::new(4.0, 9.0, 16.0, 25.0);
+        let r = avx::_mm_cmp_ps(a, b, avx::_CMP_GE_OS);
+        assert!(r.extract(0).is_nan());
+        assert_eq!(r.extract(1), 0.0);
+        assert_eq!(r.extract(2), 0.0);
+        assert_eq!(r.extract(3), 0.0);
+    }
+
     #[simd_test = "avx"]
     unsafe fn _mm256_cvtepi32_pd() {
         let a = i32x4::new(4, 9, 16, 25);

From 01a69d29beb3b1f7595875e7bb04785c60538cfe Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 11:22:03 +0200
Subject: [PATCH 17/29] avx: _mm256_cmp_ps

---
 src/x86/avx.rs | 94 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 59 insertions(+), 35 deletions(-)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 3251cede69..f1ee2ec44d 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -488,69 +488,69 @@ pub unsafe fn _mm256_xor_ps(a: f32x8, b: f32x8) -> f32x8 {
 }
 
 // Equal (ordered, non-signaling)
-pub const _CMP_EQ_OQ: i8 = 0x00;
+pub const _CMP_EQ_OQ: u8 = 0x00;
 // Less-than (ordered, signaling)
-pub const _CMP_LT_OS: i8 = 0x01;
+pub const _CMP_LT_OS: u8 = 0x01;
 // Less-than-or-equal (ordered, signaling)
-pub const _CMP_LE_OS: i8 = 0x02;
+pub const _CMP_LE_OS: u8 = 0x02;
 // Unordered (non-signaling)
-pub const _CMP_UNORD_Q: i8 = 0x03;
+pub const _CMP_UNORD_Q: u8 = 0x03;
 // Not-equal (unordered, non-signaling)
-pub const _CMP_NEQ_UQ: i8 = 0x04;
+pub const _CMP_NEQ_UQ: u8 = 0x04;
 // Not-less-than (unordered, signaling)
-pub const _CMP_NLT_US: i8 = 0x05;
+pub const _CMP_NLT_US: u8 = 0x05;
 // Not-less-than-or-equal (unordered, signaling)
-pub const _CMP_NLE_US: i8 = 0x06;
+pub const _CMP_NLE_US: u8 = 0x06;
 // Ordered (non-signaling)
-pub const _CMP_ORD_Q: i8 = 0x07;
+pub const _CMP_ORD_Q: u8 = 0x07;
 // Equal (unordered, non-signaling)
-pub const _CMP_EQ_UQ: i8 = 0x08;
+pub const _CMP_EQ_UQ: u8 = 0x08;
 // Not-greater-than-or-equal (unordered, signaling)
-pub const _CMP_NGE_US: i8 = 0x09;
+pub const _CMP_NGE_US: u8 = 0x09;
 // Not-greater-than (unordered, signaling)
-pub const _CMP_NGT_US: i8 = 0x0a;
+pub const _CMP_NGT_US: u8 = 0x0a;
 // False (ordered, non-signaling)
-pub const _CMP_FALSE_OQ: i8 = 0x0b;
+pub const _CMP_FALSE_OQ: u8 = 0x0b;
 // Not-equal (ordered, non-signaling)
-pub const _CMP_NEQ_OQ: i8 = 0x0c;
+pub const _CMP_NEQ_OQ: u8 = 0x0c;
 // Greater-than-or-equal (ordered, signaling)
-pub const _CMP_GE_OS: i8 = 0x0d;
+pub const _CMP_GE_OS: u8 = 0x0d;
 // Greater-than (ordered, signaling)
-pub const _CMP_GT_OS: i8 = 0x0e;
+pub const _CMP_GT_OS: u8 = 0x0e;
 // True (unordered, non-signaling)
-pub const _CMP_TRUE_UQ: i8 = 0x0f;
+pub const _CMP_TRUE_UQ: u8 = 0x0f;
 // Equal (ordered, signaling)
-pub const _CMP_EQ_OS: i8 = 0x10;
+pub const _CMP_EQ_OS: u8 = 0x10;
 // Less-than (ordered, non-signaling)
-pub const _CMP_LT_OQ: i8 = 0x11;
+pub const _CMP_LT_OQ: u8 = 0x11;
 // Less-than-or-equal (ordered, non-signaling)
-pub const _CMP_LE_OQ: i8 = 0x12;
+pub const _CMP_LE_OQ: u8 = 0x12;
 // Unordered (signaling)
-pub const _CMP_UNORD_S: i8 = 0x13;
+pub const _CMP_UNORD_S: u8 = 0x13;
 // Not-equal (unordered, signaling)
-pub const _CMP_NEQ_US: i8 = 0x14;
+pub const _CMP_NEQ_US: u8 = 0x14;
 // Not-less-than (unordered, non-signaling)
-pub const _CMP_NLT_UQ: i8 = 0x15;
+pub const _CMP_NLT_UQ: u8 = 0x15;
 // Not-less-than-or-equal (unordered, non-signaling)
-pub const _CMP_NLE_UQ: i8 = 0x16;
+pub const _CMP_NLE_UQ: u8 = 0x16;
 // Ordered (signaling)
-pub const _CMP_ORD_S: i8 = 0x17;
+pub const _CMP_ORD_S: u8 = 0x17;
 // Equal (unordered, signaling)
-pub const _CMP_EQ_US: i8 = 0x18;
+pub const _CMP_EQ_US: u8 = 0x18;
 // Not-greater-than-or-equal (unordered, non-signaling)
-pub const _CMP_NGE_UQ: i8 = 0x19;
+pub const _CMP_NGE_UQ: u8 = 0x19;
 // Not-greater-than (unordered, non-signaling)
-pub const _CMP_NGT_UQ: i8 = 0x1a;
+pub const _CMP_NGT_UQ: u8 = 0x1a;
 // False (ordered, signaling)
-pub const _CMP_FALSE_OS: i8 = 0x1b;
+pub const _CMP_FALSE_OS: u8 = 0x1b;
 // Not-equal (ordered, signaling)
-pub const _CMP_NEQ_OS: i8 = 0x1c;
+pub const _CMP_NEQ_OS: u8 = 0x1c;
 // Greater-than-or-equal (ordered, non-signaling)
-pub const _CMP_GE_OQ: i8 = 0x1d;
+pub const _CMP_GE_OQ: u8 = 0x1d;
 // Greater-than (ordered, non-signaling)
-pub const _CMP_GT_OQ: i8 = 0x1e;
+pub const _CMP_GT_OQ: u8 = 0x1e;
 // True (unordered, signaling)
-pub const _CMP_TRUE_US: i8 = 0x1f;
+pub const _CMP_TRUE_US: u8 = 0x1f;
 
 /// Compare packed double-precision (64-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
@@ -558,7 +558,7 @@ pub const _CMP_TRUE_US: i8 = 0x1f;
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
-pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: i8) -> f64x2 {
+pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
     macro_rules! call {
         ($imm8:expr) => { vcmppd(a, b, $imm8) }
     }
@@ -571,7 +571,7 @@ pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: i8) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
-pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 {
+pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: u8) -> f64x4 {
     macro_rules! call {
         ($imm8:expr) => { vcmppd256(a, b, $imm8) }
     }
@@ -584,13 +584,26 @@ pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
-pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: i8) -> f32x4 {
+pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
     macro_rules! call {
         ($imm8:expr) => { vcmpps(a, b, $imm8) }
     }
     constify_imm6!(imm8, call)
 }
 
+/// Compare packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
+pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 {
+    macro_rules! call {
+        ($imm8:expr) => { vcmpps256(a, b, $imm8) }
+    }
+    constify_imm6!(imm8, call)
+}
+
 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
 /// floating-point elements.
 #[inline(always)]
@@ -1095,6 +1108,8 @@ extern "C" {
     fn vcmppd256(a: f64x4, b: f64x4, imm8: u8) -> f64x4;
     #[link_name = "llvm.x86.sse.cmp.ps"]
     fn vcmpps(a: f32x4, b: f32x4, imm8: u8) -> f32x4;
+    #[link_name = "llvm.x86.avx.cmp.ps.256"]
+    fn vcmpps256(a: f32x8, b: f32x8, imm8: u8) -> f32x8;
     #[link_name = "llvm.x86.avx.cvtdq2.ps.256"]
     fn vcvtdq2ps(a: i32x8) -> f32x8;
     #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"]
@@ -1549,6 +1564,15 @@ mod tests {
         assert_eq!(r.extract(3), 0.0);
     }
 
+    #[simd_test = "avx"]
+    unsafe fn _mm256_cmp_ps() {
+        let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0);
+        let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0);
+        let r = avx::_mm256_cmp_ps(a, b, avx::_CMP_GE_OS);
+        let e = f32x8::splat(0.0);
+        assert_eq!(r, e);
+    }
+
     #[simd_test = "avx"]
     unsafe fn _mm256_cvtepi32_pd() {
         let a = i32x4::new(4, 9, 16, 25);

From ebb2cc27af5b368bbcd8457b0e19489b2028dde8 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 11:33:37 +0200
Subject: [PATCH 18/29] avx: _mm_cmp_sd

---
 src/x86/avx.rs | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index f1ee2ec44d..0a18bd03e9 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -604,6 +604,21 @@ pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 {
     constify_imm6!(imm8, call)
 }
 
+
+/// Compare the lower double-precision (64-bit) floating-point element in
+/// `a` and `b` based on the comparison operand specified by `imm8`,
+/// store the result in the returned vector, and copy the upper element
+/// from `a` to the upper element of the returned vector.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd
+pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
+    macro_rules! call {
+        ($imm8:expr) => { vcmpsd(a, b, $imm8) }
+    }
+    constify_imm6!(imm8, call)
+}
+
 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
 /// floating-point elements.
 #[inline(always)]
@@ -1110,6 +1125,8 @@ extern "C" {
     fn vcmpps(a: f32x4, b: f32x4, imm8: u8) -> f32x4;
     #[link_name = "llvm.x86.avx.cmp.ps.256"]
     fn vcmpps256(a: f32x8, b: f32x8, imm8: u8) -> f32x8;
+    #[link_name = "llvm.x86.sse2.cmp.sd"]
+    fn vcmpsd(a: f64x2, b: f64x2, imm8: u8) -> f64x2;
     #[link_name = "llvm.x86.avx.cvtdq2.ps.256"]
     fn vcvtdq2ps(a: i32x8) -> f32x8;
     #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"]
@@ -1573,6 +1590,15 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test = "avx"]
+    unsafe fn _mm_cmp_sd() {
+        let a = f64x2::new(4.0, 9.0);
+        let b = f64x2::new(4.0, 3.0);
+        let r = avx::_mm_cmp_sd(a, b, avx::_CMP_GE_OS);
+        assert!(r.extract(0).is_nan());
+        assert_eq!(r.extract(1), 9.0);
+    }
+
     #[simd_test = "avx"]
     unsafe fn _mm256_cvtepi32_pd() {
         let a = i32x4::new(4, 9, 16, 25);

From 4ae314bb699d6bf34e9f0dc3496311134d89a8c0 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 11:46:58 +0200
Subject: [PATCH 19/29] avx: _mm_cmp_ss

---
 src/x86/avx.rs | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 0a18bd03e9..5cf99bbe2d 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -604,11 +604,10 @@ pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 {
     constify_imm6!(imm8, call)
 }
 
-
 /// Compare the lower double-precision (64-bit) floating-point element in
 /// `a` and `b` based on the comparison operand specified by `imm8`,
-/// store the result in the returned vector, and copy the upper element
-/// from `a` to the upper element of the returned vector.
+/// store the result in the lower element of returned vector,
+/// and copy the upper element from `a` to the upper element of returned vector.
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd
@@ -619,6 +618,21 @@ pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
     constify_imm6!(imm8, call)
 }
 
+/// Compare the lower single-precision (32-bit) floating-point element in
+/// `a` and `b` based on the comparison operand specified by `imm8`,
+/// store the result in the lower element of returned vector,
+/// and copy the upper 3 packed elements from `a` to the upper elements of
+/// returned vector.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vcmpeqss, imm8 = 0))] // TODO Validate vcmpss
+pub unsafe fn _mm_cmp_ss(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
+    macro_rules! call {
+        ($imm8:expr) => { vcmpss(a, b, $imm8) }
+    }
+    constify_imm6!(imm8, call)
+}
+
 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
 /// floating-point elements.
 #[inline(always)]
@@ -1127,6 +1141,8 @@ extern "C" {
     fn vcmpps256(a: f32x8, b: f32x8, imm8: u8) -> f32x8;
     #[link_name = "llvm.x86.sse2.cmp.sd"]
     fn vcmpsd(a: f64x2, b: f64x2, imm8: u8) -> f64x2;
+    #[link_name = "llvm.x86.sse.cmp.ss"]
+    fn vcmpss(a: f32x4, b: f32x4, imm8: u8) -> f32x4;
     #[link_name = "llvm.x86.avx.cvtdq2.ps.256"]
     fn vcvtdq2ps(a: i32x8) -> f32x8;
     #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"]
@@ -1599,6 +1615,17 @@ mod tests {
         assert_eq!(r.extract(1), 9.0);
     }
 
+    #[simd_test = "avx"]
+    unsafe fn _mm_cmp_ss() {
+        let a = f32x4::new(4.0, 3.0, 2.0, 5.0);
+        let b = f32x4::new(4.0, 9.0, 16.0, 25.0);
+        let r = avx::_mm_cmp_ss(a, b, avx::_CMP_GE_OS);
+        assert!(r.extract(0).is_nan());
+        assert_eq!(r.extract(1), 3.0);
+        assert_eq!(r.extract(2), 2.0);
+        assert_eq!(r.extract(3), 5.0);
+    }
+
     #[simd_test = "avx"]
     unsafe fn _mm256_cvtepi32_pd() {
         let a = i32x4::new(4, 9, 16, 25);

From b1b74ab84469943a1f2fc928a0744360c11d5874 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 14:30:21 +0200
Subject: [PATCH 20/29] avx: _mm256_insertf128_pd, _mm256_castpd128_pd256

---
 src/x86/avx.rs | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 5cf99bbe2d..28d04c7841 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1073,6 +1073,57 @@ pub unsafe fn _mm256_broadcast_pd(a: &f64x2) -> f64x4 {
     vbroadcastf128pd256(a)
 }
 
+/// Copy `a` to result, then insert 128 bits (composed of 2 packed
+/// double-precision (64-bit) floating-point elements) from `b` into result
+/// at the location specified by `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
+pub unsafe fn _mm256_insertf128_pd(a: f64x4, b: f64x2, imm8: i32) -> f64x4 {
+    macro_rules! shuffle4 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle4(a, _mm256_castpd128_pd256(b), [$a, $b, $c, $d]);
+        }
+    }
+    macro_rules! shuffle3 {
+        ($a:expr, $b: expr, $c: expr) => {
+            match imm8 & 0x1 {
+                0 => shuffle4!($a, $b, $c, 3),
+                _ => shuffle4!($a, $b, $c, 5),
+            }
+        }
+    }
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr) => {
+            match imm8 & 0x1 {
+                0 => shuffle3!($a, $b, 2),
+                _ => shuffle3!($a, $b, 4),
+            }
+        }
+    }
+    macro_rules! shuffle1 {
+        ($a:expr) => {
+            match imm8 & 0x1 {
+                0 => shuffle2!($a, 5),
+                _ => shuffle2!($a, 1),
+            }
+        }
+    }
+    match imm8 & 0x1 {
+        0 => shuffle1!(4),
+        _ => shuffle1!(0),
+    }
+}
+
+/// Casts vector of type __m128d to type __m256d;
+/// the upper 128 bits of the result are undefined.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_castpd128_pd256(a: f64x2) -> f64x4 {
+    // FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
+    simd_shuffle4(a, a, [0, 1, 1, 1])
+}
+
 /// Return vector of type `f32x8` with undefined elements.
 #[inline(always)]
 #[target_feature = "+avx"]
@@ -1889,4 +1940,13 @@ mod tests {
         let e = f64x4::new(4.0, 3.0, 4.0, 3.0);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_insertf128_pd() {
+        let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
+        let b = f64x2::new(5.0, 6.0);
+        let r = avx::_mm256_insertf128_pd(a, b, 0);
+        let e = f64x4::new(5.0, 6.0, 3.0, 4.0);
+        assert_eq!(r, e);
+    }
 }

From c3d109c7e19b91f6f72f06d54ca33f1bb9db2aa1 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 15:58:09 +0200
Subject: [PATCH 21/29] avx: _mm256_insertf128_si256, _mm256_castsi128_si256

---
 src/x86/avx.rs | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 28d04c7841..db76e0f262 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1115,6 +1115,47 @@ pub unsafe fn _mm256_insertf128_pd(a: f64x4, b: f64x2, imm8: i32) -> f64x4 {
     }
 }
 
+/// Copy `a` to result, then insert 128 bits from `b` into result
+/// at the location specified by `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
+pub unsafe fn _mm256_insertf128_si256(a: i64x4, b: i64x2, imm8: i32) -> i64x4 {
+    macro_rules! shuffle4 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle4(a, _mm256_castsi128_si256(b), [$a, $b, $c, $d]);
+        }
+    }
+    macro_rules! shuffle3 {
+        ($a:expr, $b: expr, $c: expr) => {
+            match imm8 & 0x1 {
+                0 => shuffle4!($a, $b, $c, 3),
+                _ => shuffle4!($a, $b, $c, 5),
+            }
+        }
+    }
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr) => {
+            match imm8 & 0x1 {
+                0 => shuffle3!($a, $b, 2),
+                _ => shuffle3!($a, $b, 4),
+            }
+        }
+    }
+    macro_rules! shuffle1 {
+        ($a:expr) => {
+            match imm8 & 0x1 {
+                0 => shuffle2!($a, 5),
+                _ => shuffle2!($a, 1),
+            }
+        }
+    }
+    match imm8 & 0x1 {
+        0 => shuffle1!(4),
+        _ => shuffle1!(0),
+    }
+}
+
 /// Casts vector of type __m128d to type __m256d;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -1124,6 +1165,15 @@ pub unsafe fn _mm256_castpd128_pd256(a: f64x2) -> f64x4 {
     simd_shuffle4(a, a, [0, 1, 1, 1])
 }
 
+/// Casts vector of type __m128i to type __m256i;
+/// the upper 128 bits of the result are undefined.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_castsi128_si256(a: i64x2) -> i64x4 {
+    // FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
+    simd_shuffle4(a, a, [0, 1, 1, 1])
+}
+
 /// Return vector of type `f32x8` with undefined elements.
 #[inline(always)]
 #[target_feature = "+avx"]
@@ -1949,4 +1999,13 @@ mod tests {
         let e = f64x4::new(5.0, 6.0, 3.0, 4.0);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_insertf128_si256() {
+        let a = i64x4::new(1, 2, 3, 4);
+        let b = i64x2::new(5, 6);
+        let r = avx::_mm256_insertf128_si256(a, b, 0);
+        let e = i64x4::new(5, 6, 3, 4);
+        assert_eq!(r, e);
+    }
 }

From 728e9e7ae5b7a83187865263a3133fc379adb4c6 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 16:24:09 +0200
Subject: [PATCH 22/29] avx: _mm256_insertf128_ps, _mm256_castps128_ps256

---
 src/x86/avx.rs | 101 +++++++++++++++++--------------------------------
 1 file changed, 35 insertions(+), 66 deletions(-)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index db76e0f262..721c1cc5e3 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1073,6 +1073,19 @@ pub unsafe fn _mm256_broadcast_pd(a: &f64x2) -> f64x4 {
     vbroadcastf128pd256(a)
 }
 
+/// Copy `a` to result, then insert 128 bits (composed of 4 packed
+/// single-precision (32-bit) floating-point elements) from `b` into result
+/// at the location specified by `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
+pub unsafe fn _mm256_insertf128_ps(a: f32x8, b: f32x4, imm8: i32) -> f32x8 {
+    match imm8 & 1 {
+        0 => simd_shuffle8(a, _mm256_castps128_ps256(b), [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8(a, _mm256_castps128_ps256(b), [0, 1, 2, 3, 8, 9, 10, 11]),
+    }
+}
+
 /// Copy `a` to result, then insert 128 bits (composed of 2 packed
 /// double-precision (64-bit) floating-point elements) from `b` into result
 /// at the location specified by `imm8`.
@@ -1080,38 +1093,9 @@ pub unsafe fn _mm256_broadcast_pd(a: &f64x2) -> f64x4 {
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
 pub unsafe fn _mm256_insertf128_pd(a: f64x4, b: f64x2, imm8: i32) -> f64x4 {
-    macro_rules! shuffle4 {
-        ($a:expr, $b:expr, $c:expr, $d:expr) => {
-            simd_shuffle4(a, _mm256_castpd128_pd256(b), [$a, $b, $c, $d]);
-        }
-    }
-    macro_rules! shuffle3 {
-        ($a:expr, $b: expr, $c: expr) => {
-            match imm8 & 0x1 {
-                0 => shuffle4!($a, $b, $c, 3),
-                _ => shuffle4!($a, $b, $c, 5),
-            }
-        }
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr) => {
-            match imm8 & 0x1 {
-                0 => shuffle3!($a, $b, 2),
-                _ => shuffle3!($a, $b, 4),
-            }
-        }
-    }
-    macro_rules! shuffle1 {
-        ($a:expr) => {
-            match imm8 & 0x1 {
-                0 => shuffle2!($a, 5),
-                _ => shuffle2!($a, 1),
-            }
-        }
-    }
-    match imm8 & 0x1 {
-        0 => shuffle1!(4),
-        _ => shuffle1!(0),
+    match imm8 & 1 {
+        0 => simd_shuffle4(a, _mm256_castpd128_pd256(b), [4, 5, 2, 3]),
+        _ => simd_shuffle4(a, _mm256_castpd128_pd256(b), [0, 1, 4, 5]),
     }
 }
 
@@ -1121,48 +1105,24 @@ pub unsafe fn _mm256_insertf128_pd(a: f64x4, b: f64x2, imm8: i32) -> f64x4 {
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
 pub unsafe fn _mm256_insertf128_si256(a: i64x4, b: i64x2, imm8: i32) -> i64x4 {
-    macro_rules! shuffle4 {
-        ($a:expr, $b:expr, $c:expr, $d:expr) => {
-            simd_shuffle4(a, _mm256_castsi128_si256(b), [$a, $b, $c, $d]);
-        }
-    }
-    macro_rules! shuffle3 {
-        ($a:expr, $b: expr, $c: expr) => {
-            match imm8 & 0x1 {
-                0 => shuffle4!($a, $b, $c, 3),
-                _ => shuffle4!($a, $b, $c, 5),
-            }
-        }
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr) => {
-            match imm8 & 0x1 {
-                0 => shuffle3!($a, $b, 2),
-                _ => shuffle3!($a, $b, 4),
-            }
-        }
-    }
-    macro_rules! shuffle1 {
-        ($a:expr) => {
-            match imm8 & 0x1 {
-                0 => shuffle2!($a, 5),
-                _ => shuffle2!($a, 1),
-            }
-        }
-    }
-    match imm8 & 0x1 {
-        0 => shuffle1!(4),
-        _ => shuffle1!(0),
+    match imm8 & 1 {
+        0 => simd_shuffle4(a, _mm256_castsi128_si256(b), [4, 5, 2, 3]),
+        _ => simd_shuffle4(a, _mm256_castsi128_si256(b), [0, 1, 4, 5]),
     }
 }
 
+pub unsafe fn _mm256_castps128_ps256(a: f32x4) -> f32x8 {
+    // FIXME simd_shuffle8(a, a, [0, 1, 2, 3, -1, -1, -1, -1])
+    simd_shuffle8(a, a, [0, 1, 2, 3, 0, 0, 0, 0])
+}
+
 /// Casts vector of type __m128d to type __m256d;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
 #[target_feature = "+avx"]
 pub unsafe fn _mm256_castpd128_pd256(a: f64x2) -> f64x4 {
     // FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
-    simd_shuffle4(a, a, [0, 1, 1, 1])
+    simd_shuffle4(a, a, [0, 1, 0, 0])
 }
 
 /// Casts vector of type __m128i to type __m256i;
@@ -1171,7 +1131,7 @@ pub unsafe fn _mm256_castpd128_pd256(a: f64x2) -> f64x4 {
 #[target_feature = "+avx"]
 pub unsafe fn _mm256_castsi128_si256(a: i64x2) -> i64x4 {
     // FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
-    simd_shuffle4(a, a, [0, 1, 1, 1])
+    simd_shuffle4(a, a, [0, 1, 0, 0])
 }
 
 /// Return vector of type `f32x8` with undefined elements.
@@ -1991,6 +1951,15 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test = "avx"]
+    unsafe fn _mm256_insertf128_ps() {
+        let a = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0);
+        let b = f32x4::new(4.0, 9.0, 16.0, 25.0);
+        let r = avx::_mm256_insertf128_ps(a, b, 0);
+        let e = f32x8::new(4.0, 9.0, 16.0, 25.0, 8.0, 9.0, 64.0, 50.0);
+        assert_eq!(r, e);
+    }
+
     #[simd_test = "avx"]
     unsafe fn _mm256_insertf128_pd() {
         let a = f64x4::new(1.0, 2.0, 3.0, 4.0);

From 21fb6e10717aa5f94a7ca0d86362ece68bc1136e Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 16:41:28 +0200
Subject: [PATCH 23/29] avx: _mm256_insert_epi8

---
 src/x86/avx.rs | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 721c1cc5e3..635c9ff057 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1111,6 +1111,15 @@ pub unsafe fn _mm256_insertf128_si256(a: i64x4, b: i64x2, imm8: i32) -> i64x4 {
     }
 }
 
+/// Copy `a` to result, and insert the 8-bit integer `i` into result
+/// at the location specified by `index`.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_insert_epi8(a: i8x32, i: i8, index: i32) -> i8x32 {
+    let c = a;
+    c.replace(index as u32, i)
+}
+
 pub unsafe fn _mm256_castps128_ps256(a: f32x4) -> f32x8 {
     // FIXME simd_shuffle8(a, a, [0, 1, 2, 3, -1, -1, -1, -1])
     simd_shuffle8(a, a, [0, 1, 2, 3, 0, 0, 0, 0])
@@ -1977,4 +1986,20 @@ mod tests {
         let e = i64x4::new(5, 6, 3, 4);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_insert_epi8() {
+        let a = i8x32::new(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32);
+        let r = avx::_mm256_insert_epi8(a, 0, 31);
+        let e = i8x32::new(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 0);
+        assert_eq!(r, e);
+    }
 }

From 197a38a1be61b0271380f80dc774ec0296980691 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 16:48:18 +0200
Subject: [PATCH 24/29] avx: _mm256_insert_epi16

---
 src/x86/avx.rs | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 635c9ff057..6b3759bc45 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1117,7 +1117,16 @@ pub unsafe fn _mm256_insertf128_si256(a: i64x4, b: i64x2, imm8: i32) -> i64x4 {
 #[target_feature = "+avx"]
 pub unsafe fn _mm256_insert_epi8(a: i8x32, i: i8, index: i32) -> i8x32 {
     let c = a;
-    c.replace(index as u32, i)
+    c.replace(index as u32 & 31, i)
+}
+
+/// Copy `a` to result, and insert the 16-bit integer `i` into result
+/// at the location specified by `index`.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_insert_epi16(a: i16x16, i: i16, index: i32) -> i16x16 {
+    let c = a;
+    c.replace(index as u32 & 15, i)
 }
 
 pub unsafe fn _mm256_castps128_ps256(a: f32x4) -> f32x8 {
@@ -2002,4 +2011,16 @@ mod tests {
             25, 26, 27, 28, 29, 30, 31, 0);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_insert_epi16() {
+        let a = i16x16::new(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15);
+        let r = avx::_mm256_insert_epi16(a, 0, 15);
+        let e = i16x16::new(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 0);
+        assert_eq!(r, e);
+    }
 }

From d1a92688bd2051c75cf531f9a679e54e7c69c9f4 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 16:55:20 +0200
Subject: [PATCH 25/29] avx: _mm256_insert_epi32

---
 src/x86/avx.rs | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 6b3759bc45..eb7394161e 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1129,6 +1129,15 @@ pub unsafe fn _mm256_insert_epi16(a: i16x16, i: i16, index: i32) -> i16x16 {
     c.replace(index as u32 & 15, i)
 }
 
+/// Copy `a` to result, and insert the 32-bit integer `i` into result
+/// at the location specified by `index`.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_insert_epi32(a: i32x8, i: i32, index: i32) -> i32x8 {
+    let c = a;
+    c.replace(index as u32 & 7, i)
+}
+
 pub unsafe fn _mm256_castps128_ps256(a: f32x4) -> f32x8 {
     // FIXME simd_shuffle8(a, a, [0, 1, 2, 3, -1, -1, -1, -1])
     simd_shuffle8(a, a, [0, 1, 2, 3, 0, 0, 0, 0])
@@ -2023,4 +2032,12 @@ mod tests {
             8, 9, 10, 11, 12, 13, 14, 0);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_insert_epi32() {
+        let a = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = avx::_mm256_insert_epi32(a, 0, 7);
+        let e = i32x8::new(1, 2, 3, 4, 5, 6, 7, 0);
+        assert_eq!(r, e);
+    }
 }

From 56b4f9f37da3aada1deefd6fca029bce7f5e0b5a Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 16:59:33 +0200
Subject: [PATCH 26/29] avx: _mm256_insert_epi64

---
 src/x86/avx.rs | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index eb7394161e..019656f9c1 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1138,6 +1138,15 @@ pub unsafe fn _mm256_insert_epi32(a: i32x8, i: i32, index: i32) -> i32x8 {
     c.replace(index as u32 & 7, i)
 }
 
+/// Copy `a` to result, and insert the 64-bit integer `i` into result
+/// at the location specified by `index`.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_insert_epi64(a: i64x4, i: i64, index: i32) -> i64x4 {
+    let c = a;
+    c.replace(index as u32 & 3, i)
+}
+
 pub unsafe fn _mm256_castps128_ps256(a: f32x4) -> f32x8 {
     // FIXME simd_shuffle8(a, a, [0, 1, 2, 3, -1, -1, -1, -1])
     simd_shuffle8(a, a, [0, 1, 2, 3, 0, 0, 0, 0])
@@ -2040,4 +2049,12 @@ mod tests {
         let e = i32x8::new(1, 2, 3, 4, 5, 6, 7, 0);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_insert_epi64() {
+        let a = i64x4::new(1, 2, 3, 4);
+        let r = avx::_mm256_insert_epi64(a, 0, 3);
+        let e = i64x4::new(1, 2, 3, 0);
+        assert_eq!(r, e);
+    }
 }

From 4f998dcca42ac49e5089028302efb08e7637b272 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 19:42:28 +0200
Subject: [PATCH 27/29] Try to fix i586 build

---
 src/x86/avx.rs | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 019656f9c1..2bec009517 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -7,9 +7,6 @@ use simd_llvm::{simd_cast, simd_shuffle2, simd_shuffle4, simd_shuffle8};
 use v128::{f32x4, f64x2, i32x4, i64x2};
 use v256::*;
 
-use x86::sse::_mm_undefined_ps;
-use x86::sse2::_mm_undefined_pd;
-
 /// Add packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`.
 #[inline(always)]
@@ -556,7 +553,7 @@ pub const _CMP_TRUE_US: u8 = 0x1f;
 /// elements in `a` and `b` based on the comparison operand
 /// specified by `imm8`.
 #[inline(always)]
-#[target_feature = "+avx"]
+#[target_feature = "+avx,+sse2"]
 #[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
 pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
     macro_rules! call {
@@ -582,7 +579,7 @@ pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: u8) -> f64x4 {
 /// elements in `a` and `b` based on the comparison operand
 /// specified by `imm8`.
 #[inline(always)]
-#[target_feature = "+avx"]
+#[target_feature = "+avx,+sse"]
 #[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
 pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
     macro_rules! call {
@@ -609,7 +606,7 @@ pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 {
 /// store the result in the lower element of returned vector,
 /// and copy the upper element from `a` to the upper element of returned vector.
 #[inline(always)]
-#[target_feature = "+avx"]
+#[target_feature = "+avx,+sse2"]
 #[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd
 pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
     macro_rules! call {
@@ -624,7 +621,7 @@ pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
 /// and copy the upper 3 packed elements from `a` to the upper elements of
 /// returned vector.
 #[inline(always)]
-#[target_feature = "+avx"]
+#[target_feature = "+avx,+sse"]
 #[cfg_attr(test, assert_instr(vcmpeqss, imm8 = 0))] // TODO Validate vcmpss
 pub unsafe fn _mm_cmp_ss(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
     macro_rules! call {
@@ -859,9 +856,11 @@ pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 {
 /// Shuffle single-precision (32-bit) floating-point elements in `a` 
 /// using the control in `imm8`.
 #[inline(always)]
-#[target_feature = "+avx"]
+#[target_feature = "+avx,+sse"]
 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
 pub unsafe fn _mm_permute_ps(a: f32x4, imm8: i32) -> f32x4 {
+    use x86::sse::_mm_undefined_ps;
+
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle4 {
         ($a:expr, $b:expr, $c:expr, $d:expr) => {
@@ -969,9 +968,11 @@ pub unsafe fn _mm256_permute_pd(a: f64x4, imm8: i32) -> f64x4 {
 /// Shuffle double-precision (64-bit) floating-point elements in `a`
 /// using the control in `imm8`.
 #[inline(always)]
-#[target_feature = "+avx"]
+#[target_feature = "+avx,+sse2"]
 #[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))]
 pub unsafe fn _mm_permute_pd(a: f64x2, imm8: i32) -> f64x2 {
+    use x86::sse2::_mm_undefined_pd;
+
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle2 {
         ($a:expr, $b:expr) => {

From c18e3d340a1aebcd6d1ce457ed240b42654a53c3 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 8 Oct 2017 19:56:19 +0200
Subject: [PATCH 28/29] Fix missing inline and target_feature

---
 src/x86/avx.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 2bec009517..163bc8a0cb 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1148,6 +1148,10 @@ pub unsafe fn _mm256_insert_epi64(a: i64x4, i: i64, index: i32) -> i64x4 {
     c.replace(index as u32 & 3, i)
 }
 
+/// Casts vector of type __m128 to type __m256;
+/// the upper 128 bits of the result are undefined.
+#[inline(always)]
+#[target_feature = "+avx"]
 pub unsafe fn _mm256_castps128_ps256(a: f32x4) -> f32x8 {
     // FIXME simd_shuffle8(a, a, [0, 1, 2, 3, -1, -1, -1, -1])
     simd_shuffle8(a, a, [0, 1, 2, 3, 0, 0, 0, 0])

From 5744c9590ebd8b9741efa274ddd933eeacc6bc6d Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Mon, 9 Oct 2017 18:28:16 +0200
Subject: [PATCH 29/29] sse: fix _mm_undefined_ps

---
 src/x86/sse.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/x86/sse.rs b/src/x86/sse.rs
index 22a6dd75bb..26127fe2c3 100644
--- a/src/x86/sse.rs
+++ b/src/x86/sse.rs
@@ -872,7 +872,7 @@ pub unsafe fn _mm_prefetch(p: *const c_void, strategy: i8) {
 #[inline(always)]
 #[target_feature = "+sse"]
 pub unsafe fn _mm_undefined_ps() -> f32x4 {
-    mem::uninitialized()
+    f32x4::splat(mem::uninitialized())
 }
 
 #[allow(improper_ctypes)]