From 56e9370af91d8e2262a902a1cfdfb891206255b6 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sat, 14 Oct 2017 16:28:15 +0200
Subject: [PATCH 01/37] avx: _mm256_movedup_pd

---
 src/x86/avx.rs | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 4de3ee9dc7..ff31bc760c 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1316,6 +1316,15 @@ pub unsafe fn _mm256_moveldup_ps(a: f32x8) -> f32x8 {
     simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6])
 }
 
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements
+/// from "a", and return the results.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vmovddup))]
+pub unsafe fn _mm256_movedup_pd(a: f64x4) -> f64x4 {
+    simd_shuffle4(a, a, [0, 0, 2, 2])
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -2400,4 +2409,12 @@ mod tests {
         let e = f32x8::new(1., 1., 3., 3., 5., 5., 7., 7.);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_movedup_pd() {
+        let a = f64x4::new(1., 2., 3., 4.);
+        let r = avx::_mm256_movedup_pd(a);
+        let e = f64x4::new(1., 1., 3., 3.);
+        assert_eq!(r, e);
+    }
 }

From 8517ab9fe4afec8a6fca5cdd305d7523c7ae4fc0 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sat, 14 Oct 2017 16:44:12 +0200
Subject: [PATCH 02/37] avx: _mm256_lddqu_si256

---
 src/x86/avx.rs | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index ff31bc760c..b5c3283b97 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1325,6 +1325,16 @@ pub unsafe fn _mm256_movedup_pd(a: f64x4) -> f64x4 {
     simd_shuffle4(a, a, [0, 0, 2, 2])
 }
 
+/// Load 256-bits of integer data from unaligned memory into result.
+/// This intrinsic may perform better than `_mm256_loadu_si256` when the
+/// data crosses a cache line boundary.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vlddqu))]
+pub unsafe fn _mm256_lddqu_si256(mem_addr: *const i8x32) -> i8x32 {
+    vlddqu(mem_addr as *const i8)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -1478,6 +1488,8 @@ extern "C" {
     fn maskloadps(mem_addr: *const i8, mask: i32x4) -> f32x4;
     #[link_name = "llvm.x86.avx.maskstore.ps"]
     fn maskstoreps(mem_addr: *mut i8, mask: i32x4, a: f32x4);
+    #[link_name = "llvm.x86.avx.ldu.dq.256"]
+    fn vlddqu(mem_addr: *const i8) -> i8x32;
 }
 
 #[cfg(test)]
@@ -2417,4 +2429,21 @@ mod tests {
         let e = f64x4::new(1., 1., 3., 3.);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_lddqu_si256() {
+        let a = i8x32::new(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32);
+        let p = &a as *const _;
+        let r = avx::_mm256_lddqu_si256(black_box(p));
+        let e = i8x32::new(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32);
+        assert_eq!(r, e);
+    }
 }

From 1c6be55e1fd98c2c536f3a13fa8433f59665122c Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sat, 14 Oct 2017 16:57:47 +0200
Subject: [PATCH 03/37] avx: _mm256_rcp_ps

---
 src/x86/avx.rs | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index b5c3283b97..a88842b80f 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1335,6 +1335,16 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const i8x32) -> i8x32 {
     vlddqu(mem_addr as *const i8)
 }
 
+/// Compute the approximate reciprocal of packed single-precision (32-bit)
+/// floating-point elements in `a`, and return the results. The maximum
+/// relative error for this approximation is less than 1.5*2^-12.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vrcpps))]
+pub unsafe fn _mm256_rcp_ps(a: f32x8) -> f32x8 {
+    vrcpps(a)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -1490,6 +1500,8 @@ extern "C" {
     fn maskstoreps(mem_addr: *mut i8, mask: i32x4, a: f32x4);
     #[link_name = "llvm.x86.avx.ldu.dq.256"]
     fn vlddqu(mem_addr: *const i8) -> i8x32;
+    #[link_name = "llvm.x86.avx.rcp.ps.256"]
+    fn vrcpps(a: f32x8) -> f32x8;
 }
 
 #[cfg(test)]
@@ -2446,4 +2458,13 @@ mod tests {
             25, 26, 27, 28, 29, 30, 31, 32);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_rcp_ps() {
+        let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = avx::_mm256_rcp_ps(a);
+        let e = f32x8::new(0.99975586, 0.49987793, 0.33325195, 0.24993896,
+                           0.19995117, 0.16662598, 0.14282227, 0.12496948);
+        assert_eq!(r, e);
+    }
 }

From 5aee0b97bf1b38c87169a956f3e5f8c0ee1b98bb Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sat, 14 Oct 2017 17:05:30 +0200
Subject: [PATCH 04/37] avx: _mm256_rsqrt_ps

---
 src/x86/avx.rs | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index a88842b80f..bad960ebf1 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1345,6 +1345,16 @@ pub unsafe fn _mm256_rcp_ps(a: f32x8) -> f32x8 {
     vrcpps(a)
 }
 
+/// Compute the approximate reciprocal square root of packed single-precision
+/// (32-bit) floating-point elements in `a`, and return the results.
+/// The maximum relative error for this approximation is less than 1.5*2^-12.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vrsqrtps))]
+pub unsafe fn _mm256_rsqrt_ps(a: f32x8) -> f32x8 {
+    vrsqrtps(a)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -1502,6 +1512,8 @@ extern "C" {
     fn vlddqu(mem_addr: *const i8) -> i8x32;
     #[link_name = "llvm.x86.avx.rcp.ps.256"]
     fn vrcpps(a: f32x8) -> f32x8;
+    #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
+    fn vrsqrtps(a: f32x8) -> f32x8;
 }
 
 #[cfg(test)]
@@ -2467,4 +2479,13 @@ mod tests {
                            0.19995117, 0.16662598, 0.14282227, 0.12496948);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_rsqrt_ps() {
+        let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = avx::_mm256_rsqrt_ps(a);
+        let e = f32x8::new(0.99975586, 0.7069092, 0.5772705, 0.49987793,
+                           0.44714355, 0.40820313, 0.3779297, 0.3534546);
+        assert_eq!(r, e);
+    }
 }

From e3a51026119e6a1802e2e14211479150156fd2b3 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sat, 14 Oct 2017 17:14:46 +0200
Subject: [PATCH 05/37] avx: _mm256_unpackhi_pd

---
 src/x86/avx.rs | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index bad960ebf1..df04712528 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1355,6 +1355,15 @@ pub unsafe fn _mm256_rsqrt_ps(a: f32x8) -> f32x8 {
     vrsqrtps(a)
 }
 
+/// Unpack and interleave double-precision (64-bit) floating-point elements
+/// from the high half of each 128-bit lane in `a` and `b`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub unsafe fn _mm256_unpackhi_pd(a: f64x4, b: f64x4) -> f64x4 {
+    simd_shuffle4(a, b, [1, 5, 3, 7])
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -2488,4 +2497,13 @@ mod tests {
                            0.44714355, 0.40820313, 0.3779297, 0.3534546);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_unpackhi_pd() {
+        let a = f64x4::new(1., 2., 3., 4.);
+        let b = f64x4::new(5., 6., 7., 8.);
+        let r = avx::_mm256_unpackhi_pd(a, b);
+        let e = f64x4::new(2., 6., 4., 8.);
+        assert_eq!(r, e);
+    }
 }

From 01c3227276809fc0b3c1cacbd21e2442c10e6e63 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sat, 14 Oct 2017 17:25:26 +0200
Subject: [PATCH 06/37] avx: _mm256_unpackhi_ps

---
 src/x86/avx.rs | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index df04712528..d65de5baad 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1364,6 +1364,15 @@ pub unsafe fn _mm256_unpackhi_pd(a: f64x4, b: f64x4) -> f64x4 {
     simd_shuffle4(a, b, [1, 5, 3, 7])
 }
 
+/// Unpack and interleave single-precision (32-bit) floating-point elements
+/// from the high half of each 128-bit lane in `a` and `b`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub unsafe fn _mm256_unpackhi_ps(a: f32x8, b: f32x8) -> f32x8 {
+    simd_shuffle8(a, b, [2, 10, 3, 11, 6, 14, 7, 15])
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -2506,4 +2515,13 @@ mod tests {
         let e = f64x4::new(2., 6., 4., 8.);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_unpackhi_ps() {
+        let a = f32x8::new(1., 2., 3., 4., 1., 2., 3., 4.);
+        let b = f32x8::new(5., 6., 7., 8., 5., 6., 7., 8.);
+        let r = avx::_mm256_unpackhi_ps(a, b);
+        let e = f32x8::new(3., 7., 4., 8., 3., 7., 4., 8.);
+        assert_eq!(r, e);
+    }
 }

From 3060970e18c29b30a776091b1d81831aaf324618 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sat, 14 Oct 2017 17:40:00 +0200
Subject: [PATCH 07/37] avx: _mm256_unpacklo_pd, _mm256_unpacklo_ps

---
 src/x86/avx.rs | 42 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index d65de5baad..96b3587d62 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1373,6 +1373,24 @@ pub unsafe fn _mm256_unpackhi_ps(a: f32x8, b: f32x8) -> f32x8 {
     simd_shuffle8(a, b, [2, 10, 3, 11, 6, 14, 7, 15])
 }
 
+/// Unpack and interleave double-precision (64-bit) floating-point elements
+/// from the low half of each 128-bit lane in `a` and `b`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub unsafe fn _mm256_unpacklo_pd(a: f64x4, b: f64x4) -> f64x4 {
+    simd_shuffle4(a, b, [0, 4, 2, 6])
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements
+/// from the low half of each 128-bit lane in `a` and `b`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub unsafe fn _mm256_unpacklo_ps(a: f32x8, b: f32x8) -> f32x8 {
+    simd_shuffle8(a, b, [0, 8, 1, 9, 4, 12, 5, 13])
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -2518,10 +2536,28 @@ mod tests {
 
     #[simd_test = "avx"]
     unsafe fn _mm256_unpackhi_ps() {
-        let a = f32x8::new(1., 2., 3., 4., 1., 2., 3., 4.);
-        let b = f32x8::new(5., 6., 7., 8., 5., 6., 7., 8.);
+        let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = f32x8::new(9., 10., 11., 12., 13., 14., 15., 16.);
         let r = avx::_mm256_unpackhi_ps(a, b);
-        let e = f32x8::new(3., 7., 4., 8., 3., 7., 4., 8.);
+        let e = f32x8::new(3., 11., 4., 12., 7., 15., 8., 16.);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_unpacklo_pd() {
+        let a = f64x4::new(1., 2., 3., 4.);
+        let b = f64x4::new(5., 6., 7., 8.);
+        let r = avx::_mm256_unpacklo_pd(a, b);
+        let e = f64x4::new(1., 5., 3., 7.);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_unpacklo_ps() {
+        let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = f32x8::new(9., 10., 11., 12., 13., 14., 15., 16.);
+        let r = avx::_mm256_unpacklo_ps(a, b);
+        let e = f32x8::new(1., 9., 2., 10., 5., 13., 6., 14.);
         assert_eq!(r, e);
     }
 }

From 09d480a5b04f05fc84da554ded6fad5780b8f9ed Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 08:03:19 +0200
Subject: [PATCH 08/37] avx: _mm256_testz_si256

---
 src/x86/avx.rs | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 96b3587d62..10e232458e 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1391,6 +1391,17 @@ pub unsafe fn _mm256_unpacklo_ps(a: f32x8, b: f32x8) -> f32x8 {
     simd_shuffle8(a, b, [0, 8, 1, 9, 4, 12, 5, 13])
 }
 
+/// Compute the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Compute the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vptest))]
+pub unsafe fn _mm256_testz_si256(a: i64x4, b: i64x4) -> i32 {
+    ptestz256(a, b)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -1550,6 +1561,8 @@ extern "C" {
     fn vrcpps(a: f32x8) -> f32x8;
     #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
     fn vrsqrtps(a: f32x8) -> f32x8;
+    #[link_name = "llvm.x86.avx.ptestz.256"]
+    fn ptestz256(a: i64x4, b: i64x4) -> i32;
 }
 
 #[cfg(test)]
@@ -2560,4 +2573,15 @@ mod tests {
         let e = f32x8::new(1., 9., 2., 10., 5., 13., 6., 14.);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_testz_si256() {
+        let a = i64x4::new(1, 2, 3, 4);
+        let b = i64x4::new(5, 6, 7, 8);
+        let r = avx::_mm256_testz_si256(a, b);
+        assert_eq!(r, 0);
+        let b = i64x4::splat(0);
+        let r = avx::_mm256_testz_si256(a, b);
+        assert_eq!(r, 1);
+    }
 }

From 35dbf8e495b982eaa89bbbc6710f205697db4cb4 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 08:12:05 +0200
Subject: [PATCH 09/37] avx: _mm256_testc_si256

---
 src/x86/avx.rs | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 10e232458e..efa72ed5c6 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1402,6 +1402,17 @@ pub unsafe fn _mm256_testz_si256(a: i64x4, b: i64x4) -> i32 {
     ptestz256(a, b)
 }
 
+/// Compute the bitwise AND of 256 bits (representing integer data) in "a" and
+/// "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0.
+/// Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if
+/// the result is zero, otherwise set "CF" to 0. Return the "CF" value.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vptest))]
+pub unsafe fn _mm256_testc_si256(a: i64x4, b: i64x4) -> i32 {
+    ptestc256(a, b)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -1563,6 +1574,8 @@ extern "C" {
     fn vrsqrtps(a: f32x8) -> f32x8;
     #[link_name = "llvm.x86.avx.ptestz.256"]
     fn ptestz256(a: i64x4, b: i64x4) -> i32;
+    #[link_name = "llvm.x86.avx.ptestc.256"]
+    fn ptestc256(a: i64x4, b: i64x4) -> i32;
 }
 
 #[cfg(test)]
@@ -2584,4 +2597,15 @@ mod tests {
         let r = avx::_mm256_testz_si256(a, b);
         assert_eq!(r, 1);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_testc_si256() {
+        let a = i64x4::new(1, 2, 3, 4);
+        let b = i64x4::new(5, 6, 7, 8);
+        let r = avx::_mm256_testc_si256(a, b);
+        assert_eq!(r, 0);
+        let b = i64x4::splat(0);
+        let r = avx::_mm256_testc_si256(a, b);
+        assert_eq!(r, 1);
+    }
 }

From dd4ef5c91b6dfb82ee90529d5c31607cd20b34ba Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 08:24:15 +0200
Subject: [PATCH 10/37] avx: _mm256_testz_pd

---
 src/x86/avx.rs | 35 +++++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index efa72ed5c6..f668740cf0 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1402,10 +1402,10 @@ pub unsafe fn _mm256_testz_si256(a: i64x4, b: i64x4) -> i32 {
     ptestz256(a, b)
 }
 
-/// Compute the bitwise AND of 256 bits (representing integer data) in "a" and
-/// "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0.
-/// Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if
-/// the result is zero, otherwise set "CF" to 0. Return the "CF" value.
+/// Compute the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Compute the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vptest))]
@@ -1413,6 +1413,20 @@ pub unsafe fn _mm256_testc_si256(a: i64x4, b: i64x4) -> i32 {
     ptestc256(a, b)
 }
 
+/// Compute the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vtestpd))]
+pub unsafe fn _mm256_testz_pd(a: f64x4, b: f64x4) -> i32 {
+    vtestzpd256(a, b)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -1576,6 +1590,8 @@ extern "C" {
     fn ptestz256(a: i64x4, b: i64x4) -> i32;
     #[link_name = "llvm.x86.avx.ptestc.256"]
     fn ptestc256(a: i64x4, b: i64x4) -> i32;
+    #[link_name = "llvm.x86.avx.vtestz.pd.256"]
+    fn vtestzpd256(a: f64x4, b: f64x4) -> i32;
 }
 
 #[cfg(test)]
@@ -2608,4 +2624,15 @@ mod tests {
         let r = avx::_mm256_testc_si256(a, b);
         assert_eq!(r, 1);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_testz_pd() {
+        let a = f64x4::new(1., 2., 3., 4.);
+        let b = f64x4::new(5., 6., 7., 8.);
+        let r = avx::_mm256_testz_pd(a, b);
+        assert_eq!(r, 1);
+        let a = f64x4::splat(-1.);
+        let r = avx::_mm256_testz_pd(a, a);
+        assert_eq!(r, 0);
+    }
 }

From fddb670a671b28336e31222fe725a9cef0eb8c44 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 08:31:18 +0200
Subject: [PATCH 11/37] avx: _mm256_testc_pd

---
 src/x86/avx.rs | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index f668740cf0..d01bf4328a 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1427,6 +1427,20 @@ pub unsafe fn _mm256_testz_pd(a: f64x4, b: f64x4) -> i32 {
     vtestzpd256(a, b)
 }
 
+/// Compute the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vtestpd))]
+pub unsafe fn _mm256_testc_pd(a: f64x4, b: f64x4) -> i32 {
+    vtestcpd256(a, b)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -1592,6 +1606,8 @@ extern "C" {
     fn ptestc256(a: i64x4, b: i64x4) -> i32;
     #[link_name = "llvm.x86.avx.vtestz.pd.256"]
     fn vtestzpd256(a: f64x4, b: f64x4) -> i32;
+    #[link_name = "llvm.x86.avx.vtestc.pd.256"]
+    fn vtestcpd256(a: f64x4, b: f64x4) -> i32;
 }
 
 #[cfg(test)]
@@ -2635,4 +2651,15 @@ mod tests {
         let r = avx::_mm256_testz_pd(a, a);
         assert_eq!(r, 0);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_testc_pd() {
+        let a = f64x4::new(1., 2., 3., 4.);
+        let b = f64x4::new(5., 6., 7., 8.);
+        let r = avx::_mm256_testc_pd(a, b);
+        assert_eq!(r, 1);
+        let b = f64x4::splat(0.);
+        let r = avx::_mm256_testc_pd(a, b);
+        assert_eq!(r, 1);
+    }
 }

From c0961f2b716a695209eee6a72bc7a3049a50276f Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 08:42:04 +0200
Subject: [PATCH 12/37] avx: _mm256_testnzc_pd

---
 src/x86/avx.rs | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index d01bf4328a..d6ced27a89 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1441,6 +1441,21 @@ pub unsafe fn _mm256_testc_pd(a: f64x4, b: f64x4) -> i32 {
     vtestcpd256(a, b)
 }
 
+/// Compute the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vtestpd))]
+pub unsafe fn _mm256_testnzc_pd(a: f64x4, b: f64x4) -> i32 {
+    vtestnzcpd256(a, b)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -1608,6 +1623,8 @@ extern "C" {
     fn vtestzpd256(a: f64x4, b: f64x4) -> i32;
     #[link_name = "llvm.x86.avx.vtestc.pd.256"]
     fn vtestcpd256(a: f64x4, b: f64x4) -> i32;
+    #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
+    fn vtestnzcpd256(a: f64x4, b: f64x4) -> i32;
 }
 
 #[cfg(test)]
@@ -2662,4 +2679,16 @@ mod tests {
         let r = avx::_mm256_testc_pd(a, b);
         assert_eq!(r, 1);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_testnzc_pd() {
+        let a = f64x4::new(1., 2., 3., 4.);
+        let b = f64x4::new(5., 6., 7., 8.);
+        let r = avx::_mm256_testnzc_pd(a, b);
+        assert_eq!(r, 0);
+        let a = f64x4::new(1., -1., -1., -1.);
+        let b = f64x4::new(-1., -1., 1., 1.);
+        let r = avx::_mm256_testnzc_pd(a, b);
+        assert_eq!(r, 1);
+    }
 }

From f4da29bda8e4e0c869cbf2ce90bd4546533baf8f Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 08:47:23 +0200
Subject: [PATCH 13/37] avx: _mm_testz_pd

---
 src/x86/avx.rs | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index d6ced27a89..b68d2c8f45 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1456,6 +1456,20 @@ pub unsafe fn _mm256_testnzc_pd(a: f64x4, b: f64x4) -> i32 {
     vtestnzcpd256(a, b)
 }
 
+/// Compute the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vtestpd))]
+pub unsafe fn _mm_testz_pd(a: f64x2, b: f64x2) -> i32 {
+    vtestzpd(a, b)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -1625,6 +1639,8 @@ extern "C" {
     fn vtestcpd256(a: f64x4, b: f64x4) -> i32;
     #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
     fn vtestnzcpd256(a: f64x4, b: f64x4) -> i32;
+    #[link_name = "llvm.x86.avx.vtestz.pd"]
+    fn vtestzpd(a: f64x2, b: f64x2) -> i32;
 }
 
 #[cfg(test)]
@@ -2690,5 +2706,16 @@ mod tests {
         let b = f64x4::new(-1., -1., 1., 1.);
         let r = avx::_mm256_testnzc_pd(a, b);
         assert_eq!(r, 1);
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_testz_pd() {
+        let a = f64x2::new(1., 2.);
+        let b = f64x2::new(5., 6.);
+        let r = avx::_mm_testz_pd(a, b);
+        assert_eq!(r, 1);
+        let a = f64x2::splat(-1.);
+        let r = avx::_mm_testz_pd(a, a);
+        assert_eq!(r, 0);
+    }
     }
 }

From b59393b85842b619e59989cf7d774339f4da099f Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 08:56:11 +0200
Subject: [PATCH 14/37] avx: _mm_testc_pd

---
 src/x86/avx.rs | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index b68d2c8f45..ba54541993 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1470,6 +1470,20 @@ pub unsafe fn _mm_testz_pd(a: f64x2, b: f64x2) -> i32 {
     vtestzpd(a, b)
 }
 
+/// Compute the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vtestpd))]
+pub unsafe fn _mm_testc_pd(a: f64x2, b: f64x2) -> i32 {
+    vtestcpd(a, b)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -1641,6 +1655,8 @@ extern "C" {
     fn vtestnzcpd256(a: f64x4, b: f64x4) -> i32;
     #[link_name = "llvm.x86.avx.vtestz.pd"]
     fn vtestzpd(a: f64x2, b: f64x2) -> i32;
+    #[link_name = "llvm.x86.avx.vtestc.pd"]
+    fn vtestcpd(a: f64x2, b: f64x2) -> i32;
 }
 
 #[cfg(test)]
@@ -2691,9 +2707,10 @@ mod tests {
         let b = f64x4::new(5., 6., 7., 8.);
         let r = avx::_mm256_testc_pd(a, b);
         assert_eq!(r, 1);
-        let b = f64x4::splat(0.);
+        let a = f64x4::splat(1.);
+        let b = f64x4::splat(-1.);
         let r = avx::_mm256_testc_pd(a, b);
-        assert_eq!(r, 1);
+        assert_eq!(r, 0);
     }
 
     #[simd_test = "avx"]
@@ -2706,6 +2723,7 @@ mod tests {
         let b = f64x4::new(-1., -1., 1., 1.);
         let r = avx::_mm256_testnzc_pd(a, b);
         assert_eq!(r, 1);
+    }
 
     #[simd_test = "avx"]
     unsafe fn _mm_testz_pd() {
@@ -2717,5 +2735,16 @@ mod tests {
         let r = avx::_mm_testz_pd(a, a);
         assert_eq!(r, 0);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_testc_pd() {
+        let a = f64x2::new(1., 2.);
+        let b = f64x2::new(5., 6.);
+        let r = avx::_mm_testc_pd(a, b);
+        assert_eq!(r, 1);
+        let a = f64x2::splat(1.);
+        let b = f64x2::splat(-1.);
+        let r = avx::_mm_testc_pd(a, b);
+        assert_eq!(r, 0);
     }
 }

From ed3b7ca4c08ac3a42022b9e8e694130e777bfa91 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 09:01:38 +0200
Subject: [PATCH 15/37] avx: _mm_testnzc_pd

---
 src/x86/avx.rs | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index ba54541993..180a1331b0 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1484,6 +1484,21 @@ pub unsafe fn _mm_testc_pd(a: f64x2, b: f64x2) -> i32 {
     vtestcpd(a, b)
 }
 
+/// Compute the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vtestpd))]
+pub unsafe fn _mm_testnzc_pd(a: f64x2, b: f64x2) -> i32 {
+    vtestnzcpd(a, b)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -1657,6 +1672,8 @@ extern "C" {
     fn vtestzpd(a: f64x2, b: f64x2) -> i32;
     #[link_name = "llvm.x86.avx.vtestc.pd"]
     fn vtestcpd(a: f64x2, b: f64x2) -> i32;
+    #[link_name = "llvm.x86.avx.vtestnzc.pd"]
+    fn vtestnzcpd(a: f64x2, b: f64x2) -> i32;
 }
 
 #[cfg(test)]
@@ -2747,4 +2764,16 @@ mod tests {
         let r = avx::_mm_testc_pd(a, b);
         assert_eq!(r, 0);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_testnzc_pd() {
+        let a = f64x2::new(1., 2.);
+        let b = f64x2::new(5., 6.);
+        let r = avx::_mm_testnzc_pd(a, b);
+        assert_eq!(r, 0);
+        let a = f64x2::new(1., -1.);
+        let b = f64x2::new(-1., -1.);
+        let r = avx::_mm_testnzc_pd(a, b);
+        assert_eq!(r, 1);
+    }
 }

From f4ca21e63621e7485b5ea28de6dd9e9178f777db Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 09:20:24 +0200
Subject: [PATCH 16/37] avx: _mm256_testz_ps, _mm256_testc_ps,
 _mm256_testnzc_ps

---
 src/x86/avx.rs | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 180a1331b0..5c24f84940 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1499,6 +1499,49 @@ pub unsafe fn _mm_testnzc_pd(a: f64x2, b: f64x2) -> i32 {
     vtestnzcpd(a, b)
 }
 
+/// Compute the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vtestps))]
+pub unsafe fn _mm256_testz_ps(a: f32x8, b: f32x8) -> i32 {
+    vtestzps256(a, b)
+}
+
+/// Compute the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vtestps))]
+pub unsafe fn _mm256_testc_ps(a: f32x8, b: f32x8) -> i32 {
+    vtestcps256(a, b)
+}
+
+/// Compute the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vtestps))]
+pub unsafe fn _mm256_testnzc_ps(a: f32x8, b: f32x8) -> i32 {
+    vtestnzcps256(a, b)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -1674,6 +1717,12 @@ extern "C" {
     fn vtestcpd(a: f64x2, b: f64x2) -> i32;
     #[link_name = "llvm.x86.avx.vtestnzc.pd"]
     fn vtestnzcpd(a: f64x2, b: f64x2) -> i32;
+    #[link_name = "llvm.x86.avx.vtestz.ps.256"]
+    fn vtestzps256(a: f32x8, b: f32x8) -> i32;
+    #[link_name = "llvm.x86.avx.vtestc.ps.256"]
+    fn vtestcps256(a: f32x8, b: f32x8) -> i32;
+    #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
+    fn vtestnzcps256(a: f32x8, b: f32x8) -> i32;
 }
 
 #[cfg(test)]
@@ -2776,4 +2825,35 @@ mod tests {
         let r = avx::_mm_testnzc_pd(a, b);
         assert_eq!(r, 1);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_testz_ps() {
+        let a = f32x8::splat(1.);
+        let r = avx::_mm256_testz_ps(a, a);
+        assert_eq!(r, 1);
+        let a = f32x8::splat(-1.);
+        let r = avx::_mm256_testz_ps(a, a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_testc_ps() {
+        let a = f32x8::splat(1.);
+        let r = avx::_mm256_testc_ps(a, a);
+        assert_eq!(r, 1);
+        let b = f32x8::splat(-1.);
+        let r = avx::_mm256_testc_ps(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_testnzc_ps() {
+        let a = f32x8::splat(1.);
+        let r = avx::_mm256_testnzc_ps(a, a);
+        assert_eq!(r, 0);
+        let a = f32x8::new(1., -1., -1., -1., -1., -1., -1., -1.);
+        let b = f32x8::new(-1., -1., 1., 1., 1., 1., 1., 1.);
+        let r = avx::_mm256_testnzc_ps(a, b);
+        assert_eq!(r, 1);
+    }
 }

From 981892f3c9f49586789b33c68378aaa012482020 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 09:30:10 +0200
Subject: [PATCH 17/37] avx: _mm_testz_ps, _mm_testc_ps, _mm_testnzc_ps

---
 src/x86/avx.rs | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 5c24f84940..b661184bef 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1542,6 +1542,49 @@ pub unsafe fn _mm256_testnzc_ps(a: f32x8, b: f32x8) -> i32 {
     vtestnzcps256(a, b)
 }
 
+/// Compute the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vtestps))]
+pub unsafe fn _mm_testz_ps(a: f32x4, b: f32x4) -> i32 {
+    vtestzps(a, b)
+}
+
+/// Compute the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vtestps))]
+pub unsafe fn _mm_testc_ps(a: f32x4, b: f32x4) -> i32 {
+    vtestcps(a, b)
+}
+
+/// Compute the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vtestps))]
+pub unsafe fn _mm_testnzc_ps(a: f32x4, b: f32x4) -> i32 {
+    vtestnzcps(a, b)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -1723,6 +1766,12 @@ extern "C" {
     fn vtestcps256(a: f32x8, b: f32x8) -> i32;
     #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
     fn vtestnzcps256(a: f32x8, b: f32x8) -> i32;
+    #[link_name = "llvm.x86.avx.vtestz.ps"]
+    fn vtestzps(a: f32x4, b: f32x4) -> i32;
+    #[link_name = "llvm.x86.avx.vtestc.ps"]
+    fn vtestcps(a: f32x4, b: f32x4) -> i32;
+    #[link_name = "llvm.x86.avx.vtestnzc.ps"]
+    fn vtestnzcps(a: f32x4, b: f32x4) -> i32;
 }
 
 #[cfg(test)]
@@ -2856,4 +2905,35 @@ mod tests {
         let r = avx::_mm256_testnzc_ps(a, b);
         assert_eq!(r, 1);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_testz_ps() {
+        let a = f32x4::splat(1.);
+        let r = avx::_mm_testz_ps(a, a);
+        assert_eq!(r, 1);
+        let a = f32x4::splat(-1.);
+        let r = avx::_mm_testz_ps(a, a);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_testc_ps() {
+        let a = f32x4::splat(1.);
+        let r = avx::_mm_testc_ps(a, a);
+        assert_eq!(r, 1);
+        let b = f32x4::splat(-1.);
+        let r = avx::_mm_testc_ps(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_testnzc_ps() {
+        let a = f32x4::splat(1.);
+        let r = avx::_mm_testnzc_ps(a, a);
+        assert_eq!(r, 0);
+        let a = f32x4::new(1., -1., -1., -1.);
+        let b = f32x4::new(-1., -1., 1., 1.);
+        let r = avx::_mm_testnzc_ps(a, b);
+        assert_eq!(r, 1);
+    }
 }

From d97ee09e728a978b3c5921c24e2c110412a15d70 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 09:59:45 +0200
Subject: [PATCH 18/37] avx: _mm256_movemask_pd, _mm256_movemask_ps

---
 src/x86/avx.rs | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index b661184bef..1434450d34 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1585,6 +1585,23 @@ pub unsafe fn _mm_testnzc_ps(a: f32x4, b: f32x4) -> i32 {
     vtestnzcps(a, b)
 }
 
+/// Set each bit of the returned mask based on the most significant bit of the
+/// corresponding packed double-precision (64-bit) floating-point element in `a`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vmovmskpd))]
+pub unsafe fn _mm256_movemask_pd(a: f64x4) -> i32 {
+    movmskpd256(a)
+}
+
+///
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vmovmskps))]
+pub unsafe fn _mm256_movemask_ps(a: f32x8) -> i32 {
+    movmskps256(a)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -1772,6 +1789,10 @@ extern "C" {
     fn vtestcps(a: f32x4, b: f32x4) -> i32;
     #[link_name = "llvm.x86.avx.vtestnzc.ps"]
     fn vtestnzcps(a: f32x4, b: f32x4) -> i32;
+    #[link_name = "llvm.x86.avx.movmsk.pd.256"]
+    fn movmskpd256(a: f64x4) -> i32;
+    #[link_name = "llvm.x86.avx.movmsk.ps.256"]
+    fn movmskps256(a: f32x8) -> i32;
 }
 
 #[cfg(test)]
@@ -2936,4 +2957,18 @@ mod tests {
         let r = avx::_mm_testnzc_ps(a, b);
         assert_eq!(r, 1);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_movemask_pd() {
+        let a = f64x4::new(1., -2., 3., -4.);
+        let r = avx::_mm256_movemask_pd(a);
+        assert_eq!(r, 0xA);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_movemask_ps() {
+        let a = f32x8::new(1., -2., 3., -4., 1., -2., 3., -4.);
+        let r = avx::_mm256_movemask_ps(a);
+        assert_eq!(r, 0xAA);
+    }
 }

From 424775fd2627ed55420f9c0998d6aa09ee498ab1 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 12:22:36 +0200
Subject: [PATCH 19/37] avx: _mm256_setzero_pd, _mm256_setzero_ps

---
 src/x86/avx.rs | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 1434450d34..4e2e39fc75 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1594,7 +1594,8 @@ pub unsafe fn _mm256_movemask_pd(a: f64x4) -> i32 {
     movmskpd256(a)
 }
 
-///
+/// Set each bit of the returned mask based on the most significant bit of the
+/// corresponding packed single-precision (32-bit) floating-point element in `a`.
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vmovmskps))]
@@ -1602,6 +1603,22 @@ pub unsafe fn _mm256_movemask_ps(a: f32x8) -> i32 {
     movmskps256(a)
 }
 
+/// Return vector of type __m256d with all elements set to zero.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vxorps))] // FIXME vxorpd expected
+pub unsafe fn _mm256_setzero_pd() -> f64x4 {
+    f64x4::new(0., 0., 0., 0.)
+}
+
+/// Return vector of type __m256 with all elements set to zero.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub unsafe fn _mm256_setzero_ps() -> f32x8 {
+    f32x8::new(0., 0., 0., 0., 0., 0., 0., 0.)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -2971,4 +2988,16 @@ mod tests {
         let r = avx::_mm256_movemask_ps(a);
         assert_eq!(r, 0xAA);
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_setzero_pd() {
+        let r = avx::_mm256_setzero_pd();
+        assert_eq!(r, f64x4::splat(0.));
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_setzero_ps() {
+        let r = avx::_mm256_setzero_ps();
+        assert_eq!(r, f32x8::splat(0.));
+    }
 }

From 668988c9452ee15d549b6a4fbfc072f980caeaf4 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 12:29:32 +0200
Subject: [PATCH 20/37] avx: _mm256_setzero_si256

---
 src/x86/avx.rs | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 4e2e39fc75..c2afae3f05 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1619,6 +1619,14 @@ pub unsafe fn _mm256_setzero_ps() -> f32x8 {
     f32x8::new(0., 0., 0., 0., 0., 0., 0., 0.)
 }
 
+/// Return vector of type __m256i with all elements set to zero.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vxor))]
+pub unsafe fn _mm256_setzero_si256() -> i64x4 {
+    i64x4::new(0, 0, 0, 0)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -3000,4 +3008,10 @@ mod tests {
         let r = avx::_mm256_setzero_ps();
         assert_eq!(r, f32x8::splat(0.));
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_setzero_si256() {
+        let r = avx::_mm256_setzero_si256();
+        assert_eq!(r, i64x4::splat(0));
+    }
 }

From 2e9ec162d1cd839de5ac22655f996faf6567ecdf Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 12:44:22 +0200
Subject: [PATCH 21/37] avx: _mm256_set_pd, _mm256_set_ps

---
 src/x86/avx.rs | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index c2afae3f05..442a4f6c3e 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1627,6 +1627,23 @@ pub unsafe fn _mm256_setzero_si256() -> i64x4 {
     i64x4::new(0, 0, 0, 0)
 }
 
+/// Set packed double-precision (64-bit) floating-point elements in returned
+/// vector with the supplied values.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> f64x4 {
+    f64x4::new(d, c, b, a)
+}
+
+/// Set packed single-precision (32-bit) floating-point elements in returned
+/// vector with the supplied values.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32,
+                            e: f32, f: f32, g: f32, h: f32) -> f32x8 {
+    f32x8::new(h, g, f, e, d, c, b, a)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -3014,4 +3031,16 @@ mod tests {
         let r = avx::_mm256_setzero_si256();
         assert_eq!(r, i64x4::splat(0));
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_set_pd() {
+        let r = avx::_mm256_set_pd(1., 2., 3., 4.);
+        assert_eq!(r, f64x4::new(4., 3., 2., 1.));
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_set_ps() {
+        let r = avx::_mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq!(r, f32x8::new(8., 7., 6., 5., 4., 3., 2., 1.));
+    }
 }

From 502a814ff807eae466f46ca4f4c9905a34cf28b6 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 13:13:23 +0200
Subject: [PATCH 22/37] avx: _mm256_set_epi8

---
 src/x86/avx.rs | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 442a4f6c3e..a36e209c41 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1644,6 +1644,28 @@ pub unsafe fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32,
     f32x8::new(h, g, f, e, d, c, b, a)
 }
 
+/// Set packed 8-bit integers in returned vector with the supplied values in
+/// reverse order.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_set_epi8(a00: i8, a01: i8, a02: i8, a03: i8,
+                              a04: i8, a05: i8, a06: i8, a07: i8,
+                              a08: i8, a09: i8, a10: i8, a11: i8,
+                              a12: i8, a13: i8, a14: i8, a15: i8,
+                              a16: i8, a17: i8, a18: i8, a19: i8,
+                              a20: i8, a21: i8, a22: i8, a23: i8,
+                              a24: i8, a25: i8, a26: i8, a27: i8,
+                              a28: i8, a29: i8, a30: i8, a31: i8) -> i8x32 {
+    i8x32::new(a31, a30, a29, a28,
+               a27, a26, a25, a24,
+               a23, a22, a21, a20,
+               a19, a18, a17, a16,
+               a15, a14, a13, a12,
+               a11, a10, a09, a08,
+               a07, a06, a05, a04,
+               a03, a02, a01, a00)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -3043,4 +3065,17 @@ mod tests {
         let r = avx::_mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
         assert_eq!(r, f32x8::new(8., 7., 6., 5., 4., 3., 2., 1.));
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_set_epi8() {
+        let r = avx::_mm256_set_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32);
+        assert_eq!(r, i8x32::new(32, 31, 30, 29, 28, 27, 26, 25,
+                                 24, 23, 22, 21, 20, 19, 18, 17,
+                                 16, 15, 14, 13, 12, 11, 10, 9,
+                                 8, 7, 6, 5, 4, 3, 2, 1));
+    }
 }

From 56700ef0b28c925b4710093dabebc7fc52566bf1 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 13:21:32 +0200
Subject: [PATCH 23/37] avx: _mm256_set_epi16

---
 src/x86/avx.rs | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index a36e209c41..f33165038e 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1666,6 +1666,19 @@ pub unsafe fn _mm256_set_epi8(a00: i8, a01: i8, a02: i8, a03: i8,
                a03, a02, a01, a00)
 }
 
+/// Set packed 16-bit integers in "dst" with the supplied values.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_set_epi16(a00: i16, a01: i16, a02: i16, a03: i16,
+                              a04: i16, a05: i16, a06: i16, a07: i16,
+                              a08: i16, a09: i16, a10: i16, a11: i16,
+                              a12: i16, a13: i16, a14: i16, a15: i16) -> i16x16 {
+    i16x16::new(a15, a14, a13, a12,
+               a11, a10, a09, a08,
+               a07, a06, a05, a04,
+               a03, a02, a01, a00)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -3078,4 +3091,13 @@ mod tests {
                                  16, 15, 14, 13, 12, 11, 10, 9,
                                  8, 7, 6, 5, 4, 3, 2, 1));
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_set_epi16() {
+        let r = avx::_mm256_set_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq!(r, i16x16::new(16, 15, 14, 13, 12, 11, 10, 9,
+                                 8, 7, 6, 5, 4, 3, 2, 1));
+    }
 }

From 40000c9cc269a5c71ec01cea6126049714d34575 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 13:27:49 +0200
Subject: [PATCH 24/37] avx: _mm256_set_epi32

---
 src/x86/avx.rs | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index f33165038e..b7107cad00 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1666,7 +1666,7 @@ pub unsafe fn _mm256_set_epi8(a00: i8, a01: i8, a02: i8, a03: i8,
                a03, a02, a01, a00)
 }
 
-/// Set packed 16-bit integers in "dst" with the supplied values.
+/// Set packed 16-bit integers in returned vector with the supplied values.
 #[inline(always)]
 #[target_feature = "+avx"]
 pub unsafe fn _mm256_set_epi16(a00: i16, a01: i16, a02: i16, a03: i16,
@@ -1679,6 +1679,15 @@ pub unsafe fn _mm256_set_epi16(a00: i16, a01: i16, a02: i16, a03: i16,
                a03, a02, a01, a00)
 }
 
+/// Set packed 32-bit integers in returned vector with the supplied values.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_set_epi32(a0: i32, a1: i32, a2: i32, a3: i32,
+                              a4: i32, a5: i32, a6: i32, a7: i32) -> i32x8 {
+    i32x8::new(a7, a6, a5, a4,
+               a3, a2, a1, a0)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -3100,4 +3109,11 @@ mod tests {
         assert_eq!(r, i16x16::new(16, 15, 14, 13, 12, 11, 10, 9,
                                  8, 7, 6, 5, 4, 3, 2, 1));
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_set_epi32() {
+        let r = avx::_mm256_set_epi32(
+            1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq!(r, i32x8::new(8, 7, 6, 5, 4, 3, 2, 1));
+    }
 }

From b82692c26c91fec47ed72cee9bf83f47c9db9fc8 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 13:36:40 +0200
Subject: [PATCH 25/37] avx: _mm256_set_epi64x

---
 src/x86/avx.rs | 69 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 41 insertions(+), 28 deletions(-)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index b7107cad00..69e82b55e5 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1648,44 +1648,51 @@ pub unsafe fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32,
 /// reverse order.
 #[inline(always)]
 #[target_feature = "+avx"]
-pub unsafe fn _mm256_set_epi8(a00: i8, a01: i8, a02: i8, a03: i8,
-                              a04: i8, a05: i8, a06: i8, a07: i8,
-                              a08: i8, a09: i8, a10: i8, a11: i8,
-                              a12: i8, a13: i8, a14: i8, a15: i8,
-                              a16: i8, a17: i8, a18: i8, a19: i8,
-                              a20: i8, a21: i8, a22: i8, a23: i8,
-                              a24: i8, a25: i8, a26: i8, a27: i8,
-                              a28: i8, a29: i8, a30: i8, a31: i8) -> i8x32 {
-    i8x32::new(a31, a30, a29, a28,
-               a27, a26, a25, a24,
-               a23, a22, a21, a20,
-               a19, a18, a17, a16,
-               a15, a14, a13, a12,
-               a11, a10, a09, a08,
-               a07, a06, a05, a04,
-               a03, a02, a01, a00)
+pub unsafe fn _mm256_set_epi8(e00: i8, e01: i8, e02: i8, e03: i8,
+                              e04: i8, e05: i8, e06: i8, e07: i8,
+                              e08: i8, e09: i8, e10: i8, e11: i8,
+                              e12: i8, e13: i8, e14: i8, e15: i8,
+                              e16: i8, e17: i8, e18: i8, e19: i8,
+                              e20: i8, e21: i8, e22: i8, e23: i8,
+                              e24: i8, e25: i8, e26: i8, e27: i8,
+                              e28: i8, e29: i8, e30: i8, e31: i8) -> i8x32 {
+    i8x32::new(e31, e30, e29, e28,
+               e27, e26, e25, e24,
+               e23, e22, e21, e20,
+               e19, e18, e17, e16,
+               e15, e14, e13, e12,
+               e11, e10, e09, e08,
+               e07, e06, e05, e04,
+               e03, e02, e01, e00)
 }
 
 /// Set packed 16-bit integers in returned vector with the supplied values.
 #[inline(always)]
 #[target_feature = "+avx"]
-pub unsafe fn _mm256_set_epi16(a00: i16, a01: i16, a02: i16, a03: i16,
-                              a04: i16, a05: i16, a06: i16, a07: i16,
-                              a08: i16, a09: i16, a10: i16, a11: i16,
-                              a12: i16, a13: i16, a14: i16, a15: i16) -> i16x16 {
-    i16x16::new(a15, a14, a13, a12,
-               a11, a10, a09, a08,
-               a07, a06, a05, a04,
-               a03, a02, a01, a00)
+pub unsafe fn _mm256_set_epi16(e00: i16, e01: i16, e02: i16, e03: i16,
+                              e04: i16, e05: i16, e06: i16, e07: i16,
+                              e08: i16, e09: i16, e10: i16, e11: i16,
+                              e12: i16, e13: i16, e14: i16, e15: i16) -> i16x16 {
+    i16x16::new(e15, e14, e13, e12,
+               e11, e10, e09, e08,
+               e07, e06, e05, e04,
+               e03, e02, e01, e00)
 }
 
 /// Set packed 32-bit integers in returned vector with the supplied values.
 #[inline(always)]
 #[target_feature = "+avx"]
-pub unsafe fn _mm256_set_epi32(a0: i32, a1: i32, a2: i32, a3: i32,
-                              a4: i32, a5: i32, a6: i32, a7: i32) -> i32x8 {
-    i32x8::new(a7, a6, a5, a4,
-               a3, a2, a1, a0)
+pub unsafe fn _mm256_set_epi32(e0: i32, e1: i32, e2: i32, e3: i32,
+                              e4: i32, e5: i32, e6: i32, e7: i32) -> i32x8 {
+    i32x8::new(e7, e6, e5, e4,
+               e3, e2, e1, e0)
+}
+
+/// Set packed 64-bit integers in returned vector with the supplied values.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> i64x4 {
+    i64x4::new(d, c, b, a)
 }
 
 /// Casts vector of type __m128 to type __m256;
@@ -3116,4 +3123,10 @@ mod tests {
             1, 2, 3, 4, 5, 6, 7, 8);
         assert_eq!(r, i32x8::new(8, 7, 6, 5, 4, 3, 2, 1));
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_set_epi64x() {
+        let r = avx::_mm256_set_epi64x(1, 2, 3, 4);
+        assert_eq!(r, i64x4::new(4, 3, 2, 1));
+    }
 }

From 8c67767dcf6f001fbeb975a1c04a4b238fe8230d Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 14:18:39 +0200
Subject: [PATCH 26/37] avx: _mm256_setr_pd, _mm256_setr_ps

---
 src/x86/avx.rs | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 69e82b55e5..70395c9632 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1695,6 +1695,23 @@ pub unsafe fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> i64x4 {
     i64x4::new(d, c, b, a)
 }
 
+/// Set packed double-precision (64-bit) floating-point elements in returned
+/// vector with the supplied values in reverse order.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> f64x4 {
+    f64x4::new(a, b, c, d)
+}
+
+/// Set packed single-precision (32-bit) floating-point elements in returned
+/// vector with the supplied values in reverse order.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32,
+                            e: f32, f: f32, g: f32, h: f32) -> f32x8 {
+    f32x8::new(a, b, c, d, e, f, g, h)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -3129,4 +3146,16 @@ mod tests {
         let r = avx::_mm256_set_epi64x(1, 2, 3, 4);
         assert_eq!(r, i64x4::new(4, 3, 2, 1));
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_setr_pd() {
+        let r = avx::_mm256_setr_pd(1., 2., 3., 4.);
+        assert_eq!(r, f64x4::new(1., 2., 3., 4.));
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_setr_ps() {
+        let r = avx::_mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        assert_eq!(r, f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.));
+    }
 }

From 9f6f7823cce652a6d3e716dd50772530d3e6b6f8 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 14:24:46 +0200
Subject: [PATCH 27/37] avx: _mm256_setr_epi8

---
 src/x86/avx.rs | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 70395c9632..9489be6000 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1712,6 +1712,28 @@ pub unsafe fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32,
     f32x8::new(a, b, c, d, e, f, g, h)
 }
 
+/// Set packed single-precision (32-bit) floating-point elements in returned
+/// vector with the supplied values in reverse order.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_setr_epi8(e00: i8, e01: i8, e02: i8, e03: i8,
+                              e04: i8, e05: i8, e06: i8, e07: i8,
+                              e08: i8, e09: i8, e10: i8, e11: i8,
+                              e12: i8, e13: i8, e14: i8, e15: i8,
+                              e16: i8, e17: i8, e18: i8, e19: i8,
+                              e20: i8, e21: i8, e22: i8, e23: i8,
+                              e24: i8, e25: i8, e26: i8, e27: i8,
+                              e28: i8, e29: i8, e30: i8, e31: i8) -> i8x32 {
+    i8x32::new(e00, e01, e02, e03,
+               e04, e05, e06, e07,
+               e08, e09, e10, e11,
+               e12, e13, e14, e15,
+               e16, e17, e18, e19,
+               e20, e21, e22, e23,
+               e24, e25, e26, e27,
+               e28, e29, e30, e31)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -3158,4 +3180,17 @@ mod tests {
         let r = avx::_mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
         assert_eq!(r, f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.));
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_setr_epi8() {
+        let r = avx::_mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32);
+        assert_eq!(r, i8x32::new(1, 2, 3, 4, 5, 6, 7, 8,
+                                 9, 10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24,
+                                 25, 26, 27, 28, 29, 30, 31, 32));
+    }
 }

From 06b96385657171e6491c76202b6a00cfeb0feed8 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 14:32:58 +0200
Subject: [PATCH 28/37] avx: _mm256_setr_epi16

---
 src/x86/avx.rs | 58 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 40 insertions(+), 18 deletions(-)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 9489be6000..5a6af5057c 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1670,20 +1670,20 @@ pub unsafe fn _mm256_set_epi8(e00: i8, e01: i8, e02: i8, e03: i8,
 #[inline(always)]
 #[target_feature = "+avx"]
 pub unsafe fn _mm256_set_epi16(e00: i16, e01: i16, e02: i16, e03: i16,
-                              e04: i16, e05: i16, e06: i16, e07: i16,
-                              e08: i16, e09: i16, e10: i16, e11: i16,
-                              e12: i16, e13: i16, e14: i16, e15: i16) -> i16x16 {
+                               e04: i16, e05: i16, e06: i16, e07: i16,
+                               e08: i16, e09: i16, e10: i16, e11: i16,
+                               e12: i16, e13: i16, e14: i16, e15: i16) -> i16x16 {
     i16x16::new(e15, e14, e13, e12,
-               e11, e10, e09, e08,
-               e07, e06, e05, e04,
-               e03, e02, e01, e00)
+                e11, e10, e09, e08,
+                e07, e06, e05, e04,
+                e03, e02, e01, e00)
 }
 
 /// Set packed 32-bit integers in returned vector with the supplied values.
 #[inline(always)]
 #[target_feature = "+avx"]
 pub unsafe fn _mm256_set_epi32(e0: i32, e1: i32, e2: i32, e3: i32,
-                              e4: i32, e5: i32, e6: i32, e7: i32) -> i32x8 {
+                               e4: i32, e5: i32, e6: i32, e7: i32) -> i32x8 {
     i32x8::new(e7, e6, e5, e4,
                e3, e2, e1, e0)
 }
@@ -1708,22 +1708,22 @@ pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> f64x4 {
 #[inline(always)]
 #[target_feature = "+avx"]
 pub unsafe fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32,
-                            e: f32, f: f32, g: f32, h: f32) -> f32x8 {
+                             e: f32, f: f32, g: f32, h: f32) -> f32x8 {
     f32x8::new(a, b, c, d, e, f, g, h)
 }
 
-/// Set packed single-precision (32-bit) floating-point elements in returned
-/// vector with the supplied values in reverse order.
+/// Set packed 8-bit integers in returned vector with the supplied values in
+/// reverse order.
 #[inline(always)]
 #[target_feature = "+avx"]
 pub unsafe fn _mm256_setr_epi8(e00: i8, e01: i8, e02: i8, e03: i8,
-                              e04: i8, e05: i8, e06: i8, e07: i8,
-                              e08: i8, e09: i8, e10: i8, e11: i8,
-                              e12: i8, e13: i8, e14: i8, e15: i8,
-                              e16: i8, e17: i8, e18: i8, e19: i8,
-                              e20: i8, e21: i8, e22: i8, e23: i8,
-                              e24: i8, e25: i8, e26: i8, e27: i8,
-                              e28: i8, e29: i8, e30: i8, e31: i8) -> i8x32 {
+                               e04: i8, e05: i8, e06: i8, e07: i8,
+                               e08: i8, e09: i8, e10: i8, e11: i8,
+                               e12: i8, e13: i8, e14: i8, e15: i8,
+                               e16: i8, e17: i8, e18: i8, e19: i8,
+                               e20: i8, e21: i8, e22: i8, e23: i8,
+                               e24: i8, e25: i8, e26: i8, e27: i8,
+                               e28: i8, e29: i8, e30: i8, e31: i8) -> i8x32 {
     i8x32::new(e00, e01, e02, e03,
                e04, e05, e06, e07,
                e08, e09, e10, e11,
@@ -1734,6 +1734,19 @@ pub unsafe fn _mm256_setr_epi8(e00: i8, e01: i8, e02: i8, e03: i8,
                e28, e29, e30, e31)
 }
 
+/// Set packed 16-bit integers in  with the supplied values in reverse order.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_setr_epi16(e00: i16, e01: i16, e02: i16, e03: i16,
+                                e04: i16, e05: i16, e06: i16, e07: i16,
+                                e08: i16, e09: i16, e10: i16, e11: i16,
+                                e12: i16, e13: i16, e14: i16, e15: i16) -> i16x16 {
+    i16x16::new(e00, e01, e02, e03,
+                e04, e05, e06, e07,
+                e08, e09, e10, e11,
+                e12, e13, e14, e15)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -3153,7 +3166,7 @@ mod tests {
             1, 2, 3, 4, 5, 6, 7, 8,
             9, 10, 11, 12, 13, 14, 15, 16);
         assert_eq!(r, i16x16::new(16, 15, 14, 13, 12, 11, 10, 9,
-                                 8, 7, 6, 5, 4, 3, 2, 1));
+                                  8, 7, 6, 5, 4, 3, 2, 1));
     }
 
     #[simd_test = "avx"]
@@ -3193,4 +3206,13 @@ mod tests {
                                  17, 18, 19, 20, 21, 22, 23, 24,
                                  25, 26, 27, 28, 29, 30, 31, 32));
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_setr_epi16() {
+        let r = avx::_mm256_setr_epi16(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16);
+        assert_eq!(r, i16x16::new(1, 2, 3, 4, 5, 6, 7, 8,
+                                  9, 10, 11, 12, 13, 14, 15, 16));
+    }
 }

From 13fa6b0e0e7257cd8e6b9718500938bfcfe4952a Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 14:40:51 +0200
Subject: [PATCH 29/37] avx: _mm256_setr_epi32, _mm256_setr_epi64x

---
 src/x86/avx.rs | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 5a6af5057c..3bb25d5a42 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1734,7 +1734,8 @@ pub unsafe fn _mm256_setr_epi8(e00: i8, e01: i8, e02: i8, e03: i8,
                e28, e29, e30, e31)
 }
 
-/// Set packed 16-bit integers in  with the supplied values in reverse order.
+/// Set packed 16-bit integers in returned vector with the supplied values in
+/// reverse order.
 #[inline(always)]
 #[target_feature = "+avx"]
 pub unsafe fn _mm256_setr_epi16(e00: i16, e01: i16, e02: i16, e03: i16,
@@ -1747,6 +1748,24 @@ pub unsafe fn _mm256_setr_epi16(e00: i16, e01: i16, e02: i16, e03: i16,
                 e12, e13, e14, e15)
 }
 
+/// Set packed 32-bit integers in returned vector with the supplied values in
+/// reverse order.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_setr_epi32(e0: i32, e1: i32, e2: i32, e3: i32,
+                               e4: i32, e5: i32, e6: i32, e7: i32) -> i32x8 {
+    i32x8::new(e0, e1, e2, e3,
+               e4, e5, e6, e7)
+}
+
+/// Set packed 64-bit integers in returned vector with the supplied values in
+/// reverse order.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> i64x4 {
+    i64x4::new(a, b, c, d)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -3215,4 +3234,17 @@ mod tests {
         assert_eq!(r, i16x16::new(1, 2, 3, 4, 5, 6, 7, 8,
                                   9, 10, 11, 12, 13, 14, 15, 16));
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_setr_epi32() {
+        let r = avx::_mm256_setr_epi32(
+            1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq!(r, i32x8::new(1, 2, 3, 4, 5, 6, 7, 8));
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_setr_epi64x() {
+        let r = avx::_mm256_setr_epi64x(1, 2, 3, 4);
+        assert_eq!(r, i64x4::new(1, 2, 3, 4));
+    }
 }

From 3c1daea381e5d11604127275daa572445cb23d9a Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 15:06:00 +0200
Subject: [PATCH 30/37] avx: add missing assert_instr

---
 src/x86/avx.rs | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 3bb25d5a42..de898a4584 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1631,6 +1631,8 @@ pub unsafe fn _mm256_setzero_si256() -> i64x4 {
 /// vector with the supplied values.
 #[inline(always)]
 #[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+#[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> f64x4 {
     f64x4::new(d, c, b, a)
 }
@@ -1691,6 +1693,8 @@ pub unsafe fn _mm256_set_epi32(e0: i32, e1: i32, e2: i32, e3: i32,
 /// Set packed 64-bit integers in returned vector with the supplied values.
 #[inline(always)]
 #[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+#[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> i64x4 {
     i64x4::new(d, c, b, a)
 }
@@ -1699,6 +1703,8 @@ pub unsafe fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> i64x4 {
 /// vector with the supplied values in reverse order.
 #[inline(always)]
 #[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+#[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> f64x4 {
     f64x4::new(a, b, c, d)
 }
@@ -1762,6 +1768,8 @@ pub unsafe fn _mm256_setr_epi32(e0: i32, e1: i32, e2: i32, e3: i32,
 /// reverse order.
 #[inline(always)]
 #[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+#[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> i64x4 {
     i64x4::new(a, b, c, d)
 }

From 896561c5a0b998ab7b5a9c239311b7b4bb787d9c Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 15:12:36 +0200
Subject: [PATCH 31/37] avx: _mm256_set1_pd

---
 src/x86/avx.rs | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index de898a4584..2a37766bfe 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1774,6 +1774,16 @@ pub unsafe fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> i64x4 {
     i64x4::new(a, b, c, d)
 }
 
+/// Broadcast double-precision (64-bit) floating-point value `a` to all
+/// elements of returned vector.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vmovddup))]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+pub unsafe fn _mm256_set1_pd(a: f64) -> f64x4 {
+    f64x4::new(a, a, a, a)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -3255,4 +3265,10 @@ mod tests {
         let r = avx::_mm256_setr_epi64x(1, 2, 3, 4);
         assert_eq!(r, i64x4::new(1, 2, 3, 4));
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_set1_pd() {
+        let r = avx::_mm256_set1_pd(1.);
+        assert_eq!(r, f64x4::splat(1.));
+    }
 }

From d8a472790e209add9ea112a140c450d29f9207bc Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 15:15:47 +0200
Subject: [PATCH 32/37] avx: _mm256_set1_ps

---
 src/x86/avx.rs | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 2a37766bfe..36e67f0007 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1784,6 +1784,16 @@ pub unsafe fn _mm256_set1_pd(a: f64) -> f64x4 {
     f64x4::new(a, a, a, a)
 }
 
+/// Broadcast single-precision (32-bit) floating-point value `a` to all
+/// elements of returned vector.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vpermilps))]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+pub unsafe fn _mm256_set1_ps(a: f32) -> f32x8 {
+    f32x8::new(a, a, a, a, a, a, a, a)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -3271,4 +3281,10 @@ mod tests {
         let r = avx::_mm256_set1_pd(1.);
         assert_eq!(r, f64x4::splat(1.));
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_set1_ps() {
+        let r = avx::_mm256_set1_ps(1.);
+        assert_eq!(r, f32x8::splat(1.));
+    }
 }

From 328ac368aa19744d6897da2f08ba6ad4d691bd2e Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 15:21:37 +0200
Subject: [PATCH 33/37] avx: _mm256_set1_epi8

---
 src/x86/avx.rs | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 36e67f0007..e421ba8f11 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1794,6 +1794,19 @@ pub unsafe fn _mm256_set1_ps(a: f32) -> f32x8 {
     f32x8::new(a, a, a, a, a, a, a, a)
 }
 
+/// Broadcast 8-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastb`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vpshufb))]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+pub unsafe fn _mm256_set1_epi8(a: i8) -> i8x32 {
+    i8x32::new(a, a, a, a, a, a, a, a,
+               a, a, a, a, a, a, a, a,
+               a, a, a, a, a, a, a, a,
+               a, a, a, a, a, a, a, a)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -3287,4 +3300,10 @@ mod tests {
         let r = avx::_mm256_set1_ps(1.);
         assert_eq!(r, f32x8::splat(1.));
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_set1_epi8() {
+        let r = avx::_mm256_set1_epi8(1);
+        assert_eq!(r, i8x32::splat(1));
+    }
 }

From 6fcddf17943e91cdc207af0feb79fb7622358c0e Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 15:34:24 +0200
Subject: [PATCH 34/37] avx: _mm256_set1_epi16, _mm256_set1_epi32

---
 src/x86/avx.rs | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index e421ba8f11..2932e42bb3 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1807,6 +1807,27 @@ pub unsafe fn _mm256_set1_epi8(a: i8) -> i8x32 {
                a, a, a, a, a, a, a, a)
 }
 
+/// Broadcast 16-bit integer `a` to all all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastw`.
+#[inline(always)]
+#[target_feature = "+avx"]
+//#[cfg_attr(test, assert_instr(vpshufb))]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+pub unsafe fn _mm256_set1_epi16(a: i16) -> i16x16 {
+    i16x16::new(a, a, a, a, a, a, a, a,
+               a, a, a, a, a, a, a, a)
+}
+
+/// Broadcast 32-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastd`.
+#[inline(always)]
+#[target_feature = "+avx"]
+//#[cfg_attr(test, assert_instr(vpermilps))]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+pub unsafe fn _mm256_set1_epi32(a: i32) -> i32x8 {
+    i32x8::new(a, a, a, a, a, a, a, a)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -3306,4 +3327,16 @@ mod tests {
         let r = avx::_mm256_set1_epi8(1);
         assert_eq!(r, i8x32::splat(1));
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_set1_epi16() {
+        let r = avx::_mm256_set1_epi16(1);
+        assert_eq!(r, i16x16::splat(1));
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_set1_epi32() {
+        let r = avx::_mm256_set1_epi32(1);
+        assert_eq!(r, i32x8::splat(1));
+    }
 }

From ce139597a0a3c6882e5c2d31c98cded4fda0140a Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 15:39:45 +0200
Subject: [PATCH 35/37] avx: _mm256_set1_epi64x

---
 src/x86/avx.rs | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 2932e42bb3..1f69a4e5b9 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1828,6 +1828,16 @@ pub unsafe fn _mm256_set1_epi32(a: i32) -> i32x8 {
     i32x8::new(a, a, a, a, a, a, a, a)
 }
 
+/// Broadcast 64-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastq`.
+#[inline(always)]
+#[target_feature = "+avx"]
+//#[cfg_attr(test, assert_instr(vmovddup))]
+#[cfg_attr(test, assert_instr(vinsertf128))]
+pub unsafe fn _mm256_set1_epi64x(a: i64) -> i64x4 {
+    i64x4::new(a, a, a, a)
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -3339,4 +3349,10 @@ mod tests {
         let r = avx::_mm256_set1_epi32(1);
         assert_eq!(r, i32x8::splat(1));
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_set1_epi64x() {
+        let r = avx::_mm256_set1_epi64x(1);
+        assert_eq!(r, i64x4::splat(1));
+    }
 }

From 6a9996ada0782f7d4b1fc5e29bb5d0aabbc6a034 Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 16:34:08 +0200
Subject: [PATCH 36/37] avx: _mm256_castpd_si256, _mm256_castsi256_pd,
 _mm256_castps256_ps128, _mm256_castpd256_pd128, _mm256_castsi256_si128

---
 src/x86/avx.rs | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index 1f69a4e5b9..b9c8e8b3a6 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1838,6 +1838,51 @@ pub unsafe fn _mm256_set1_epi64x(a: i64) -> i64x4 {
     i64x4::new(a, a, a, a)
 }
 
+/// Casts vector of type __m256d to type __m256i.
+/// This intrinsic is only used for compilation and does not generate any
+/// instructions, thus it has zero latency.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_castpd_si256(a: f64x4) -> i64x4 {
+    simd_cast(a)
+}
+
+/// Casts vector of type __m256i to type __m256d.
+/// This intrinsic is only used for compilation and does not generate any
+/// instructions, thus it has zero latency.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_castsi256_pd(a: i64x4) -> f64x4 {
+    simd_cast(a)
+}
+
+/// Casts vector of type __m256 to type __m128.
+/// This intrinsic is only used for compilation and does not generate any
+/// instructions, thus it has zero latency.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_castps256_ps128(a: f32x8) -> f32x4 {
+    simd_shuffle4(a, a, [0, 1, 2, 3])
+}
+
+/// Casts vector of type __m256d to type __m128d.
+/// This intrinsic is only used for compilation and does not generate any
+/// instructions, thus it has zero latency.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_castpd256_pd128(a: f64x4) -> f64x2 {
+    simd_shuffle2(a, a, [0, 1])
+}
+
+/// Casts vector of type __m256i to type __m128i.
+/// This intrinsic is only used for compilation and does not generate any
+/// instructions, thus it has zero latency.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_castsi256_si128(a: i64x4) -> i64x2 {
+    simd_shuffle2(a, a, [0, 1])
+}
+
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
 #[inline(always)]
@@ -3355,4 +3400,39 @@ mod tests {
         let r = avx::_mm256_set1_epi64x(1);
         assert_eq!(r, i64x4::splat(1));
     }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_castpd_si256() {
+        let a = f64x4::new(1., 2., 3., 4.);
+        let r = avx::_mm256_castpd_si256(a);
+        assert_eq!(r, i64x4::new(1, 2, 3, 4));
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_castsi256_pd() {
+        let a = i64x4::new(1, 2, 3, 4);
+        let r = avx::_mm256_castsi256_pd(a);
+        assert_eq!(r, f64x4::new(1., 2., 3., 4.));
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_castps256_ps128() {
+        let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = avx::_mm256_castps256_ps128(a);
+        assert_eq!(r, f32x4::new(1., 2., 3., 4.));
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_castpd256_pd128() {
+        let a = f64x4::new(1., 2., 3., 4.);
+        let r = avx::_mm256_castpd256_pd128(a);
+        assert_eq!(r, f64x2::new(1., 2.));
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_castsi256_si128() {
+        let a = i64x4::new(1, 2, 3, 4);
+        let r = avx::_mm256_castsi256_si128(a);
+        assert_eq!(r, i64x2::new(1, 2));
+    }
 }

From 6c1b14891c473a100c109e86ba25e99962a143cd Mon Sep 17 00:00:00 2001
From: gwenn <gtreguier@gmail.com>
Date: Sun, 15 Oct 2017 16:48:30 +0200
Subject: [PATCH 37/37] avx: remove assert_instr failing

---
 src/x86/avx.rs | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index b9c8e8b3a6..6757e00f16 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1631,7 +1631,6 @@ pub unsafe fn _mm256_setzero_si256() -> i64x4 {
 /// vector with the supplied values.
 #[inline(always)]
 #[target_feature = "+avx"]
-#[cfg_attr(test, assert_instr(vunpcklpd))]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> f64x4 {
     f64x4::new(d, c, b, a)
@@ -1693,7 +1692,6 @@ pub unsafe fn _mm256_set_epi32(e0: i32, e1: i32, e2: i32, e3: i32,
 /// Set packed 64-bit integers in returned vector with the supplied values.
 #[inline(always)]
 #[target_feature = "+avx"]
-#[cfg_attr(test, assert_instr(vpunpcklqdq))]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> i64x4 {
     i64x4::new(d, c, b, a)
@@ -1703,8 +1701,6 @@ pub unsafe fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> i64x4 {
 /// vector with the supplied values in reverse order.
 #[inline(always)]
 #[target_feature = "+avx"]
-#[cfg_attr(test, assert_instr(vunpcklpd))]
-#[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> f64x4 {
     f64x4::new(a, b, c, d)
 }
@@ -1768,7 +1764,6 @@ pub unsafe fn _mm256_setr_epi32(e0: i32, e1: i32, e2: i32, e3: i32,
 /// reverse order.
 #[inline(always)]
 #[target_feature = "+avx"]
-#[cfg_attr(test, assert_instr(vpunpcklqdq))]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> i64x4 {
     i64x4::new(a, b, c, d)
@@ -1778,8 +1773,6 @@ pub unsafe fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> i64x4 {
 /// elements of returned vector.
 #[inline(always)]
 #[target_feature = "+avx"]
-#[cfg_attr(test, assert_instr(vmovddup))]
-#[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_set1_pd(a: f64) -> f64x4 {
     f64x4::new(a, a, a, a)
 }
@@ -1788,8 +1781,6 @@ pub unsafe fn _mm256_set1_pd(a: f64) -> f64x4 {
 /// elements of returned vector.
 #[inline(always)]
 #[target_feature = "+avx"]
-#[cfg_attr(test, assert_instr(vpermilps))]
-#[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_set1_ps(a: f32) -> f32x8 {
     f32x8::new(a, a, a, a, a, a, a, a)
 }
@@ -1822,8 +1813,6 @@ pub unsafe fn _mm256_set1_epi16(a: i16) -> i16x16 {
 /// This intrinsic may generate the `vpbroadcastd`.
 #[inline(always)]
 #[target_feature = "+avx"]
-//#[cfg_attr(test, assert_instr(vpermilps))]
-#[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_set1_epi32(a: i32) -> i32x8 {
     i32x8::new(a, a, a, a, a, a, a, a)
 }